mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-22 18:42:41 -05:00
Compare commits
22 Commits
release_up
...
jeffshep/s
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d0961b98bf | ||
|
|
302589b7f9 | ||
|
|
cc85949d6d | ||
|
|
3a1824e3ad | ||
|
|
579643326d | ||
|
|
14f76f227e | ||
|
|
25baf5203a | ||
|
|
1178fcb0ba | ||
|
|
e4d84c8e45 | ||
|
|
7a3ab1e44c | ||
|
|
598a293dfa | ||
|
|
40b3068462 | ||
|
|
0ecbbbce75 | ||
|
|
9b1e130d18 | ||
|
|
0e17b33d2a | ||
|
|
34d80abd26 | ||
|
|
249278ab77 | ||
|
|
25fdb17f80 | ||
|
|
3a02a27f1e | ||
|
|
4eed9d529f | ||
|
|
f344d410a2 | ||
|
|
9dc1228063 |
@@ -1,6 +1,8 @@
|
||||
# Azure Machine Learning Python SDK notebooks
|
||||
|
||||
> a community-driven repository of examples using mlflow for tracking can be found at https://github.com/Azure/azureml-examples
|
||||
|
||||
** **With the introduction of AzureML SDK v2, this samples repository for the v1 SDK is now deprecated and will not be monitored or updated. Users are encouraged to visit the [v2 SDK samples repository](https://github.com/Azure/azureml-examples) instead for up-to-date and enhanced examples of how to build, train, and deploy machine learning models with AzureML's newest features.** **
|
||||
|
||||
|
||||
Welcome to the Azure Machine Learning Python SDK notebooks repository!
|
||||
|
||||
|
||||
@@ -103,7 +103,7 @@
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.51.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -6,7 +6,8 @@ dependencies:
|
||||
- fairlearn>=0.6.2
|
||||
- joblib
|
||||
- liac-arff
|
||||
- raiwidgets~=0.23.0
|
||||
- raiwidgets~=0.26.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- protobuf==3.20.0
|
||||
- numpy<1.24.0
|
||||
|
||||
@@ -6,7 +6,8 @@ dependencies:
|
||||
- fairlearn>=0.6.2
|
||||
- joblib
|
||||
- liac-arff
|
||||
- raiwidgets~=0.23.0
|
||||
- raiwidgets~=0.26.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- protobuf==3.20.0
|
||||
- numpy<1.24.0
|
||||
|
||||
@@ -8,14 +8,18 @@ dependencies:
|
||||
# Azure ML only supports 3.7.0 and later.
|
||||
- pip==22.3.1
|
||||
- python>=3.7,<3.9
|
||||
- conda-forge::fbprophet==0.7.1
|
||||
- pandas==1.1.5
|
||||
- scipy==1.5.3
|
||||
- Cython==0.29.14
|
||||
- tqdm==4.65.0
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=latest
|
||||
- azureml-defaults~=latest
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/latest/validated_win32_requirements.txt [--no-deps]
|
||||
- azureml-widgets~=1.51.0
|
||||
- azureml-defaults~=1.51.0
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.51.0/validated_win32_requirements.txt [--no-deps]
|
||||
- matplotlib==3.6.2
|
||||
- xgboost==1.3.3
|
||||
- arch==4.14
|
||||
- mlflow-skinny==1.30.0
|
||||
|
||||
- cmdstanpy==0.9.5
|
||||
- setuptools-git==1.2
|
||||
|
||||
@@ -6,7 +6,7 @@ channels:
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Azure ML only supports 3.7 and later.
|
||||
- pip==20.1.1
|
||||
- pip==22.3.1
|
||||
- python>=3.7,<3.9
|
||||
- matplotlib==3.2.1
|
||||
- numpy>=1.21.6,<=1.22.3
|
||||
@@ -20,16 +20,13 @@ dependencies:
|
||||
- pytorch::pytorch=1.11.0
|
||||
- cudatoolkit=10.1.243
|
||||
- notebook
|
||||
- jinja2<=2.11.2
|
||||
- markupsafe<2.1.0
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.48.0
|
||||
- azureml-defaults~=1.48.0
|
||||
- azureml-widgets~=1.51.0
|
||||
- azureml-defaults~=1.51.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- spacy==2.2.4
|
||||
- pystan==2.19.1.1
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.48.0/validated_linux_requirements.txt [--no-deps]
|
||||
- arch==4.14
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.51.0/validated_linux_requirements.txt [--no-deps]
|
||||
|
||||
@@ -6,7 +6,7 @@ channels:
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.7 and later.
|
||||
- pip==20.1.1
|
||||
- pip==22.3.1
|
||||
- python>=3.7,<3.9
|
||||
- matplotlib==3.2.1
|
||||
- numpy>=1.21.6,<=1.22.3
|
||||
@@ -16,20 +16,17 @@ dependencies:
|
||||
- scikit-learn==0.22.1
|
||||
- py-xgboost<=1.3.3
|
||||
- holidays==0.10.3
|
||||
- conda-forge::fbprophet==0.7.1
|
||||
- pytorch::pytorch=1.11.0
|
||||
- cudatoolkit=9.0
|
||||
- notebook
|
||||
- jinja2<=2.11.2
|
||||
- markupsafe<2.1.0
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.48.0
|
||||
- azureml-defaults~=1.48.0
|
||||
- azureml-widgets~=1.51.0
|
||||
- azureml-defaults~=1.51.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- spacy==2.2.4
|
||||
- pystan==2.19.1.1
|
||||
- fbprophet==0.7.1
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.48.0/validated_darwin_requirements.txt [--no-deps]
|
||||
- arch==4.14
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.51.0/validated_darwin_requirements.txt [--no-deps]
|
||||
|
||||
@@ -712,7 +712,9 @@
|
||||
"from azureml.core.model import Model\n",
|
||||
"from azureml.core.environment import Environment\n",
|
||||
"\n",
|
||||
"inference_config = InferenceConfig(entry_script=script_file_name)\n",
|
||||
"inference_config = InferenceConfig(\n",
|
||||
" environment=best_run.get_environment(), entry_script=script_file_name\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"aciconfig = AciWebservice.deploy_configuration(\n",
|
||||
" cpu_cores=2,\n",
|
||||
|
||||
@@ -97,7 +97,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.51.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -97,7 +97,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.51.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -454,10 +454,13 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Note:** Not all datasets produce a y_transformer. The dataset used in the current notebook requires a transformer as the y column data is categorical."
|
||||
"**Note:** Not all datasets produce a y_transformer. The dataset used in the current notebook requires a transformer as the y column data is categorical. \n",
|
||||
"\n",
|
||||
"We will go ahead and download the mlflow transformer model and use it to transform test data that can be used for further experimentation below. To run the commented code, make sure the environment requirement is satisfied. You can go ahead and create the environment from the `conda.yaml` file under `/outputs/featurization/pipeline/` and run the given code in it."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -466,7 +469,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.core.shared.constants import Transformers\n",
|
||||
"''' from azureml.automl.core.shared.constants import Transformers\n",
|
||||
"\n",
|
||||
"transformers = mlflow.sklearn.load_model(uri) # Using method 1\n",
|
||||
"data_transformers = transformers.get_transformers()\n",
|
||||
@@ -474,14 +477,15 @@
|
||||
"y_transformer = data_transformers[Transformers.Y_TRANSFORMER]\n",
|
||||
"\n",
|
||||
"X_test = x_transformer.transform(X_test_data)\n",
|
||||
"y_test = y_transformer.transform(y_test_data)"
|
||||
"y_test = y_transformer.transform(y_test_data) '''"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Run the following cell to see the featurization summary of X and y transformers. "
|
||||
"Run the following cell to see the featurization summary of X and y transformers. Uncomment to use. "
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -490,10 +494,10 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_data_summary = x_transformer.get_featurization_summary(is_user_friendly=False)\n",
|
||||
"''' X_data_summary = x_transformer.get_featurization_summary(is_user_friendly=False)\n",
|
||||
"\n",
|
||||
"summary_df = pd.DataFrame.from_records(X_data_summary)\n",
|
||||
"summary_df"
|
||||
"summary_df '''"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -544,10 +548,11 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Another way to load the data is to go to the above autofeaturization experiment and check for the featurized dataset ids under `Output datasets`. Uncomment and replace them accordingly below to use."
|
||||
"Another way to load the data is to go to the above autofeaturization experiment and check for the featurized dataset ids under `Output datasets`. Uncomment and replace them accordingly below, to use."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -597,10 +602,20 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Here we are passing our training data to the lightgbm classifier, any custom model can be used with your data."
|
||||
"Here we are passing our training data to the lightgbm classifier, any custom model can be used with your data. Let us first install lightgbm."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! pip install lightgbm"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -612,11 +627,27 @@
|
||||
"import lightgbm as lgb\n",
|
||||
"\n",
|
||||
"model = lgb.LGBMClassifier(learning_rate=0.08,max_depth=-5,random_state=42)\n",
|
||||
"model.fit(X_train, y_train, sample_weight=sample_weight, eval_set=[(X_test, y_test),(X_train, y_train)],\n",
|
||||
" verbose=20,eval_metric='logloss')\n",
|
||||
"\n",
|
||||
"model.fit(X_train, y_train, sample_weight=sample_weight)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Once training is done, the test data obtained after transforming from the above downloaded transformer can be used to calculate the accuracy "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print('Training accuracy {:.4f}'.format(model.score(X_train, y_train)))\n",
|
||||
"print('Testing accuracy {:.4f}'.format(model.score(X_test, y_test)))"
|
||||
"\n",
|
||||
"# Uncomment below to test the model on test data \n",
|
||||
"# print('Testing accuracy {:.4f}'.format(model.score(X_test, y_test)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -654,45 +685,8 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_pred = model.predict(X_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Calculate metrics for the prediction\n",
|
||||
"\n",
|
||||
"Now visualize the data on a scatter plot to show what our truth (actual) values are compared to the predicted values \n",
|
||||
"from the trained model that was returned."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.metrics import confusion_matrix\n",
|
||||
"from matplotlib import pyplot as plt\n",
|
||||
"import numpy as np\n",
|
||||
"import itertools\n",
|
||||
"\n",
|
||||
"cf =confusion_matrix(y_test,y_pred)\n",
|
||||
"plt.imshow(cf,cmap=plt.cm.Blues,interpolation='nearest')\n",
|
||||
"plt.colorbar()\n",
|
||||
"plt.title('Confusion Matrix')\n",
|
||||
"plt.xlabel('Predicted')\n",
|
||||
"plt.ylabel('Actual')\n",
|
||||
"class_labels = ['False','True']\n",
|
||||
"tick_marks = np.arange(len(class_labels))\n",
|
||||
"plt.xticks(tick_marks,class_labels)\n",
|
||||
"plt.yticks([-0.5,0,1,1.5],['','False','True',''])\n",
|
||||
"# plotting text value inside cells\n",
|
||||
"thresh = cf.max() / 2.\n",
|
||||
"for i,j in itertools.product(range(cf.shape[0]),range(cf.shape[1])):\n",
|
||||
" plt.text(j,i,format(cf[i,j],'d'),horizontalalignment='center',color='white' if cf[i,j] >thresh else 'black')\n",
|
||||
"plt.show()"
|
||||
"# Uncomment below to test the model on test data\n",
|
||||
"# y_pred = model.predict(X_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -1,9 +1,21 @@
|
||||
name: azure_automl_experimental
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
<<<<<<< HEAD
|
||||
# Currently Azure ML only supports 3.6.0 and later.
|
||||
- pip<=20.2.4
|
||||
- python>=3.6.0,<3.10
|
||||
- cython==0.29.14
|
||||
- urllib3==1.26.7
|
||||
- PyJWT < 2.0.0
|
||||
- numpy==1.22.3
|
||||
- pywin32==227
|
||||
- cryptography<37.0.0
|
||||
=======
|
||||
# Currently Azure ML only supports 3.7.0 and later.
|
||||
- pip<=22.3.1
|
||||
- python>=3.7.0,<3.10
|
||||
- python>=3.7.0,<3.11
|
||||
>>>>>>> 4671acd451ce979c3cebcd3917804861a333b710
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
@@ -13,3 +25,4 @@ dependencies:
|
||||
- azureml-mlflow
|
||||
- pandas
|
||||
- mlflow
|
||||
- docker<6.0.0
|
||||
|
||||
@@ -4,10 +4,10 @@ channels:
|
||||
- main
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.6.0 and later.
|
||||
# Currently Azure ML only supports 3.7.0 and later.
|
||||
- pip<=20.2.4
|
||||
- nomkl
|
||||
- python>=3.6.0,<3.10
|
||||
- python>=3.7.0,<3.11
|
||||
- urllib3==1.26.7
|
||||
- PyJWT < 2.0.0
|
||||
- numpy>=1.21.6,<=1.22.3
|
||||
|
||||
@@ -92,7 +92,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.51.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -91,7 +91,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.51.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -122,7 +122,10 @@ def calculate_scores_and_build_plots(
|
||||
input_dir: str, output_dir: str, automl_settings: Dict[str, Any]
|
||||
):
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
grains = automl_settings.get(constants.TimeSeries.TIME_SERIES_ID_COLUMN_NAMES)
|
||||
grains = automl_settings.get(
|
||||
constants.TimeSeries.TIME_SERIES_ID_COLUMN_NAMES,
|
||||
automl_settings.get(constants.TimeSeries.GRAIN_COLUMN_NAMES, None),
|
||||
)
|
||||
time_column_name = automl_settings.get(constants.TimeSeries.TIME_COLUMN_NAME)
|
||||
if grains is None:
|
||||
grains = []
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
"For this notebook we are using a synthetic dataset to demonstrate the back testing in many model scenario. This allows us to check historical performance of AutoML on a historical data. To do that we step back on the backtesting period by the data set several times and split the data to train and test sets. Then these data sets are used for training and evaluation of model.<br>\n",
|
||||
"\n",
|
||||
"Thus, it is a quick way of evaluating AutoML as if it was in production. Here, we do not test historical performance of a particular model, for this see the [notebook](../forecasting-backtest-single-model/auto-ml-forecasting-backtest-single-model.ipynb). Instead, the best model for every backtest iteration can be different since AutoML chooses the best model for a given training set.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"**NOTE: There are limits on how many runs we can do in parallel per workspace, and we currently recommend to set the parallelism to maximum of 320 runs per experiment per workspace. If users want to have more parallelism and increase this limit they might encounter Too Many Requests errors (HTTP 429).**"
|
||||
@@ -43,7 +44,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prerequisites\n",
|
||||
"You'll need to create a compute Instance by following the instructions in the [EnvironmentSetup.md](../Setup_Resources/EnvironmentSetup.md)."
|
||||
"You'll need to create a compute Instance by following [these](https://learn.microsoft.com/en-us/azure/machine-learning/v1/how-to-create-manage-compute-instance?tabs=python) instructions."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -313,22 +314,37 @@
|
||||
"source": [
|
||||
"### Set up training parameters\n",
|
||||
"\n",
|
||||
"This dictionary defines the AutoML and many models settings. For this forecasting task we need to define several settings including the name of the time column, the maximum forecast horizon, and the partition column name definition. Please note, that in this case we are setting grain_column_names to be the time series ID column plus iteration, because we want to train a separate model for each time series and iteration.\n",
|
||||
"We need to provide ``ForecastingParameters``, ``AutoMLConfig`` and ``ManyModelsTrainParameters`` objects. For the forecasting task we also need to define several settings including the name of the time column, the maximum forecast horizon, and the partition column name(s) definition.\n",
|
||||
"\n",
|
||||
"#### ``ForecastingParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
|
||||
"| **time_column_name** | The name of your time column. |\n",
|
||||
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"| **cv_step_size** | Number of periods between two consecutive cross-validation folds. The default value is \\\"auto\\\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
|
||||
"\n",
|
||||
"#### ``AutoMLConfig`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **task** | forecasting |\n",
|
||||
"| **primary_metric** | This is the metric that you want to optimize.<br> Forecasting supports the following primary metrics <br><i>normalized_root_mean_squared_error</i><br><i>normalized_mean_absolute_error</i> |\n",
|
||||
"| **primary_metric** | This is the metric that you want to optimize.<br> Forecasting supports the following primary metrics <br><i>spearman_correlation</i><br><i>normalized_root_mean_squared_error</i><br><i>r2_score</i><br><i>normalized_mean_absolute_error</i> |\n",
|
||||
"| **blocked_models** | Blocked models won't be used by AutoML. |\n",
|
||||
"| **iteration_timeout_minutes** | Maximum amount of time in minutes that the model can train. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **iterations** | Number of models to train. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **experiment_timeout_hours** | Maximum amount of time in hours that the experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **experiment_timeout_hours** | Maximum amount of time in hours that each experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. **It does not control the overall timeout for the pipeline run, instead controls the timeout for each training run per partitioned time series.** |\n",
|
||||
"| **label_column_name** | The name of the label column. |\n",
|
||||
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
|
||||
"| **n_cross_validations** | Number of cross validation splits. The default value is \"auto\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
|
||||
"|**cv_step_size**|Number of periods between two consecutive cross-validation folds. The default value is \"auto\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value.\n",
|
||||
"| **time_column_name** | The name of your time column. |\n",
|
||||
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"| **n_cross_validations** | Number of cross validation splits. The default value is \\\"auto\\\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
|
||||
"| **enable_early_stopping** | Flag to enable early termination if the primary metric is no longer improving. |\n",
|
||||
"| **enable_engineered_explanations** | Engineered feature explanations will be downloaded if enable_engineered_explanations flag is set to True. By default it is set to False to save storage space. |\n",
|
||||
"| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
|
||||
"| **pipeline_fetch_max_batch_size** | Determines how many pipelines (training algorithms) to fetch at a time for training, this helps reduce throttling when training at large scale. |\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#### ``ManyModelsTrainParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **automl_settings** | The ``AutoMLConfig`` object defined above. |\n",
|
||||
"| **partition_column_names** | The names of columns used to group your models. For timeseries, the groups must not split up individual time-series. That is, each group must contain one or more whole time-series. |"
|
||||
]
|
||||
},
|
||||
@@ -345,22 +361,30 @@
|
||||
"from azureml.train.automl.runtime._many_models.many_models_parameters import (\n",
|
||||
" ManyModelsTrainParameters,\n",
|
||||
")\n",
|
||||
"from azureml.automl.core.forecasting_parameters import ForecastingParameters\n",
|
||||
"from azureml.train.automl.automlconfig import AutoMLConfig\n",
|
||||
"\n",
|
||||
"partition_column_names = [TIME_SERIES_ID_COLNAME, \"backtest_iteration\"]\n",
|
||||
"automl_settings = {\n",
|
||||
" \"task\": \"forecasting\",\n",
|
||||
" \"primary_metric\": \"normalized_root_mean_squared_error\",\n",
|
||||
" \"iteration_timeout_minutes\": 10, # This needs to be changed based on the dataset. We ask customer to explore how long training is taking before settings this value\n",
|
||||
" \"iterations\": 15,\n",
|
||||
" \"experiment_timeout_hours\": 0.25, # This also needs to be changed based on the dataset. For larger data set this number needs to be bigger.\n",
|
||||
" \"label_column_name\": TARGET_COLNAME,\n",
|
||||
" \"n_cross_validations\": \"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
|
||||
" \"cv_step_size\": \"auto\",\n",
|
||||
" \"time_column_name\": TIME_COLNAME,\n",
|
||||
" \"forecast_horizon\": 6,\n",
|
||||
" \"time_series_id_column_names\": partition_column_names,\n",
|
||||
" \"track_child_runs\": False,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"forecasting_parameters = ForecastingParameters(\n",
|
||||
" time_column_name=TIME_COLNAME,\n",
|
||||
" forecast_horizon=6,\n",
|
||||
" time_series_id_column_names=partition_column_names,\n",
|
||||
" cv_step_size=\"auto\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"automl_settings = AutoMLConfig(\n",
|
||||
" task=\"forecasting\",\n",
|
||||
" primary_metric=\"normalized_root_mean_squared_error\",\n",
|
||||
" iteration_timeout_minutes=10,\n",
|
||||
" iterations=15,\n",
|
||||
" experiment_timeout_hours=0.25,\n",
|
||||
" label_column_name=TARGET_COLNAME,\n",
|
||||
" n_cross_validations=\"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
|
||||
" track_child_runs=False,\n",
|
||||
" forecasting_parameters=forecasting_parameters,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"mm_paramters = ManyModelsTrainParameters(\n",
|
||||
" automl_settings=automl_settings, partition_column_names=partition_column_names\n",
|
||||
@@ -389,7 +413,14 @@
|
||||
"| **train_pipeline_parameters** | The set of configuration parameters defined in the previous section. |\n",
|
||||
"| **run_invocation_timeout** | Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. This must be greater than ``experiment_timeout_hours`` by at least 300 seconds. |\n",
|
||||
"\n",
|
||||
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution."
|
||||
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution.\n",
|
||||
"\n",
|
||||
"**Note**: Total time taken for the **training step** in the pipeline to complete = $ \\frac{t}{ p \\times n } \\times ts $\n",
|
||||
"where,\n",
|
||||
"- $ t $ is time taken for training one partition (can be viewed in the training logs)\n",
|
||||
"- $ p $ is ``process_count_per_node``\n",
|
||||
"- $ n $ is ``node_count``\n",
|
||||
"- $ ts $ is total number of partitions in time series based on ``partition_column_names``"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -492,25 +523,31 @@
|
||||
"source": [
|
||||
"For many models we need to provide the ManyModelsInferenceParameters object.\n",
|
||||
"\n",
|
||||
"#### ManyModelsInferenceParameters arguments\n",
|
||||
"#### ``ManyModelsInferenceParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **partition_column_names** | List of column names that identifies groups. |\n",
|
||||
"| **target_column_name** | \\[Optional\\] Column name only if the inference dataset has the target. |\n",
|
||||
"| **time_column_name** | Column name only if it is timeseries. |\n",
|
||||
"| **many_models_run_id** | \\[Optional\\] Many models pipeline run id where models were trained. |\n",
|
||||
"| **partition_column_names** | List of column names that identifies groups. |\n",
|
||||
"| **target_column_name** | \\[Optional] Column name only if the inference dataset has the target. |\n",
|
||||
"| **time_column_name** | \\[Optional] Time column name only if it is timeseries. |\n",
|
||||
"| **inference_type** | \\[Optional] Which inference method to use on the model. Possible values are 'forecast', 'predict_proba', and 'predict'. |\n",
|
||||
"| **forecast_mode** | \\[Optional] The type of forecast to be used, either 'rolling' or 'recursive'; defaults to 'recursive'. |\n",
|
||||
"| **step** | \\[Optional] Number of periods to advance the forecasting window in each iteration **(for rolling forecast only)**; defaults to 1. |\n",
|
||||
"\n",
|
||||
"#### get_many_models_batch_inference_steps arguments\n",
|
||||
"#### ``get_many_models_batch_inference_steps`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **experiment** | The experiment used for inference run. |\n",
|
||||
"| **inference_data** | The data to use for inferencing. It should be the same schema as used for training.\n",
|
||||
"| **compute_target** | The compute target that runs the inference pipeline.|\n",
|
||||
"| **compute_target** | The compute target that runs the inference pipeline. |\n",
|
||||
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with the number of cores per node (varies by compute sku). |\n",
|
||||
"| **process_count_per_node** | The number of processes per node.\n",
|
||||
"| **train_run_id** | \\[Optional\\] The run id of the hierarchy training, by default it is the latest successful training many model run in the experiment. |\n",
|
||||
"| **train_experiment_name** | \\[Optional\\] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline. |\n",
|
||||
"| **process_count_per_node** | \\[Optional\\] The number of processes per node, by default it's 4. |"
|
||||
"| **process_count_per_node** | \\[Optional] The number of processes per node. By default it's 2 (should be at most half of the number of cores in a single node of the compute cluster that will be used for the experiment).\n",
|
||||
"| **inference_pipeline_parameters** | \\[Optional] The ``ManyModelsInferenceParameters`` object defined above. |\n",
|
||||
"| **append_row_file_name** | \\[Optional] The name of the output file (optional, default value is 'parallel_run_step.txt'). Supports 'txt' and 'csv' file extension. A 'txt' file extension generates the output in 'txt' format with space as separator without column names. A 'csv' file extension generates the output in 'csv' format with comma as separator and with column names. |\n",
|
||||
"| **train_run_id** | \\[Optional] The run id of the **training pipeline**. By default it is the latest successful training pipeline run in the experiment. |\n",
|
||||
"| **train_experiment_name** | \\[Optional] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline. |\n",
|
||||
"| **run_invocation_timeout** | \\[Optional] Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **output_datastore** | \\[Optional] The ``Datastore`` or ``OutputDatasetConfig`` to be used for output. If specified any pipeline output will be written to that location. If unspecified the default datastore will be used. |\n",
|
||||
"| **arguments** | \\[Optional] Arguments to be passed to inference script. Possible argument is '--forecast_quantiles' followed by quantile values. |"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -629,7 +666,9 @@
|
||||
"backtesting_results = \"backtesting_mm_results\"\n",
|
||||
"os.makedirs(backtesting_results, exist_ok=True)\n",
|
||||
"calculate_scores_and_build_plots(\n",
|
||||
" forecasting_results_name, backtesting_results, automl_settings\n",
|
||||
" forecasting_results_name,\n",
|
||||
" backtesting_results,\n",
|
||||
" automl_settings.as_serializable_dict(),\n",
|
||||
")\n",
|
||||
"pd.DataFrame({\"File\": os.listdir(backtesting_results)})"
|
||||
]
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -10,6 +11,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -17,6 +19,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -34,6 +37,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -42,7 +46,7 @@
|
||||
"\n",
|
||||
"AutoML highlights here include built-in holiday featurization, accessing engineered feature names, and working with the `forecast` function. Please also look at the additional forecasting notebooks, which document lagging, rolling windows, forecast quantiles, other ways to use the forecast function, and forecaster deployment.\n",
|
||||
"\n",
|
||||
"Make sure you have executed the [configuration notebook](../../../configuration.ipynb) before running this notebook.\n",
|
||||
"Make sure you have executed the [configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) before running this notebook.\n",
|
||||
"\n",
|
||||
"Notebook synopsis:\n",
|
||||
"1. Creating an Experiment in an existing Workspace\n",
|
||||
@@ -52,6 +56,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -61,7 +66,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1680248038565
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
@@ -77,6 +86,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -93,6 +103,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -126,6 +137,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -165,30 +177,12 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Data\n",
|
||||
"\n",
|
||||
"The [Machine Learning service workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-workspace) is paired with the storage account, which contains the default data store. We will use it to upload the bike share data and create [tabular dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py) for training. A tabular dataset defines a series of lazily-evaluated, immutable operations to load data from the data source into tabular representation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"datastore = ws.get_default_datastore()\n",
|
||||
"datastore.upload_files(\n",
|
||||
" files=[\"./bike-no.csv\"], target_path=\"dataset/\", overwrite=True, show_progress=True\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's set up what we know about the dataset. \n",
|
||||
"\n",
|
||||
"**Target column** is what we want to forecast.\n",
|
||||
@@ -207,24 +201,51 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, \"dataset/bike-no.csv\")]\n",
|
||||
").with_timestamp_columns(fine_grain_timestamp=time_column_name)\n",
|
||||
"\n",
|
||||
"# Drop the columns 'casual' and 'registered' as these columns are a breakdown of the total and therefore a leak.\n",
|
||||
"dataset = dataset.drop_columns(columns=[\"casual\", \"registered\"])\n",
|
||||
"\n",
|
||||
"dataset.take(5).to_pandas_dataframe().reset_index(drop=True)"
|
||||
"You are now ready to load the historical bike share data. We will load the CSV file into a plain pandas DataFrame."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
"source_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"all_data = pd.read_csv(\"bike-no.csv\", parse_dates=[time_column_name])\n",
|
||||
"\n",
|
||||
"# Drop the columns 'casual' and 'registered' as these columns are a breakdown of the total and therefore a leak.\n",
|
||||
"all_data.drop([\"casual\", \"registered\"], axis=1, inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"### Split the data\n",
|
||||
"\n",
|
||||
@@ -234,25 +255,68 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1680247376789
|
||||
},
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
"source_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# select data that occurs before a specified date\n",
|
||||
"train = dataset.time_before(datetime(2012, 8, 31), include_boundary=True)\n",
|
||||
"train.to_pandas_dataframe().tail(5).reset_index(drop=True)"
|
||||
"train = all_data[all_data[time_column_name] <= pd.Timestamp(\"2012-08-31\")].copy()\n",
|
||||
"test = all_data[all_data[time_column_name] >= pd.Timestamp(\"2012-09-01\")].copy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Upload data to datastore\n",
|
||||
"\n",
|
||||
"The [Machine Learning service workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-workspace) is paired with the storage account, which contains the default data store. We will use it to upload the bike share data and create [tabular dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py) for training. A tabular dataset defines a series of lazily-evaluated, immutable operations to load data from the data source into tabular representation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
"source_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test = dataset.time_after(datetime(2012, 9, 1), include_boundary=True)\n",
|
||||
"test.to_pandas_dataframe().head(5).reset_index(drop=True)"
|
||||
"from azureml.data.dataset_factory import TabularDatasetFactory\n",
|
||||
"\n",
|
||||
"datastore = ws.get_default_datastore()\n",
|
||||
"\n",
|
||||
"train_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
|
||||
" train, target=(datastore, \"dataset/\"), name=\"bike_no_train\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"test_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
|
||||
" test, target=(datastore, \"dataset/\"), name=\"bike_no_test\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -270,6 +334,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -294,6 +359,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -312,6 +378,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -331,6 +398,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -360,7 +428,7 @@
|
||||
" featurization=featurization_config,\n",
|
||||
" blocked_models=[\"ExtremeRandomTrees\"],\n",
|
||||
" experiment_timeout_hours=0.3,\n",
|
||||
" training_data=train,\n",
|
||||
" training_data=train_dataset,\n",
|
||||
" label_column_name=target_column_name,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" enable_early_stopping=True,\n",
|
||||
@@ -373,6 +441,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -398,6 +467,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -416,6 +486,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -441,6 +512,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -484,6 +556,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -491,6 +564,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -509,6 +583,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -531,6 +606,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -546,7 +622,7 @@
|
||||
"from run_forecast import run_rolling_forecast\n",
|
||||
"\n",
|
||||
"remote_run = run_rolling_forecast(\n",
|
||||
" test_experiment, compute_target, best_run, test, target_column_name\n",
|
||||
" test_experiment, compute_target, best_run, test_dataset, target_column_name\n",
|
||||
")\n",
|
||||
"remote_run"
|
||||
]
|
||||
@@ -561,6 +637,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -579,6 +656,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -595,6 +673,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -626,6 +705,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -635,6 +715,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -666,6 +747,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -722,6 +804,9 @@
|
||||
],
|
||||
"friendly_name": "Forecasting BikeShare Demand",
|
||||
"index_order": 1,
|
||||
"kernel_info": {
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
@@ -737,11 +822,19 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.13"
|
||||
"version": "3.10.9"
|
||||
},
|
||||
"microsoft": {
|
||||
"ms_spell_check": {
|
||||
"ms_spell_check_language": "en"
|
||||
}
|
||||
},
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"npconvert_exporter": "python",
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
},
|
||||
"pygments_lexer": "ipython3",
|
||||
"tags": [
|
||||
"Forecasting"
|
||||
|
||||
@@ -2,23 +2,22 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Automated Machine Learning\n",
|
||||
"_**Forecasting using the Energy Demand Dataset**_\n",
|
||||
@@ -33,17 +32,17 @@
|
||||
"Advanced Forecasting\n",
|
||||
"1. [Advanced Training](#advanced_training)\n",
|
||||
"1. [Advanced Results](#advanced_results)"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Introduction<a id=\"introduction\"></a>\n",
|
||||
"\n",
|
||||
"In this example we use the associated New York City energy demand dataset to showcase how you can use AutoML for a simple forecasting problem and explore the results. The goal is predict the energy demand for the next 48 hours based on historic time-series data.\n",
|
||||
"\n",
|
||||
"If you are using an Azure Machine Learning Compute Instance, you are all set. Otherwise, go through the [configuration notebook](../../../configuration.ipynb) first, if you haven't already, to establish your connection to the AzureML Workspace.\n",
|
||||
"If you are using an Azure Machine Learning Compute Instance, you are all set. Otherwise, go through the [configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) first, if you haven't already, to establish your connection to the AzureML Workspace.\n",
|
||||
"\n",
|
||||
"In this notebook you will learn how to:\n",
|
||||
"1. Creating an Experiment using an existing Workspace\n",
|
||||
@@ -53,20 +52,18 @@
|
||||
"1. Generate the forecast and compute the out-of-sample accuracy metrics\n",
|
||||
"1. Configuration and remote run of AutoML for a time-series model with lag and rolling window features\n",
|
||||
"1. Run and explore the forecast with lagging features"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Setup<a id=\"setup\"></a>"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import logging\n",
|
||||
@@ -85,36 +82,36 @@
|
||||
"from azureml.core import Experiment, Workspace, Dataset\n",
|
||||
"from azureml.train.automl import AutoMLConfig\n",
|
||||
"from datetime import datetime"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This notebook is compatible with Azure ML SDK version 1.35.0 or later."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As part of the setup you have already created an Azure ML `Workspace` object. For Automated ML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ws = Workspace.from_config()\n",
|
||||
"\n",
|
||||
@@ -136,11 +133,13 @@
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create or Attach existing AmlCompute\n",
|
||||
"A compute target is required to execute a remote Automated ML run. \n",
|
||||
@@ -150,13 +149,11 @@
|
||||
"#### Creation of AmlCompute takes approximately 5 minutes. \n",
|
||||
"If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
|
||||
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
@@ -175,22 +172,24 @@
|
||||
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
"compute_target.wait_for_completion(show_output=True)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Data<a id=\"data\"></a>\n",
|
||||
"\n",
|
||||
"We will use energy consumption [data from New York City](http://mis.nyiso.com/public/P-58Blist.htm) for model training. The data is stored in a tabular format and includes energy demand and basic weather data at an hourly frequency. \n",
|
||||
"\n",
|
||||
"With Azure Machine Learning datasets you can keep a single copy of data in your storage, easily access data during model training, share data and collaborate with other users. Below, we will upload the datatset and create a [tabular dataset](https://docs.microsoft.com/bs-latn-ba/azure/machine-learning/service/how-to-create-register-datasets#dataset-types) to be used training and prediction."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's set up what we know about the dataset.\n",
|
||||
"\n",
|
||||
@@ -198,86 +197,122 @@
|
||||
"<b>Time column</b> is the time axis along which to predict.\n",
|
||||
"\n",
|
||||
"The other columns, \"temp\" and \"precip\", are implicitly designated as features."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"target_column_name = \"demand\"\n",
|
||||
"time_column_name = \"timeStamp\""
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/nyc_energy.csv\"\n",
|
||||
").with_timestamp_columns(fine_grain_timestamp=time_column_name)\n",
|
||||
"dataset.take(5).to_pandas_dataframe().reset_index(drop=True)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The NYC Energy dataset is missing energy demand values for all datetimes later than August 10th, 2017 5AM. Below, we trim the rows containing these missing values from the end of the dataset."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Cut off the end of the dataset due to large number of nan values\n",
|
||||
"dataset = dataset.time_before(datetime(2017, 10, 10, 5))"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Split the data into train and test sets"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The first split we make is into train and test sets. Note that we are splitting on time. Data before and including August 8th, 2017 5AM will be used for training, and data after will be used for testing."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# split into train based on time\n",
|
||||
"train = dataset.time_before(datetime(2017, 8, 8, 5), include_boundary=True)\n",
|
||||
"train.to_pandas_dataframe().reset_index(drop=True).sort_values(time_column_name).tail(5)"
|
||||
]
|
||||
"train = (\n",
|
||||
" dataset.time_before(datetime(2017, 8, 8, 5), include_boundary=True)\n",
|
||||
" .to_pandas_dataframe()\n",
|
||||
" .reset_index(drop=True)\n",
|
||||
")\n",
|
||||
"train.sort_values(time_column_name).tail(5)"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# split into test based on time\n",
|
||||
"test = dataset.time_between(datetime(2017, 8, 8, 6), datetime(2017, 8, 10, 5))\n",
|
||||
"test.to_pandas_dataframe().reset_index(drop=True).head(5)"
|
||||
]
|
||||
"test = (\n",
|
||||
" dataset.time_between(datetime(2017, 8, 8, 6), datetime(2017, 8, 10, 5))\n",
|
||||
" .to_pandas_dataframe()\n",
|
||||
" .reset_index(drop=True)\n",
|
||||
")\n",
|
||||
"test.head(5)"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# register the splitted train and test data in workspace storage\n",
|
||||
"from azureml.data.dataset_factory import TabularDatasetFactory\n",
|
||||
"\n",
|
||||
"datastore = ws.get_default_datastore()\n",
|
||||
"train_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
|
||||
" train, target=(datastore, \"dataset/\"), name=\"nyc_energy_train\"\n",
|
||||
")\n",
|
||||
"test_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
|
||||
" test, target=(datastore, \"dataset/\"), name=\"nyc_energy_test\"\n",
|
||||
")"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": false,
|
||||
"outputs_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Setting the maximum forecast horizon\n",
|
||||
"\n",
|
||||
@@ -286,20 +321,20 @@
|
||||
"Learn more about forecast horizons in our [Auto-train a time-series forecast model](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-auto-train-forecast#configure-and-run-experiment) guide.\n",
|
||||
"\n",
|
||||
"In this example, we set the horizon to 48 hours."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"forecast_horizon = 48"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Forecasting Parameters\n",
|
||||
"To define forecasting parameters for your experiment training, you can leverage the ForecastingParameters class. The table below details the forecasting parameter we will be passing into our experiment.\n",
|
||||
@@ -310,11 +345,11 @@
|
||||
"|**forecast_horizon**|The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly).|\n",
|
||||
"|**freq**|Forecast frequency. This optional parameter represents the period with which the forecast is desired, for example, daily, weekly, yearly, etc. Use this parameter for the correction of time series containing irregular data points or for padding of short time series. The frequency needs to be a pandas offset alias. Please refer to [pandas documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects) for more information.\n",
|
||||
"|**cv_step_size**|Number of periods between two consecutive cross-validation folds. The default value is \"auto\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Train<a id=\"train\"></a>\n",
|
||||
"\n",
|
||||
@@ -332,20 +367,18 @@
|
||||
"|**n_cross_validations**|Number of cross-validation folds to use for model/pipeline selection. The default value is \"auto\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value.\n",
|
||||
"|**enable_early_stopping**|Flag to enble early termination if the score is not improving in the short term.|\n",
|
||||
"|**forecasting_parameters**|A class holds all the forecasting related parameters.|\n"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This notebook uses the blocked_models parameter to exclude some models that take a longer time to train on this dataset. You can choose to remove models from the blocked_models list but you may need to increase the experiment_timeout_hours parameter value to get results."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.core.forecasting_parameters import ForecastingParameters\n",
|
||||
"\n",
|
||||
@@ -361,7 +394,7 @@
|
||||
" primary_metric=\"normalized_root_mean_squared_error\",\n",
|
||||
" blocked_models=[\"ExtremeRandomTrees\", \"AutoArima\", \"Prophet\"],\n",
|
||||
" experiment_timeout_hours=0.3,\n",
|
||||
" training_data=train,\n",
|
||||
" training_data=train_dataset,\n",
|
||||
" label_column_name=target_column_name,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" enable_early_stopping=True,\n",
|
||||
@@ -369,65 +402,65 @@
|
||||
" verbosity=logging.INFO,\n",
|
||||
" forecasting_parameters=forecasting_parameters,\n",
|
||||
")"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Call the `submit` method on the experiment object and pass the run configuration. Depending on the data and the number of iterations this can run for a while.\n",
|
||||
"One may specify `show_output = True` to print currently running iterations to the console."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"remote_run = experiment.submit(automl_config, show_output=False)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"remote_run.wait_for_completion()"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retrieve the Best Run details\n",
|
||||
"Below we retrieve the best Run object from among all the runs in the experiment."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"best_run = remote_run.get_best_child()\n",
|
||||
"best_run"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Featurization\n",
|
||||
"We can look at the engineered feature names generated in time-series featurization via. the JSON file named 'engineered_feature_names.json' under the run outputs."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download the JSON file locally\n",
|
||||
"best_run.download_file(\n",
|
||||
@@ -437,11 +470,13 @@
|
||||
" records = json.load(f)\n",
|
||||
"\n",
|
||||
"records"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### View featurization summary\n",
|
||||
"You can also see what featurization steps were performed on different raw features in the user data. For each raw feature in the user data, the following information is displayed:\n",
|
||||
@@ -451,13 +486,11 @@
|
||||
"+ Type detected\n",
|
||||
"+ If feature was dropped\n",
|
||||
"+ List of feature transformations for the raw feature"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download the featurization summary JSON file locally\n",
|
||||
"best_run.download_file(\n",
|
||||
@@ -479,41 +512,41 @@
|
||||
" \"Transformations\",\n",
|
||||
" ]\n",
|
||||
"]"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Forecasting<a id=\"forecast\"></a>\n",
|
||||
"\n",
|
||||
"Now that we have retrieved the best pipeline/model, it can be used to make predictions on test data. We will do batch scoring on the test dataset which should have the same schema as training dataset.\n",
|
||||
"\n",
|
||||
"The inference will run on a remote compute. In this example, it will re-use the training compute."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_experiment = Experiment(ws, experiment_name + \"_inference\")"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Retrieving forecasts from the model\n",
|
||||
"We have created a function called `run_forecast` that submits the test data to the best model determined during the training run and retrieves forecasts. This function uses a helper script `forecasting_script` which is uploaded and expecuted on the remote compute."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from run_forecast import run_remote_inference\n",
|
||||
"\n",
|
||||
@@ -521,39 +554,39 @@
|
||||
" test_experiment=test_experiment,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" train_run=best_run,\n",
|
||||
" test_dataset=test,\n",
|
||||
" test_dataset=test_dataset,\n",
|
||||
" target_column_name=target_column_name,\n",
|
||||
")\n",
|
||||
"remote_run_infer.wait_for_completion(show_output=False)\n",
|
||||
"\n",
|
||||
"# download the inference output file to the local machine\n",
|
||||
"remote_run_infer.download_file(\"outputs/predictions.csv\", \"predictions.csv\")"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Evaluate\n",
|
||||
"To evaluate the accuracy of the forecast, we'll compare against the actual sales quantities for some select metrics, included the mean absolute percentage error (MAPE). For more metrics that can be used for evaluation after training, please see [supported metrics](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-understand-automated-ml#regressionforecasting-metrics), and [how to calculate residuals](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-understand-automated-ml#residuals)."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load forecast data frame\n",
|
||||
"fcst_df = pd.read_csv(\"predictions.csv\", parse_dates=[time_column_name])\n",
|
||||
"fcst_df.head()"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.core.shared import constants\n",
|
||||
"from azureml.automl.runtime.shared.score import scoring\n",
|
||||
@@ -580,31 +613,31 @@
|
||||
" (test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8\n",
|
||||
")\n",
|
||||
"plt.show()"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Advanced Training <a id=\"advanced_training\"></a>\n",
|
||||
"We did not use lags in the previous model specification. In effect, the prediction was the result of a simple regression on date, time series identifier columns and any additional features. This is often a very good prediction as common time series patterns like seasonality and trends can be captured in this manner. Such simple regression is horizon-less: it doesn't matter how far into the future we are predicting, because we are not using past data. In the previous example, the horizon was only used to split the data for cross-validation."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Using lags and rolling window features\n",
|
||||
"Now we will configure the target lags, that is the previous values of the target variables, meaning the prediction is no longer horizon-less. We therefore must still specify the `forecast_horizon` that the model will learn to forecast. The `target_lags` keyword specifies how far back we will construct the lags of the target variable, and the `target_rolling_window_size` specifies the size of the rolling window over which we will generate the `max`, `min` and `sum` features.\n",
|
||||
"\n",
|
||||
"This notebook uses the blocked_models parameter to exclude some models that take a longer time to train on this dataset. You can choose to remove models from the blocked_models list but you may need to increase the iteration_timeout_minutes parameter value to get results."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"advanced_forecasting_parameters = ForecastingParameters(\n",
|
||||
" time_column_name=time_column_name,\n",
|
||||
@@ -627,7 +660,7 @@
|
||||
" \"Prophet\",\n",
|
||||
" ], # These models are blocked for tutorial purposes, remove this for real use cases.\n",
|
||||
" experiment_timeout_hours=0.3,\n",
|
||||
" training_data=train,\n",
|
||||
" training_data=train_dataset,\n",
|
||||
" label_column_name=target_column_name,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" enable_early_stopping=True,\n",
|
||||
@@ -635,70 +668,70 @@
|
||||
" verbosity=logging.INFO,\n",
|
||||
" forecasting_parameters=advanced_forecasting_parameters,\n",
|
||||
")"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We now start a new remote run, this time with lag and rolling window featurization. AutoML applies featurizations in the setup stage, prior to iterating over ML models. The full training set is featurized first, followed by featurization of each of the CV splits. Lag and rolling window features introduce additional complexity, so the run will take longer than in the previous example that lacked these featurizations."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"advanced_remote_run = experiment.submit(automl_config, show_output=False)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"advanced_remote_run.wait_for_completion()"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Retrieve the Best Run details"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"best_run_lags = remote_run.get_best_child()\n",
|
||||
"best_run_lags"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Advanced Results<a id=\"advanced_results\"></a>\n",
|
||||
"We did not use lags in the previous model specification. In effect, the prediction was the result of a simple regression on date, time series identifier columns and any additional features. This is often a very good prediction as common time series patterns like seasonality and trends can be captured in this manner. Such simple regression is horizon-less: it doesn't matter how far into the future we are predicting, because we are not using past data. In the previous example, the horizon was only used to split the data for cross-validation."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_experiment_advanced = Experiment(ws, experiment_name + \"_inference_advanced\")\n",
|
||||
"advanced_remote_run_infer = run_remote_inference(\n",
|
||||
" test_experiment=test_experiment_advanced,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" train_run=best_run_lags,\n",
|
||||
" test_dataset=test,\n",
|
||||
" test_dataset=test_dataset,\n",
|
||||
" target_column_name=target_column_name,\n",
|
||||
" inference_folder=\"./forecast_advanced\",\n",
|
||||
")\n",
|
||||
@@ -708,23 +741,23 @@
|
||||
"advanced_remote_run_infer.download_file(\n",
|
||||
" \"outputs/predictions.csv\", \"predictions_advanced.csv\"\n",
|
||||
")"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fcst_adv_df = pd.read_csv(\"predictions_advanced.csv\", parse_dates=[time_column_name])\n",
|
||||
"fcst_adv_df.head()"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.core.shared import constants\n",
|
||||
"from azureml.automl.runtime.shared.score import scoring\n",
|
||||
@@ -753,7 +786,10 @@
|
||||
" (test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8\n",
|
||||
")\n",
|
||||
"plt.show()"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -767,26 +803,37 @@
|
||||
"automated-machine-learning"
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"name": "python38-azureml",
|
||||
"language": "python",
|
||||
"name": "python38-azureml"
|
||||
"display_name": "Python 3.8 - AzureML"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.8.5",
|
||||
"mimetype": "text/x-python",
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"nbconvert_exporter": "python",
|
||||
"file_extension": ".py"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "6bd77c88278e012ef31757c15997a7bea8c943977c43d6909403c00ae11d43ca"
|
||||
}
|
||||
},
|
||||
"microsoft": {
|
||||
"ms_spell_check": {
|
||||
"ms_spell_check_language": "en"
|
||||
}
|
||||
},
|
||||
"kernel_info": {
|
||||
"name": "python3"
|
||||
},
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -52,7 +52,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Please make sure you have followed the `configuration.ipynb` notebook so that your ML workspace information is saved in the config file."
|
||||
"Please make sure you have followed the [configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) so that your ML workspace information is saved in the config file."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -52,7 +52,7 @@
|
||||
"\n",
|
||||
"AutoML highlights here include using Deep Learning forecasts, Arima, Prophet, Remote Execution and Remote Inferencing, and working with the `forecast` function. Please also look at the additional forecasting notebooks, which document lagging, rolling windows, forecast quantiles, other ways to use the forecast function, and forecaster deployment.\n",
|
||||
"\n",
|
||||
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
|
||||
"Make sure you have executed the [configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) before running this notebook.\n",
|
||||
"\n",
|
||||
"Notebook synopsis:\n",
|
||||
"\n",
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prerequisites\n",
|
||||
"You'll need to create a compute Instance by following the instructions in the [EnvironmentSetup.md](../Setup_Resources/EnvironmentSetup.md)."
|
||||
"You'll need to create a compute Instance by following [these](https://learn.microsoft.com/en-us/azure/machine-learning/v1/how-to-create-manage-compute-instance?tabs=python) instructions."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -259,6 +259,7 @@
|
||||
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
|
||||
"| **time_column_name** | The name of your time column. |\n",
|
||||
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"| **cv_step_size** | Number of periods between two consecutive cross-validation folds. The default value is \\\"auto\\\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
|
||||
"\n",
|
||||
"#### ``AutoMLConfig`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
@@ -268,11 +269,10 @@
|
||||
"| **blocked_models** | Blocked models won't be used by AutoML. |\n",
|
||||
"| **iteration_timeout_minutes** | Maximum amount of time in minutes that the model can train. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **iterations** | Number of models to train. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **experiment_timeout_hours** | Maximum amount of time in hours that the experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **experiment_timeout_hours** | Maximum amount of time in hours that each experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. **It does not control the overall timeout for the pipeline run, instead controls the timeout for each training run per partitioned time series.** |\n",
|
||||
"| **label_column_name** | The name of the label column. |\n",
|
||||
"| **n_cross_validations** | Number of cross-validation folds to use for model/pipeline selection. The default value is \\\"auto\\\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
|
||||
"| **cv_step_size** | Number of periods between two consecutive cross-validation folds. The default value is \\\"auto\\\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
|
||||
"| **enable_early_stopping** | Flag to enable early termination if the score is not improving in the short term. |\n",
|
||||
"| **n_cross_validations** | Number of cross validation splits. The default value is \\\"auto\\\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
|
||||
"| **enable_early_stopping** | Flag to enable early termination if the primary metric is no longer improving. |\n",
|
||||
"| **enable_engineered_explanations** | Engineered feature explanations will be downloaded if enable_engineered_explanations flag is set to True. By default it is set to False to save storage space. |\n",
|
||||
"| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
|
||||
"| **pipeline_fetch_max_batch_size** | Determines how many pipelines (training algorithms) to fetch at a time for training, this helps reduce throttling when training at large scale. |\n",
|
||||
@@ -281,7 +281,7 @@
|
||||
"#### ``HTSTrainParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **automl_settings** | ``AutoMLConfig`` object.\n",
|
||||
"| **automl_settings** | The ``AutoMLConfig`` object defined above. |\n",
|
||||
"| **hierarchy_column_names** | The names of columns that define the hierarchical structure of the data from highest level to most granular. |\n",
|
||||
"| **training_level** | The level of the hierarchy to be used for training models. |\n",
|
||||
"| **enable_engineered_explanations** | The switch controls engineered explanations. |"
|
||||
@@ -354,16 +354,25 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Parallel run step is leveraged to train the hierarchy. To configure the ParallelRunConfig you will need to determine the appropriate number of workers and nodes for your use case. The `process_count_per_node` is based off the number of cores of the compute VM. The node_count will determine the number of master nodes to use, increasing the node count will speed up the training process.\n",
|
||||
"Parallel run step is leveraged to train multiple models at once. To configure the ParallelRunConfig you will need to determine the appropriate number of workers and nodes for your use case. The ``process_count_per_node`` is based off the number of cores of the compute VM. The node_count will determine the number of master nodes to use, increasing the node count will speed up the training process.\n",
|
||||
"\n",
|
||||
"* **experiment:** The experiment used for training.\n",
|
||||
"* **train_data:** The tabular dataset to be used as input to the training run.\n",
|
||||
"* **node_count:** The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long.\n",
|
||||
"* **process_count_per_node:** Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node or optimal performance.\n",
|
||||
"* **train_pipeline_parameters:** The set of configuration parameters defined in the previous section. \n",
|
||||
"* **run_invocation_timeout:** Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. This must be greater than ``experiment_timeout_hours`` by at least 300 seconds.\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **experiment** | The experiment used for training. |\n",
|
||||
"| **train_data** | The file dataset to be used as input to the training run. |\n",
|
||||
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long. |\n",
|
||||
"| **process_count_per_node** | Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node for optimal performance. |\n",
|
||||
"| **train_pipeline_parameters** | The set of configuration parameters defined in the previous section. |\n",
|
||||
"| **run_invocation_timeout** | Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. This must be greater than ``experiment_timeout_hours`` by at least 300 seconds. |\n",
|
||||
"\n",
|
||||
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution."
|
||||
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution.\n",
|
||||
"\n",
|
||||
"**Note**: Total time taken for the **training step** in the pipeline to complete = $ \\frac{t}{ p \\times n } \\times ts $\n",
|
||||
"where,\n",
|
||||
"- $ t $ is time taken for training one partition (can be viewed in the training logs)\n",
|
||||
"- $ p $ is ``process_count_per_node``\n",
|
||||
"- $ n $ is ``node_count``\n",
|
||||
"- $ ts $ is total number of partitions in time series based on ``partition_column_names``"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -527,19 +536,24 @@
|
||||
"source": [
|
||||
"## 5.0 Forecasting\n",
|
||||
"For hierarchical forecasting we need to provide the HTSInferenceParameters object.\n",
|
||||
"#### HTSInferenceParameters arguments\n",
|
||||
"* **hierarchy_forecast_level:** The default level of the hierarchy to produce prediction/forecast on.\n",
|
||||
"* **allocation_method:** \\[Optional] The disaggregation method to use if the hierarchy forecast level specified is below the define hierarchy training level. <br><i>(average historical proportions) 'average_historical_proportions'</i><br><i>(proportions of the historical averages) 'proportions_of_historical_average'</i>\n",
|
||||
"#### ``HTSInferenceParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **hierarchy_forecast_level:** | The default level of the hierarchy to produce prediction/forecast on. |\n",
|
||||
"| **allocation_method:** | \\[Optional] The disaggregation method to use if the hierarchy forecast level specified is below the define hierarchy training level. <br><i>(average historical proportions) 'average_historical_proportions'</i><br><i>(proportions of the historical averages) 'proportions_of_historical_average'</i> |\n",
|
||||
"\n",
|
||||
"#### get_many_models_batch_inference_steps arguments\n",
|
||||
"* **experiment:** The experiment used for inference run.\n",
|
||||
"* **inference_data:** The data to use for inferencing. It should be the same schema as used for training.\n",
|
||||
"* **compute_target:** The compute target that runs the inference pipeline.\n",
|
||||
"* **node_count:** The number of compute nodes to be used for running the user script. We recommend to start with the number of cores per node (varies by compute sku).\n",
|
||||
"* **process_count_per_node:** The number of processes per node.\n",
|
||||
"* **train_run_id:** \\[Optional] The run id of the hierarchy training, by default it is the latest successful training hts run in the experiment.\n",
|
||||
"* **train_experiment_name:** \\[Optional] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline.\n",
|
||||
"* **process_count_per_node:** \\[Optional] The number of processes per node, by default it's 4."
|
||||
"#### ``get_many_models_batch_inference_steps`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **experiment** | The experiment used for inference run. |\n",
|
||||
"| **inference_data** | The data to use for inferencing. It should be the same schema as used for training.\n",
|
||||
"| **compute_target** | The compute target that runs the inference pipeline. |\n",
|
||||
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with the number of cores per node (varies by compute sku). |\n",
|
||||
"| **process_count_per_node** | \\[Optional] The number of processes per node. By default it's 2 (should be at most half of the number of cores in a single node of the compute cluster that will be used for the experiment).\n",
|
||||
"| **inference_pipeline_parameters** | \\[Optional] The ``HTSInferenceParameters`` object defined above. |\n",
|
||||
"| **train_run_id** | \\[Optional] The run id of the **training pipeline**. By default it is the latest successful training pipeline run in the experiment. |\n",
|
||||
"| **train_experiment_name** | \\[Optional] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline. |\n",
|
||||
"| **run_invocation_timeout** | \\[Optional] Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. |"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -40,7 +40,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prerequisites\n",
|
||||
"You'll need to create a compute Instance by following the instructions in the [EnvironmentSetup.md](../Setup_Resources/EnvironmentSetup.md)."
|
||||
"You'll need to create a compute Instance by following [these](https://learn.microsoft.com/en-us/azure/machine-learning/v1/how-to-create-manage-compute-instance?tabs=python) instructions."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -379,7 +379,7 @@
|
||||
"source": [
|
||||
"### Set up training parameters\n",
|
||||
"\n",
|
||||
"We need to provide ``ForecastingParameters``, ``AutoMLConfig`` and ``ManyModelsTrainParameters`` objects. For the forecasting task we also need to define several settings including the name of the time column, the maximum forecast horizon, and the partition column name definition.\n",
|
||||
"We need to provide ``ForecastingParameters``, ``AutoMLConfig`` and ``ManyModelsTrainParameters`` objects. For the forecasting task we also need to define several settings including the name of the time column, the maximum forecast horizon, and the partition column name(s) definition.\n",
|
||||
"\n",
|
||||
"#### ``ForecastingParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
@@ -387,6 +387,7 @@
|
||||
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
|
||||
"| **time_column_name** | The name of your time column. |\n",
|
||||
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"| **cv_step_size** | Number of periods between two consecutive cross-validation folds. The default value is \\\"auto\\\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
|
||||
"\n",
|
||||
"#### ``AutoMLConfig`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
@@ -396,14 +397,19 @@
|
||||
"| **blocked_models** | Blocked models won't be used by AutoML. |\n",
|
||||
"| **iteration_timeout_minutes** | Maximum amount of time in minutes that the model can train. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **iterations** | Number of models to train. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **experiment_timeout_hours** | Maximum amount of time in hours that the experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **experiment_timeout_hours** | Maximum amount of time in hours that each experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. **It does not control the overall timeout for the pipeline run, instead controls the timeout for each training run per partitioned time series.** |\n",
|
||||
"| **label_column_name** | The name of the label column. |\n",
|
||||
"| **n_cross_validations** | Number of cross validation splits. The default value is \\\"auto\\\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
|
||||
"| **cv_step_size** |Number of periods between two consecutive cross-validation folds. The default value is \\\"auto\\\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
|
||||
"| **enable_early_stopping** | Flag to enable early termination if the score is not improving in the short term. |\n",
|
||||
"| **enable_early_stopping** | Flag to enable early termination if the primary metric is no longer improving. |\n",
|
||||
"| **enable_engineered_explanations** | Engineered feature explanations will be downloaded if enable_engineered_explanations flag is set to True. By default it is set to False to save storage space. |\n",
|
||||
"| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
|
||||
"| **pipeline_fetch_max_batch_size** | Determines how many pipelines (training algorithms) to fetch at a time for training, this helps reduce throttling when training at large scale. |\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#### ``ManyModelsTrainParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **automl_settings** | The ``AutoMLConfig`` object defined above. |\n",
|
||||
"| **partition_column_names** | The names of columns used to group your models. For timeseries, the groups must not split up individual time-series. That is, each group must contain one or more whole time-series. |"
|
||||
]
|
||||
},
|
||||
@@ -427,9 +433,9 @@
|
||||
"\n",
|
||||
"forecasting_parameters = ForecastingParameters(\n",
|
||||
" time_column_name=\"WeekStarting\",\n",
|
||||
" drop_column_names=\"Revenue\",\n",
|
||||
" forecast_horizon=6,\n",
|
||||
" time_series_id_column_names=partition_column_names,\n",
|
||||
" cv_step_size=\"auto\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"automl_settings = AutoMLConfig(\n",
|
||||
@@ -440,7 +446,6 @@
|
||||
" experiment_timeout_hours=0.25,\n",
|
||||
" label_column_name=\"Quantity\",\n",
|
||||
" n_cross_validations=\"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
|
||||
" cv_step_size=\"auto\",\n",
|
||||
" track_child_runs=False,\n",
|
||||
" forecasting_parameters=forecasting_parameters,\n",
|
||||
")\n",
|
||||
@@ -463,7 +468,9 @@
|
||||
"\n",
|
||||
"Reuse of previous results (``allow_reuse``) is key when using pipelines in a collaborative environment since eliminating unnecessary reruns offers agility. Reuse is the default behavior when the ``script_name``, ``inputs``, and the parameters of a step remain the same. When reuse is allowed, results from the previous run are immediately sent to the next step. If ``allow_reuse`` is set to False, a new run will always be generated for this step during pipeline execution.\n",
|
||||
"\n",
|
||||
"> Note that we only support partitioned FileDataset and TabularDataset without partition when using such output as input."
|
||||
"> Note that we only support partitioned FileDataset and TabularDataset without partition when using such output as input.\n",
|
||||
"\n",
|
||||
"> Note that we **drop column** \"Revenue\" from the dataset in this step to avoid information leak as \"Quantity\" = \"Revenue\" / \"Price\". **Please modify the logic based on your data**."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -501,18 +508,25 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Parallel run step is leveraged to train multiple models at once. To configure the ParallelRunConfig you will need to determine the appropriate number of workers and nodes for your use case. The process_count_per_node is based off the number of cores of the compute VM. The node_count will determine the number of master nodes to use, increasing the node count will speed up the training process.\n",
|
||||
"Parallel run step is leveraged to train multiple models at once. To configure the ParallelRunConfig you will need to determine the appropriate number of workers and nodes for your use case. The ``process_count_per_node`` is based off the number of cores of the compute VM. The node_count will determine the number of master nodes to use, increasing the node count will speed up the training process.\n",
|
||||
"\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **experiment** | The experiment used for training. |\n",
|
||||
"| **train_data** | The file dataset to be used as input to the training run. |\n",
|
||||
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long. |\n",
|
||||
"| **process_count_per_node** | Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node or optimal performance. |\n",
|
||||
"| **process_count_per_node** | Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node for optimal performance. |\n",
|
||||
"| **train_pipeline_parameters** | The set of configuration parameters defined in the previous section. |\n",
|
||||
"| **run_invocation_timeout** | Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. This must be greater than ``experiment_timeout_hours`` by at least 300 seconds. |\n",
|
||||
"\n",
|
||||
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution."
|
||||
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution.\n",
|
||||
"\n",
|
||||
"**Note**: Total time taken for the **training step** in the pipeline to complete = $ \\frac{t}{ p \\times n } \\times ts $\n",
|
||||
"where,\n",
|
||||
"- $ t $ is time taken for training one partition (can be viewed in the training logs)\n",
|
||||
"- $ p $ is ``process_count_per_node``\n",
|
||||
"- $ n $ is ``node_count``\n",
|
||||
"- $ ts $ is total number of partitions in time series based on ``partition_column_names``"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -611,7 +625,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 7.2 Schedule the pipeline\n",
|
||||
"### 5.2 Schedule the pipeline\n",
|
||||
"You can also [schedule the pipeline](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-schedule-pipelines) to run on a time-based or change-based schedule. This could be used to automatically retrain models every month or based on another trigger such as data drift."
|
||||
]
|
||||
},
|
||||
@@ -667,25 +681,31 @@
|
||||
"source": [
|
||||
"For many models we need to provide the ManyModelsInferenceParameters object.\n",
|
||||
"\n",
|
||||
"#### ManyModelsInferenceParameters arguments\n",
|
||||
"#### ``ManyModelsInferenceParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **partition_column_names** | List of column names that identifies groups. |\n",
|
||||
"| **partition_column_names** | List of column names that identifies groups. |\n",
|
||||
"| **target_column_name** | \\[Optional] Column name only if the inference dataset has the target. |\n",
|
||||
"| **time_column_name** | \\[Optional] Column name only if it is timeseries. |\n",
|
||||
"| **many_models_run_id** | \\[Optional] Many models run id where models were trained. |\n",
|
||||
"| **time_column_name** | \\[Optional] Time column name only if it is timeseries. |\n",
|
||||
"| **inference_type** | \\[Optional] Which inference method to use on the model. Possible values are 'forecast', 'predict_proba', and 'predict'. |\n",
|
||||
"| **forecast_mode** | \\[Optional] The type of forecast to be used, either 'rolling' or 'recursive'; defaults to 'recursive'. |\n",
|
||||
"| **step** | \\[Optional] Number of periods to advance the forecasting window in each iteration **(for rolling forecast only)**; defaults to 1. |\n",
|
||||
"\n",
|
||||
"#### get_many_models_batch_inference_steps arguments\n",
|
||||
"#### ``get_many_models_batch_inference_steps`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **experiment** | The experiment used for inference run. |\n",
|
||||
"| **inference_data** | The data to use for inferencing. It should be the same schema as used for training.\n",
|
||||
"| **compute_target** | The compute target that runs the inference pipeline. |\n",
|
||||
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with the number of cores per node (varies by compute sku). |\n",
|
||||
"| **process_count_per_node** | The number of processes per node (should be at most half of the number of cores of the compute cluster that will be used for the experiment).\n",
|
||||
"| **train_run_id** | \\[Optional] The run id of the hierarchy training, by default it is the latest successful training many model run in the experiment. |\n",
|
||||
"| **process_count_per_node** | \\[Optional] The number of processes per node. By default it's 2 (should be at most half of the number of cores in a single node of the compute cluster that will be used for the experiment).\n",
|
||||
"| **inference_pipeline_parameters** | \\[Optional] The ``ManyModelsInferenceParameters`` object defined above. |\n",
|
||||
"| **append_row_file_name** | \\[Optional] The name of the output file (optional, default value is 'parallel_run_step.txt'). Supports 'txt' and 'csv' file extension. A 'txt' file extension generates the output in 'txt' format with space as separator without column names. A 'csv' file extension generates the output in 'csv' format with comma as separator and with column names. |\n",
|
||||
"| **train_run_id** | \\[Optional] The run id of the **training pipeline**. By default it is the latest successful training pipeline run in the experiment. |\n",
|
||||
"| **train_experiment_name** | \\[Optional] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline. |\n",
|
||||
"| **process_count_per_node** | \\[Optional] The number of processes per node, by default it's 4. |"
|
||||
"| **run_invocation_timeout** | \\[Optional] Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **output_datastore** | \\[Optional] The ``Datastore`` or ``OutputDatasetConfig`` to be used for output. If specified any pipeline output will be written to that location. If unspecified the default datastore will be used. |\n",
|
||||
"| **arguments** | \\[Optional] Arguments to be passed to inference script. Possible argument is '--forecast_quantiles' followed by quantile values. |"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -11,6 +11,12 @@ def main(args):
|
||||
dataset = run_context.input_datasets["train_10_models"]
|
||||
df = dataset.to_pandas_dataframe()
|
||||
|
||||
# Drop the column "Revenue" from the dataset to avoid information leak as
|
||||
# "Quantity" = "Revenue" / "Price". Please modify the logic based on your data.
|
||||
drop_column_name = "Revenue"
|
||||
if drop_column_name in df.columns:
|
||||
df.drop(drop_column_name, axis=1, inplace=True)
|
||||
|
||||
# Apply any data pre-processing techniques here
|
||||
|
||||
df.to_parquet(output / "data_prepared_result.parquet", compression=None)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -10,6 +11,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -17,6 +19,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -34,18 +37,20 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Introduction<a id=\"introduction\"></a>\n",
|
||||
"In this example, we use AutoML to train, select, and operationalize a time-series forecasting model for multiple time-series.\n",
|
||||
"\n",
|
||||
"Make sure you have executed the [configuration notebook](../../../configuration.ipynb) before running this notebook.\n",
|
||||
"Make sure you have executed the [configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) before running this notebook.\n",
|
||||
"\n",
|
||||
"The examples in the follow code samples use the University of Chicago's Dominick's Finer Foods dataset to forecast orange juice sales. Dominick's was a grocery chain in the Chicago metropolitan area."
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -70,6 +75,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -86,6 +92,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -119,6 +126,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -158,6 +166,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -181,6 +190,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -201,6 +211,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -220,6 +231,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -250,6 +262,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -275,6 +288,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -291,6 +305,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -318,6 +333,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -356,6 +372,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -373,6 +390,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -446,6 +464,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -472,6 +491,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -491,6 +511,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -528,6 +549,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -548,6 +570,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -556,6 +579,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -584,6 +608,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -639,6 +664,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -646,6 +672,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -668,6 +695,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -687,6 +715,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -715,7 +744,7 @@
|
||||
" description=\"Automl forecasting sample service\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"aci_service_name = \"automl-oj-forecast-01\"\n",
|
||||
"aci_service_name = \"automl-oj-forecast-03\"\n",
|
||||
"print(aci_service_name)\n",
|
||||
"aci_service = Model.deploy(ws, aci_service_name, [model], inference_config, aciconfig)\n",
|
||||
"aci_service.wait_for_deployment(True)\n",
|
||||
@@ -732,6 +761,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -780,6 +810,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
@@ -792,7 +823,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"serv = Webservice(ws, \"automl-oj-forecast-01\")\n",
|
||||
"serv = Webservice(ws, \"automl-oj-forecast-03\")\n",
|
||||
"serv.delete() # don't do it accidentally"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
"source": [
|
||||
"## Introduction\n",
|
||||
"\n",
|
||||
"In this notebook, we demonstrate how to use piplines to train and inference on AutoML Forecasting model. Two pipelines will be created: one for training AutoML model, and the other is for inference on AutoML model. We'll also demonstrate how to schedule the inference pipeline so you can get inference results periodically (with refreshed test dataset). Make sure you have executed the configuration notebook before running this notebook. In this notebook you will learn how to:\n",
|
||||
"In this notebook, we demonstrate how to use piplines to train and inference on AutoML Forecasting model. Two pipelines will be created: one for training AutoML model, and the other is for inference on AutoML model. We'll also demonstrate how to schedule the inference pipeline so you can get inference results periodically (with refreshed test dataset). Make sure you have executed the [configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) before running this notebook. In this notebook you will learn how to:\n",
|
||||
"\n",
|
||||
"- Configure AutoML using AutoMLConfig for forecasting tasks using pipeline AutoMLSteps.\n",
|
||||
"- Create and register an AutoML model using AzureML pipeline.\n",
|
||||
|
||||
@@ -2,25 +2,24 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In this notebook we will explore the univaraite time-series data to determine the settings for an automated ML experiment. We will follow the thought process depicted in the following diagram:<br/>\n",
|
||||
"In this notebook we will explore the univariate time-series data to determine the settings for an automated ML experiment. We will follow the thought process depicted in the following diagram:<br/>\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"The objective is to answer the following questions:\n",
|
||||
@@ -32,22 +31,20 @@
|
||||
" </ul>\n",
|
||||
" <li>Is the data stationary? </li>\n",
|
||||
" <ul style=\"margin-top:-1px; list-style-type:none\"> \n",
|
||||
" <li> Importance: In the absense of features that capture trend behavior, ML models (regression and tree based) are not well equiped to predict stochastic trends. Working with stationary data solves this problem. </li>\n",
|
||||
" <li> Importance: In the absence of features that capture trend behavior, ML models (regression and tree based) are not well equipped to predict stochastic trends. Working with stationary data solves this problem. </li>\n",
|
||||
" </ul>\n",
|
||||
" <li>Is there a detectable auto-regressive pattern in the stationary data? </li>\n",
|
||||
" <ul style=\"margin-top:-1px; list-style-type:none\"> \n",
|
||||
" <li> Importance: The accuracy of ML models can be improved if serial correlation is modeled by including lags of the dependent/target varaible as features. Including target lags in every experiment by default will result in a regression in accuracy scores if such setting is not warranted. </li>\n",
|
||||
" <li> Importance: The accuracy of ML models can be improved if serial correlation is modeled by including lags of the dependent/target variable as features. Including target lags in every experiment by default will result in a regression in accuracy scores if such setting is not warranted. </li>\n",
|
||||
" </ul>\n",
|
||||
"</ol>\n",
|
||||
"\n",
|
||||
"The answers to these questions will help determine the appropriate settings for the automated ML experiment.\n"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import warnings\n",
|
||||
@@ -68,13 +65,13 @@
|
||||
"# set printing options\n",
|
||||
"pd.set_option(\"display.max_columns\", 500)\n",
|
||||
"pd.set_option(\"display.width\", 1000)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load data\n",
|
||||
"main_data_loc = \"data\"\n",
|
||||
@@ -89,13 +86,13 @@
|
||||
"df.sort_values(by=TIME_COLNAME, inplace=True)\n",
|
||||
"df.set_index(TIME_COLNAME, inplace=True)\n",
|
||||
"df.head(2)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# plot the entire dataset\n",
|
||||
"fig, ax = plt.subplots(figsize=(6, 2), dpi=180)\n",
|
||||
@@ -103,20 +100,20 @@
|
||||
"ax.title.set_text(\"Original Data Series\")\n",
|
||||
"locs, labels = plt.xticks()\n",
|
||||
"plt.xticks(rotation=45)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The graph plots the alcohol sales in the United States. Because the data is trending, it can be difficult to see cycles, seasonality or other interestng behaviors due to the scaling issues. For example, if there is a seasonal pattern, which we will discuss later, we cannot see them on the trending data. In such case, it is worth plotting the same data in first differences."
|
||||
]
|
||||
"The graph plots the alcohol sales in the United States. Because the data is trending, it can be difficult to see cycles, seasonality or other interesting behaviors due to the scaling issues. For example, if there is a seasonal pattern, which we will discuss later, we cannot see them on the trending data. In such case, it is worth plotting the same data in first differences."
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# plot the entire dataset in first differences\n",
|
||||
"fig, ax = plt.subplots(figsize=(6, 2), dpi=180)\n",
|
||||
@@ -124,18 +121,20 @@
|
||||
"ax.title.set_text(\"Data in first differences\")\n",
|
||||
"locs, labels = plt.xticks()\n",
|
||||
"plt.xticks(rotation=45)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In the previous plot we observe that the data is more volatile towards the end of the series. This period coincides with the Covid-19 period, so we will exclude it from our experiment. Since in this example there are no user-provided features it is hard to make an argument that a model trained on the less volatile pre-covid data will be able to accurately predict the covid period."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# 1. Seasonality\n",
|
||||
"\n",
|
||||
@@ -144,13 +143,11 @@
|
||||
"2. If it's seasonal, does the data exhibit a trend (up or down)?\n",
|
||||
"\n",
|
||||
"It is hard to visually detect seasonality when the data is trending. The reason being is scale of seasonal fluctuations is dwarfed by the range of the trend in the data. One way to deal with this is to de-trend the data by taking the first differences. We will discuss this in more detail in the next section."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# plot the entire dataset in first differences\n",
|
||||
"fig, ax = plt.subplots(figsize=(6, 2), dpi=180)\n",
|
||||
@@ -158,20 +155,20 @@
|
||||
"ax.title.set_text(\"Data in first differences\")\n",
|
||||
"locs, labels = plt.xticks()\n",
|
||||
"plt.xticks(rotation=45)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For the next plot, we will exclude the Covid period again. We will also shorten the length of data because plotting a very long time series may prevent us from seeing seasonal patterns, if there are any, because the plot may look like a random walk."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# remove COVID period\n",
|
||||
"df = df[:COVID_PERIOD_START]\n",
|
||||
@@ -182,11 +179,13 @@
|
||||
"ax.title.set_text(\"Data in first differences\")\n",
|
||||
"locs, labels = plt.xticks()\n",
|
||||
"plt.xticks(rotation=45)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<p style=\"font-size:150%; color:blue\"> Conclusion </p>\n",
|
||||
"\n",
|
||||
@@ -205,11 +204,11 @@
|
||||
" <li> In the first case, by taking first differences we are removing stochastic trend, but we do not remove seasonal patterns. In the second case, we do not remove the stochastic trend and it can be captured by the trend component of the STL decomposition. It is hard to say which option will work best in your case, hence you will need to run both options to see which one results in more accurate forecasts. </li>\n",
|
||||
" </ul>\n",
|
||||
"</ol>"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# 2. Stationarity\n",
|
||||
"If the data does not exhibit seasonal patterns, we would like to see if the data is non-stationary. Particularly, we want to see if there is a clear trending behavior. If such behavior is observed, we would like to first difference the data and examine the plot of an auto-correlation function (ACF) known as correlogram. If the data is seasonal, differencing it will not get rid off the seasonality and this will be shown on the correlogram as well.\n",
|
||||
@@ -237,13 +236,11 @@
|
||||
"</ol>\n",
|
||||
"\n",
|
||||
"To answer the first question, we run a series of tests (we call them unit root tests)."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# unit root tests\n",
|
||||
"test = unit_root_test_wrapper(df[TARGET_COLNAME])\n",
|
||||
@@ -251,11 +248,13 @@
|
||||
"print(\"Summary table\", \"\\n\", test[\"summary\"], \"\\n\")\n",
|
||||
"print(\"Is the {} series stationary?: {}\".format(TARGET_COLNAME, test[\"stationary\"]))\n",
|
||||
"print(\"---------------\", \"\\n\")"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In the previous cell, we ran a series of unit root tests. The summary table contains the following columns:\n",
|
||||
"<ul> \n",
|
||||
@@ -277,13 +276,11 @@
|
||||
"Each of the tests shows that the original time series is non-stationary. The final decision is based on the majority rule. If, there is a split decision, the algorithm will claim it is stationary. We run a series of tests because each test by itself may not be accurate. In many cases when there are conflicting test results, the user needs to make determination if the series is stationary or not.\n",
|
||||
"\n",
|
||||
"Since we found the series to be non-stationary, we will difference it and then test if the differenced series is stationary."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# unit root tests\n",
|
||||
"test = unit_root_test_wrapper(df[TARGET_COLNAME].diff().dropna())\n",
|
||||
@@ -291,20 +288,20 @@
|
||||
"print(\"Summary table\", \"\\n\", test[\"summary\"], \"\\n\")\n",
|
||||
"print(\"Is the {} series stationary?: {}\".format(TARGET_COLNAME, test[\"stationary\"]))\n",
|
||||
"print(\"---------------\", \"\\n\")"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Four out of five tests show that the series in first differences is stationary. Notice that this decision is not unanimous. Next, let's plot the original series in first-differences to illustrate the difference between non-stationary (unit root) process vs the stationary one."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# plot original and stationary data\n",
|
||||
"fig = plt.figure(figsize=(10, 10))\n",
|
||||
@@ -314,29 +311,31 @@
|
||||
"ax2.plot(df[TARGET_COLNAME].diff().dropna(), \"-b\")\n",
|
||||
"ax1.title.set_text(\"Original data\")\n",
|
||||
"ax2.title.set_text(\"Data in first differences\")"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you were asked a question \"What is the mean of the series before and after 2008?\", for the series titled \"Original data\" the mean values will be significantly different. This implies that the first moment of the series (in this case, it is the mean) is time dependent, i.e., mean changes depending on the interval one is looking at. Thus, the series is deemed to be non-stationary. On the other hand, for the series titled \"Data in first differences\" the means for both periods are roughly the same. Hence, the first moment is time invariant; meaning it does not depend on the interval of time one is looking at. In this example it is easy to visually distinguish between stationary and non-stationary data. Often this distinction is not easy to make, therefore we rely on the statistical tests described above to help us make an informed decision. "
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<p style=\"font-size:150%; color:blue\"> Conclusion </p>\n",
|
||||
"Since we found the original process to be non-stationary (contains unit root), we will have to model the data in first differences. As a result, we will set the DIFFERENCE_SERIES parameter to True."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# 3 Check if there is a clear autoregressive pattern\n",
|
||||
"We need to determine if we should include lags of the target variable as features in order to improve forecast accuracy. To do this, we will examine the ACF and partial ACF (PACF) plots of the stationary series. In our case, it is a series in first diffrences.\n",
|
||||
"# 3 Check if there is a clear auto-regressive pattern\n",
|
||||
"We need to determine if we should include lags of the target variable as features in order to improve forecast accuracy. To do this, we will examine the ACF and partial ACF (PACF) plots of the stationary series. In our case, it is a series in first differences.\n",
|
||||
"\n",
|
||||
"<ul>\n",
|
||||
" <li> Question: What is an Auto-regressive pattern? What are we looking for? </li>\n",
|
||||
@@ -347,11 +346,11 @@
|
||||
" The lag order is on the x-axis while the auto- and partial-correlation coefficients are on the y-axis. Vertical lines that are outside the shaded area represent statistically significant lags. Notice, the ACF function decays to zero and the PACF shows 2 significant spikes (we ignore the first spike for lag 0 in both plots since the linear relationship of any series with itself is always 1). <li/>\n",
|
||||
" </ul>\n",
|
||||
"<ul/>"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<ul>\n",
|
||||
" <li> Question: What do I do if I observe an auto-regressive behavior? </li>\n",
|
||||
@@ -365,32 +364,32 @@
|
||||
" <br/>\n",
|
||||
" <li> Next, let's examine the ACF and PACF plots of the stationary target variable (depicted below). Here, we do not see a decay in the ACF, instead we see a decay in PACF. It is hard to make an argument the the target variable exhibits auto-regressive behavior. </li>\n",
|
||||
" </ul>"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Plot the ACF/PACF for the series in differences\n",
|
||||
"fig, ax = plt.subplots(1, 2, figsize=(10, 5))\n",
|
||||
"plot_acf(df[TARGET_COLNAME].diff().dropna().values.squeeze(), ax=ax[0])\n",
|
||||
"plot_pacf(df[TARGET_COLNAME].diff().dropna().values.squeeze(), ax=ax[1])\n",
|
||||
"plt.show()"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<p style=\"font-size:150%; color:blue\"> Conclusion </p>\n",
|
||||
"Since we do not see a clear indication of an AR(p) process, we will not be using target lags and will set the TARGET_LAGS parameter to None."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<p style=\"font-size:150%; color:blue; font-weight: bold\"> AutoML Experiment Settings </p>\n",
|
||||
"Based on the analysis performed, we should try the following settings for the AutoML experiment and use them in the \"2_run_experiment\" notebook.\n",
|
||||
@@ -399,11 +398,11 @@
|
||||
" <li> DIFFERENCE_SERIES=True </li>\n",
|
||||
" <li> TARGET_LAGS=None </li>\n",
|
||||
"</ul>"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Appendix: ACF, PACF and Lag Selection\n",
|
||||
"To do this, we will examine the ACF and partial ACF (PACF) plots of the differenced series. \n",
|
||||
@@ -418,23 +417,23 @@
|
||||
" </li>\n",
|
||||
" where $\\sigma_{xzy}$ is the covariance between two random variables $X$ and $Z$; $\\sigma_x$ and $\\sigma_z$ is the variance for $X$ and $Z$, respectively. The correlation coefficient measures the strength of linear relationship between two random variables. This metric can take any value from -1 to 1. <li/>\n",
|
||||
" <br/>\n",
|
||||
" <li> The auto-correlation coefficient $\\rho_{Y_{t} Y_{t-k}}$ is the time series equivalent of the correlation coefficient, except instead of measuring linear association between two random variables $X$ and $Z$, it measures the strength of a linear relationship between a random variable $Y_t$ and its lag $Y_{t-k}$ for any positive interger value of $k$. </li> \n",
|
||||
" <li> The auto-correlation coefficient $\\rho_{Y_{t} Y_{t-k}}$ is the time series equivalent of the correlation coefficient, except instead of measuring linear association between two random variables $X$ and $Z$, it measures the strength of a linear relationship between a random variable $Y_t$ and its lag $Y_{t-k}$ for any positive integer value of $k$. </li> \n",
|
||||
" <br />\n",
|
||||
" <li> To visualize the ACF for a particular lag, say lag 2, plot the second lag of a series $y_{t-2}$ on the x-axis, and plot the series itself $y_t$ on the y-axis. The autocorrelation coefficient is the slope of the best fitted regression line and can be interpreted as follows. A one unit increase in the lag of a variable one period ago leads to a $\\rho_{Y_{t} Y_{t-2}}$ units change in the variable in the current period. This interpreation can be applied to any lag. </li> \n",
|
||||
" <li> To visualize the ACF for a particular lag, say lag 2, plot the second lag of a series $y_{t-2}$ on the x-axis, and plot the series itself $y_t$ on the y-axis. The autocorrelation coefficient is the slope of the best fitted regression line and can be interpreted as follows. A one unit increase in the lag of a variable one period ago leads to a $\\rho_{Y_{t} Y_{t-2}}$ units change in the variable in the current period. This interpretation can be applied to any lag. </li> \n",
|
||||
" <br />\n",
|
||||
" <li> In the interpretation posted above we need to be careful not to confuse the word \"leads\" with \"causes\" since these are not the same thing. We do not know the lagged value of the varaible causes it to change. Afterall, there are probably many other features that may explain the movement in $Y_t$. All we are trying to do in this section is to identify situations when the variable contains the strong auto-regressive components that needs to be included in the model to improve forecast accuracy. </li>\n",
|
||||
" <li> In the interpretation posted above we need to be careful not to confuse the word \"leads\" with \"causes\" since these are not the same thing. We do not know the lagged value of the variable causes it to change. After all, there are probably many other features that may explain the movement in $Y_t$. All we are trying to do in this section is to identify situations when the variable contains the strong auto-regressive components that needs to be included in the model to improve forecast accuracy. </li>\n",
|
||||
" </ul>\n",
|
||||
"</ul>"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<ul>\n",
|
||||
" <li> Question: What is the PACF? </li>\n",
|
||||
" <ul style=\"list-style-type:none;\">\n",
|
||||
" <li> When describing the ACF we essentially running a regression between a partigular lag of a series, say, lag 4, and the series itself. What this implies is the regression coefficient for lag 4 captures the impact of everything that happens in lags 1, 2 and 3. In other words, if lag 1 is the most important lag and we exclude it from the regression, naturally, the regression model will assign the importance of the 1st lag to the 4th one. Partial auto-correlation function fixes this problem since it measures the contribution of each lag accounting for the information added by the intermediary lags. If we were to illustrate ACF and PACF for the fourth lag using the regression analogy, the difference is a follows: \n",
|
||||
" <li> When describing the ACF we essentially running a regression between a particular lag of a series, say, lag 4, and the series itself. What this implies is the regression coefficient for lag 4 captures the impact of everything that happens in lags 1, 2 and 3. In other words, if lag 1 is the most important lag and we exclude it from the regression, naturally, the regression model will assign the importance of the 1st lag to the 4th one. Partial auto-correlation function fixes this problem since it measures the contribution of each lag accounting for the information added by the intermediary lags. If we were to illustrate ACF and PACF for the fourth lag using the regression analogy, the difference is a follows: \n",
|
||||
" \\begin{align}\n",
|
||||
" Y_{t} &= a_{0} + a_{4} Y_{t-4} + e_{t} \\\\\n",
|
||||
" Y_{t} &= b_{0} + b_{1} Y_{t-1} + b_{2} Y_{t-2} + b_{3} Y_{t-3} + b_{4} Y_{t-4} + \\varepsilon_{t} \\\\\n",
|
||||
@@ -442,27 +441,28 @@
|
||||
" </li>\n",
|
||||
" <br/>\n",
|
||||
" <li>\n",
|
||||
" Here, you can think of $a_4$ and $b_{4}$ as the auto- and partial auto-correlation coefficients for lag 4. Notice, in the second equation we explicitely accounting for the intermediate lags by adding them as regrerssors.\n",
|
||||
" Here, you can think of $a_4$ and $b_{4}$ as the auto- and partial auto-correlation coefficients for lag 4. Notice, in the second equation we explicitly accounting for the intermediate lags by adding them as regressors.\n",
|
||||
" </li>\n",
|
||||
" </ul>\n",
|
||||
"</ul>"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<ul>\n",
|
||||
" <li> Question: Auto-regressive pattern? What are we looking for? </li>\n",
|
||||
" <ul style=\"list-style-type:none;\">\n",
|
||||
" <li> We are looking for a classical profiles for an AR(p) process such as an exponential decay of an ACF and a the first $p$ significant lags of the PACF. Let's examine the ACF/PACF profiles of the same simulated AR(2) shown in Section 3, and check if the ACF/PACF explanation are refelcted in these plots. <li/>\n",
|
||||
" <li> We are looking for a classical profiles for an AR(p) process such as an exponential decay of an ACF and a the first $p$ significant lags of the PACF. Let's examine the ACF/PACF profiles of the same simulated AR(2) shown in Section 3, and check if the ACF/PACF explanation are reflected in these plots. <li/>\n",
|
||||
" <li><img src=\"figures/ACF_PACF_for_AR2.png\" class=\"img_class\">\n",
|
||||
" <li> The autocorrelation coefficient for the 3rd lag is 0.6, which can be interpreted that a one unit increase in the value of the target varaible three periods ago leads to 0.6 units increase in the current period. However, the PACF plot shows that the partial autocorrealtion coefficient is zero (from a statistical point of view since it lies within the shaded region). This is happening because the 1st and 2nd lags are good predictors of the target variable. Ommiting these two lags from the regression results in the misleading conclusion that the third lag is a good prediciton. <li/>\n",
|
||||
" <li> The autocorrelation coefficient for the 3rd lag is 0.6, which can be interpreted that a one unit increase in the value of the target variable three periods ago leads to 0.6 units increase in the current period. However, the PACF plot shows that the partial autocorrelation coefficient is zero (from a statistical point of view since it lies within the shaded region). This is happening because the 1st and 2nd lags are good predictors of the target variable. Omitting these two lags from the regression results in the misleading conclusion that the third lag is a good prediction. <li/>\n",
|
||||
" <br/>\n",
|
||||
" <li> This is why it is important to examine both the ACF and the PACF plots when tring to determine the auto regressive order for the variable in question. <li/>\n",
|
||||
" <li> This is why it is important to examine both the ACF and the PACF plots when trying to determine the auto regressive order for the variable in question. <li/>\n",
|
||||
" </ul>\n",
|
||||
"</ul> "
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -472,21 +472,32 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"name": "python38-azureml",
|
||||
"language": "python",
|
||||
"name": "python38-azureml"
|
||||
"display_name": "Python 3.8 - AzureML"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.8.10",
|
||||
"mimetype": "text/x-python",
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.9"
|
||||
"nbconvert_exporter": "python",
|
||||
"file_extension": ".py"
|
||||
},
|
||||
"microsoft": {
|
||||
"ms_spell_check": {
|
||||
"ms_spell_check_language": "en"
|
||||
}
|
||||
},
|
||||
"kernel_info": {
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -2,23 +2,22 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Running AutoML experiments\n",
|
||||
"\n",
|
||||
@@ -27,20 +26,18 @@
|
||||
"<br/>\n",
|
||||
"\n",
|
||||
"The output generated by this notebook is saved in the `experiment_output`folder."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Setup"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import logging\n",
|
||||
@@ -63,21 +60,21 @@
|
||||
"np.set_printoptions(precision=4, suppress=True, linewidth=100)\n",
|
||||
"pd.set_option(\"display.max_columns\", 500)\n",
|
||||
"pd.set_option(\"display.width\", 1000)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As part of the setup you have already created a **Workspace**. You will also need to create a [compute target](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute) for your AutoML run. In this tutorial, you create AmlCompute as your training compute resource.\n",
|
||||
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ws = Workspace.from_config()\n",
|
||||
"amlcompute_cluster_name = \"recipe-cluster\"\n",
|
||||
@@ -107,22 +104,22 @@
|
||||
"compute_target.wait_for_completion(\n",
|
||||
" show_output=True, min_node_count=None, timeout_in_minutes=20\n",
|
||||
")"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Data\n",
|
||||
"\n",
|
||||
"Here, we will load the data from the csv file and drop the Covid period."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"main_data_loc = \"data\"\n",
|
||||
"train_file_name = \"S4248SM144SCEN.csv\"\n",
|
||||
@@ -140,32 +137,34 @@
|
||||
"\n",
|
||||
"# remove the Covid period\n",
|
||||
"df = df.query('{} <= \"{}\"'.format(TIME_COLNAME, COVID_PERIOD_START))"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Set parameters\n",
|
||||
"\n",
|
||||
"The first set of parameters is based on the analysis performed in the `auto-ml-forecasting-univariate-recipe-experiment-settings` notebook. "
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# set parameters based on the settings notebook analysis\n",
|
||||
"DIFFERENCE_SERIES = True\n",
|
||||
"TARGET_LAGS = None\n",
|
||||
"STL_TYPE = None"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Next, define additional parameters to be used in the <a href=\"https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig?view=azure-ml-py\"> AutoML config </a> class.\n",
|
||||
"\n",
|
||||
@@ -180,32 +179,30 @@
|
||||
" </ul>\n",
|
||||
" </li>\n",
|
||||
"</ul>\n"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# set other parameters\n",
|
||||
"FORECAST_HORIZON = 12\n",
|
||||
"TIME_SERIES_ID_COLNAMES = []\n",
|
||||
"BLOCKED_MODELS = []"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To run AutoML, you also need to create an **Experiment**. An Experiment corresponds to a prediction problem you are trying to solve, while a Run corresponds to a specific approach to the problem."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# choose a name for the run history container in the workspace\n",
|
||||
"if isinstance(TARGET_LAGS, list):\n",
|
||||
@@ -232,38 +229,38 @@
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"print(outputDf.T)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# create output directory\n",
|
||||
"output_dir = \"experiment_output/{}\".format(experiment_desc)\n",
|
||||
"if not os.path.exists(output_dir):\n",
|
||||
" os.makedirs(output_dir)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# difference data and test for unit root\n",
|
||||
"if DIFFERENCE_SERIES:\n",
|
||||
" df_delta = df.copy()\n",
|
||||
" df_delta[TARGET_COLNAME] = df[TARGET_COLNAME].diff()\n",
|
||||
" df_delta.dropna(axis=0, inplace=True)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# split the data into train and test set\n",
|
||||
"if DIFFERENCE_SERIES:\n",
|
||||
@@ -281,64 +278,51 @@
|
||||
" time_colname=TIME_COLNAME,\n",
|
||||
" ts_id_colnames=TIME_SERIES_ID_COLNAMES,\n",
|
||||
" )"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Upload files to the Datastore\n",
|
||||
"The [Machine Learning service workspace](https://docs.microsoft.com/en-us/azure/machine-learning/service/concept-workspace) is paired with the storage account, which contains the default data store. We will use it to upload the bike share data and create [tabular dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset?view=azure-ml-py) for training. A tabular dataset defines a series of lazily-evaluated, immutable operations to load data from the data source into tabular representation."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_train.to_csv(\"train.csv\", index=False)\n",
|
||||
"df_test.to_csv(\"test.csv\", index=False)\n",
|
||||
"\n",
|
||||
"from azureml.data.dataset_factory import TabularDatasetFactory\n",
|
||||
"\n",
|
||||
"datastore = ws.get_default_datastore()\n",
|
||||
"datastore.upload_files(\n",
|
||||
" files=[\"./train.csv\"],\n",
|
||||
" target_path=\"uni-recipe-dataset/tabular/\",\n",
|
||||
" overwrite=True,\n",
|
||||
" show_progress=True,\n",
|
||||
"train_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
|
||||
" df_train, target=(datastore, \"dataset/\"), name=\"train\"\n",
|
||||
")\n",
|
||||
"datastore.upload_files(\n",
|
||||
" files=[\"./test.csv\"],\n",
|
||||
" target_path=\"uni-recipe-dataset/tabular/\",\n",
|
||||
" overwrite=True,\n",
|
||||
" show_progress=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"from azureml.core import Dataset\n",
|
||||
"\n",
|
||||
"train_dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, \"uni-recipe-dataset/tabular/train.csv\")]\n",
|
||||
")\n",
|
||||
"test_dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, \"uni-recipe-dataset/tabular/test.csv\")]\n",
|
||||
"test_dataset = TabularDatasetFactory.register_pandas_dataframe(\n",
|
||||
" df_test, target=(datastore, \"dataset/\"), name=\"test\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# print the first 5 rows of the Dataset\n",
|
||||
"train_dataset.to_pandas_dataframe().reset_index(drop=True).head(5)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Config AutoML"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"time_series_settings = {\n",
|
||||
" \"time_column_name\": TIME_COLNAME,\n",
|
||||
@@ -365,76 +349,76 @@
|
||||
" compute_target=compute_target,\n",
|
||||
" **time_series_settings,\n",
|
||||
")"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We will now run the experiment, you can go to Azure ML portal to view the run details."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"remote_run = experiment.submit(automl_config, show_output=False)\n",
|
||||
"remote_run.wait_for_completion()"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Retrieve the Best Run details\n",
|
||||
"Below we retrieve the best Run object from among all the runs in the experiment."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"best_run = remote_run.get_best_child()\n",
|
||||
"best_run"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Inference\n",
|
||||
"\n",
|
||||
"We now use the best fitted model from the AutoML Run to make forecasts for the test set. We will do batch scoring on the test dataset which should have the same schema as training dataset.\n",
|
||||
"\n",
|
||||
"The inference will run on a remote compute. In this example, it will re-use the training compute."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_experiment = Experiment(ws, experiment_name + \"_inference\")"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retreiving forecasts from the model\n",
|
||||
"We have created a function called `run_forecast` that submits the test data to the best model determined during the training run and retrieves forecasts. This function uses a helper script `forecasting_script` which is uploaded and expecuted on the remote compute."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from run_forecast import run_remote_inference\n",
|
||||
"\n",
|
||||
@@ -448,31 +432,31 @@
|
||||
"remote_run.wait_for_completion(show_output=False)\n",
|
||||
"\n",
|
||||
"remote_run.download_file(\"outputs/predictions.csv\", f\"{output_dir}/predictions.csv\")"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Download the prediction result for metrics calcuation\n",
|
||||
"The test data with predictions are saved in artifact `outputs/predictions.csv`. We will use it to calculate accuracy metrics and vizualize predictions versus actuals."
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_trans = pd.read_csv(f\"{output_dir}/predictions.csv\", parse_dates=[TIME_COLNAME])\n",
|
||||
"X_trans.head()"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# convert forecast in differences to levels\n",
|
||||
"def convert_fcst_diff_to_levels(fcst, yt, df_orig):\n",
|
||||
@@ -486,13 +470,13 @@
|
||||
" )\n",
|
||||
" out.rename(columns={TARGET_COLNAME: \"actual_level\"}, inplace=True)\n",
|
||||
" return out"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if DIFFERENCE_SERIES:\n",
|
||||
" # convert forecast in differences to the levels\n",
|
||||
@@ -506,20 +490,20 @@
|
||||
" fcst_df[\"predicted_level\"] = y_predictions\n",
|
||||
"\n",
|
||||
"del X_trans"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Calculate metrics and save output"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# compute metrics\n",
|
||||
"metrics_df = compute_metrics(fcst_df=fcst_df, metric_name=None, ts_id_colnames=None)\n",
|
||||
@@ -530,20 +514,20 @@
|
||||
"\n",
|
||||
"metrics_df.to_csv(os.path.join(output_dir, metrics_file_name), index=True)\n",
|
||||
"fcst_df.to_csv(os.path.join(output_dir, fcst_file_name), index=True)"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Generate and save visuals"
|
||||
]
|
||||
],
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plot_df = df.query('{} > \"2010-01-01\"'.format(TIME_COLNAME))\n",
|
||||
"plot_df.set_index(TIME_COLNAME, inplace=True)\n",
|
||||
@@ -562,7 +546,10 @@
|
||||
"\n",
|
||||
"plt.setp(labels, rotation=45)\n",
|
||||
"plt.savefig(os.path.join(output_dir, plot_file_name))"
|
||||
]
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -572,26 +559,37 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"name": "python38-azureml",
|
||||
"language": "python",
|
||||
"name": "python38-azureml"
|
||||
"display_name": "Python 3.8 - AzureML"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.8.5",
|
||||
"mimetype": "text/x-python",
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"nbconvert_exporter": "python",
|
||||
"file_extension": ".py"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "6bd77c88278e012ef31757c15997a7bea8c943977c43d6909403c00ae11d43ca"
|
||||
}
|
||||
},
|
||||
"microsoft": {
|
||||
"ms_spell_check": {
|
||||
"ms_spell_check_language": "en"
|
||||
}
|
||||
},
|
||||
"kernel_info": {
|
||||
"name": "python3"
|
||||
},
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -239,7 +239,7 @@
|
||||
"\n",
|
||||
"env = Environment(\"deploytocloudenv\")\n",
|
||||
"env.python.conda_dependencies.add_pip_package(\"joblib\")\n",
|
||||
"env.python.conda_dependencies.add_pip_package(\"numpy\")\n",
|
||||
"env.python.conda_dependencies.add_pip_package(\"numpy==1.23\")\n",
|
||||
"env.python.conda_dependencies.add_pip_package(\"scikit-learn=={}\".format(sklearn.__version__))"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -91,7 +91,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import joblib\n",
|
||||
"import dill\n",
|
||||
"\n",
|
||||
"from sklearn.datasets import load_diabetes\n",
|
||||
"from sklearn.linear_model import Ridge\n",
|
||||
@@ -101,7 +101,7 @@
|
||||
"\n",
|
||||
"model = Ridge().fit(dataset_x, dataset_y)\n",
|
||||
"\n",
|
||||
"joblib.dump(model, 'sklearn_regression_model.pkl')"
|
||||
"dill.dump(model, open('sklearn_regression_model.pkl', 'wb'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -285,7 +285,8 @@
|
||||
" 'azureml-defaults',\n",
|
||||
" 'inference-schema[numpy-support]',\n",
|
||||
" 'joblib',\n",
|
||||
" 'numpy',\n",
|
||||
" 'dill==0.3.6',\n",
|
||||
" 'numpy==1.23',\n",
|
||||
" 'scikit-learn=={}'.format(sklearn.__version__)\n",
|
||||
"])"
|
||||
]
|
||||
@@ -486,7 +487,8 @@
|
||||
" 'azureml-defaults',\n",
|
||||
" 'inference-schema[numpy-support]',\n",
|
||||
" 'joblib',\n",
|
||||
" 'numpy',\n",
|
||||
" 'dill==0.3.6',\n",
|
||||
" 'numpy==1.23',\n",
|
||||
" 'scikit-learn=={}'.format(sklearn.__version__)\n",
|
||||
"])\n",
|
||||
"inference_config = InferenceConfig(entry_script='score.py', environment=environment)\n",
|
||||
|
||||
@@ -1,373 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Deploy models to Azure Kubernetes Service (AKS) using controlled roll out\n",
|
||||
"This notebook will show you how to deploy mulitple AKS webservices with the same scoring endpoint and how to roll out your models in a controlled manner by configuring % of scoring traffic going to each webservice. If you are using a Notebook VM, you are all set. Otherwise, go through the [configuration notebook](../../../configuration.ipynb) to install the Azure Machine Learning Python SDK and create an Azure ML Workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check for latest version\n",
|
||||
"import azureml.core\n",
|
||||
"print(azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize workspace\n",
|
||||
"Create a [Workspace](https://docs.microsoft.com/python/api/azureml-core/azureml.core.workspace%28class%29?view=azure-ml-py) object from your persisted configuration."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()\n",
|
||||
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Register the model\n",
|
||||
"Register a file or folder as a model by calling [Model.register()](https://docs.microsoft.com/python/api/azureml-core/azureml.core.model.model?view=azure-ml-py#register-workspace--model-path--model-name--tags-none--properties-none--description-none--datasets-none--model-framework-none--model-framework-version-none--child-paths-none-).\n",
|
||||
"In addition to the content of the model file itself, your registered model will also store model metadata -- model description, tags, and framework information -- that will be useful when managing and deploying models in your workspace. Using tags, for instance, you can categorize your models and apply filters when listing models in your workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Model\n",
|
||||
"\n",
|
||||
"model = Model.register(workspace=ws,\n",
|
||||
" model_name='sklearn_regression_model.pkl', # Name of the registered model in your workspace.\n",
|
||||
" model_path='./sklearn_regression_model.pkl', # Local file to upload and register as a model.\n",
|
||||
" model_framework=Model.Framework.SCIKITLEARN, # Framework used to create the model.\n",
|
||||
" model_framework_version='0.19.1', # Version of scikit-learn used to create the model.\n",
|
||||
" description='Ridge regression model to predict diabetes progression.',\n",
|
||||
" tags={'area': 'diabetes', 'type': 'regression'})\n",
|
||||
"\n",
|
||||
"print('Name:', model.name)\n",
|
||||
"print('Version:', model.version)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Register an environment (for all models)\n",
|
||||
"\n",
|
||||
"If you control over how your model is run, or if it has special runtime requirements, you can specify your own environment and scoring method.\n",
|
||||
"\n",
|
||||
"Specify the model's runtime environment by creating an [Environment](https://docs.microsoft.com/python/api/azureml-core/azureml.core.environment%28class%29?view=azure-ml-py) object and providing the [CondaDependencies](https://docs.microsoft.com/python/api/azureml-core/azureml.core.conda_dependencies.condadependencies?view=azure-ml-py) needed by your model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"from azureml.core.conda_dependencies import CondaDependencies\n",
|
||||
"\n",
|
||||
"environment=Environment('my-sklearn-environment')\n",
|
||||
"environment.python.conda_dependencies = CondaDependencies.create(conda_packages=[\n",
|
||||
" 'pip==20.2.4'],\n",
|
||||
" pip_packages=[\n",
|
||||
" 'azureml-defaults',\n",
|
||||
" 'inference-schema[numpy-support]',\n",
|
||||
" 'numpy',\n",
|
||||
" 'scikit-learn==0.22.1',\n",
|
||||
" 'scipy'\n",
|
||||
"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"When using a custom environment, you must also provide Python code for initializing and running your model. An example script is included with this notebook."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('score.py') as f:\n",
|
||||
" print(f.read())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create the InferenceConfig\n",
|
||||
"Create the inference configuration to reference your environment and entry script during deployment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.model import InferenceConfig\n",
|
||||
"\n",
|
||||
"inference_config = InferenceConfig(entry_script='score.py', \n",
|
||||
" source_directory='.',\n",
|
||||
" environment=environment)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Provision the AKS Cluster\n",
|
||||
"If you already have an AKS cluster attached to this workspace, skip the step below and provide the name of the cluster.\n",
|
||||
"\n",
|
||||
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import AksCompute\n",
|
||||
"from azureml.core.compute import ComputeTarget\n",
|
||||
"# Use the default configuration (can also provide parameters to customize)\n",
|
||||
"prov_config = AksCompute.provisioning_configuration()\n",
|
||||
"\n",
|
||||
"aks_name = 'my-aks' \n",
|
||||
"# Create the cluster\n",
|
||||
"aks_target = ComputeTarget.create(workspace = ws, \n",
|
||||
" name = aks_name, \n",
|
||||
" provisioning_configuration = prov_config) \n",
|
||||
"aks_target.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create an Endpoint and add a version (AKS service)\n",
|
||||
"This creates a new endpoint and adds a version behind it. By default the first version added is the default version. You can specify the traffic percentile a version takes behind an endpoint. \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# deploying the model and create a new endpoint\n",
|
||||
"from azureml.core.webservice import AksEndpoint\n",
|
||||
"# from azureml.core.compute import ComputeTarget\n",
|
||||
"\n",
|
||||
"#select a created compute\n",
|
||||
"compute = ComputeTarget(ws, 'my-aks')\n",
|
||||
"namespace_name=\"endpointnamespace\"\n",
|
||||
"# define the endpoint name\n",
|
||||
"endpoint_name = \"myendpoint1\"\n",
|
||||
"# define the service name\n",
|
||||
"version_name= \"versiona\"\n",
|
||||
"\n",
|
||||
"endpoint_deployment_config = AksEndpoint.deploy_configuration(tags = {'modelVersion':'firstversion', 'department':'finance'}, \n",
|
||||
" description = \"my first version\", namespace = namespace_name, \n",
|
||||
" version_name = version_name, traffic_percentile = 40)\n",
|
||||
"\n",
|
||||
"endpoint = Model.deploy(ws, endpoint_name, [model], inference_config, endpoint_deployment_config, compute)\n",
|
||||
"endpoint.wait_for_deployment(True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"endpoint.get_logs()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Add another version of the service to an existing endpoint\n",
|
||||
"This adds another version behind an existing endpoint. You can specify the traffic percentile the new version takes. If no traffic_percentile is specified then it defaults to 0. All the unspecified traffic percentile (in this example 50) across all versions goes to default version."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Adding a new version to an existing Endpoint.\n",
|
||||
"version_name_add=\"versionb\" \n",
|
||||
"\n",
|
||||
"endpoint.create_version(version_name = version_name_add, inference_config=inference_config, models=[model], tags = {'modelVersion':'secondversion', 'department':'finance'}, \n",
|
||||
" description = \"my second version\", traffic_percentile = 10)\n",
|
||||
"endpoint.wait_for_deployment(True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Update an existing version in an endpoint\n",
|
||||
"There are two types of versions: control and treatment. An endpoint contains one or more treatment versions but only one control version. This categorization helps compare the different versions against the defined control version."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"endpoint.update_version(version_name=endpoint.versions[version_name_add].name, description=\"my second version update\", traffic_percentile=40, is_default=True, is_control_version_type=True)\n",
|
||||
"endpoint.wait_for_deployment(True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Test the web service using run method\n",
|
||||
"Test the web sevice by passing in data. Run() method retrieves API keys behind the scenes to make sure that call is authenticated."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Scoring on endpoint\n",
|
||||
"import json\n",
|
||||
"test_sample = json.dumps({'data': [\n",
|
||||
" [1,2,3,4,5,6,7,8,9,10], \n",
|
||||
" [10,9,8,7,6,5,4,3,2,1]\n",
|
||||
"]})\n",
|
||||
"\n",
|
||||
"test_sample_encoded = bytes(test_sample, encoding='utf8')\n",
|
||||
"prediction = endpoint.run(input_data=test_sample_encoded)\n",
|
||||
"print(prediction)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Delete Resources"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# deleting a version in an endpoint\n",
|
||||
"endpoint.delete_version(version_name=version_name)\n",
|
||||
"endpoint.wait_for_deployment(True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# deleting an endpoint, this will delete all versions in the endpoint and the endpoint itself\n",
|
||||
"endpoint.delete()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "shipatel"
|
||||
}
|
||||
],
|
||||
"category": "deployment",
|
||||
"compute": [
|
||||
"None"
|
||||
],
|
||||
"datasets": [
|
||||
"Diabetes"
|
||||
],
|
||||
"deployment": [
|
||||
"Azure Kubernetes Service"
|
||||
],
|
||||
"exclude_from_index": false,
|
||||
"framework": [
|
||||
"Scikit-learn"
|
||||
],
|
||||
"friendly_name": "Deploy models to AKS using controlled roll out",
|
||||
"index_order": 3,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.0"
|
||||
},
|
||||
"star_tag": [
|
||||
"featured"
|
||||
],
|
||||
"tags": [
|
||||
"None"
|
||||
],
|
||||
"task": "Deploy a model with Azure Machine Learning"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,4 +0,0 @@
|
||||
name: deploy-aks-with-controlled-rollout
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
@@ -1,28 +0,0 @@
|
||||
import pickle
|
||||
import json
|
||||
import numpy
|
||||
from sklearn.externals import joblib
|
||||
from sklearn.linear_model import Ridge
|
||||
from azureml.core.model import Model
|
||||
|
||||
|
||||
def init():
|
||||
global model
|
||||
# note here "sklearn_regression_model.pkl" is the name of the model registered under
|
||||
# this is a different behavior than before when the code is run locally, even though the code is the same.
|
||||
model_path = Model.get_model_path('sklearn_regression_model.pkl')
|
||||
# deserialize the model file back into a sklearn model
|
||||
model = joblib.load(model_path)
|
||||
|
||||
|
||||
# note you can pass in multiple rows for scoring
|
||||
def run(raw_data):
|
||||
try:
|
||||
data = json.loads(raw_data)['data']
|
||||
data = numpy.array(data)
|
||||
result = model.predict(data)
|
||||
# you can return any data type as long as it is JSON-serializable
|
||||
return result.tolist()
|
||||
except Exception as e:
|
||||
error = str(e)
|
||||
return error
|
||||
Binary file not shown.
@@ -5,4 +5,4 @@ dependencies:
|
||||
- matplotlib
|
||||
- tqdm
|
||||
- scipy
|
||||
- sklearn
|
||||
- scikit-learn
|
||||
|
||||
@@ -5,4 +5,4 @@ dependencies:
|
||||
- matplotlib
|
||||
- tqdm
|
||||
- scipy
|
||||
- sklearn
|
||||
- scikit-learn
|
||||
|
||||
@@ -106,7 +106,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.51.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -239,6 +239,7 @@
|
||||
"available_packages = pkg_resources.working_set\n",
|
||||
"pandas_ver = None\n",
|
||||
"numpy_ver = None\n",
|
||||
"sklearn_ver = None\n",
|
||||
"for dist in list(available_packages):\n",
|
||||
" if dist.key == 'pandas':\n",
|
||||
" pandas_ver = dist.version\n",
|
||||
@@ -247,16 +248,21 @@
|
||||
" numpy_ver = dist.version\n",
|
||||
" else:\n",
|
||||
" numpy_ver = '1.21.6'\n",
|
||||
" if dist.key == 'scikit-learn':\n",
|
||||
" sklearn_ver = dist.version\n",
|
||||
"pandas_dep = 'pandas'\n",
|
||||
"numpy_dep = 'numpy'\n",
|
||||
"sklearn_dep = 'scikit-learn'\n",
|
||||
"if pandas_ver:\n",
|
||||
" pandas_dep = 'pandas=={}'.format(pandas_ver)\n",
|
||||
"if numpy_ver:\n",
|
||||
" numpy_dep = 'numpy=={}'.format(numpy_ver)\n",
|
||||
"if sklearn_ver:\n",
|
||||
" sklearn_dep = 'scikit-learn=={}'.format(sklearn_ver)\n",
|
||||
"\n",
|
||||
"# Note: we build shap at commit 690245 for Tesla K80 GPUs\n",
|
||||
"env.docker.base_dockerfile = f\"\"\"\n",
|
||||
"FROM nvidia/cuda:10.2-devel-ubuntu20.04\n",
|
||||
"FROM nvidia/cuda:10.2-devel-ubuntu18.04\n",
|
||||
"ENV PATH=\"/root/miniconda3/bin:${{PATH}}\"\n",
|
||||
"ARG PATH=\"/root/miniconda3/bin:${{PATH}}\"\n",
|
||||
"RUN apt-get update && \\\n",
|
||||
@@ -292,7 +298,9 @@
|
||||
"pip uninstall -y xgboost && \\\n",
|
||||
"conda install py-xgboost==1.3.3 && \\\n",
|
||||
"pip uninstall -y numpy && \\\n",
|
||||
"pip install {numpy_dep} \\\n",
|
||||
"pip install {numpy_dep} && \\\n",
|
||||
"pip install {sklearn_dep} && \\\n",
|
||||
"pip install chardet \\\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"env.python.user_managed_dependencies = True\n",
|
||||
|
||||
@@ -10,7 +10,7 @@ dependencies:
|
||||
- ipython
|
||||
- matplotlib
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.23.0
|
||||
- raiwidgets~=0.26.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- scipy>=1.5.3
|
||||
|
||||
@@ -10,7 +10,7 @@ dependencies:
|
||||
- matplotlib
|
||||
- azureml-dataset-runtime
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.23.0
|
||||
- raiwidgets~=0.26.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- scipy>=1.5.3
|
||||
|
||||
@@ -9,7 +9,7 @@ dependencies:
|
||||
- ipython
|
||||
- matplotlib
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.23.0
|
||||
- raiwidgets~=0.26.0
|
||||
- packaging>=20.9
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
|
||||
@@ -9,7 +9,7 @@ dependencies:
|
||||
- ipython
|
||||
- matplotlib
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.23.0
|
||||
- raiwidgets~=0.26.0
|
||||
- packaging>=20.9
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
|
||||
@@ -11,7 +11,7 @@ dependencies:
|
||||
- azureml-dataset-runtime
|
||||
- azureml-core
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.23.0
|
||||
- raiwidgets~=0.26.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- scipy>=1.5.3
|
||||
|
||||
@@ -175,7 +175,7 @@
|
||||
"store_name=os.getenv(\"ADL_STORENAME_62\", \"<my-datastore-name>\") # ADLS account name\n",
|
||||
"tenant_id=os.getenv(\"ADL_TENANT_62\", \"<my-tenant-id>\") # tenant id of service principal\n",
|
||||
"client_id=os.getenv(\"ADL_CLIENTID_62\", \"<my-client-id>\") # client id of service principal\n",
|
||||
"client_secret=os.getenv(\"ADL_CLIENT_SECRET_62\", \"<my-client-secret>\") # the secret of service principal\n",
|
||||
"client_st=os.getenv(\"ADL_CLIENT_SECRET_62\", \"<my-client-secret>\") # the secret of service principal\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" adls_datastore = Datastore.get(ws, datastore_name)\n",
|
||||
@@ -189,7 +189,7 @@
|
||||
" store_name=store_name, # ADLS account name\n",
|
||||
" tenant_id=tenant_id, # tenant id of service principal\n",
|
||||
" client_id=client_id, # client id of service principal\n",
|
||||
" client_secret=client_secret) # the secret of service principal\n",
|
||||
" client_secret=client_st) # the secret of service principal\n",
|
||||
" print(\"Registered datastore with name: %s\" % datastore_name)\n",
|
||||
"\n",
|
||||
"adls_data_ref = DataReference(\n",
|
||||
|
||||
@@ -147,7 +147,7 @@
|
||||
"store_name = os.getenv(\"ADL_STORENAME_62\", \"<my-datastore-name>\") # ADLS account name\n",
|
||||
"tenant_id = os.getenv(\"ADL_TENANT_62\", \"<my-tenant-id>\") # tenant id of service principal\n",
|
||||
"client_id = os.getenv(\"ADL_CLIENTID_62\", \"<my-client-id>\") # client id of service principal\n",
|
||||
"client_secret = os.getenv(\"ADL_CLIENT_62_SECRET\", \"<my-client-secret>\") # the secret of service principal\n",
|
||||
"client_st = os.getenv(\"ADL_CLIENT_62_SECRET\", \"<my-client-secret>\") # the secret of service principal\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" adls_datastore = Datastore.get(ws, datastore_name)\n",
|
||||
@@ -161,7 +161,7 @@
|
||||
" store_name=store_name, # ADLS account name\n",
|
||||
" tenant_id=tenant_id, # tenant id of service principal\n",
|
||||
" client_id=client_id, # client id of service principal\n",
|
||||
" client_secret=client_secret) # the secret of service principal\n",
|
||||
" client_secret=client_st) # the secret of service principal\n",
|
||||
" print(\"registered datastore with name: %s\" % datastore_name)"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -330,7 +330,7 @@
|
||||
"- **inputs:** List of input connections for data consumed by this step. Fetch this inside the notebook using dbutils.widgets.get(\"input\")\n",
|
||||
"- **outputs:** List of output port definitions for outputs produced by this step. Fetch this inside the notebook using dbutils.widgets.get(\"output\")\n",
|
||||
"- **existing_cluster_id:** Cluster ID of an existing Interactive cluster on the Databricks workspace. If you are providing this, do not provide any of the parameters below that are used to create a new cluster such as spark_version, node_type, etc.\n",
|
||||
"- **spark_version:** Version of spark for the databricks run cluster. You can refer to [DataBricks runtime version](https://learn.microsoft.com/azure/databricks/dev-tools/api/#--runtime-version-strings) to specify the spark version. default value: 4.0.x-scala2.11\n",
|
||||
"- **spark_version:** Version of spark for the databricks run cluster. You can refer to [DataBricks runtime version](https://learn.microsoft.com/azure/databricks/dev-tools/api/#--runtime-version-strings) to specify the spark version. default value: 10.4.x-scala2.12\n",
|
||||
"- **node_type:** Azure vm node types for the databricks run cluster. default value: Standard_D3_v2\n",
|
||||
"- **num_workers:** Specifies a static number of workers for the databricks run cluster\n",
|
||||
"- **min_workers:** Specifies a min number of workers to use for auto-scaling the databricks run cluster\n",
|
||||
|
||||
@@ -86,7 +86,7 @@
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"oj_sales_path = \"./oj.csv\"\n",
|
||||
"r = requests.get(\"http://www.cs.unitn.it/~taufer/Data/oj.csv\")\n",
|
||||
"r = requests.get(\"https://raw.githubusercontent.com/Azure/azureml-examples/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-orange-juice-sales/data/dominicks_OJ.csv\")\n",
|
||||
"open(oj_sales_path, \"wb\").write(r.content)"
|
||||
]
|
||||
},
|
||||
@@ -140,7 +140,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"partitioned_dataset = dataset.partition_by(partition_keys=['store', 'brand'], target=(datastore, \"partition_by_key_res\"), name=\"partitioned_oj_data\")\n",
|
||||
"partitioned_dataset = dataset.partition_by(partition_keys=['Store', 'Brand'], target=(datastore, \"partition_by_key_res\"), name=\"partitioned_oj_data\")\n",
|
||||
"partitioned_dataset.partition_keys"
|
||||
]
|
||||
},
|
||||
@@ -274,7 +274,7 @@
|
||||
"parallel_run_config = ParallelRunConfig(\n",
|
||||
" source_directory=scripts_folder,\n",
|
||||
" entry_script=script_file, # the user script to run against each input\n",
|
||||
" partition_keys=['store', 'brand'],\n",
|
||||
" partition_keys=['Store', 'Brand'],\n",
|
||||
" error_threshold=5,\n",
|
||||
" output_action='append_row',\n",
|
||||
" append_row_file_name=\"revenue_outputs.txt\",\n",
|
||||
@@ -362,8 +362,8 @@
|
||||
"result_file = os.path.join(target_dir, batch_output.path_on_datastore, parallel_run_config.append_row_file_name)\n",
|
||||
"\n",
|
||||
"df = pd.read_csv(result_file, delimiter=\" \", header=None)\n",
|
||||
"df.columns=[\"WeekStarting\", \"Quantity\", \"logQuantity\", \"Advert\", \"Price\", \"Age60\", \"COLLEGE\", \"INCOME\", \"Hincome150\", \"Large HH\", \"Minorities\", \"WorkingWoman\", \"SSTRDIST\", \"SSTRVOL\", \"CPDIST5\", \"CPWVOL5\", \"Store\", \"Brand\", \"total_income\"]\n",
|
||||
"\n",
|
||||
"df.columns = [\"week\", \"logmove\", \"feat\", \"price\", \"AGE60\", \"EDUC\", \"ETHNIC\", \"INCOME\", \"HHLARGE\", \"WORKWOM\", \"HVAL150\", \"SSTRDIST\", \"SSTRVOL\", \"CPDIST5\", \"CPWVOL5\", \"store\", \"brand\", \"total_income\"]\n",
|
||||
"print(\"Prediction has \", df.shape[0], \" rows\")\n",
|
||||
"df.head(10)"
|
||||
]
|
||||
@@ -413,7 +413,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.9"
|
||||
"version": "3.8.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -1,358 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Distributed Chainer\n",
|
||||
"In this tutorial, you will run a Chainer training example on the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset using ChainerMN distributed training across a GPU cluster."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisites\n",
|
||||
"* If you are using an Azure Machine Learning compute instance, you are all set. Otherwise, go through the [Configuration](../../../../configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check core SDK version number\n",
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"SDK version:\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Diagnostics\n",
|
||||
"Opt-in diagnostics for better experience, quality, and security of future releases."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"Diagnostics"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.telemetry import set_diagnostics_collection\n",
|
||||
"\n",
|
||||
"set_diagnostics_collection(send_diagnostics=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize workspace\n",
|
||||
"\n",
|
||||
"Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()\n",
|
||||
"print('Workspace name: ' + ws.name, \n",
|
||||
" 'Azure region: ' + ws.location, \n",
|
||||
" 'Subscription id: ' + ws.subscription_id, \n",
|
||||
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create or attach existing AmlCompute\n",
|
||||
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, we use Azure ML managed compute ([AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute)) for our remote training compute resource. Specifically, the below code creates an `STANDARD_NC6` GPU cluster that autoscales from `0` to `4` nodes.\n",
|
||||
"\n",
|
||||
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
|
||||
"\n",
|
||||
"**Creation of AmlCompute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace, this code will skip the creation process.\n",
|
||||
"\n",
|
||||
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# choose a name for your cluster\n",
|
||||
"cluster_name = \"gpu-cluster\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
|
||||
" print('Found existing compute target.')\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" print('Creating a new compute target...')\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',\n",
|
||||
" max_nodes=4)\n",
|
||||
"\n",
|
||||
" # create the cluster\n",
|
||||
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
" compute_target.wait_for_completion(show_output=True)\n",
|
||||
"\n",
|
||||
"# use get_status() to get a detailed status for the current AmlCompute. \n",
|
||||
"print(compute_target.get_status().serialize())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The above code creates GPU compute. If you instead want to create CPU compute, provide a different VM size to the `vm_size` parameter, such as `STANDARD_D2_V2`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train model on the remote compute\n",
|
||||
"Now that we have the AmlCompute ready to go, let's run our distributed training job."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create a project directory\n",
|
||||
"Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"project_folder = './chainer-distr'\n",
|
||||
"os.makedirs(project_folder, exist_ok=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prepare training script\n",
|
||||
"Now you will need to create your training script. In this tutorial, the script for distributed training of MNIST is already provided for you at `train_mnist.py`. In practice, you should be able to take any custom Chainer training script as is and run it with Azure ML without having to modify your code."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Once your script is ready, copy the training script `train_mnist.py` into the project directory."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import shutil\n",
|
||||
"\n",
|
||||
"shutil.copy('train_mnist.py', project_folder)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create an experiment\n",
|
||||
"Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed Chainer tutorial. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Experiment\n",
|
||||
"\n",
|
||||
"experiment_name = 'chainer-distr'\n",
|
||||
"experiment = Experiment(ws, name=experiment_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create an environment\n",
|
||||
"\n",
|
||||
"In this tutorial, we will use one of the Azure ML Chainer curated environments for training."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"\n",
|
||||
"chainer_env = Environment.get(ws, name='AzureML-Chainer-5.1.0-GPU')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configure your training job\n",
|
||||
"\n",
|
||||
"Create a ScriptRunConfig object to specify the configuration details of your training job, including your training script, environment to use, and the compute target to run on.\n",
|
||||
"\n",
|
||||
"In order to execute a distributed run using MPI, you must create an `MpiConfiguration` object and specify it to the `distributed_job_config` parameter. The below code will configure a 2-node distributed job. If you would also like to run multiple processes per node (i.e. if your cluster SKU has multiple GPUs), additionally specify the `process_count_per_node` parameter in MpiConfiguration."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import ScriptRunConfig\n",
|
||||
"from azureml.core.runconfig import MpiConfiguration\n",
|
||||
"\n",
|
||||
"src = ScriptRunConfig(source_directory=project_folder,\n",
|
||||
" script='train_mnist.py',\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" environment=chainer_env,\n",
|
||||
" distributed_job_config=MpiConfiguration(node_count=2))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Submit job\n",
|
||||
"Run your experiment by submitting your ScriptRunConfig object. Note that this call is asynchronous."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"run = experiment.submit(src)\n",
|
||||
"print(run)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Monitor your run\n",
|
||||
"You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes. You can see that the widget automatically plots and visualizes the loss metric that we logged to the Azure ML run."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"\n",
|
||||
"RunDetails(run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"run.wait_for_completion(show_output=True)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "ninhu"
|
||||
}
|
||||
],
|
||||
"category": "training",
|
||||
"compute": [
|
||||
"AML Compute"
|
||||
],
|
||||
"datasets": [
|
||||
"MNIST"
|
||||
],
|
||||
"deployment": [
|
||||
"None"
|
||||
],
|
||||
"exclude_from_index": false,
|
||||
"framework": [
|
||||
"Chainer"
|
||||
],
|
||||
"friendly_name": "Distributed Training with Chainer",
|
||||
"index_order": 1,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.7"
|
||||
},
|
||||
"tags": [
|
||||
"None"
|
||||
],
|
||||
"task": "Use the Chainer estimator to perform distributed training"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,5 +0,0 @@
|
||||
name: distributed-chainer
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
@@ -1,125 +0,0 @@
|
||||
# Official ChainerMN example taken from
|
||||
# https://github.com/chainer/chainer/blob/master/examples/chainermn/mnist/train_mnist.py
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
|
||||
import chainer
|
||||
import chainer.functions as F
|
||||
import chainer.links as L
|
||||
from chainer import training
|
||||
from chainer.training import extensions
|
||||
|
||||
import chainermn
|
||||
|
||||
|
||||
class MLP(chainer.Chain):
|
||||
|
||||
def __init__(self, n_units, n_out):
|
||||
super(MLP, self).__init__(
|
||||
# the size of the inputs to each layer will be inferred
|
||||
l1=L.Linear(784, n_units), # n_in -> n_units
|
||||
l2=L.Linear(n_units, n_units), # n_units -> n_units
|
||||
l3=L.Linear(n_units, n_out), # n_units -> n_out
|
||||
)
|
||||
|
||||
def __call__(self, x):
|
||||
h1 = F.relu(self.l1(x))
|
||||
h2 = F.relu(self.l2(h1))
|
||||
return self.l3(h2)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='ChainerMN example: MNIST')
|
||||
parser.add_argument('--batchsize', '-b', type=int, default=100,
|
||||
help='Number of images in each mini-batch')
|
||||
parser.add_argument('--communicator', type=str,
|
||||
default='non_cuda_aware', help='Type of communicator')
|
||||
parser.add_argument('--epoch', '-e', type=int, default=20,
|
||||
help='Number of sweeps over the dataset to train')
|
||||
parser.add_argument('--gpu', '-g', default=True,
|
||||
help='Use GPU')
|
||||
parser.add_argument('--out', '-o', default='result',
|
||||
help='Directory to output the result')
|
||||
parser.add_argument('--resume', '-r', default='',
|
||||
help='Resume the training from snapshot')
|
||||
parser.add_argument('--unit', '-u', type=int, default=1000,
|
||||
help='Number of units')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Prepare ChainerMN communicator.
|
||||
|
||||
if args.gpu:
|
||||
if args.communicator == 'naive':
|
||||
print("Error: 'naive' communicator does not support GPU.\n")
|
||||
exit(-1)
|
||||
comm = chainermn.create_communicator(args.communicator)
|
||||
device = comm.intra_rank
|
||||
else:
|
||||
if args.communicator != 'naive':
|
||||
print('Warning: using naive communicator '
|
||||
'because only naive supports CPU-only execution')
|
||||
comm = chainermn.create_communicator('naive')
|
||||
device = -1
|
||||
|
||||
if comm.rank == 0:
|
||||
print('==========================================')
|
||||
print('Num process (COMM_WORLD): {}'.format(comm.size))
|
||||
if args.gpu:
|
||||
print('Using GPUs')
|
||||
print('Using {} communicator'.format(args.communicator))
|
||||
print('Num unit: {}'.format(args.unit))
|
||||
print('Num Minibatch-size: {}'.format(args.batchsize))
|
||||
print('Num epoch: {}'.format(args.epoch))
|
||||
print('==========================================')
|
||||
|
||||
model = L.Classifier(MLP(args.unit, 10))
|
||||
if device >= 0:
|
||||
chainer.cuda.get_device_from_id(device).use()
|
||||
model.to_gpu()
|
||||
|
||||
# Create a multi node optimizer from a standard Chainer optimizer.
|
||||
optimizer = chainermn.create_multi_node_optimizer(
|
||||
chainer.optimizers.Adam(), comm)
|
||||
optimizer.setup(model)
|
||||
|
||||
# Split and distribute the dataset. Only worker 0 loads the whole dataset.
|
||||
# Datasets of worker 0 are evenly split and distributed to all workers.
|
||||
if comm.rank == 0:
|
||||
train, test = chainer.datasets.get_mnist()
|
||||
else:
|
||||
train, test = None, None
|
||||
train = chainermn.scatter_dataset(train, comm, shuffle=True)
|
||||
test = chainermn.scatter_dataset(test, comm, shuffle=True)
|
||||
|
||||
train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
|
||||
test_iter = chainer.iterators.SerialIterator(test, args.batchsize,
|
||||
repeat=False, shuffle=False)
|
||||
|
||||
updater = training.StandardUpdater(train_iter, optimizer, device=device)
|
||||
trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
|
||||
|
||||
# Create a multi node evaluator from a standard Chainer evaluator.
|
||||
evaluator = extensions.Evaluator(test_iter, model, device=device)
|
||||
evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
|
||||
trainer.extend(evaluator)
|
||||
|
||||
# Some display and output extensions are necessary only for one worker.
|
||||
# (Otherwise, there would just be repeated outputs.)
|
||||
if comm.rank == 0:
|
||||
trainer.extend(extensions.dump_graph('main/loss'))
|
||||
trainer.extend(extensions.LogReport())
|
||||
trainer.extend(extensions.PrintReport(
|
||||
['epoch', 'main/loss', 'validation/main/loss',
|
||||
'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
|
||||
trainer.extend(extensions.ProgressBar())
|
||||
|
||||
if args.resume:
|
||||
chainer.serializers.load_npz(args.resume, trainer)
|
||||
|
||||
trainer.run()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,138 +0,0 @@
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
|
||||
from datautils import download_mnist
|
||||
|
||||
import chainer
|
||||
from chainer import backend
|
||||
from chainer import backends
|
||||
from chainer.backends import cuda
|
||||
from chainer import Function, gradient_check, report, training, utils, Variable
|
||||
from chainer import datasets, iterators, optimizers, serializers
|
||||
from chainer import Link, Chain, ChainList
|
||||
import chainer.functions as F
|
||||
import chainer.links as L
|
||||
from chainer.training import extensions
|
||||
from chainer.dataset import concat_examples
|
||||
from chainer.backends.cuda import to_cpu
|
||||
|
||||
|
||||
from azureml.core.run import Run
|
||||
run = Run.get_context()
|
||||
|
||||
|
||||
class MyNetwork(Chain):
|
||||
|
||||
def __init__(self, n_mid_units=100, n_out=10):
|
||||
super(MyNetwork, self).__init__()
|
||||
with self.init_scope():
|
||||
self.l1 = L.Linear(None, n_mid_units)
|
||||
self.l2 = L.Linear(n_mid_units, n_mid_units)
|
||||
self.l3 = L.Linear(n_mid_units, n_out)
|
||||
|
||||
def forward(self, x):
|
||||
h = F.relu(self.l1(x))
|
||||
h = F.relu(self.l2(h))
|
||||
return self.l3(h)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Chainer example: MNIST')
|
||||
parser.add_argument('--batchsize', '-b', type=int, default=100,
|
||||
help='Number of images in each mini-batch')
|
||||
parser.add_argument('--epochs', '-e', type=int, default=20,
|
||||
help='Number of sweeps over the dataset to train')
|
||||
parser.add_argument('--output_dir', '-o', default='./outputs',
|
||||
help='Directory to output the result')
|
||||
parser.add_argument('--gpu_id', '-g', default=0,
|
||||
help='ID of the GPU to be used. Set to -1 if you use CPU')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Download the MNIST data if you haven't downloaded it yet
|
||||
train, test = download_mnist()
|
||||
|
||||
gpu_id = args.gpu_id
|
||||
batchsize = args.batchsize
|
||||
epochs = args.epochs
|
||||
run.log('Batch size', np.int(batchsize))
|
||||
run.log('Epochs', np.int(epochs))
|
||||
|
||||
train_iter = iterators.SerialIterator(train, batchsize)
|
||||
|
||||
model = MyNetwork()
|
||||
|
||||
if gpu_id >= 0:
|
||||
# Make a specified GPU current
|
||||
chainer.backends.cuda.get_device_from_id(0).use()
|
||||
model.to_gpu() # Copy the model to the GPU
|
||||
|
||||
# Choose an optimizer algorithm
|
||||
optimizer = optimizers.MomentumSGD(lr=0.01, momentum=0.9)
|
||||
|
||||
# Give the optimizer a reference to the model so that it
|
||||
# can locate the model's parameters.
|
||||
optimizer.setup(model)
|
||||
|
||||
while train_iter.epoch < epochs:
|
||||
# ---------- One iteration of the training loop ----------
|
||||
train_batch = train_iter.next()
|
||||
image_train, target_train = concat_examples(train_batch, gpu_id)
|
||||
|
||||
# Calculate the prediction of the network
|
||||
prediction_train = model(image_train)
|
||||
|
||||
# Calculate the loss with softmax_cross_entropy
|
||||
loss = F.softmax_cross_entropy(prediction_train, target_train)
|
||||
|
||||
# Calculate the gradients in the network
|
||||
model.cleargrads()
|
||||
loss.backward()
|
||||
|
||||
# Update all the trainable parameters
|
||||
optimizer.update()
|
||||
# --------------------- until here ---------------------
|
||||
|
||||
# Check the validation accuracy of prediction after every epoch
|
||||
if train_iter.is_new_epoch: # If this iteration is the final iteration of the current epoch
|
||||
|
||||
# Display the training loss
|
||||
print('epoch:{:02d} train_loss:{:.04f} '.format(
|
||||
train_iter.epoch, float(to_cpu(loss.array))), end='')
|
||||
|
||||
test_losses = []
|
||||
test_accuracies = []
|
||||
test_iter = iterators.SerialIterator(test, batchsize,
|
||||
repeat=False, shuffle=False)
|
||||
while True:
|
||||
test_batch = test_iter.next()
|
||||
image_test, target_test = concat_examples(test_batch, gpu_id)
|
||||
|
||||
# Forward the test data
|
||||
prediction_test = model(image_test)
|
||||
|
||||
# Calculate the loss
|
||||
loss_test = F.softmax_cross_entropy(prediction_test, target_test)
|
||||
test_losses.append(to_cpu(loss_test.array))
|
||||
|
||||
# Calculate the accuracy
|
||||
accuracy = F.accuracy(prediction_test, target_test)
|
||||
accuracy.to_cpu()
|
||||
test_accuracies.append(accuracy.array)
|
||||
|
||||
if test_iter.is_new_epoch:
|
||||
break
|
||||
|
||||
val_accuracy = np.mean(test_accuracies)
|
||||
print('val_loss:{:.04f} val_accuracy:{:.04f}'.format(
|
||||
np.mean(test_losses), val_accuracy))
|
||||
|
||||
run.log("Accuracy", np.float(val_accuracy))
|
||||
|
||||
serializers.save_npz(os.path.join(args.output_dir, 'model.npz'), model)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,50 +0,0 @@
|
||||
import numpy as np
|
||||
import os
|
||||
import json
|
||||
|
||||
from datautils import download_mnist
|
||||
|
||||
from chainer import serializers, using_config, Variable, datasets
|
||||
import chainer.functions as F
|
||||
import chainer.links as L
|
||||
from chainer import Chain
|
||||
|
||||
from azureml.core.model import Model
|
||||
|
||||
|
||||
class MyNetwork(Chain):
|
||||
|
||||
def __init__(self, n_mid_units=100, n_out=10):
|
||||
super(MyNetwork, self).__init__()
|
||||
with self.init_scope():
|
||||
self.l1 = L.Linear(None, n_mid_units)
|
||||
self.l2 = L.Linear(n_mid_units, n_mid_units)
|
||||
self.l3 = L.Linear(n_mid_units, n_out)
|
||||
|
||||
def forward(self, x):
|
||||
h = F.relu(self.l1(x))
|
||||
h = F.relu(self.l2(h))
|
||||
return self.l3(h)
|
||||
|
||||
|
||||
def init():
|
||||
global model
|
||||
|
||||
# AZUREML_MODEL_DIR is an environment variable created during deployment.
|
||||
# It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
|
||||
# For multiple models, it points to the folder containing all deployed models (./azureml-models)
|
||||
model_root = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'model.npz')
|
||||
|
||||
# Load our saved artifacts
|
||||
model = MyNetwork()
|
||||
serializers.load_npz(model_root, model)
|
||||
|
||||
|
||||
def run(input_data):
|
||||
i = np.array(json.loads(input_data)['data'])
|
||||
|
||||
_, test = download_mnist()
|
||||
x = Variable(np.asarray([test[i][0]]))
|
||||
y = model(x)
|
||||
|
||||
return np.ndarray.tolist(y.data.argmax(axis=1))
|
||||
@@ -1,50 +0,0 @@
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import glob
|
||||
import gzip
|
||||
import numpy as np
|
||||
import os
|
||||
import struct
|
||||
|
||||
from azureml.core import Dataset
|
||||
from azureml.opendatasets import MNIST
|
||||
from chainer.datasets import tuple_dataset
|
||||
|
||||
|
||||
# load compressed MNIST gz files and return numpy arrays
|
||||
def load_data(filename, label=False):
|
||||
with gzip.open(filename) as gz:
|
||||
struct.unpack('I', gz.read(4))
|
||||
n_items = struct.unpack('>I', gz.read(4))
|
||||
if not label:
|
||||
n_rows = struct.unpack('>I', gz.read(4))[0]
|
||||
n_cols = struct.unpack('>I', gz.read(4))[0]
|
||||
res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
|
||||
res = res.reshape(n_items[0], n_rows * n_cols)
|
||||
else:
|
||||
res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
|
||||
res = res.reshape(n_items[0], 1)
|
||||
return res
|
||||
|
||||
|
||||
def download_mnist():
|
||||
data_folder = os.path.join(os.getcwd(), 'data/mnist')
|
||||
os.makedirs(data_folder, exist_ok=True)
|
||||
|
||||
mnist_file_dataset = MNIST.get_file_dataset()
|
||||
mnist_file_dataset.download(data_folder, overwrite=True)
|
||||
|
||||
X_train = load_data(glob.glob(os.path.join(data_folder, "**/train-images-idx3-ubyte.gz"),
|
||||
recursive=True)[0], False) / 255.0
|
||||
X_test = load_data(glob.glob(os.path.join(data_folder, "**/t10k-images-idx3-ubyte.gz"),
|
||||
recursive=True)[0], False) / 255.0
|
||||
y_train = load_data(glob.glob(os.path.join(data_folder, "**/train-labels-idx1-ubyte.gz"),
|
||||
recursive=True)[0], True).reshape(-1)
|
||||
y_test = load_data(glob.glob(os.path.join(data_folder, "**/t10k-labels-idx1-ubyte.gz"),
|
||||
recursive=True)[0], True).reshape(-1)
|
||||
|
||||
train = tuple_dataset.TupleDataset(X_train.astype(np.float32), y_train.astype(np.int32))
|
||||
test = tuple_dataset.TupleDataset(X_test.astype(np.float32), y_test.astype(np.int32))
|
||||
|
||||
return train, test
|
||||
@@ -1,808 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved. \n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Train and hyperparameter tune with Chainer\n",
|
||||
"\n",
|
||||
"In this tutorial, we demonstrate how to use the Azure ML Python SDK to train a Convolutional Neural Network (CNN) on a single-node GPU with Chainer to perform handwritten digit recognition on the popular MNIST dataset. We will also demonstrate how to perform hyperparameter tuning of the model using Azure ML's HyperDrive service."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisites\n",
|
||||
"* If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [Configuration](../../../../configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check core SDK version number\n",
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"SDK version:\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Diagnostics\n",
|
||||
"Opt-in diagnostics for better experience, quality, and security of future releases."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"Diagnostics"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.telemetry import set_diagnostics_collection\n",
|
||||
"\n",
|
||||
"set_diagnostics_collection(send_diagnostics=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize workspace\n",
|
||||
"Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()\n",
|
||||
"print('Workspace name: ' + ws.name, \n",
|
||||
" 'Azure region: ' + ws.location, \n",
|
||||
" 'Subscription id: ' + ws.subscription_id, \n",
|
||||
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create or Attach existing AmlCompute\n",
|
||||
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, we use Azure ML managed compute ([AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute)) for our remote training compute resource.\n",
|
||||
"\n",
|
||||
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
|
||||
"\n",
|
||||
"**Creation of AmlCompute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace, this code will skip the creation process.\n",
|
||||
"\n",
|
||||
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# choose a name for your cluster\n",
|
||||
"cluster_name = \"gpu-cluster\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
|
||||
" print('Found existing compute target.')\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" print('Creating a new compute target...')\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',\n",
|
||||
" max_nodes=4)\n",
|
||||
"\n",
|
||||
" # create the cluster\n",
|
||||
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
"compute_target.wait_for_completion(show_output=True)\n",
|
||||
"\n",
|
||||
"# use get_status() to get a detailed status for the current cluster. \n",
|
||||
"print(compute_target.get_status().serialize())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The above code creates a GPU cluster. If you instead want to create a CPU cluster, provide a different VM size to the `vm_size` parameter, such as `STANDARD_D2_V2`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train model on the remote compute\n",
|
||||
"Now that you have your data and training script prepared, you are ready to train on your remote compute cluster. You can take advantage of Azure compute to leverage GPUs to cut down your training time. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create a project directory\n",
|
||||
"Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"project_folder = './chainer-mnist'\n",
|
||||
"os.makedirs(project_folder, exist_ok=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prepare training script\n",
|
||||
"Now you will need to create your training script. In this tutorial, the training script is already provided for you at `chainer_mnist.py`. In practice, you should be able to take any custom training script as is and run it with Azure ML without having to modify your code.\n",
|
||||
"\n",
|
||||
"However, if you would like to use Azure ML's [tracking and metrics](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#metrics) capabilities, you will have to add a small amount of Azure ML code inside your training script. \n",
|
||||
"\n",
|
||||
"In `chainer_mnist.py`, we will log some metrics to our Azure ML run. To do so, we will access the Azure ML `Run` object within the script:\n",
|
||||
"```Python\n",
|
||||
"from azureml.core.run import Run\n",
|
||||
"run = Run.get_context()\n",
|
||||
"```\n",
|
||||
"Further within `chainer_mnist.py`, we log the batchsize and epochs parameters, and the highest accuracy the model achieves:\n",
|
||||
"```Python\n",
|
||||
"run.log('Batch size', np.int(args.batchsize))\n",
|
||||
"run.log('Epochs', np.int(args.epochs))\n",
|
||||
"\n",
|
||||
"run.log('Accuracy', np.float(val_accuracy))\n",
|
||||
"```\n",
|
||||
"These run metrics will become particularly important when we begin hyperparameter tuning our model in the \"Tune model hyperparameters\" section."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Once your script is ready, copy the training script `chainer_mnist.py` into your project directory."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import shutil\n",
|
||||
"\n",
|
||||
"shutil.copy('chainer_mnist.py', project_folder)\n",
|
||||
"shutil.copy('chainer_score.py', project_folder)\n",
|
||||
"shutil.copy('datautils.py', project_folder)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create an experiment\n",
|
||||
"Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this Chainer tutorial. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Experiment\n",
|
||||
"\n",
|
||||
"experiment_name = 'chainer-mnist'\n",
|
||||
"experiment = Experiment(ws, name=experiment_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create an environment\n",
|
||||
"\n",
|
||||
"Define a conda environment YAML file with your training script dependencies and create an Azure ML environment."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%writefile conda_dependencies.yml\n",
|
||||
"\n",
|
||||
"channels:\n",
|
||||
"- conda-forge\n",
|
||||
"dependencies:\n",
|
||||
"- python=3.8.12\n",
|
||||
"- pip=22.3.1\n",
|
||||
"- pip:\n",
|
||||
" - azureml-defaults\n",
|
||||
" - azureml-opendatasets\n",
|
||||
" - chainer\n",
|
||||
" - cupy-cuda111\n",
|
||||
" - pytest"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"from azureml.core.runconfig import DockerConfiguration\n",
|
||||
"\n",
|
||||
"chainer_env = Environment.from_conda_specification(name = 'chainer-gpu', file_path = './conda_dependencies.yml')\n",
|
||||
"\n",
|
||||
"# Specify a GPU base image\n",
|
||||
"chainer_env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu20.04'\n",
|
||||
"\n",
|
||||
"docker_config = DockerConfiguration(use_docker=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configure your training job\n",
|
||||
"\n",
|
||||
"Create a ScriptRunConfig object to specify the configuration details of your training job, including your training script, environment to use, and the compute target to run on."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import ScriptRunConfig\n",
|
||||
"\n",
|
||||
"src = ScriptRunConfig(source_directory=project_folder,\n",
|
||||
" script='chainer_mnist.py',\n",
|
||||
" arguments=['--epochs', 10, '--batchsize', 128, '--output_dir', './outputs'],\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" environment=chainer_env,\n",
|
||||
" docker_runtime_config=docker_config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Submit job\n",
|
||||
"Run your experiment by submitting your ScriptRunConfig object. Note that this call is asynchronous."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"run = experiment.submit(src)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Monitor your run\n",
|
||||
"You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"\n",
|
||||
"RunDetails(run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# to get more details of your run\n",
|
||||
"print(run.get_details())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Tune model hyperparameters\n",
|
||||
"Now that we've seen how to do a simple Chainer training run using the SDK, let's see if we can further improve the accuracy of our model. We can optimize our model's hyperparameters using Azure Machine Learning's hyperparameter tuning capabilities."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Start a hyperparameter sweep\n",
|
||||
"First, we will define the hyperparameter space to sweep over. Let's tune the batch size and epochs parameters. In this example we will use random sampling to try different configuration sets of hyperparameters to maximize our primary metric, accuracy.\n",
|
||||
"\n",
|
||||
"Then, we specify the early termination policy to use to early terminate poorly performing runs. Here we use the `BanditPolicy`, which will terminate any run that doesn't fall within the slack factor of our primary evaluation metric. In this tutorial, we will apply this policy every epoch (since we report our `Accuracy` metric every epoch and `evaluation_interval=1`). Notice we will delay the first policy evaluation until after the first `3` epochs (`delay_evaluation=3`).\n",
|
||||
"Refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-tune-hyperparameters#specify-an-early-termination-policy) for more information on the BanditPolicy and other policies available."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.train.hyperdrive.runconfig import HyperDriveConfig\n",
|
||||
"from azureml.train.hyperdrive.sampling import RandomParameterSampling\n",
|
||||
"from azureml.train.hyperdrive.policy import BanditPolicy\n",
|
||||
"from azureml.train.hyperdrive.run import PrimaryMetricGoal\n",
|
||||
"from azureml.train.hyperdrive.parameter_expressions import choice\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"param_sampling = RandomParameterSampling( {\n",
|
||||
" \"--batchsize\": choice(128, 256),\n",
|
||||
" \"--epochs\": choice(5, 10, 20, 40)\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"hyperdrive_config = HyperDriveConfig(run_config=src,\n",
|
||||
" hyperparameter_sampling=param_sampling, \n",
|
||||
" primary_metric_name='Accuracy',\n",
|
||||
" policy=BanditPolicy(evaluation_interval=1, slack_factor=0.1, delay_evaluation=3),\n",
|
||||
" primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,\n",
|
||||
" max_total_runs=8,\n",
|
||||
" max_concurrent_runs=4)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Finally, lauch the hyperparameter tuning job."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# start the HyperDrive run\n",
|
||||
"hyperdrive_run = experiment.submit(hyperdrive_config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Monitor HyperDrive runs\n",
|
||||
"You can monitor the progress of the runs with the following Jupyter widget. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"RunDetails(hyperdrive_run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"hyperdrive_run.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"assert(hyperdrive_run.get_status() == \"Completed\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Warm start a Hyperparameter Tuning experiment and resuming child runs\n",
|
||||
"Often times, finding the best hyperparameter values for your model can be an iterative process, needing multiple tuning runs that learn from previous hyperparameter tuning runs. Reusing knowledge from these previous runs will accelerate the hyperparameter tuning process, thereby reducing the cost of tuning the model and will potentially improve the primary metric of the resulting model. When warm starting a hyperparameter tuning experiment with Bayesian sampling, trials from the previous run will be used as prior knowledge to intelligently pick new samples, so as to improve the primary metric. Additionally, when using Random or Grid sampling, any early termination decisions will leverage metrics from the previous runs to determine poorly performing training runs. \n",
|
||||
"\n",
|
||||
"Azure Machine Learning allows you to warm start your hyperparameter tuning run by leveraging knowledge from up to 5 previously completed hyperparameter tuning parent runs. \n",
|
||||
"\n",
|
||||
"Additionally, there might be occasions when individual training runs of a hyperparameter tuning experiment are cancelled due to budget constraints or fail due to other reasons. It is now possible to resume such individual training runs from the last checkpoint (assuming your training script handles checkpoints). Resuming an individual training run will use the same hyperparameter configuration and mount the storage used for that run. The training script should accept the \"--resume-from\" argument, which contains the checkpoint or model files from which to resume the training run. You can also resume individual runs as part of an experiment that spends additional budget on hyperparameter tuning. Any additional budget, after resuming the specified training runs is used for exploring additional configurations.\n",
|
||||
"\n",
|
||||
"For more information on warm starting and resuming hyperparameter tuning runs, please refer to the [Hyperparameter Tuning for Azure Machine Learning documentation](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters) \n",
|
||||
"\n",
|
||||
"### Find and register best model\n",
|
||||
"When all jobs finish, we can find out the one that has the highest accuracy."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"best_run = hyperdrive_run.get_best_run_by_primary_metric()\n",
|
||||
"print(best_run.get_details()['runDefinition']['arguments'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now, let's list the model files uploaded during the run."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(best_run.get_file_names())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can then register the folder (and all files in it) as a model named `chainer-dnn-mnist` under the workspace for deployment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = best_run.register_model(model_name='chainer-dnn-mnist', model_path='outputs/model.npz')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Deploy the model in ACI\n",
|
||||
"Now, we are ready to deploy the model as a web service running in Azure Container Instance, [ACI](https://azure.microsoft.com/en-us/services/container-instances/). Azure Machine Learning accomplishes this by constructing a Docker image with the scoring logic and model baked in.\n",
|
||||
"\n",
|
||||
"### Create scoring script\n",
|
||||
"First, we will create a scoring script that will be invoked by the web service call.\n",
|
||||
"+ Now that the scoring script must have two required functions, `init()` and `run(input_data)`.\n",
|
||||
" + In `init()`, you typically load the model into a global object. This function is executed only once when the Docker contianer is started.\n",
|
||||
" + In `run(input_data)`, the model is used to predict a value based on the input data. The input and output to `run` uses NPZ as the serialization and de-serialization format because it is the preferred format for Chainer, but you are not limited to it.\n",
|
||||
" \n",
|
||||
"Refer to the scoring script `chainer_score.py` for this tutorial. Our web service will use this file to predict. When writing your own scoring script, don't forget to test it locally first before you go and deploy the web service."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"shutil.copy('chainer_score.py', project_folder)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create myenv.yml\n",
|
||||
"We also need to create an environment file so that Azure Machine Learning can install the necessary packages in the Docker image which are required by your scoring script. In this case, we need to specify conda package `numpy` and pip install `chainer`. Please note that you must indicate azureml-defaults with verion >= 1.0.45 as a pip dependency, because it contains the functionality needed to host the model as a web service."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.runconfig import CondaDependencies\n",
|
||||
"\n",
|
||||
"cd = CondaDependencies.create()\n",
|
||||
"cd.add_conda_package('numpy')\n",
|
||||
"cd.add_pip_package('chainer')\n",
|
||||
"cd.add_pip_package(\"azureml-defaults\")\n",
|
||||
"cd.add_pip_package(\"azureml-opendatasets\")\n",
|
||||
"cd.save_to_file(base_directory='./', conda_file_path='myenv.yml')\n",
|
||||
"\n",
|
||||
"print(cd.serialize_to_string())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Deploy to ACI\n",
|
||||
"We are almost ready to deploy. Create the inference configuration and deployment configuration and deploy to ACI. This cell will run for about 7-8 minutes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.webservice import AciWebservice\n",
|
||||
"from azureml.core.model import InferenceConfig\n",
|
||||
"from azureml.core.webservice import Webservice\n",
|
||||
"from azureml.core.model import Model\n",
|
||||
"from azureml.core.environment import Environment\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"myenv = Environment.from_conda_specification(name=\"myenv\", file_path=\"myenv.yml\")\n",
|
||||
"inference_config = InferenceConfig(entry_script=\"chainer_score.py\", environment=myenv,\n",
|
||||
" source_directory=project_folder)\n",
|
||||
"\n",
|
||||
"aciconfig = AciWebservice.deploy_configuration(cpu_cores=1,\n",
|
||||
" auth_enabled=True, # this flag generates API keys to secure access\n",
|
||||
" memory_gb=2,\n",
|
||||
" tags={'name': 'mnist', 'framework': 'Chainer'},\n",
|
||||
" description='Chainer DNN with MNIST')\n",
|
||||
"\n",
|
||||
"service = Model.deploy(workspace=ws,\n",
|
||||
" name='chainer-mnist-1',\n",
|
||||
" models=[model],\n",
|
||||
" inference_config=inference_config,\n",
|
||||
" deployment_config=aciconfig)\n",
|
||||
"service.wait_for_deployment(True)\n",
|
||||
"print(service.state)\n",
|
||||
"print(service.scoring_uri)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Tip: If something goes wrong with the deployment, the first thing to look at is the logs from the service by running the following command:** "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(service.get_logs())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This is the scoring web service endpoint:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(service.scoring_uri)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Test the deployed model\n",
|
||||
"Let's test the deployed model. Pick a random sample from the test set, and send it to the web service hosted in ACI for a prediction. Note, here we are using the an HTTP request to invoke the service.\n",
|
||||
"\n",
|
||||
"We can retrieve the API keys used for accessing the HTTP endpoint and construct a raw HTTP request to send to the service. Don't forget to add key to the HTTP header."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# retreive the API keys. two keys were generated.\n",
|
||||
"key1, Key2 = service.get_keys()\n",
|
||||
"print(key1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import urllib\n",
|
||||
"import gzip\n",
|
||||
"import numpy as np\n",
|
||||
"import struct\n",
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# load compressed MNIST gz files and return numpy arrays\n",
|
||||
"def load_data(filename, label=False):\n",
|
||||
" with gzip.open(filename) as gz:\n",
|
||||
" struct.unpack('I', gz.read(4))\n",
|
||||
" n_items = struct.unpack('>I', gz.read(4))\n",
|
||||
" if not label:\n",
|
||||
" n_rows = struct.unpack('>I', gz.read(4))[0]\n",
|
||||
" n_cols = struct.unpack('>I', gz.read(4))[0]\n",
|
||||
" res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)\n",
|
||||
" res = res.reshape(n_items[0], n_rows * n_cols)\n",
|
||||
" else:\n",
|
||||
" res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)\n",
|
||||
" res = res.reshape(n_items[0], 1)\n",
|
||||
" return res\n",
|
||||
"\n",
|
||||
"data_folder = os.path.join(os.getcwd(), 'data/mnist')\n",
|
||||
"os.makedirs(data_folder, exist_ok=True)\n",
|
||||
"\n",
|
||||
"urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/t10k-images-idx3-ubyte.gz',\n",
|
||||
" filename=os.path.join(data_folder, 't10k-images-idx3-ubyte.gz'))\n",
|
||||
"urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/t10k-labels-idx1-ubyte.gz',\n",
|
||||
" filename=os.path.join(data_folder, 't10k-labels-idx1-ubyte.gz'))\n",
|
||||
"\n",
|
||||
"X_test = load_data(os.path.join(data_folder, 't10k-images-idx3-ubyte.gz'), False) / np.float32(255.0)\n",
|
||||
"y_test = load_data(os.path.join(data_folder, 't10k-labels-idx1-ubyte.gz'), True).reshape(-1)\n",
|
||||
"\n",
|
||||
"# send a random row from the test set to score\n",
|
||||
"random_index = np.random.randint(0, len(X_test)-1)\n",
|
||||
"input_data = \"{\\\"data\\\": [\" + str(random_index) + \"]}\"\n",
|
||||
"\n",
|
||||
"headers = {'Content-Type':'application/json', 'Authorization': 'Bearer ' + key1}\n",
|
||||
"\n",
|
||||
"# send sample to service for scoring\n",
|
||||
"resp = requests.post(service.scoring_uri, input_data, headers=headers)\n",
|
||||
"\n",
|
||||
"print(\"label:\", y_test[random_index])\n",
|
||||
"print(\"prediction:\", resp.text[1])\n",
|
||||
"\n",
|
||||
"plt.imshow(X_test[random_index].reshape((28,28)), cmap='gray')\n",
|
||||
"plt.axis('off')\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's look at the workspace after the web service was deployed. You should see\n",
|
||||
"\n",
|
||||
" + a registered model named 'chainer-dnn-mnist' and with the id 'chainer-dnn-mnist:1'\n",
|
||||
" + a webservice called 'chainer-mnist-svc' with some scoring URL"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = ws.models['chainer-dnn-mnist']\n",
|
||||
"print(\"Model: {}, ID: {}\".format('chainer-dnn-mnist', model.id))\n",
|
||||
" \n",
|
||||
"webservice = ws.webservices['chainer-mnist-1']\n",
|
||||
"print(\"Webservice: {}, scoring URI: {}\".format('chainer-mnist-1', webservice.scoring_uri))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Clean up"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can delete the ACI deployment with a simple delete API call."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"service.delete()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "nagaur"
|
||||
}
|
||||
],
|
||||
"category": "training",
|
||||
"compute": [
|
||||
"AML Compute"
|
||||
],
|
||||
"datasets": [
|
||||
"MNIST"
|
||||
],
|
||||
"deployment": [
|
||||
"Azure Container Instance"
|
||||
],
|
||||
"exclude_from_index": false,
|
||||
"framework": [
|
||||
"Chainer"
|
||||
],
|
||||
"friendly_name": "Train a model with hyperparameter tuning",
|
||||
"index_order": 1,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.7"
|
||||
},
|
||||
"tags": [
|
||||
"None"
|
||||
],
|
||||
"task": "Train a Convolutional Neural Network (CNN)"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,13 +0,0 @@
|
||||
name: train-hyperparameter-tune-deploy-with-chainer
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
- numpy
|
||||
- matplotlib
|
||||
- json
|
||||
- urllib
|
||||
- gzip
|
||||
- struct
|
||||
- requests
|
||||
- azureml-opendatasets
|
||||
@@ -21,7 +21,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Distributed TensorFlow with Horovod\n",
|
||||
"In this tutorial, you will train a word2vec model in TensorFlow using distributed training via [Horovod](https://github.com/uber/horovod)."
|
||||
"In this tutorial, you will train a model in TensorFlow using distributed training via [Horovod](https://github.com/uber/horovod)."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -144,26 +144,6 @@
|
||||
"The above code creates a GPU cluster. If you instead want to create a CPU cluster, provide a different VM size to the `vm_size` parameter, such as `STANDARD_D2_V2`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create a Dataset for Files\n",
|
||||
"A Dataset can reference single or multiple files in your datastores or public urls. The files can be of any format. FileDataset provides you with the ability to download or mount the files to your compute. By creating a dataset, you create a reference to the data source location. The data remains in its existing location, so no extra storage cost is incurred. [Learn More](https://aka.ms/azureml/howto/createdatasets)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Dataset\n",
|
||||
"\n",
|
||||
"web_paths = ['https://azuremlexamples.blob.core.windows.net/datasets/text8.zip']\n",
|
||||
"dataset = Dataset.File.from_files(path=web_paths)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -171,28 +151,6 @@
|
||||
"You may want to register datasets using the register() method to your workspace so that the dataset can be shared with others, reused across various experiments, and referred to by name in your training script."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset = dataset.register(workspace=ws,\n",
|
||||
" name='wikipedia-text',\n",
|
||||
" description='Wikipedia text training and test dataset',\n",
|
||||
" create_new_version=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# list the files referenced by the dataset\n",
|
||||
"dataset.to_path()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -200,43 +158,6 @@
|
||||
"## Train model on the remote compute"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create a project directory\n",
|
||||
"Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script, and any additional files your training script depends on."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"project_folder = './tf-distr-hvd'\n",
|
||||
"os.makedirs(project_folder, exist_ok=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copy the training script `tf_horovod_word2vec.py` into this project directory."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import shutil\n",
|
||||
"\n",
|
||||
"shutil.copy('tf_horovod_word2vec.py', project_folder)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -274,7 +195,7 @@
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"\n",
|
||||
"tf_env = Environment.get(ws, name='AzureML-TensorFlow-1.13-GPU')"
|
||||
"tf_env = Environment.get(ws, name='AzureML-tensorflow-2.7-ubuntu20.04-py38-cuda11-gpu')"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -297,9 +218,8 @@
|
||||
"from azureml.core import ScriptRunConfig\n",
|
||||
"from azureml.core.runconfig import MpiConfiguration\n",
|
||||
"\n",
|
||||
"src = ScriptRunConfig(source_directory=project_folder,\n",
|
||||
" script='tf_horovod_word2vec.py',\n",
|
||||
" arguments=['--input_data', dataset.as_mount()],\n",
|
||||
"src = ScriptRunConfig(source_directory=\"src\",\n",
|
||||
" script='train.py',\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" environment=tf_env,\n",
|
||||
" distributed_job_config=MpiConfiguration(node_count=2))"
|
||||
|
||||
@@ -2,10 +2,3 @@ name: distributed-tensorflow-with-horovod
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
- keras
|
||||
- tensorflow-gpu==1.13.2
|
||||
- horovod==0.19.1
|
||||
- matplotlib
|
||||
- pandas
|
||||
- fuse
|
||||
|
||||
@@ -0,0 +1,120 @@
|
||||
# Copyright 2019 Uber Technologies, Inc. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Script adapted from: https://github.com/horovod/horovod/blob/master/examples/tensorflow2_keras_mnist.py
|
||||
# ==============================================================================
|
||||
|
||||
import tensorflow as tf
|
||||
import horovod.tensorflow.keras as hvd
|
||||
|
||||
import os
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--learning-rate", "-lr", type=float, default=0.001)
|
||||
parser.add_argument("--epochs", type=int, default=24)
|
||||
parser.add_argument("--steps-per-epoch", type=int, default=500)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Horovod: initialize Horovod.
|
||||
hvd.init()
|
||||
|
||||
# Horovod: pin GPU to be used to process local rank (one GPU per process)
|
||||
gpus = tf.config.experimental.list_physical_devices("GPU")
|
||||
for gpu in gpus:
|
||||
tf.config.experimental.set_memory_growth(gpu, True)
|
||||
if gpus:
|
||||
tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
|
||||
|
||||
(mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data(
|
||||
path="mnist-%d.npz" % hvd.rank()
|
||||
)
|
||||
|
||||
dataset = tf.data.Dataset.from_tensor_slices(
|
||||
(
|
||||
tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
|
||||
tf.cast(mnist_labels, tf.int64),
|
||||
)
|
||||
)
|
||||
dataset = dataset.repeat().shuffle(10000).batch(128)
|
||||
|
||||
mnist_model = tf.keras.Sequential(
|
||||
[
|
||||
tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
|
||||
tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
|
||||
tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
|
||||
tf.keras.layers.Dropout(0.25),
|
||||
tf.keras.layers.Flatten(),
|
||||
tf.keras.layers.Dense(128, activation="relu"),
|
||||
tf.keras.layers.Dropout(0.5),
|
||||
tf.keras.layers.Dense(10, activation="softmax"),
|
||||
]
|
||||
)
|
||||
|
||||
# Horovod: adjust learning rate based on number of GPUs.
|
||||
scaled_lr = args.learning_rate * hvd.size()
|
||||
opt = tf.optimizers.Adam(scaled_lr)
|
||||
|
||||
# Horovod: add Horovod DistributedOptimizer.
|
||||
opt = hvd.DistributedOptimizer(opt)
|
||||
|
||||
# Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow
|
||||
# uses hvd.DistributedOptimizer() to compute gradients.
|
||||
mnist_model.compile(
|
||||
loss=tf.losses.SparseCategoricalCrossentropy(),
|
||||
optimizer=opt,
|
||||
metrics=["accuracy"],
|
||||
experimental_run_tf_function=False,
|
||||
)
|
||||
|
||||
callbacks = [
|
||||
# Horovod: broadcast initial variable states from rank 0 to all other processes.
|
||||
# This is necessary to ensure consistent initialization of all workers when
|
||||
# training is started with random weights or restored from a checkpoint.
|
||||
hvd.callbacks.BroadcastGlobalVariablesCallback(0),
|
||||
# Horovod: average metrics among workers at the end of every epoch.
|
||||
#
|
||||
# Note: This callback must be in the list before the ReduceLROnPlateau,
|
||||
# TensorBoard or other metrics-based callbacks.
|
||||
hvd.callbacks.MetricAverageCallback(),
|
||||
# Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
|
||||
# accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
|
||||
# the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
|
||||
hvd.callbacks.LearningRateWarmupCallback(
|
||||
warmup_epochs=3, initial_lr=scaled_lr, verbose=1
|
||||
),
|
||||
]
|
||||
|
||||
# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
|
||||
if hvd.rank() == 0:
|
||||
output_dir = "./outputs"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
callbacks.append(
|
||||
tf.keras.callbacks.ModelCheckpoint(
|
||||
os.path.join(output_dir, "checkpoint-{epoch}.h5")
|
||||
)
|
||||
)
|
||||
|
||||
# Horovod: write logs on worker 0.
|
||||
verbose = 1 if hvd.rank() == 0 else 0
|
||||
|
||||
# Train the model.
|
||||
# Horovod: adjust number of steps based on number of GPUs.
|
||||
mnist_model.fit(
|
||||
dataset,
|
||||
steps_per_epoch=args.steps_per_epoch // hvd.size(),
|
||||
callbacks=callbacks,
|
||||
epochs=args.epochs,
|
||||
verbose=verbose,
|
||||
)
|
||||
@@ -1,238 +0,0 @@
|
||||
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
||||
# Modifications copyright (C) 2017 Uber Technologies, Inc.
|
||||
# Additional modifications copyright (C) Microsoft Corporation
|
||||
# Licensed under the Apache License, Version 2.0
|
||||
# Script adapted from: https://github.com/uber/horovod/blob/master/examples/tensorflow_word2vec.py
|
||||
# ======================================
|
||||
"""Basic word2vec example."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import zipfile
|
||||
import argparse
|
||||
import glob
|
||||
|
||||
import numpy as np
|
||||
from six.moves import urllib
|
||||
from six.moves import xrange # pylint: disable=redefined-builtin
|
||||
import tensorflow as tf
|
||||
import horovod.tensorflow as hvd
|
||||
from azureml.core.run import Run
|
||||
|
||||
# Horovod: initialize Horovod.
|
||||
hvd.init()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--input_data', type=str, help='training data')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
input_data = args.input_data
|
||||
print("the input data is at %s" % input_data)
|
||||
|
||||
# Step 1: Read data.
|
||||
filename = input_data
|
||||
|
||||
|
||||
# Read the data into a list of strings.
|
||||
def read_data(filename):
|
||||
"""Extract the first file enclosed in a zip file as a list of words."""
|
||||
with zipfile.ZipFile(filename) as f:
|
||||
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
|
||||
return data
|
||||
|
||||
|
||||
vocabulary = read_data(filename)
|
||||
print('Data size', len(vocabulary))
|
||||
|
||||
# Step 2: Build the dictionary and replace rare words with UNK token.
|
||||
vocabulary_size = 50000
|
||||
|
||||
|
||||
def build_dataset(words, n_words):
|
||||
"""Process raw inputs into a dataset."""
|
||||
count = [['UNK', -1]]
|
||||
count.extend(collections.Counter(words).most_common(n_words - 1))
|
||||
dictionary = dict()
|
||||
for word, _ in count:
|
||||
dictionary[word] = len(dictionary)
|
||||
data = list()
|
||||
unk_count = 0
|
||||
for word in words:
|
||||
if word in dictionary:
|
||||
index = dictionary[word]
|
||||
else:
|
||||
index = 0 # dictionary['UNK']
|
||||
unk_count += 1
|
||||
data.append(index)
|
||||
count[0][1] = unk_count
|
||||
reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
|
||||
return data, count, dictionary, reversed_dictionary
|
||||
|
||||
|
||||
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
|
||||
vocabulary_size)
|
||||
del vocabulary # Hint to reduce memory.
|
||||
print('Most common words (+UNK)', count[:5])
|
||||
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
|
||||
|
||||
|
||||
# Step 3: Function to generate a training batch for the skip-gram model.
|
||||
def generate_batch(batch_size, num_skips, skip_window):
|
||||
assert num_skips <= 2 * skip_window
|
||||
# Adjust batch_size to match num_skips
|
||||
batch_size = batch_size // num_skips * num_skips
|
||||
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
|
||||
# Backtrack a little bit to avoid skipping words in the end of a batch
|
||||
data_index = random.randint(0, len(data) - span - 1)
|
||||
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
|
||||
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
|
||||
buffer = collections.deque(maxlen=span)
|
||||
for _ in range(span):
|
||||
buffer.append(data[data_index])
|
||||
data_index = (data_index + 1) % len(data)
|
||||
for i in range(batch_size // num_skips):
|
||||
target = skip_window # target label at the center of the buffer
|
||||
targets_to_avoid = [skip_window]
|
||||
for j in range(num_skips):
|
||||
while target in targets_to_avoid:
|
||||
target = random.randint(0, span - 1)
|
||||
targets_to_avoid.append(target)
|
||||
batch[i * num_skips + j] = buffer[skip_window]
|
||||
labels[i * num_skips + j, 0] = buffer[target]
|
||||
buffer.append(data[data_index])
|
||||
data_index = (data_index + 1) % len(data)
|
||||
return batch, labels
|
||||
|
||||
|
||||
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
|
||||
for i in range(8):
|
||||
print(batch[i], reverse_dictionary[batch[i]],
|
||||
'->', labels[i, 0], reverse_dictionary[labels[i, 0]])
|
||||
|
||||
# Step 4: Build and train a skip-gram model.
|
||||
|
||||
max_batch_size = 128
|
||||
embedding_size = 128 # Dimension of the embedding vector.
|
||||
skip_window = 1 # How many words to consider left and right.
|
||||
num_skips = 2 # How many times to reuse an input to generate a label.
|
||||
|
||||
# We pick a random validation set to sample nearest neighbors. Here we limit the
|
||||
# validation samples to the words that have a low numeric ID, which by
|
||||
# construction are also the most frequent.
|
||||
valid_size = 16 # Random set of words to evaluate similarity on.
|
||||
valid_window = 100 # Only pick dev samples in the head of the distribution.
|
||||
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
|
||||
num_sampled = 64 # Number of negative examples to sample.
|
||||
|
||||
graph = tf.Graph()
|
||||
|
||||
with graph.as_default():
|
||||
|
||||
# Input data.
|
||||
train_inputs = tf.placeholder(tf.int32, shape=[None])
|
||||
train_labels = tf.placeholder(tf.int32, shape=[None, 1])
|
||||
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
|
||||
|
||||
# Look up embeddings for inputs.
|
||||
embeddings = tf.Variable(
|
||||
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
|
||||
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
|
||||
|
||||
# Construct the variables for the NCE loss
|
||||
nce_weights = tf.Variable(
|
||||
tf.truncated_normal([vocabulary_size, embedding_size],
|
||||
stddev=1.0 / math.sqrt(embedding_size)))
|
||||
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
|
||||
|
||||
# Compute the average NCE loss for the batch.
|
||||
# tf.nce_loss automatically draws a new sample of the negative labels each
|
||||
# time we evaluate the loss.
|
||||
loss = tf.reduce_mean(
|
||||
tf.nn.nce_loss(weights=nce_weights,
|
||||
biases=nce_biases,
|
||||
labels=train_labels,
|
||||
inputs=embed,
|
||||
num_sampled=num_sampled,
|
||||
num_classes=vocabulary_size))
|
||||
|
||||
# Horovod: adjust learning rate based on number of GPUs.
|
||||
optimizer = tf.train.GradientDescentOptimizer(1.0 * hvd.size())
|
||||
|
||||
# Horovod: add Horovod Distributed Optimizer.
|
||||
optimizer = hvd.DistributedOptimizer(optimizer)
|
||||
|
||||
train_op = optimizer.minimize(loss)
|
||||
|
||||
# Compute the cosine similarity between minibatch examples and all embeddings.
|
||||
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
|
||||
normalized_embeddings = embeddings / norm
|
||||
valid_embeddings = tf.nn.embedding_lookup(
|
||||
normalized_embeddings, valid_dataset)
|
||||
similarity = tf.matmul(
|
||||
valid_embeddings, normalized_embeddings, transpose_b=True)
|
||||
|
||||
# Add variable initializer.
|
||||
init = tf.global_variables_initializer()
|
||||
|
||||
# Horovod: broadcast initial variable states from rank 0 to all other processes.
|
||||
# This is necessary to ensure consistent initialization of all workers when
|
||||
# training is started with random weights or restored from a checkpoint.
|
||||
bcast = hvd.broadcast_global_variables(0)
|
||||
|
||||
# Step 5: Begin training.
|
||||
|
||||
# Horovod: adjust number of steps based on number of GPUs.
|
||||
num_steps = 4000 // hvd.size() + 1
|
||||
|
||||
# Horovod: pin GPU to be used to process local rank (one GPU per process)
|
||||
config = tf.ConfigProto()
|
||||
config.gpu_options.allow_growth = True
|
||||
config.gpu_options.visible_device_list = str(hvd.local_rank())
|
||||
|
||||
with tf.Session(graph=graph, config=config) as session:
|
||||
# We must initialize all variables before we use them.
|
||||
init.run()
|
||||
bcast.run()
|
||||
print('Initialized')
|
||||
run = Run.get_context()
|
||||
average_loss = 0
|
||||
for step in xrange(num_steps):
|
||||
# simulate various sentence length by randomization
|
||||
batch_size = random.randint(max_batch_size // 2, max_batch_size)
|
||||
batch_inputs, batch_labels = generate_batch(
|
||||
batch_size, num_skips, skip_window)
|
||||
feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
|
||||
|
||||
# We perform one update step by evaluating the optimizer op (including it
|
||||
# in the list of returned values for session.run()
|
||||
_, loss_val = session.run([train_op, loss], feed_dict=feed_dict)
|
||||
average_loss += loss_val
|
||||
|
||||
if step % 2000 == 0:
|
||||
if step > 0:
|
||||
average_loss /= 2000
|
||||
# The average loss is an estimate of the loss over the last 2000 batches.
|
||||
print('Average loss at step ', step, ': ', average_loss)
|
||||
run.log("Loss", average_loss)
|
||||
average_loss = 0
|
||||
final_embeddings = normalized_embeddings.eval()
|
||||
|
||||
# Evaluate similarity in the end on worker 0.
|
||||
if hvd.rank() == 0:
|
||||
sim = similarity.eval()
|
||||
for i in xrange(valid_size):
|
||||
valid_word = reverse_dictionary[valid_examples[i]]
|
||||
top_k = 8 # number of nearest neighbors
|
||||
nearest = (-sim[i, :]).argsort()[1:top_k + 1]
|
||||
log_str = 'Nearest to %s:' % valid_word
|
||||
for k in xrange(top_k):
|
||||
close_word = reverse_dictionary[nearest[k]]
|
||||
log_str = '%s %s,' % (log_str, close_word)
|
||||
print(log_str)
|
||||
@@ -36,8 +36,6 @@ Using these samples, you will learn how to do the following.
|
||||
| [cartpole_ci.ipynb](cartpole-on-compute-instance/cartpole_ci.ipynb) | Notebook to train a Cartpole playing agent on an Azure Machine Learning Compute Instance |
|
||||
| [cartpole_sc.ipynb](cartpole-on-single-compute/cartpole_sc.ipynb) | Notebook to train a Cartpole playing agent on an Azure Machine Learning Compute Cluster (single node) |
|
||||
| [pong_rllib.ipynb](atari-on-distributed-compute/pong_rllib.ipynb) | Notebook for distributed training of Pong agent using RLlib on multiple compute targets |
|
||||
| [minecraft.ipynb](minecraft-on-distributed-compute/minecraft.ipynb) | Notebook to train an agent to navigate through a lava maze in the Minecraft game |
|
||||
| [particle.ipynb](multiagent-particle-envs/particle.ipynb) | Notebook to train policies in a multiagent cooperative navigation scenario based on OpenAI's Particle environments |
|
||||
|
||||
## Prerequisites
|
||||
|
||||
|
||||
@@ -1,14 +1,18 @@
|
||||
FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu20.04
|
||||
# NC-series GPUs only support the pytorch-1.11/cuda11.3 combo
|
||||
# See https://learn.microsoft.com/en-us/azure/machine-learning/resource-curated-environments
|
||||
FROM mcr.microsoft.com/azureml/curated/acpt-pytorch-1.11-cuda11.3
|
||||
|
||||
USER root
|
||||
RUN conda install -c anaconda python=3.7
|
||||
|
||||
# CUDA repository key rotation: https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212771
|
||||
RUN apt-key del 7fa2af80
|
||||
ENV distro ubuntu1804
|
||||
ENV arch x86_64
|
||||
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/$distro/$arch/3bf863cc.pub
|
||||
# ENV AZUREML_CONDA_ENVIRONMENT_PATH /azureml-envs/ray-rllib
|
||||
# Create conda environment
|
||||
# RUN conda create -p $AZUREML_CONDA_ENVIRONMENT_PATH \
|
||||
# python=3.8.5
|
||||
|
||||
# Prepend path to AzureML conda environment
|
||||
# ENV PATH $AZUREML_CONDA_ENVIRONMENT_PATH/bin:$PATH
|
||||
|
||||
# Install necessary packages to support videos in rllib/gym
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python-opengl \
|
||||
rsync \
|
||||
@@ -17,61 +21,35 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
rm -rf /usr/share/man/*
|
||||
|
||||
ENV AZUREML_CONDA_ENVIRONMENT_PATH /azureml-envs/tensorflow-2.4
|
||||
|
||||
# Create conda environment
|
||||
RUN conda create -p $AZUREML_CONDA_ENVIRONMENT_PATH \
|
||||
python=3.7 pip=20.2.4
|
||||
|
||||
# Prepend path to AzureML conda environment
|
||||
ENV PATH $AZUREML_CONDA_ENVIRONMENT_PATH/bin:$PATH
|
||||
|
||||
RUN pip --version
|
||||
RUN python --version
|
||||
|
||||
# Install ray-on-aml
|
||||
RUN pip install 'ray-on-aml==0.1.6'
|
||||
RUN pip install ray-on-aml==0.2.4 \
|
||||
ray==2.4.0 \
|
||||
ray[rllib]==2.4.0 \
|
||||
mlflow==2.3.1 \
|
||||
azureml-defaults==1.50.0 \
|
||||
azureml-dataset-runtime[fuse,pandas]==1.50.0 \
|
||||
azureml-contrib-reinforcementlearning==1.50.0 \
|
||||
gputil==1.4.0 \
|
||||
scipy==1.9.1 \
|
||||
pyglet==2.0.6 \
|
||||
cloudpickle==2.2.1 \
|
||||
tensorflow==2.11.0 \
|
||||
tensorflow-probability==0.19.0 \
|
||||
tabulate==0.9.0 \
|
||||
dm_tree==0.1.8 \
|
||||
lz4==4.3.2 \
|
||||
psutil==5.9.4 \
|
||||
setproctitle==1.3.2 \
|
||||
pygame==2.1.0 \
|
||||
gymnasium[classic_control]==0.26.3 \
|
||||
gymnasium[atari]==0.26.3 \
|
||||
gymnasium[accept-rom-license]==0.26.3 \
|
||||
gym==0.26.2 \
|
||||
gym[atari]==0.26.2 \
|
||||
gym[accept-rom-license]==0.26.2
|
||||
|
||||
RUN pip install ray==0.8.7
|
||||
RUN pip install gym[atari]==0.19.0
|
||||
RUN pip install gym[accept-rom-license]==0.19.0
|
||||
|
||||
# Install pip dependencies
|
||||
RUN pip install 'matplotlib>=3.3,<3.4' \
|
||||
'psutil>=5.8,<5.9' \
|
||||
'tqdm>=4.59,<4.60' \
|
||||
'pandas>=1.1,<1.2' \
|
||||
'scipy>=1.5,<1.6' \
|
||||
'numpy>=1.10,<1.20' \
|
||||
'ipykernel~=6.0' \
|
||||
'azureml-core==1.36.0.post2' \
|
||||
'azureml-defaults==1.36.0' \
|
||||
'azureml-mlflow==1.36.0' \
|
||||
'azureml-telemetry==1.36.0' \
|
||||
'tensorboard==2.4.0' \
|
||||
'tensorflow-gpu==2.4.1' \
|
||||
'tensorflow-datasets==4.3.0' \
|
||||
'onnxruntime-gpu>=1.7,<1.8' \
|
||||
'horovod[tensorflow-gpu]==0.21.3'
|
||||
|
||||
RUN pip install --no-cache-dir \
|
||||
azureml-defaults \
|
||||
azureml-dataset-runtime[fuse,pandas] \
|
||||
azureml-contrib-reinforcementlearning \
|
||||
gputil \
|
||||
cloudpickle==1.3.0 \
|
||||
tabulate \
|
||||
dm_tree \
|
||||
lz4 \
|
||||
psutil \
|
||||
setproctitle
|
||||
|
||||
# This is required for ray 0.8.7
|
||||
RUN pip install -U aiohttp==3.7.4
|
||||
|
||||
RUN pip install 'msrest<0.7.0'
|
||||
RUN pip install protobuf==3.20.0
|
||||
|
||||
# This is needed for mpi to locate libpython
|
||||
ENV LD_LIBRARY_PATH $AZUREML_CONDA_ENVIRONMENT_PATH/lib:$LD_LIBRARY_PATH
|
||||
# Display all versions
|
||||
RUN pip freeze
|
||||
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
pong-impala-vectorized:
|
||||
env: ALE/Pong-v5
|
||||
run: IMPALA
|
||||
config:
|
||||
# Make analogous to old v4 + NoFrameskip.
|
||||
env_config:
|
||||
frameskip: 1
|
||||
full_action_space: false
|
||||
repeat_action_probability: 0.0
|
||||
rollout_fragment_length: 50
|
||||
train_batch_size: 500
|
||||
num_workers: 11
|
||||
num_envs_per_worker: 10
|
||||
framework: torch
|
||||
log_level: INFO
|
||||
stop:
|
||||
episode_reward_mean: 10
|
||||
time_total_s: 3600
|
||||
@@ -1,35 +1,35 @@
|
||||
from ray_on_aml.core import Ray_On_AML
|
||||
|
||||
import ray.tune as tune
|
||||
from ray.rllib import train
|
||||
|
||||
import yaml
|
||||
from ray.tune.tune import run_experiments
|
||||
from utils import callbacks
|
||||
import argparse
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--config', help='Path to yaml configuration file')
|
||||
args = parser.parse_args()
|
||||
|
||||
ray_on_aml = Ray_On_AML()
|
||||
ray = ray_on_aml.getRay()
|
||||
if ray: # in the headnode
|
||||
# Parse arguments
|
||||
train_parser = train.create_parser()
|
||||
ray.init(address="auto")
|
||||
print("Configuring run from file: ", args.config)
|
||||
experiment_config = None
|
||||
with open(args.config, "r") as file:
|
||||
experiment_config = yaml.safe_load(file)
|
||||
print(f'Config: {experiment_config}')
|
||||
|
||||
args = train_parser.parse_args()
|
||||
print("Algorithm config:", args.config)
|
||||
# Set local_dir in each experiment configuration to ensure generated logs get picked up
|
||||
# by Azure ML
|
||||
for experiment in experiment_config.values():
|
||||
experiment["local_dir"] = "./logs"
|
||||
|
||||
trials = run_experiments(
|
||||
experiment_config,
|
||||
callbacks=[callbacks.TrialCallback()],
|
||||
verbose=2
|
||||
)
|
||||
|
||||
tune.run(
|
||||
run_or_experiment=args.run,
|
||||
config={
|
||||
"env": args.env,
|
||||
"num_gpus": args.config["num_gpus"],
|
||||
"num_workers": args.config["num_workers"],
|
||||
"callbacks": {"on_train_result": callbacks.on_train_result},
|
||||
"sample_batch_size": 50,
|
||||
"train_batch_size": 1000,
|
||||
"num_sgd_iter": 2,
|
||||
"num_data_loader_buffers": 2,
|
||||
"model": {"dim": 42},
|
||||
},
|
||||
stop=args.stop,
|
||||
local_dir='./logs')
|
||||
else:
|
||||
print("in worker node")
|
||||
|
||||
@@ -3,15 +3,20 @@
|
||||
'''
|
||||
|
||||
from azureml.core import Run
|
||||
from ray import tune
|
||||
from ray.tune import Callback
|
||||
from ray.air import session
|
||||
|
||||
|
||||
def on_train_result(info):
|
||||
'''Callback on train result to record metrics returned by trainer.
|
||||
'''
|
||||
run = Run.get_context()
|
||||
run.log(
|
||||
name='episode_reward_mean',
|
||||
value=info["result"]["episode_reward_mean"])
|
||||
run.log(
|
||||
name='episodes_total',
|
||||
value=info["result"]["episodes_total"])
|
||||
class TrialCallback(Callback):
|
||||
|
||||
def on_trial_result(self, iteration, trials, trial, result, **info):
|
||||
'''Callback on train result to record metrics returned by trainer.
|
||||
'''
|
||||
run = Run.get_context()
|
||||
run.log(
|
||||
name='episode_reward_mean',
|
||||
value=result["episode_reward_mean"])
|
||||
run.log(
|
||||
name='episodes_total',
|
||||
value=result["episodes_total"])
|
||||
|
||||
@@ -22,7 +22,8 @@
|
||||
"source": [
|
||||
"# Reinforcement Learning in Azure Machine Learning - Pong problem\n",
|
||||
"Reinforcement Learning in Azure Machine Learning is a managed service for running distributed reinforcement learning training and simulation using the open source Ray framework.\n",
|
||||
"This example uses Ray RLlib to train a Pong playing agent on a multi-node cluster.\n",
|
||||
"This noteboook demonstrates how to use Ray to solve a more complex problem using a more complex setup including Ray RLLib running on multiple compute nodes and using a GPU.\n",
|
||||
"For this example we will train a Pong playing agent on cluster of two NC6 nodes (6 CPU, 1 GPU).\n",
|
||||
"\n",
|
||||
"## Pong problem\n",
|
||||
"[Pong](https://en.wikipedia.org/wiki/Pong) is a two-dimensional sports game that simulates table tennis. The player controls an in-game paddle by moving it vertically across the left or right side of the screen. They can compete against another player controlling a second paddle on the opposing side. Players use the paddles to hit a ball back and forth."
|
||||
@@ -46,7 +47,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The goal here is to train an agent to win an episode of Pong game against opponent with the score of at least 18 points. An episode in Pong runs until one of the players reaches a score of 21. Episodes are a terminology that is used across all the [OpenAI gym](https://www.gymlibrary.dev/environments/atari/pong/) environments that contains a strictly defined task.\n",
|
||||
"The goal here is to train an agent to win an episode of Pong game against opponent with the score of at least 10 points. An episode in Pong runs until one of the players reaches a score of 21. Episodes are a terminology that is used across all the [OpenAI gym](https://www.gymlibrary.dev/environments/atari/pong/) environments that contains a strictly defined task.\n",
|
||||
"\n",
|
||||
"Training a Pong agent is a compute-intensive task and this example demonstrates the use of Reinforcement Learning in Azure Machine Learning service to train an agent faster in a distributed, parallel environment. You'll learn more about using the head and the worker compute targets to train an agent in this notebook below."
|
||||
]
|
||||
@@ -60,19 +61,6 @@
|
||||
"It is highly recommended that the user should go through the [Reinforcement Learning in Azure Machine Learning - Cartpole Problem on Single Compute](../cartpole-on-single-compute/cartpole_sc.ipynb) to understand the basics of Reinforcement Learning in Azure Machine Learning and Ray RLlib used in this notebook."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Set up Development Environment\n",
|
||||
"The following subsections show typical steps to setup your development environment. Setup includes:\n",
|
||||
"\n",
|
||||
"* Connecting to a workspace to enable communication between your local machine and remote resources\n",
|
||||
"* Creating an experiment to track all your runs\n",
|
||||
"* Setting up a virtual network\n",
|
||||
"* Creating remote head and worker compute target on a virtual network to use for training"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -86,7 +74,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646081765827
|
||||
"logged": 1683263371795
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -113,7 +101,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646081772340
|
||||
"logged": 1683263375690
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -137,7 +125,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646081775643
|
||||
"logged": 1683263378789
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -165,7 +153,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646086081229
|
||||
"logged": 1683263385677
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -177,7 +165,7 @@
|
||||
"compute_min_nodes = 0\n",
|
||||
"compute_max_nodes = 2\n",
|
||||
"\n",
|
||||
"# This example uses GPU VM. For using CPU VM, set SKU to STANDARD_D2_V2\n",
|
||||
"# This example uses GPU VM.\n",
|
||||
"vm_size = 'STANDARD_NC6'\n",
|
||||
"\n",
|
||||
"if compute_name in ws.compute_targets:\n",
|
||||
@@ -207,15 +195,52 @@
|
||||
" print(compute_target.get_status().serialize())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"### Create Azure ML Environment\r\n",
|
||||
"\r\n",
|
||||
"This step creates and registers an Azure ML Environment that includes all of the dependencies needed to run this example, including CUDA drivers Pytorch, RLLib, and associated tools. This step can take a significant time (30 min) on the first run."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646160884910
|
||||
"logged": 1683263388781
|
||||
},
|
||||
"jupyter": {
|
||||
"outputs_hidden": true,
|
||||
"outputs_hidden": false,
|
||||
"source_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ray_environment_name = 'pong-gpu'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683056774047
|
||||
},
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
"source_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
@@ -229,7 +254,6 @@
|
||||
"import os\n",
|
||||
"from azureml.core import Environment\n",
|
||||
"\n",
|
||||
"ray_environment_name = 'pong-gpu'\n",
|
||||
"ray_environment_dockerfile_path = os.path.join(os.getcwd(), 'docker', 'Dockerfile-gpu')\n",
|
||||
"\n",
|
||||
"# Build GPU image\n",
|
||||
@@ -249,15 +273,7 @@
|
||||
"\n",
|
||||
"The code below submits the training run using a `ScriptRunConfig`. By providing the\n",
|
||||
"command to run the training, and a `RunConfig` object configured with your\n",
|
||||
"compute target, number of nodes, and environment image to use.\n",
|
||||
"\n",
|
||||
"We specify `episode_reward_mean` to 18 as we want to stop the training as soon as the trained agent reaches an average win margin of at least 18 point over opponent over all episodes in the training epoch.\n",
|
||||
"Number of Ray worker processes are defined by parameter `num_workers`. We set it to 13 as we have 11 CPUs available in our compute targets. Multiple Ray worker processes parallelizes agent training and helps in achieving our goal faster. \n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"Number of CPUs in the compute cluster = 6 * 2 = 12 CPUs over 2 nodes\n",
|
||||
"Number of CPUs available = (Number of CPUs in the compute cluster) - (1 CPU for head node) = 12 - 1 = 11\n",
|
||||
"```"
|
||||
"compute target, number of nodes, and environment image to use."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -265,7 +281,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646162435310
|
||||
"logged": 1683264835679
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -282,16 +298,12 @@
|
||||
"aml_run_config_ml.node_count = 2\n",
|
||||
"aml_run_config_ml.environment = ray_environment\n",
|
||||
"\n",
|
||||
"training_algorithm = \"IMPALA\"\n",
|
||||
"rl_environment = \"PongNoFrameskip-v4\"\n",
|
||||
"script_name='pong_rllib.py'\n",
|
||||
"config_name='pong-impala-vectorized.yaml'\n",
|
||||
"\n",
|
||||
"command=[\n",
|
||||
" 'python', script_name,\n",
|
||||
" '--run', training_algorithm,\n",
|
||||
" '--env', rl_environment,\n",
|
||||
" '--config', '\\'{\"num_gpus\": 1, \"num_workers\": 11}\\'',\n",
|
||||
" '--stop', '\\'{\"episode_reward_mean\": 18, \"time_total_s\": 3600}\\''\n",
|
||||
" '--config', config_name\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"config = ScriptRunConfig(source_directory='./files',\n",
|
||||
@@ -305,25 +317,34 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Training script\n",
|
||||
"As recommended in [RLlib](https://ray.readthedocs.io/en/latest/rllib.html) documentations, we use Ray [Tune](https://ray.readthedocs.io/en/latest/tune.html) API to run the training algorithm. All the RLlib built-in trainers are compatible with the Tune API. Here we use tune.run() to execute a built-in training algorithm. For convenience, down below you can see part of the entry script where we make this call.\n",
|
||||
"### Training configuration\n",
|
||||
"All training parameters (including the Reinforcement Learning algorithm) are set through a single configuration file. For this example we'll be using the IMPALA algorithm to train an agent to play Atari Pong.\n",
|
||||
"We set `num_workers` to 11 because we have 11 CPUs available for worker nodes (6 CPUs on each of 2 machines, with 1 CPU consumed as a head node).\n",
|
||||
"We set `episode_reward_mean` (under `stop`) to 10 so that we terminate the run once we achieve a reward score of 10.\n",
|
||||
"\n",
|
||||
"Here is the configuration we are using for this example:\n",
|
||||
"\n",
|
||||
"```yaml\n",
|
||||
"pong:\n",
|
||||
" env: ALE/Pong-v5\n",
|
||||
" run: IMPALA\n",
|
||||
" config:\n",
|
||||
" num_workers: 11\n",
|
||||
" num_gpus: 1\n",
|
||||
" rollout_fragment_length: 50\n",
|
||||
" train_batch_size: 1000\n",
|
||||
" num_sgd_iter: 2\n",
|
||||
" num_multi_gpu_tower_stacks: 2\n",
|
||||
" env_config:\n",
|
||||
" frameskip: 1\n",
|
||||
" full_action_space: false\n",
|
||||
" repeat_action_probability: 0.0\n",
|
||||
" stop:\n",
|
||||
" episode_reward_mean: 10\n",
|
||||
" total_time_s: 3600\n",
|
||||
" model:\n",
|
||||
" dim: 42\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
" tune.run(\n",
|
||||
" run_or_experiment=args.run,\n",
|
||||
" config={\n",
|
||||
" \"env\": args.env,\n",
|
||||
" \"num_gpus\": args.config[\"num_gpus\"],\n",
|
||||
" \"num_workers\": args.config[\"num_workers\"],\n",
|
||||
" \"callbacks\": {\"on_train_result\": callbacks.on_train_result},\n",
|
||||
" \"sample_batch_size\": 50,\n",
|
||||
" \"train_batch_size\": 1000,\n",
|
||||
" \"num_sgd_iter\": 2,\n",
|
||||
" \"num_data_loader_buffers\": 2,\n",
|
||||
" \"model\": {\"dim\": 42},\n",
|
||||
" },\n",
|
||||
" stop=args.stop,\n",
|
||||
" local_dir='./logs')\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
@@ -339,7 +360,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683056781459
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
@@ -359,7 +384,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683056781759
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Uncomment line below to cancel the run\n",
|
||||
@@ -379,7 +408,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1682525323059
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"training_run.wait_for_completion()"
|
||||
@@ -399,7 +432,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064583273
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get the reward metrics from training_run\n",
|
||||
@@ -416,7 +453,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1682445012908
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
@@ -431,7 +472,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We observe that during the training over multiple episodes, the agent learns to win the Pong game against opponent with our target of 18 points in each episode of 21 points.\n",
|
||||
"We observe that during the training over multiple episodes, the agent learns to win the Pong game against opponent with our target of 10 points in each episode of 21 points.\n",
|
||||
"**Congratulations!! You have trained your Pong agent to win a game.**"
|
||||
]
|
||||
},
|
||||
@@ -446,7 +487,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1682445012927
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# To archive the created experiment:\n",
|
||||
@@ -456,14 +501,6 @@
|
||||
"#head_compute_target.delete()\n",
|
||||
"#worker_compute_target.delete()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Next\n",
|
||||
"In this example, you learned how to solve distributed reinforcement learning training problems using head and worker compute targets. This was an introductory tutorial on Reinforement Learning in Azure Machine Learning service offering. We would love to hear your feedback to build the features you need!"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -494,7 +531,17 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.3"
|
||||
"version": "3.8.5"
|
||||
},
|
||||
"microsoft": {
|
||||
"host": {
|
||||
"AzureML": {
|
||||
"notebookHasBeenCompleted": true
|
||||
}
|
||||
},
|
||||
"ms_spell_check": {
|
||||
"ms_spell_check_language": "en"
|
||||
}
|
||||
},
|
||||
"notice": "Copyright (c) Microsoft Corporation. All rights reserved.\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00afLicensed under the MIT License.\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00af ",
|
||||
"nteract": {
|
||||
|
||||
@@ -84,7 +84,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646344676671
|
||||
"logged": 1683062935076
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -106,7 +106,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646344680982
|
||||
"logged": 1683062936280
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -133,7 +133,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646344684217
|
||||
"logged": 1683062936485
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -160,7 +160,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646344690768
|
||||
"logged": 1683062937126
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -212,14 +212,14 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646344835579
|
||||
"logged": 1683062937499
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
"\n",
|
||||
"experiment_name = 'CartPole-v0-CI'\n",
|
||||
"experiment_name = 'CartPole-v1-CI'\n",
|
||||
"experiment = Experiment(workspace=ws, name=experiment_name)"
|
||||
]
|
||||
},
|
||||
@@ -228,7 +228,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646346293902
|
||||
"logged": 1683064044718
|
||||
},
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
@@ -282,7 +282,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646347120585
|
||||
"logged": 1683064046594
|
||||
},
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
@@ -300,18 +300,10 @@
|
||||
"from azureml.core import RunConfiguration, ScriptRunConfig, Experiment\n",
|
||||
"from azureml.core.runconfig import DockerConfiguration, RunConfiguration\n",
|
||||
"\n",
|
||||
"training_algorithm = 'PPO'\n",
|
||||
"rl_environment = 'CartPole-v0'\n",
|
||||
"\n",
|
||||
"config_name = 'cartpole-ppo.yaml'\n",
|
||||
"script_name = 'cartpole_training.py'\n",
|
||||
"script_arguments = [\n",
|
||||
" '--run', training_algorithm,\n",
|
||||
" '--env', rl_environment,\n",
|
||||
" '--config', '{\"num_gpus\": 0, \"num_workers\": 1}',\n",
|
||||
" '--stop', '{\"episode_reward_mean\": 200, \"time_total_s\": 300}',\n",
|
||||
" '--checkpoint-freq', '2',\n",
|
||||
" '--checkpoint-at-end',\n",
|
||||
" '--local-dir', './logs'\n",
|
||||
" '--config', config_name\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"aml_run_config_ml = RunConfiguration(communicator='OpenMpi')\n",
|
||||
@@ -331,43 +323,35 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Training script\n",
|
||||
"### Training configuration\n",
|
||||
"\n",
|
||||
"As recommended in RLlib documentations, we use Ray Tune API to run the training algorithm. All the RLlib built-in trainers are compatible with the Tune API. Here we use `tune.run()` to execute a built-in training algorithm. For convenience, down below you can see part of the entry script where we make this call.\n",
|
||||
"This is the training configuration (in yaml) that we use to train an agent to solve the CartPole problem using\n",
|
||||
"the PPO algorithm.\n",
|
||||
"\n",
|
||||
"This is the list of parameters we are passing into `tune.run()` via the `script_params` parameter:\n",
|
||||
"\n",
|
||||
"- `run_or_experiment`: name of the [built-in algorithm](https://ray.readthedocs.io/en/latest/rllib-algorithms.html#rllib-algorithms), 'PPO' in our example,\n",
|
||||
"- `config`: Algorithm-specific configuration. This includes specifying the environment, `env`, which in our example is the gym **[CartPole-v0](https://www.gymlibrary.dev/environments/classic_control/cart_pole/)** environment,\n",
|
||||
"- `stop`: stopping conditions, which could be any of the metrics returned by the trainer. Here we use \"mean of episode reward\", and \"total training time in seconds\" as stop conditions, and\n",
|
||||
"- `checkpoint_freq` and `checkpoint_at_end`: Frequency of taking checkpoints (number of training iterations between checkpoints), and if a checkpoint should be taken at the end.\n",
|
||||
"\n",
|
||||
"We also specify the `local_dir`, the directory in which the training logs, checkpoints and other training artificats will be recorded. \n",
|
||||
"\n",
|
||||
"See [RLlib Training APIs](https://ray.readthedocs.io/en/latest/rllib-training.html#rllib-training-apis) for more details, and also [Training (tune.run, tune.Experiment)](https://ray.readthedocs.io/en/latest/tune/api_docs/execution.html#training-tune-run-tune-experiment) for the complete list of parameters.\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"import os\n",
|
||||
"import ray\n",
|
||||
"import ray.tune as tune\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
"\n",
|
||||
" # parse arguments ...\n",
|
||||
" \n",
|
||||
" # Start ray head (single node)\n",
|
||||
" os.system('ray start --head')\n",
|
||||
" ray.init(address='auto')\n",
|
||||
"\n",
|
||||
" # Run training task using tune.run\n",
|
||||
" tune.run(\n",
|
||||
" run_or_experiment=args.run,\n",
|
||||
" config=dict(args.config, env=args.env),\n",
|
||||
" stop=args.stop,\n",
|
||||
" checkpoint_freq=args.checkpoint_freq,\n",
|
||||
" checkpoint_at_end=args.checkpoint_at_end,\n",
|
||||
" local_dir=args.local_dir\n",
|
||||
" )\n",
|
||||
"```yaml\n",
|
||||
"cartpole-ppo:\n",
|
||||
" env: CartPole-v1\n",
|
||||
" run: PPO\n",
|
||||
" stop:\n",
|
||||
" episode_reward_mean: 475\n",
|
||||
" time_total_s: 300\n",
|
||||
" checkpoint_config:\n",
|
||||
" checkpoint_frequency: 2\n",
|
||||
" checkpoint_at_end: true\n",
|
||||
" config:\n",
|
||||
" # Works for both torch and tf.\n",
|
||||
" framework: torch\n",
|
||||
" gamma: 0.99\n",
|
||||
" lr: 0.0003\n",
|
||||
" num_workers: 1\n",
|
||||
" observation_filter: MeanStdFilter\n",
|
||||
" num_sgd_iter: 6\n",
|
||||
" vf_loss_coeff: 0.01\n",
|
||||
" model:\n",
|
||||
" fcnet_hiddens: [32]\n",
|
||||
" fcnet_activation: linear\n",
|
||||
" vf_share_layers: true\n",
|
||||
" enable_connectors: true\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
@@ -386,7 +370,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646347127671
|
||||
"logged": 1683064049813
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -408,7 +392,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064050024
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Uncomment line below to cancel the run\n",
|
||||
@@ -430,7 +418,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646347318682
|
||||
"logged": 1683064304728
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -463,7 +451,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646347328505
|
||||
"logged": 1683064305251
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -471,7 +459,7 @@
|
||||
"from os import path\n",
|
||||
"from distutils import dir_util\n",
|
||||
"\n",
|
||||
"training_artifacts_path = path.join(\"logs\", training_algorithm)\n",
|
||||
"training_artifacts_path = path.join(\"logs\", \"cartpole-ppo\")\n",
|
||||
"print(\"Training artifacts path:\", training_artifacts_path)\n",
|
||||
"\n",
|
||||
"if path.exists(training_artifacts_path):\n",
|
||||
@@ -493,19 +481,20 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646347334571
|
||||
"logged": 1683064305283
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A helper function to find checkpoint files in a directory\n",
|
||||
"# A helper function to find all of the checkpoint directories located within a larger directory tree\n",
|
||||
"def find_checkpoints(file_path):\n",
|
||||
" print(\"Looking in path:\", file_path)\n",
|
||||
" checkpoints = []\n",
|
||||
" for root, _, files in os.walk(file_path):\n",
|
||||
" for name in files:\n",
|
||||
" if os.path.basename(root).startswith('checkpoint_'):\n",
|
||||
" checkpoints.append(path.join(root, name))\n",
|
||||
" for root, dirs, files in os.walk(file_path):\n",
|
||||
" trimmed_root = root[len(file_path)+1:]\n",
|
||||
" for name in dirs:\n",
|
||||
" if name.startswith('checkpoint_'):\n",
|
||||
" checkpoints.append(path.join(trimmed_root, name))\n",
|
||||
" return checkpoints"
|
||||
]
|
||||
},
|
||||
@@ -514,7 +503,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646347337724
|
||||
"logged": 1683064305305
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -522,16 +511,16 @@
|
||||
"# Find checkpoints and last checkpoint number\n",
|
||||
"checkpoint_files = find_checkpoints(training_artifacts_path)\n",
|
||||
"\n",
|
||||
"checkpoint_numbers = []\n",
|
||||
"for file in checkpoint_files:\n",
|
||||
" file = os.path.basename(file)\n",
|
||||
" if file.startswith('checkpoint-') and not file.endswith('.tune_metadata'):\n",
|
||||
" checkpoint_numbers.append(int(file.split('-')[1]))\n",
|
||||
"last_checkpoint_path = None\n",
|
||||
"last_checkpoint_number = -1\n",
|
||||
"for checkpoint_file in checkpoint_files:\n",
|
||||
" checkpoint_number = int(os.path.basename(checkpoint_file).split('_')[1])\n",
|
||||
" if checkpoint_number > last_checkpoint_number:\n",
|
||||
" last_checkpoint_path = checkpoint_file\n",
|
||||
" last_checkpoint_number = checkpoint_number\n",
|
||||
"\n",
|
||||
"print(\"Checkpoints:\", checkpoint_numbers)\n",
|
||||
"\n",
|
||||
"last_checkpoint_number = max(checkpoint_numbers)\n",
|
||||
"print(\"Last checkpoint number:\", last_checkpoint_number)"
|
||||
"print(\"Last checkpoint number:\", last_checkpoint_number)\n",
|
||||
"print(\"Last checkpoint path:\", last_checkpoint_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -546,17 +535,16 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646347346085
|
||||
"logged": 1683064305331
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Upload the checkpoint files and create a DataSet\n",
|
||||
"from azureml.core import Dataset\n",
|
||||
"from azureml.data.dataset_factory import FileDatasetFactory\n",
|
||||
"\n",
|
||||
"datastore = ws.get_default_datastore()\n",
|
||||
"checkpoint_dataref = datastore.upload_files(checkpoint_files, target_path='cartpole_checkpoints_' + training_run.id, overwrite=True)\n",
|
||||
"checkpoint_ds = Dataset.File.from_files(checkpoint_dataref)"
|
||||
"checkpoint_ds = FileDatasetFactory.upload_directory(training_artifacts_path, (datastore, 'cartpole_checkpoints_' + training_run.id), overwrite=False, show_progress=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -571,7 +559,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646347354726
|
||||
"logged": 1683064305353
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -598,7 +586,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646347414835
|
||||
"logged": 1683064305371
|
||||
},
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
@@ -614,23 +602,18 @@
|
||||
"source": [
|
||||
"ray_environment_name = 'cartpole-ray-ci'\n",
|
||||
"\n",
|
||||
"experiment_name = 'CartPole-v0-CI'\n",
|
||||
"training_algorithm = 'PPO'\n",
|
||||
"rl_environment = 'CartPole-v0'\n",
|
||||
"experiment_name = 'CartPole-v1-CI'\n",
|
||||
"\n",
|
||||
"experiment = Experiment(workspace=ws, name=experiment_name)\n",
|
||||
"ray_environment = Environment.get(workspace=ws, name=ray_environment_name)\n",
|
||||
"\n",
|
||||
"script_name = 'cartpole_rollout.py'\n",
|
||||
"script_arguments = [\n",
|
||||
" '--run', training_algorithm,\n",
|
||||
" '--env', rl_environment,\n",
|
||||
" '--config', '{}',\n",
|
||||
" '--steps', '2000',\n",
|
||||
" '--checkpoint-number', str(last_checkpoint_number),\n",
|
||||
" '--no-render',\n",
|
||||
" '--artifacts-dataset', checkpoint_ds.as_named_input('artifacts_dataset'),\n",
|
||||
" '--artifacts-path', checkpoint_ds.as_named_input('artifacts_path').as_mount()\n",
|
||||
" '--checkpoint', last_checkpoint_path,\n",
|
||||
" '--algo', 'PPO',\n",
|
||||
" '--render', 'false',\n",
|
||||
" '--dataset_path', checkpoint_ds.as_named_input('dataset_path').as_mount()\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"aml_run_config_ml = RunConfiguration(communicator='OpenMpi')\n",
|
||||
@@ -653,7 +636,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"And then, similar to the training section, we can monitor the real-time progress of the rollout run and its chid as follows. If you browse logs of the child run you can see the evaluation results recorded in driver_log.txt file. Note that you may need to wait several minutes before these results become available."
|
||||
"And then, similar to the training section, we can monitor the real-time progress of the rollout run and its chid as follows. If you browse logs of the child run you can see the evaluation results recorded in std_log_process_0.txt file. Note that you may need to wait several minutes before these results become available."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -661,7 +644,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646347429626
|
||||
"logged": 1683064305399
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -679,7 +662,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064305419
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Uncomment line below to cancel the run\n",
|
||||
@@ -698,7 +685,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683064305437
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# To archive the created experiment:\n",
|
||||
@@ -750,13 +741,16 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.3"
|
||||
"version": "3.8.5"
|
||||
},
|
||||
"microsoft": {
|
||||
"host": {
|
||||
"AzureML": {
|
||||
"notebookHasBeenCompleted": true
|
||||
}
|
||||
},
|
||||
"ms_spell_check": {
|
||||
"ms_spell_check_language": "en"
|
||||
}
|
||||
},
|
||||
"notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License.",
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
cartpole-ppo:
|
||||
env: CartPole-v1
|
||||
run: PPO
|
||||
stop:
|
||||
episode_reward_mean: 475
|
||||
time_total_s: 300
|
||||
checkpoint_config:
|
||||
checkpoint_frequency: 2
|
||||
checkpoint_at_end: true
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
framework: torch
|
||||
gamma: 0.99
|
||||
lr: 0.0003
|
||||
num_workers: 1
|
||||
observation_filter: MeanStdFilter
|
||||
num_sgd_iter: 6
|
||||
vf_loss_coeff: 0.01
|
||||
model:
|
||||
fcnet_hiddens: [32]
|
||||
fcnet_activation: linear
|
||||
vf_share_layers: true
|
||||
enable_connectors: true
|
||||
@@ -1,121 +1,108 @@
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
import ray
|
||||
from ray.rllib import rollout
|
||||
from ray.tune.registry import get_trainable_cls
|
||||
from ray.rllib.evaluate import RolloutSaver, rollout
|
||||
from ray_on_aml.core import Ray_On_AML
|
||||
import ray.cloudpickle as cloudpickle
|
||||
from ray.tune.utils import merge_dicts
|
||||
from ray.tune.registry import get_trainable_cls, _global_registry, ENV_CREATOR
|
||||
|
||||
from azureml.core import Run
|
||||
|
||||
from utils import callbacks
|
||||
|
||||
import collections
|
||||
import copy
|
||||
import gymnasium as gym
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
def run_rollout(args, parser):
|
||||
|
||||
config = args.config
|
||||
if not args.env:
|
||||
if not config.get("env"):
|
||||
parser.error("the following arguments are required: --env")
|
||||
args.env = config.get("env")
|
||||
def run_rollout(checkpoint, algo, render, steps, episodes):
|
||||
config_dir = os.path.dirname(checkpoint)
|
||||
config_path = os.path.join(config_dir, "params.pkl")
|
||||
config = None
|
||||
|
||||
# Create the Trainer from config.
|
||||
cls = get_trainable_cls(args.run)
|
||||
agent = cls(env=args.env, config=config)
|
||||
# Try parent directory.
|
||||
if not os.path.exists(config_path):
|
||||
config_path = os.path.join(config_dir, "../params.pkl")
|
||||
|
||||
# Load state from checkpoint.
|
||||
agent.restore(args.checkpoint)
|
||||
num_steps = int(args.steps)
|
||||
num_episodes = int(args.episodes)
|
||||
# Load the config from pickled.
|
||||
if os.path.exists(config_path):
|
||||
with open(config_path, "rb") as f:
|
||||
config = cloudpickle.load(f)
|
||||
# If no pkl file found, require command line `--config`.
|
||||
else:
|
||||
raise ValueError("Could not find params.pkl in either the checkpoint dir or its parent directory")
|
||||
|
||||
# Determine the video output directory.
|
||||
use_arg_monitor = False
|
||||
try:
|
||||
args.video_dir
|
||||
except AttributeError:
|
||||
print("There is no such attribute: args.video_dir")
|
||||
use_arg_monitor = True
|
||||
# Make sure worker 0 has an Env.
|
||||
config["create_env_on_driver"] = True
|
||||
|
||||
video_dir = None
|
||||
if not use_arg_monitor:
|
||||
if args.monitor:
|
||||
video_dir = os.path.join("./logs", "video")
|
||||
elif args.video_dir:
|
||||
video_dir = os.path.expanduser(args.video_dir)
|
||||
# Merge with `evaluation_config` (first try from command line, then from
|
||||
# pkl file).
|
||||
evaluation_config = copy.deepcopy(config.get("evaluation_config", {}))
|
||||
config = merge_dicts(config, evaluation_config)
|
||||
env = config.get("env")
|
||||
|
||||
# Make sure we have evaluation workers.
|
||||
if not config.get("evaluation_num_workers"):
|
||||
config["evaluation_num_workers"] = config.get("num_workers", 0)
|
||||
if not config.get("evaluation_duration"):
|
||||
config["evaluation_duration"] = 1
|
||||
|
||||
# Hard-override this as it raises a warning by Algorithm otherwise.
|
||||
# Makes no sense anyways, to have it set to None as we don't call
|
||||
# `Algorithm.train()` here.
|
||||
config["evaluation_interval"] = 1
|
||||
|
||||
# Rendering settings.
|
||||
config["render_env"] = render
|
||||
|
||||
# Create the Algorithm from config.
|
||||
cls = get_trainable_cls(algo)
|
||||
algorithm = cls(env=env, config=config)
|
||||
|
||||
# Load state from checkpoint, if provided.
|
||||
if checkpoint:
|
||||
algorithm.restore(checkpoint)
|
||||
|
||||
# Do the actual rollout.
|
||||
with rollout.RolloutSaver(
|
||||
args.out,
|
||||
args.use_shelve,
|
||||
write_update_file=args.track_progress,
|
||||
target_steps=num_steps,
|
||||
target_episodes=num_episodes,
|
||||
save_info=args.save_info) as saver:
|
||||
if use_arg_monitor:
|
||||
rollout.rollout(
|
||||
agent,
|
||||
args.env,
|
||||
num_steps,
|
||||
num_episodes,
|
||||
saver,
|
||||
args.no_render,
|
||||
args.monitor)
|
||||
else:
|
||||
rollout.rollout(
|
||||
agent, args.env,
|
||||
num_steps,
|
||||
num_episodes,
|
||||
saver,
|
||||
args.no_render, video_dir)
|
||||
with RolloutSaver(
|
||||
outfile=None,
|
||||
use_shelve=False,
|
||||
write_update_file=False,
|
||||
target_steps=steps,
|
||||
target_episodes=episodes,
|
||||
save_info=False,
|
||||
) as saver:
|
||||
rollout(algorithm, env, steps, episodes, saver, not render)
|
||||
algorithm.stop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# Start ray head (single node)
|
||||
os.system('ray start --head')
|
||||
ray.init(address='auto')
|
||||
ray_on_aml = Ray_On_AML()
|
||||
ray = ray_on_aml.getRay()
|
||||
if ray:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--dataset_path', required=True, help='Path to artifacts dataset')
|
||||
parser.add_argument('--checkpoint', required=True, help='Name of checkpoint file directory')
|
||||
parser.add_argument('--algo', required=True, help='Name of RL algorithm')
|
||||
parser.add_argument('--render', default=False, required=False, help='True to render')
|
||||
parser.add_argument('--steps', required=False, type=int, help='Number of steps to run')
|
||||
parser.add_argument('--episodes', required=False, type=int, help='Number of episodes to run')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Add positional argument - serves as placeholder for checkpoint
|
||||
argvc = sys.argv[1:]
|
||||
argvc.insert(0, 'checkpoint-placeholder')
|
||||
# Get a handle to run
|
||||
run = Run.get_context()
|
||||
|
||||
# Parse arguments
|
||||
rollout_parser = rollout.create_parser()
|
||||
# Get handles to the tarining artifacts dataset and mount path
|
||||
dataset_path = run.input_datasets['dataset_path']
|
||||
|
||||
rollout_parser.add_argument(
|
||||
'--checkpoint-number', required=False, type=int, default=1,
|
||||
help='Checkpoint number of the checkpoint from which to roll out')
|
||||
# Find checkpoint file to be evaluated
|
||||
checkpoint = os.path.join(dataset_path, args.checkpoint)
|
||||
print('Checkpoint:', checkpoint)
|
||||
|
||||
rollout_parser.add_argument(
|
||||
'--artifacts-dataset', required=True,
|
||||
help='The checkpoints artifacts dataset')
|
||||
|
||||
rollout_parser.add_argument(
|
||||
'--artifacts-path', required=True,
|
||||
help='The checkpoints artifacts path')
|
||||
|
||||
args = rollout_parser.parse_args(argvc)
|
||||
|
||||
# Get a handle to run
|
||||
run = Run.get_context()
|
||||
|
||||
# Get handles to the tarining artifacts dataset and mount path
|
||||
artifacts_dataset = run.input_datasets['artifacts_dataset']
|
||||
artifacts_path = run.input_datasets['artifacts_path']
|
||||
|
||||
# Find checkpoint file to be evaluated
|
||||
checkpoint_id = '-' + str(args.checkpoint_number)
|
||||
checkpoint_files = list(filter(
|
||||
lambda filename: filename.endswith(checkpoint_id),
|
||||
artifacts_dataset.to_path()))
|
||||
|
||||
checkpoint_file = checkpoint_files[0]
|
||||
if checkpoint_file[0] == '/':
|
||||
checkpoint_file = checkpoint_file[1:]
|
||||
checkpoint = os.path.join(artifacts_path, checkpoint_file)
|
||||
print('Checkpoint:', checkpoint)
|
||||
|
||||
# Set rollout checkpoint
|
||||
args.checkpoint = checkpoint
|
||||
|
||||
# Start rollout
|
||||
run_rollout(args, rollout_parser)
|
||||
# Start rollout
|
||||
ray.init(address='auto')
|
||||
run_rollout(checkpoint, args.algo, args.render, args.steps, args.episodes)
|
||||
|
||||
@@ -1,32 +1,34 @@
|
||||
import ray
|
||||
from ray.rllib import train
|
||||
from ray import tune
|
||||
import os
|
||||
|
||||
from ray_on_aml.core import Ray_On_AML
|
||||
import yaml
|
||||
from ray.tune.tune import run_experiments
|
||||
from utils import callbacks
|
||||
import argparse
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--config', help='Path to yaml configuration file')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Parse arguments and add callbacks to config
|
||||
train_parser = train.create_parser()
|
||||
ray_on_aml = Ray_On_AML()
|
||||
ray = ray_on_aml.getRay()
|
||||
if ray: # in the headnode
|
||||
ray.init(address="auto")
|
||||
print("Configuring run from file: ", args.config)
|
||||
experiment_config = None
|
||||
with open(args.config, "r") as file:
|
||||
experiment_config = yaml.safe_load(file)
|
||||
|
||||
args = train_parser.parse_args()
|
||||
args.config["callbacks"] = {"on_train_result": callbacks.on_train_result}
|
||||
# Set local_dir in each experiment configuration to ensure generated logs get picked up
|
||||
# Also set monitor to ensure videos are captured
|
||||
for experiment_name, experiment in experiment_config.items():
|
||||
experiment["storage_path"] = "./logs"
|
||||
experiment['config']['monitor'] = True
|
||||
print(f'Config: {experiment_config}')
|
||||
|
||||
# Trace if video capturing is on
|
||||
if 'monitor' in args.config and args.config['monitor']:
|
||||
print("Video capturing is ON!")
|
||||
|
||||
# Start ray head (single node)
|
||||
os.system('ray start --head')
|
||||
ray.init(address='auto')
|
||||
|
||||
# Run training task using tune.run
|
||||
tune.run(
|
||||
run_or_experiment=args.run,
|
||||
config=dict(args.config, env=args.env),
|
||||
stop=args.stop,
|
||||
checkpoint_freq=args.checkpoint_freq,
|
||||
checkpoint_at_end=args.checkpoint_at_end,
|
||||
local_dir=args.local_dir
|
||||
)
|
||||
trials = run_experiments(
|
||||
experiment_config,
|
||||
callbacks=[callbacks.TrialCallback()],
|
||||
verbose=2
|
||||
)
|
||||
else:
|
||||
print("in worker node")
|
||||
|
||||
@@ -1,19 +1,27 @@
|
||||
FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
|
||||
|
||||
USER root
|
||||
RUN conda install -c anaconda python=3.7
|
||||
RUN pip install ray-on-aml==0.2.4 \
|
||||
ray==2.4.0 \
|
||||
ray[rllib]==2.4.0 \
|
||||
mlflow==2.3.1 \
|
||||
azureml-defaults==1.50.0 \
|
||||
azureml-dataset-runtime[fuse,pandas]==1.50.0 \
|
||||
azureml-contrib-reinforcementlearning==1.50.0 \
|
||||
gputil==1.4.0 \
|
||||
scipy==1.9.1 \
|
||||
pyglet==2.0.6 \
|
||||
cloudpickle==2.2.1 \
|
||||
tensorflow==2.11.0 \
|
||||
tensorflow-probability==0.19.0 \
|
||||
torch \
|
||||
tabulate==0.9.0 \
|
||||
dm_tree==0.1.8 \
|
||||
lz4==4.3.2 \
|
||||
psutil==5.9.4 \
|
||||
setproctitle==1.3.2 \
|
||||
pygame==2.1.0 \
|
||||
gymnasium[classic_control]==0.26.3 \
|
||||
gym[classic_control]==0.26.2
|
||||
|
||||
RUN pip install ray-on-aml==0.1.6
|
||||
RUN pip install gym[atari]==0.19.0
|
||||
RUN pip install gym[accept-rom-license]==0.19.0
|
||||
RUN pip install ale-py==0.7.0
|
||||
RUN pip install azureml-core
|
||||
RUN pip install azureml-dataset-runtime
|
||||
RUN pip install ray==0.8.7
|
||||
RUN pip install ray[rllib,tune,serve]==0.8.7
|
||||
RUN pip install tensorflow==1.14.0
|
||||
RUN pip install 'msrest<0.7.0'
|
||||
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y jq
|
||||
RUN apt-get install -y rsync
|
||||
# Display the exact versions we have installed
|
||||
RUN pip freeze
|
||||
|
||||
@@ -3,15 +3,20 @@
|
||||
'''
|
||||
|
||||
from azureml.core import Run
|
||||
from ray import tune
|
||||
from ray.tune import Callback
|
||||
from ray.air import session
|
||||
|
||||
|
||||
def on_train_result(info):
|
||||
'''Callback on train result to record metrics returned by trainer.
|
||||
'''
|
||||
run = Run.get_context()
|
||||
run.log(
|
||||
name='episode_reward_mean',
|
||||
value=info["result"]["episode_reward_mean"])
|
||||
run.log(
|
||||
name='episodes_total',
|
||||
value=info["result"]["episodes_total"])
|
||||
class TrialCallback(Callback):
|
||||
|
||||
def on_trial_result(self, iteration, trials, trial, result, **info):
|
||||
'''Callback on train result to record metrics returned by trainer.
|
||||
'''
|
||||
run = Run.get_context()
|
||||
run.log(
|
||||
name='episode_reward_mean',
|
||||
value=result["episode_reward_mean"])
|
||||
run.log(
|
||||
name='episodes_total',
|
||||
value=result["episodes_total"])
|
||||
|
||||
@@ -84,7 +84,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646347616697
|
||||
"logged": 1683056824182
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -107,7 +107,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646429058500
|
||||
"logged": 1683056825821
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -136,7 +136,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646359152101
|
||||
"logged": 1683056826903
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -181,14 +181,14 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646348040613
|
||||
"logged": 1683056827252
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
"\n",
|
||||
"experiment_name = 'CartPole-v0-SC'\n",
|
||||
"experiment_name = 'CartPole-v1-SC'\n",
|
||||
"experiment = Experiment(workspace=ws, name=experiment_name)"
|
||||
]
|
||||
},
|
||||
@@ -250,7 +250,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646437786449
|
||||
"logged": 1683059658819
|
||||
},
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
@@ -264,44 +264,31 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"from azureml.core import RunConfiguration, ScriptRunConfig, Experiment\n",
|
||||
"from azureml.core.runconfig import DockerConfiguration, RunConfiguration\n",
|
||||
"\n",
|
||||
"training_algorithm = \"PPO\"\n",
|
||||
"rl_environment = \"CartPole-v0\"\n",
|
||||
"video_capture = True\n",
|
||||
"if video_capture:\n",
|
||||
" algorithm_config = '\\'{\"num_gpus\": 0, \"num_workers\": 1, \"monitor\": true}\\''\n",
|
||||
"else:\n",
|
||||
" algorithm_config = '\\'{\"num_gpus\": 0, \"num_workers\": 1, \"monitor\": false}\\''\n",
|
||||
"\n",
|
||||
"config_name = 'cartpole-ppo.yaml'\n",
|
||||
"script_name = 'cartpole_training.py'\n",
|
||||
"video_capture = True\n",
|
||||
"script_arguments = [\n",
|
||||
" '--run', training_algorithm,\n",
|
||||
" '--env', rl_environment,\n",
|
||||
" '--stop', '\\'{\"episode_reward_mean\": 200, \"time_total_s\": 300}\\'',\n",
|
||||
" '--config', algorithm_config,\n",
|
||||
" '--checkpoint-freq', '2',\n",
|
||||
" '--checkpoint-at-end',\n",
|
||||
" '--local-dir', './logs'\n",
|
||||
" '--config', config_name\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"ray_environment = Environment.get(ws, name=ray_environment_name)\n",
|
||||
"run_config = RunConfiguration(communicator='OpenMpi')\n",
|
||||
"run_config.target = compute_target\n",
|
||||
"run_config.node_count = 1\n",
|
||||
"run_config.environment = ray_environment\n",
|
||||
"command=[\"python\", script_name, *script_arguments]\n",
|
||||
"\n",
|
||||
"aml_run_config_ml = RunConfiguration(communicator='OpenMpi')\n",
|
||||
"aml_run_config_ml.target = compute_target\n",
|
||||
"aml_run_config_ml.node_count = 1\n",
|
||||
"aml_run_config_ml.environment = ray_environment\n",
|
||||
"\n",
|
||||
"if video_capture:\n",
|
||||
" command = [\"xvfb-run -s '-screen 0 640x480x16 -ac +extension GLX +render' \"] + command\n",
|
||||
" run_config.environment_variables[\"SDL_VIDEODRIVER\"] = \"dummy\"\n",
|
||||
" aml_run_config_ml.environment_variables[\"SDL_VIDEODRIVER\"] = \"dummy\"\n",
|
||||
"\n",
|
||||
"training_config = ScriptRunConfig(source_directory='./files',\n",
|
||||
" command=command,\n",
|
||||
" run_config = run_config\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" command=command,\n",
|
||||
" run_config = aml_run_config_ml\n",
|
||||
" )\n",
|
||||
"training_run = experiment.submit(training_config)"
|
||||
]
|
||||
},
|
||||
@@ -309,42 +296,35 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Training script\n",
|
||||
"### Training configuration\n",
|
||||
"\n",
|
||||
"As recommended in RLlib documentations, we use Ray Tune API to run the training algorithm. All the RLlib built-in trainers are compatible with the Tune API. Here we use `tune.run()` to execute a built-in training algorithm. For convenience, down below you can see part of the entry script where we make this call.\n",
|
||||
"This is the training configuration (in yaml) that we use to train an agent to solve the CartPole problem using\n",
|
||||
"the PPO algorithm.\n",
|
||||
"\n",
|
||||
"This is the list of parameters we are passing into `tune.run()` via the `script_params` parameter:\n",
|
||||
"\n",
|
||||
"- `run_or_experiment`: name of the [built-in algorithm](https://ray.readthedocs.io/en/latest/rllib-algorithms.html#rllib-algorithms), 'PPO' in our example,\n",
|
||||
"- `config`: Algorithm-specific configuration. This includes specifying the environment, `env`, which in our example is the gym **[CartPole-v0](https://www.gymlibrary.dev/environments/classic_control/cart_pole/)** environment,\n",
|
||||
"- `stop`: stopping conditions, which could be any of the metrics returned by the trainer. Here we use \"mean of episode reward\", and \"total training time in seconds\" as stop conditions, and\n",
|
||||
"- `checkpoint_freq` and `checkpoint_at_end`: Frequency of taking checkpoints (number of training iterations between checkpoints), and if a checkpoint should be taken at the end.\n",
|
||||
"\n",
|
||||
"We also specify the `local_dir`, the directory in which the training logs, checkpoints and other training artificats will be recorded. \n",
|
||||
"\n",
|
||||
"See [RLlib Training APIs](https://ray.readthedocs.io/en/latest/rllib-training.html#rllib-training-apis) for more details, and also [Training (tune.run, tune.Experiment)](https://ray.readthedocs.io/en/latest/tune/api_docs/execution.html#training-tune-run-tune-experiment) for the complete list of parameters.\n",
|
||||
"\n",
|
||||
"```python\n",
|
||||
"import ray\n",
|
||||
"import ray.tune as tune\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
"\n",
|
||||
" # parse arguments ...\n",
|
||||
" \n",
|
||||
" # Start ray head (single node)\n",
|
||||
" os.system('ray start --head')\n",
|
||||
" ray.init(address='auto')\n",
|
||||
"\n",
|
||||
" # Run training task using tune.run\n",
|
||||
" tune.run(\n",
|
||||
" run_or_experiment=args.run,\n",
|
||||
" config=dict(args.config, env=args.env),\n",
|
||||
" stop=args.stop,\n",
|
||||
" checkpoint_freq=args.checkpoint_freq,\n",
|
||||
" checkpoint_at_end=args.checkpoint_at_end,\n",
|
||||
" local_dir=args.local_dir\n",
|
||||
" )\n",
|
||||
"```yaml\n",
|
||||
"cartpole-ppo:\n",
|
||||
" env: CartPole-v1\n",
|
||||
" run: PPO\n",
|
||||
" stop:\n",
|
||||
" episode_reward_mean: 475\n",
|
||||
" time_total_s: 300\n",
|
||||
" checkpoint_config:\n",
|
||||
" checkpoint_frequency: 2\n",
|
||||
" checkpoint_at_end: true\n",
|
||||
" config:\n",
|
||||
" # Works for both torch and tf.\n",
|
||||
" framework: torch\n",
|
||||
" gamma: 0.99\n",
|
||||
" lr: 0.0003\n",
|
||||
" num_workers: 1\n",
|
||||
" observation_filter: MeanStdFilter\n",
|
||||
" num_sgd_iter: 6\n",
|
||||
" vf_loss_coeff: 0.01\n",
|
||||
" model:\n",
|
||||
" fcnet_hiddens: [32]\n",
|
||||
" fcnet_activation: linear\n",
|
||||
" vf_share_layers: true\n",
|
||||
" enable_connectors: true\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
@@ -362,7 +342,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646437627002
|
||||
"logged": 1683060289002
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -403,7 +383,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683060297005
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"training_run.wait_for_completion()"
|
||||
@@ -420,7 +404,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683060517858
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Run\n",
|
||||
@@ -441,7 +429,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646437652309
|
||||
"logged": 1683060521847
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -449,7 +437,7 @@
|
||||
"from os import path\n",
|
||||
"from distutils import dir_util\n",
|
||||
"\n",
|
||||
"training_artifacts_path = path.join(\"logs\", training_algorithm)\n",
|
||||
"training_artifacts_path = path.join(\"logs\", \"cartpole-ppo\")\n",
|
||||
"print(\"Training artifacts path:\", training_artifacts_path)\n",
|
||||
"\n",
|
||||
"if path.exists(training_artifacts_path):\n",
|
||||
@@ -475,7 +463,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646437657045
|
||||
"logged": 1683060867182
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -514,7 +502,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646437690241
|
||||
"logged": 1683060871682
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -535,7 +523,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646437692954
|
||||
"logged": 1683060900828
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -543,7 +531,8 @@
|
||||
"first_movie = mp4_files[0] if len(mp4_files) > 0 else None\n",
|
||||
"print(\"First movie:\", first_movie)\n",
|
||||
"\n",
|
||||
"display_movie(first_movie)"
|
||||
"if first_movie:\n",
|
||||
" display_movie(first_movie)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -558,7 +547,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646437717147
|
||||
"logged": 1683060914790
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -566,7 +555,8 @@
|
||||
"last_movie = mp4_files[-1] if len(mp4_files) > 0 else None\n",
|
||||
"print(\"Last movie:\", last_movie)\n",
|
||||
"\n",
|
||||
"display_movie(last_movie)"
|
||||
"if last_movie:\n",
|
||||
" display_movie(last_movie)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -597,53 +587,65 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683061167899
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# A helper function to find checkpoint files in a directory\n",
|
||||
"# A helper function to find all of the checkpoint directories located within a larger directory tree\n",
|
||||
"def find_checkpoints(file_path):\n",
|
||||
" print(\"Looking in path:\", file_path)\n",
|
||||
" checkpoints = []\n",
|
||||
" for root, _, files in os.walk(file_path):\n",
|
||||
" for name in files:\n",
|
||||
" if os.path.basename(root).startswith('checkpoint_'):\n",
|
||||
" checkpoints.append(path.join(root, name))\n",
|
||||
" return checkpoints\n",
|
||||
"\n",
|
||||
"checkpoint_files = find_checkpoints(training_artifacts_path)"
|
||||
" for root, dirs, files in os.walk(file_path):\n",
|
||||
" trimmed_root = root[len(file_path)+1:]\n",
|
||||
" for name in dirs:\n",
|
||||
" if name.startswith('checkpoint_'):\n",
|
||||
" checkpoints.append(path.join(trimmed_root, name))\n",
|
||||
" return checkpoints"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683061170184
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Find checkpoints and last checkpoint number\n",
|
||||
"checkpoint_numbers = []\n",
|
||||
"for file in checkpoint_files:\n",
|
||||
" file = os.path.basename(file)\n",
|
||||
" if file.startswith('checkpoint-') and not file.endswith('.tune_metadata'):\n",
|
||||
" checkpoint_numbers.append(int(file.split('-')[-1]))\n",
|
||||
"checkpoint_files = find_checkpoints(training_artifacts_path)\n",
|
||||
"\n",
|
||||
"print(\"Checkpoints:\", checkpoint_numbers)\n",
|
||||
"last_checkpoint_path = None\n",
|
||||
"last_checkpoint_number = -1\n",
|
||||
"for checkpoint_file in checkpoint_files:\n",
|
||||
" checkpoint_number = int(os.path.basename(checkpoint_file).split('_')[1])\n",
|
||||
" if checkpoint_number > last_checkpoint_number:\n",
|
||||
" last_checkpoint_path = checkpoint_file\n",
|
||||
" last_checkpoint_number = checkpoint_number\n",
|
||||
"\n",
|
||||
"last_checkpoint_number = max(checkpoint_numbers)\n",
|
||||
"print(\"Last checkpoint number:\", last_checkpoint_number)"
|
||||
"print(\"Last checkpoint number:\", last_checkpoint_number)\n",
|
||||
"print(\"Last checkpoint path:\", last_checkpoint_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683061176740
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Upload the checkpoint files and create a DataSet\n",
|
||||
"from azureml.core import Dataset\n",
|
||||
"from azureml.data.dataset_factory import FileDatasetFactory\n",
|
||||
"\n",
|
||||
"datastore = ws.get_default_datastore()\n",
|
||||
"checkpoint_dataref = datastore.upload_files(checkpoint_files, target_path='cartpole_checkpoints_' + run_id, overwrite=True)\n",
|
||||
"checkpoint_ds = Dataset.File.from_files(checkpoint_dataref)"
|
||||
"checkpoint_ds = FileDatasetFactory.upload_directory(training_artifacts_path, (datastore, 'cartpole_checkpoints_' + training_run.id), overwrite=False, show_progress=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -657,54 +659,45 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062377151
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ray_environment_name = 'cartpole-ray-sc'\n",
|
||||
"\n",
|
||||
"experiment_name = 'CartPole-v0-SC'\n",
|
||||
"experiment_name = 'CartPole-v1-SC'\n",
|
||||
"training_algorithm = 'PPO'\n",
|
||||
"rl_environment = 'CartPole-v0'\n",
|
||||
"rl_environment = 'CartPole-v1'\n",
|
||||
"\n",
|
||||
"experiment = Experiment(workspace=ws, name=experiment_name)\n",
|
||||
"ray_environment = Environment.get(workspace=ws, name=ray_environment_name)\n",
|
||||
"\n",
|
||||
"script_name = 'cartpole_rollout.py'\n",
|
||||
"video_capture = True\n",
|
||||
"if video_capture:\n",
|
||||
" script_arguments = ['--video-dir', './logs/video']\n",
|
||||
"else:\n",
|
||||
" script_arguments = ['--no-render']\n",
|
||||
"script_arguments = script_arguments + [\n",
|
||||
" '--run', training_algorithm,\n",
|
||||
" '--env', rl_environment,\n",
|
||||
" '--config', '{}',\n",
|
||||
"script_arguments = [\n",
|
||||
" '--steps', '2000',\n",
|
||||
" '--checkpoint-number', str(last_checkpoint_number),\n",
|
||||
" '--artifacts-dataset', checkpoint_ds.as_named_input('artifacts_dataset'),\n",
|
||||
" '--artifacts-path', checkpoint_ds.as_named_input('artifacts_path').as_mount()\n",
|
||||
" '--checkpoint', last_checkpoint_path,\n",
|
||||
" '--algo', 'PPO',\n",
|
||||
" '--render', 'true',\n",
|
||||
" '--dataset_path', checkpoint_ds.as_named_input('dataset_path').as_mount()\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"command = [\"python\", script_name, *script_arguments]\n",
|
||||
"\n",
|
||||
"if video_capture:\n",
|
||||
" command = [\"xvfb-run -s '-screen 0 640x480x16 -ac +extension GLX +render' \"] + command\n",
|
||||
" run_config.environment_variables[\"SDL_VIDEODRIVER\"] = \"dummy\"\n",
|
||||
"\n",
|
||||
"run_config = RunConfiguration(communicator='OpenMpi')\n",
|
||||
"run_config.target = compute_target\n",
|
||||
"run_config.docker = DockerConfiguration(use_docker=True)\n",
|
||||
"run_config.node_count = 1\n",
|
||||
"run_config.environment = ray_environment\n",
|
||||
"aml_run_config_ml = RunConfiguration(communicator='OpenMpi')\n",
|
||||
"aml_run_config_ml.target = compute_target\n",
|
||||
"aml_run_config_ml.node_count = 1\n",
|
||||
"aml_run_config_ml.environment = ray_environment\n",
|
||||
"aml_run_config_ml.data\n",
|
||||
"\n",
|
||||
"rollout_config = ScriptRunConfig(\n",
|
||||
" source_directory='./files',\n",
|
||||
" command=command,\n",
|
||||
" run_config=run_config\n",
|
||||
" script=script_name,\n",
|
||||
" arguments=script_arguments,\n",
|
||||
" run_config = aml_run_config_ml\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"rollout_run = experiment.submit(rollout_config)\n",
|
||||
"rollout_run"
|
||||
" \n",
|
||||
"rollout_run = experiment.submit(rollout_config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -717,7 +710,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062379999
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"RunDetails(rollout_run).show()"
|
||||
@@ -733,7 +730,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062451723
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Uncomment line below to cancel the run\n",
|
||||
@@ -753,7 +754,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062747822
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download rollout artifacts\n",
|
||||
@@ -777,7 +782,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062752847
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Look for the downloaded movie in local directory\n",
|
||||
@@ -797,13 +806,18 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1683062763275
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"last_movie = mp4_files[-1] if len(mp4_files) > 0 else None\n",
|
||||
"print(\"Last movie:\", last_movie)\n",
|
||||
"\n",
|
||||
"display_movie(last_movie)"
|
||||
"if last_movie:\n",
|
||||
" display_movie(last_movie)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -876,7 +890,17 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.3"
|
||||
"version": "3.8.5"
|
||||
},
|
||||
"microsoft": {
|
||||
"host": {
|
||||
"AzureML": {
|
||||
"notebookHasBeenCompleted": true
|
||||
}
|
||||
},
|
||||
"ms_spell_check": {
|
||||
"ms_spell_check_language": "en"
|
||||
}
|
||||
},
|
||||
"notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License.",
|
||||
"nteract": {
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
cartpole-ppo:
|
||||
env: CartPole-v1
|
||||
run: PPO
|
||||
stop:
|
||||
episode_reward_mean: 475
|
||||
time_total_s: 300
|
||||
checkpoint_config:
|
||||
checkpoint_frequency: 2
|
||||
checkpoint_at_end: true
|
||||
config:
|
||||
# Works for both torch and tf.
|
||||
framework: torch
|
||||
gamma: 0.99
|
||||
lr: 0.0003
|
||||
num_workers: 1
|
||||
observation_filter: MeanStdFilter
|
||||
num_sgd_iter: 6
|
||||
vf_loss_coeff: 0.01
|
||||
model:
|
||||
fcnet_hiddens: [32]
|
||||
fcnet_activation: linear
|
||||
vf_share_layers: true
|
||||
enable_connectors: true
|
||||
render_env: true
|
||||
@@ -1,121 +1,108 @@
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
import ray
|
||||
from ray.rllib import rollout
|
||||
from ray.tune.registry import get_trainable_cls
|
||||
from ray.rllib.evaluate import RolloutSaver, rollout
|
||||
from ray_on_aml.core import Ray_On_AML
|
||||
import ray.cloudpickle as cloudpickle
|
||||
from ray.tune.utils import merge_dicts
|
||||
from ray.tune.registry import get_trainable_cls, _global_registry, ENV_CREATOR
|
||||
|
||||
from azureml.core import Run
|
||||
|
||||
from utils import callbacks
|
||||
|
||||
import collections
|
||||
import copy
|
||||
import gymnasium as gym
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
def run_rollout(args, parser):
|
||||
|
||||
config = args.config
|
||||
if not args.env:
|
||||
if not config.get("env"):
|
||||
parser.error("the following arguments are required: --env")
|
||||
args.env = config.get("env")
|
||||
def run_rollout(checkpoint, algo, render, steps, episodes):
|
||||
config_dir = os.path.dirname(checkpoint)
|
||||
config_path = os.path.join(config_dir, "params.pkl")
|
||||
config = None
|
||||
|
||||
# Create the Trainer from config.
|
||||
cls = get_trainable_cls(args.run)
|
||||
agent = cls(env=args.env, config=config)
|
||||
# Try parent directory.
|
||||
if not os.path.exists(config_path):
|
||||
config_path = os.path.join(config_dir, "../params.pkl")
|
||||
|
||||
# Load state from checkpoint.
|
||||
agent.restore(args.checkpoint)
|
||||
num_steps = int(args.steps)
|
||||
num_episodes = int(args.episodes)
|
||||
# Load the config from pickled.
|
||||
if os.path.exists(config_path):
|
||||
with open(config_path, "rb") as f:
|
||||
config = cloudpickle.load(f)
|
||||
# If no pkl file found, require command line `--config`.
|
||||
else:
|
||||
raise ValueError("Could not find params.pkl in either the checkpoint dir or its parent directory")
|
||||
|
||||
# Determine the video output directory.
|
||||
use_arg_monitor = False
|
||||
try:
|
||||
args.video_dir
|
||||
except AttributeError:
|
||||
print("There is no such attribute: args.video_dir")
|
||||
use_arg_monitor = True
|
||||
# Make sure worker 0 has an Env.
|
||||
config["create_env_on_driver"] = True
|
||||
|
||||
video_dir = None
|
||||
if not use_arg_monitor:
|
||||
if args.monitor:
|
||||
video_dir = os.path.join("./logs", "video")
|
||||
elif args.video_dir:
|
||||
video_dir = os.path.expanduser(args.video_dir)
|
||||
# Merge with `evaluation_config` (first try from command line, then from
|
||||
# pkl file).
|
||||
evaluation_config = copy.deepcopy(config.get("evaluation_config", {}))
|
||||
config = merge_dicts(config, evaluation_config)
|
||||
env = config.get("env")
|
||||
|
||||
# Make sure we have evaluation workers.
|
||||
if not config.get("evaluation_num_workers"):
|
||||
config["evaluation_num_workers"] = config.get("num_workers", 0)
|
||||
if not config.get("evaluation_duration"):
|
||||
config["evaluation_duration"] = 1
|
||||
|
||||
# Hard-override this as it raises a warning by Algorithm otherwise.
|
||||
# Makes no sense anyways, to have it set to None as we don't call
|
||||
# `Algorithm.train()` here.
|
||||
config["evaluation_interval"] = 1
|
||||
|
||||
# Rendering settings.
|
||||
config["render_env"] = render
|
||||
|
||||
# Create the Algorithm from config.
|
||||
cls = get_trainable_cls(algo)
|
||||
algorithm = cls(env=env, config=config)
|
||||
|
||||
# Load state from checkpoint, if provided.
|
||||
if checkpoint:
|
||||
algorithm.restore(checkpoint)
|
||||
|
||||
# Do the actual rollout.
|
||||
with rollout.RolloutSaver(
|
||||
args.out,
|
||||
args.use_shelve,
|
||||
write_update_file=args.track_progress,
|
||||
target_steps=num_steps,
|
||||
target_episodes=num_episodes,
|
||||
save_info=args.save_info) as saver:
|
||||
if use_arg_monitor:
|
||||
rollout.rollout(
|
||||
agent,
|
||||
args.env,
|
||||
num_steps,
|
||||
num_episodes,
|
||||
saver,
|
||||
args.no_render,
|
||||
args.monitor)
|
||||
else:
|
||||
rollout.rollout(
|
||||
agent, args.env,
|
||||
num_steps,
|
||||
num_episodes,
|
||||
saver,
|
||||
args.no_render, video_dir)
|
||||
with RolloutSaver(
|
||||
outfile=None,
|
||||
use_shelve=False,
|
||||
write_update_file=False,
|
||||
target_steps=steps,
|
||||
target_episodes=episodes,
|
||||
save_info=False,
|
||||
) as saver:
|
||||
rollout(algorithm, env, steps, episodes, saver, not render)
|
||||
algorithm.stop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# Start ray head (single node)
|
||||
os.system('ray start --head')
|
||||
ray.init(address='auto')
|
||||
ray_on_aml = Ray_On_AML()
|
||||
ray = ray_on_aml.getRay()
|
||||
if ray:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--dataset_path', required=True, help='Path to artifacts dataset')
|
||||
parser.add_argument('--checkpoint', required=True, help='Name of checkpoint file directory')
|
||||
parser.add_argument('--algo', required=True, help='Name of RL algorithm')
|
||||
parser.add_argument('--render', default=False, required=False, help='True to render')
|
||||
parser.add_argument('--steps', required=False, type=int, help='Number of steps to run')
|
||||
parser.add_argument('--episodes', required=False, type=int, help='Number of episodes to run')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Add positional argument - serves as placeholder for checkpoint
|
||||
argvc = sys.argv[1:]
|
||||
argvc.insert(0, 'checkpoint-placeholder')
|
||||
# Get a handle to run
|
||||
run = Run.get_context()
|
||||
|
||||
# Parse arguments
|
||||
rollout_parser = rollout.create_parser()
|
||||
# Get handles to the tarining artifacts dataset and mount path
|
||||
dataset_path = run.input_datasets['dataset_path']
|
||||
|
||||
rollout_parser.add_argument(
|
||||
'--checkpoint-number', required=False, type=int, default=1,
|
||||
help='Checkpoint number of the checkpoint from which to roll out')
|
||||
# Find checkpoint file to be evaluated
|
||||
checkpoint = os.path.join(dataset_path, args.checkpoint)
|
||||
print('Checkpoint:', checkpoint)
|
||||
|
||||
rollout_parser.add_argument(
|
||||
'--artifacts-dataset', required=True,
|
||||
help='The checkpoints artifacts dataset')
|
||||
|
||||
rollout_parser.add_argument(
|
||||
'--artifacts-path', required=True,
|
||||
help='The checkpoints artifacts path')
|
||||
|
||||
args = rollout_parser.parse_args(argvc)
|
||||
|
||||
# Get a handle to run
|
||||
run = Run.get_context()
|
||||
|
||||
# Get handles to the tarining artifacts dataset and mount path
|
||||
artifacts_dataset = run.input_datasets['artifacts_dataset']
|
||||
artifacts_path = run.input_datasets['artifacts_path']
|
||||
|
||||
# Find checkpoint file to be evaluated
|
||||
checkpoint_id = '-' + str(args.checkpoint_number)
|
||||
checkpoint_files = list(filter(
|
||||
lambda filename: filename.endswith(checkpoint_id),
|
||||
artifacts_dataset.to_path()))
|
||||
|
||||
checkpoint_file = checkpoint_files[0]
|
||||
if checkpoint_file[0] == '/':
|
||||
checkpoint_file = checkpoint_file[1:]
|
||||
checkpoint = os.path.join(artifacts_path, checkpoint_file)
|
||||
print('Checkpoint:', checkpoint)
|
||||
|
||||
# Set rollout checkpoint
|
||||
args.checkpoint = checkpoint
|
||||
|
||||
# Start rollout
|
||||
run_rollout(args, rollout_parser)
|
||||
# Start rollout
|
||||
ray.init(address='auto')
|
||||
run_rollout(checkpoint, args.algo, args.render, args.steps, args.episodes)
|
||||
|
||||
@@ -1,32 +1,34 @@
|
||||
import os
|
||||
import ray
|
||||
from ray.rllib import train
|
||||
from ray import tune
|
||||
|
||||
from ray_on_aml.core import Ray_On_AML
|
||||
import yaml
|
||||
from ray.tune.tune import run_experiments
|
||||
from utils import callbacks
|
||||
import argparse
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--config', help='Path to yaml configuration file')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Parse arguments and add callbacks to config
|
||||
train_parser = train.create_parser()
|
||||
ray_on_aml = Ray_On_AML()
|
||||
ray = ray_on_aml.getRay()
|
||||
if ray: # in the headnode
|
||||
ray.init(address="auto")
|
||||
print("Configuring run from file: ", args.config)
|
||||
experiment_config = None
|
||||
with open(args.config, "r") as file:
|
||||
experiment_config = yaml.safe_load(file)
|
||||
|
||||
args = train_parser.parse_args()
|
||||
args.config["callbacks"] = {"on_train_result": callbacks.on_train_result}
|
||||
# Set local_dir in each experiment configuration to ensure generated logs get picked up
|
||||
# Also set monitor to ensure videos are captured
|
||||
for experiment_name, experiment in experiment_config.items():
|
||||
experiment["storage_path"] = "./logs"
|
||||
experiment['config']['monitor'] = True
|
||||
print(f'Config: {experiment_config}')
|
||||
|
||||
# Trace if video capturing is on
|
||||
if 'monitor' in args.config and args.config['monitor']:
|
||||
print("Video capturing is ON!")
|
||||
|
||||
# Start ray head (single node)
|
||||
os.system('ray start --head')
|
||||
ray.init(address='auto')
|
||||
|
||||
# Run training task using tune.run
|
||||
tune.run(
|
||||
run_or_experiment=args.run,
|
||||
config=dict(args.config, env=args.env),
|
||||
stop=args.stop,
|
||||
checkpoint_freq=args.checkpoint_freq,
|
||||
checkpoint_at_end=args.checkpoint_at_end,
|
||||
local_dir=args.local_dir
|
||||
)
|
||||
trials = run_experiments(
|
||||
experiment_config,
|
||||
callbacks=[callbacks.TrialCallback()],
|
||||
verbose=2
|
||||
)
|
||||
else:
|
||||
print("in worker node")
|
||||
|
||||
@@ -8,31 +8,28 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
rm -rf /usr/share/man/*
|
||||
|
||||
RUN conda install -y conda=4.13.0 python=3.7 && conda clean -ay
|
||||
RUN pip install ray-on-aml==0.2.1 & \
|
||||
pip install --no-cache-dir \
|
||||
azureml-defaults \
|
||||
azureml-dataset-runtime[fuse,pandas] \
|
||||
azureml-contrib-reinforcementlearning \
|
||||
gputil \
|
||||
scipy \
|
||||
pyglet==1.5.27 \
|
||||
cloudpickle==1.3.0 \
|
||||
tensorboardX \
|
||||
tensorflow==1.14.0 \
|
||||
tabulate \
|
||||
dm_tree \
|
||||
lz4 \
|
||||
psutil \
|
||||
setproctitle \
|
||||
pygame \
|
||||
gym[classic_control]==0.19.0 && \
|
||||
conda install -y -c conda-forge x264='1!152.20180717' ffmpeg=4.0.2 && \
|
||||
conda install -c anaconda opencv
|
||||
RUN pip install ray-on-aml==0.2.4 \
|
||||
ray==2.4.0 \
|
||||
ray[rllib]==2.4.0 \
|
||||
mlflow==2.3.1 \
|
||||
azureml-defaults==1.50.0 \
|
||||
azureml-dataset-runtime[fuse,pandas]==1.50.0 \
|
||||
azureml-contrib-reinforcementlearning==1.50.0 \
|
||||
gputil==1.4.0 \
|
||||
scipy==1.9.1 \
|
||||
pyglet==2.0.6 \
|
||||
cloudpickle==2.2.1 \
|
||||
tensorflow==2.11.0 \
|
||||
tensorflow-probability==0.19.0 \
|
||||
torch \
|
||||
tabulate==0.9.0 \
|
||||
dm_tree==0.1.8 \
|
||||
lz4==4.3.2 \
|
||||
psutil==5.9.4 \
|
||||
setproctitle==1.3.2 \
|
||||
pygame==2.1.0 \
|
||||
gymnasium[classic_control]==0.26.3 \
|
||||
gym[classic_control]==0.26.2
|
||||
|
||||
RUN pip install protobuf==3.20.0
|
||||
|
||||
RUN pip install --upgrade ray==0.8.3 \
|
||||
ray[rllib,dashboard,tune]==0.8.3
|
||||
|
||||
RUN pip install 'msrest<0.7.0'
|
||||
# Display the exact versions we have installed
|
||||
RUN pip freeze
|
||||
|
||||
@@ -3,21 +3,20 @@
|
||||
'''
|
||||
|
||||
from azureml.core import Run
|
||||
from ray import tune
|
||||
from ray.tune import Callback
|
||||
from ray.air import session
|
||||
|
||||
|
||||
def on_train_result(info):
|
||||
'''Callback on train result to record metrics returned by trainer.
|
||||
'''
|
||||
run = Run.get_context()
|
||||
run.log(
|
||||
name='episode_reward_mean',
|
||||
value=info["result"]["episode_reward_mean"])
|
||||
run.log(
|
||||
name='episodes_total',
|
||||
value=info["result"]["episodes_total"])
|
||||
run.log(
|
||||
name='perf_cpu_percent',
|
||||
value=info["result"]["perf"]["cpu_util_percent"])
|
||||
run.log(
|
||||
name='perf_memory_percent',
|
||||
value=info["result"]["perf"]["ram_util_percent"])
|
||||
class TrialCallback(Callback):
|
||||
|
||||
def on_trial_result(self, iteration, trials, trial, result, **info):
|
||||
'''Callback on train result to record metrics returned by trainer.
|
||||
'''
|
||||
run = Run.get_context()
|
||||
run.log(
|
||||
name='episode_reward_mean',
|
||||
value=result["episode_reward_mean"])
|
||||
run.log(
|
||||
name='episodes_total',
|
||||
value=result["episodes_total"])
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
# DisableDockerDetector "Disabled to unblock PRs until the owner can fix the file. Not used in any prod deployments - only as a documentation for the customers"
|
||||
FROM akdmsft/particle-cpu
|
||||
|
||||
RUN conda install -c anaconda python=3.7
|
||||
|
||||
# Install required pip packages
|
||||
RUN pip3 install --upgrade pip setuptools && pip3 install --upgrade \
|
||||
pandas \
|
||||
matplotlib \
|
||||
psutil \
|
||||
numpy \
|
||||
scipy \
|
||||
gym \
|
||||
azureml-defaults \
|
||||
tensorboardX \
|
||||
tensorflow==1.15 \
|
||||
tensorflow-probability==0.8.0 \
|
||||
onnxruntime \
|
||||
tf2onnx \
|
||||
cloudpickle==1.1.1 \
|
||||
tabulate \
|
||||
dm_tree \
|
||||
lz4 \
|
||||
opencv-python
|
||||
|
||||
RUN cd multiagent-particle-envs && \
|
||||
pip3 install -e . && \
|
||||
pip3 install --upgrade pyglet==1.3.2
|
||||
|
||||
RUN pip3 install ray-on-aml==0.1.6
|
||||
|
||||
RUN pip install protobuf==3.20.0
|
||||
|
||||
RUN pip3 install --upgrade \
|
||||
ray==0.8.7 \
|
||||
ray[rllib]==0.8.7 \
|
||||
ray[tune]==0.8.7
|
||||
|
||||
RUN pip install 'msrest<0.7.0'
|
||||
@@ -1,70 +0,0 @@
|
||||
# MIT License
|
||||
|
||||
# Copyright (c) 2018 OpenAI
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
import numpy as np
|
||||
import gym
|
||||
|
||||
|
||||
class MultiDiscrete(gym.Space):
|
||||
"""
|
||||
- The multi-discrete action space consists of a series of discrete action spaces with different
|
||||
parameters
|
||||
- It can be adapted to both a Discrete action space or a continuous (Box) action space
|
||||
- It is useful to represent game controllers or keyboards where each key can be represented as
|
||||
a discrete action space
|
||||
- It is parametrized by passing an array of arrays containing [min, max] for each discrete action
|
||||
space where the discrete action space can take any integers from `min` to `max` (both inclusive)
|
||||
Note: A value of 0 always need to represent the NOOP action.
|
||||
e.g. Nintendo Game Controller
|
||||
- Can be conceptualized as 3 discrete action spaces:
|
||||
1) Arrow Keys: Discrete 5 - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4] - params: min: 0, max: 4
|
||||
2) Button A: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1
|
||||
3) Button B: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1
|
||||
- Can be initialized as
|
||||
MultiDiscrete([ [0,4], [0,1], [0,1] ])
|
||||
"""
|
||||
def __init__(self, array_of_param_array):
|
||||
self.low = np.array([x[0] for x in array_of_param_array])
|
||||
self.high = np.array([x[1] for x in array_of_param_array])
|
||||
self.num_discrete_space = self.low.shape[0]
|
||||
|
||||
def sample(self):
|
||||
""" Returns a array with one sample from each discrete action space """
|
||||
# For each row: round(random .* (max - min) + min, 0)
|
||||
# random_array = prng.np_random.rand(self.num_discrete_space)
|
||||
random_array = np.random.RandomState().rand(self.num_discrete_space)
|
||||
return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
|
||||
|
||||
def contains(self, x):
|
||||
return len(x) == self.num_discrete_space \
|
||||
and (np.array(x) >= self.low).all() \
|
||||
and (np.array(x) <= self.high).all()
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
return self.num_discrete_space
|
||||
|
||||
def __repr__(self):
|
||||
return "MultiDiscrete" + str(self.num_discrete_space)
|
||||
|
||||
def __eq__(self, other):
|
||||
return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)
|
||||
@@ -1,413 +0,0 @@
|
||||
# MIT License
|
||||
|
||||
# Copyright (c) 2018 OpenAI
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
"""
|
||||
2D rendering framework
|
||||
"""
|
||||
from __future__ import division
|
||||
import os
|
||||
import six
|
||||
import sys
|
||||
from gym import error
|
||||
import math
|
||||
import numpy as np
|
||||
import pyglet
|
||||
|
||||
from pyglet.gl import glEnable, glHint, glLineWidth, glBlendFunc, glClearColor, glPushMatrix, \
|
||||
glTranslatef, glRotatef, glScalef, glPopMatrix, glColor4f, glBegin, glVertex3f, glEnd, glLineStipple, \
|
||||
glDisable, glVertex2f, GL_BLEND, GL_LINE_SMOOTH, GL_LINE_SMOOTH_HINT, GL_NICEST, GL_SRC_ALPHA, \
|
||||
GL_ONE_MINUS_SRC_ALPHA, GL_LINE_STIPPLE, GL_POINTS, GL_QUADS, GL_TRIANGLES, GL_POLYGON, GL_LINE_LOOP, \
|
||||
GL_LINE_STRIP, GL_LINES
|
||||
|
||||
|
||||
if "Apple" in sys.version:
|
||||
if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ:
|
||||
os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib'
|
||||
# (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite
|
||||
|
||||
|
||||
RAD2DEG = 57.29577951308232
|
||||
|
||||
|
||||
def get_display(spec):
|
||||
"""Convert a display specification (such as :0) into an actual Display
|
||||
object.
|
||||
|
||||
Pyglet only supports multiple Displays on Linux.
|
||||
"""
|
||||
if spec is None:
|
||||
return None
|
||||
elif isinstance(spec, six.string_types):
|
||||
return pyglet.canvas.Display(spec)
|
||||
else:
|
||||
raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec))
|
||||
|
||||
|
||||
class Viewer(object):
|
||||
def __init__(self, width, height, display=None):
|
||||
display = get_display(display)
|
||||
|
||||
self.width = width
|
||||
self.height = height
|
||||
|
||||
self.window = pyglet.window.Window(width=width, height=height, display=display)
|
||||
self.window.on_close = self.window_closed_by_user
|
||||
self.geoms = []
|
||||
self.onetime_geoms = []
|
||||
self.transform = Transform()
|
||||
|
||||
glEnable(GL_BLEND)
|
||||
# glEnable(GL_MULTISAMPLE)
|
||||
glEnable(GL_LINE_SMOOTH)
|
||||
# glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE)
|
||||
glHint(GL_LINE_SMOOTH_HINT, GL_NICEST)
|
||||
glLineWidth(2.0)
|
||||
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)
|
||||
|
||||
def close(self):
|
||||
self.window.close()
|
||||
|
||||
def window_closed_by_user(self):
|
||||
self.close()
|
||||
|
||||
def set_bounds(self, left, right, bottom, top):
|
||||
assert right > left and top > bottom
|
||||
scalex = self.width / (right - left)
|
||||
scaley = self.height / (top - bottom)
|
||||
self.transform = Transform(
|
||||
translation=(-left * scalex, -bottom * scaley),
|
||||
scale=(scalex, scaley))
|
||||
|
||||
def add_geom(self, geom):
|
||||
self.geoms.append(geom)
|
||||
|
||||
def add_onetime(self, geom):
|
||||
self.onetime_geoms.append(geom)
|
||||
|
||||
def render(self, return_rgb_array=False):
|
||||
glClearColor(1, 1, 1, 1)
|
||||
self.window.clear()
|
||||
self.window.switch_to()
|
||||
self.window.dispatch_events()
|
||||
self.transform.enable()
|
||||
for geom in self.geoms:
|
||||
geom.render()
|
||||
for geom in self.onetime_geoms:
|
||||
geom.render()
|
||||
self.transform.disable()
|
||||
arr = None
|
||||
if return_rgb_array:
|
||||
buffer = pyglet.image.get_buffer_manager().get_color_buffer()
|
||||
image_data = buffer.get_image_data()
|
||||
arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
|
||||
# In https://github.com/openai/gym-http-api/issues/2, we
|
||||
# discovered that someone using Xmonad on Arch was having
|
||||
# a window of size 598 x 398, though a 600 x 400 window
|
||||
# was requested. (Guess Xmonad was preserving a pixel for
|
||||
# the boundary.) So we use the buffer height/width rather
|
||||
# than the requested one.
|
||||
arr = arr.reshape(buffer.height, buffer.width, 4)
|
||||
arr = arr[::-1, :, 0:3]
|
||||
self.window.flip()
|
||||
self.onetime_geoms = []
|
||||
return arr
|
||||
|
||||
# Convenience
|
||||
def draw_circle(self, radius=10, res=30, filled=True, **attrs):
|
||||
geom = make_circle(radius=radius, res=res, filled=filled)
|
||||
_add_attrs(geom, attrs)
|
||||
self.add_onetime(geom)
|
||||
return geom
|
||||
|
||||
def draw_polygon(self, v, filled=True, **attrs):
|
||||
geom = make_polygon(v=v, filled=filled)
|
||||
_add_attrs(geom, attrs)
|
||||
self.add_onetime(geom)
|
||||
return geom
|
||||
|
||||
def draw_polyline(self, v, **attrs):
|
||||
geom = make_polyline(v=v)
|
||||
_add_attrs(geom, attrs)
|
||||
self.add_onetime(geom)
|
||||
return geom
|
||||
|
||||
def draw_line(self, start, end, **attrs):
|
||||
geom = Line(start, end)
|
||||
_add_attrs(geom, attrs)
|
||||
self.add_onetime(geom)
|
||||
return geom
|
||||
|
||||
def get_array(self):
|
||||
self.window.flip()
|
||||
image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data()
|
||||
self.window.flip()
|
||||
arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
|
||||
arr = arr.reshape(self.height, self.width, 4)
|
||||
return arr[::-1, :, 0:3]
|
||||
|
||||
|
||||
def _add_attrs(geom, attrs):
|
||||
if "color" in attrs:
|
||||
geom.set_color(*attrs["color"])
|
||||
if "linewidth" in attrs:
|
||||
geom.set_linewidth(attrs["linewidth"])
|
||||
|
||||
|
||||
class Geom(object):
|
||||
def __init__(self):
|
||||
self._color = Color((0, 0, 0, 1.0))
|
||||
self.attrs = [self._color]
|
||||
|
||||
def render(self):
|
||||
for attr in reversed(self.attrs):
|
||||
attr.enable()
|
||||
self.render1()
|
||||
for attr in self.attrs:
|
||||
attr.disable()
|
||||
|
||||
def render1(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def add_attr(self, attr):
|
||||
self.attrs.append(attr)
|
||||
|
||||
def set_color(self, r, g, b, alpha=1):
|
||||
self._color.vec4 = (r, g, b, alpha)
|
||||
|
||||
|
||||
class Attr(object):
|
||||
def enable(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def disable(self):
|
||||
pass
|
||||
|
||||
|
||||
class Transform(Attr):
|
||||
def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1, 1)):
|
||||
self.set_translation(*translation)
|
||||
self.set_rotation(rotation)
|
||||
self.set_scale(*scale)
|
||||
|
||||
def enable(self):
|
||||
glPushMatrix()
|
||||
glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint
|
||||
glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0)
|
||||
glScalef(self.scale[0], self.scale[1], 1)
|
||||
|
||||
def disable(self):
|
||||
glPopMatrix()
|
||||
|
||||
def set_translation(self, newx, newy):
|
||||
self.translation = (float(newx), float(newy))
|
||||
|
||||
def set_rotation(self, new):
|
||||
self.rotation = float(new)
|
||||
|
||||
def set_scale(self, newx, newy):
|
||||
self.scale = (float(newx), float(newy))
|
||||
|
||||
|
||||
class Color(Attr):
|
||||
def __init__(self, vec4):
|
||||
self.vec4 = vec4
|
||||
|
||||
def enable(self):
|
||||
glColor4f(*self.vec4)
|
||||
|
||||
|
||||
class LineStyle(Attr):
|
||||
def __init__(self, style):
|
||||
self.style = style
|
||||
|
||||
def enable(self):
|
||||
glEnable(GL_LINE_STIPPLE)
|
||||
glLineStipple(1, self.style)
|
||||
|
||||
def disable(self):
|
||||
glDisable(GL_LINE_STIPPLE)
|
||||
|
||||
|
||||
class LineWidth(Attr):
|
||||
def __init__(self, stroke):
|
||||
self.stroke = stroke
|
||||
|
||||
def enable(self):
|
||||
glLineWidth(self.stroke)
|
||||
|
||||
|
||||
class Point(Geom):
|
||||
def __init__(self):
|
||||
Geom.__init__(self)
|
||||
|
||||
def render1(self):
|
||||
glBegin(GL_POINTS) # draw point
|
||||
glVertex3f(0.0, 0.0, 0.0)
|
||||
glEnd()
|
||||
|
||||
|
||||
class FilledPolygon(Geom):
|
||||
def __init__(self, v):
|
||||
Geom.__init__(self)
|
||||
self.v = v
|
||||
|
||||
def render1(self):
|
||||
if len(self.v) == 4:
|
||||
glBegin(GL_QUADS)
|
||||
elif len(self.v) > 4:
|
||||
glBegin(GL_POLYGON)
|
||||
else:
|
||||
glBegin(GL_TRIANGLES)
|
||||
for p in self.v:
|
||||
glVertex3f(p[0], p[1], 0) # draw each vertex
|
||||
glEnd()
|
||||
|
||||
color = (
|
||||
self._color.vec4[0] * 0.5,
|
||||
self._color.vec4[1] * 0.5,
|
||||
self._color.vec4[2] * 0.5,
|
||||
self._color.vec4[3] * 0.5)
|
||||
glColor4f(*color)
|
||||
glBegin(GL_LINE_LOOP)
|
||||
for p in self.v:
|
||||
glVertex3f(p[0], p[1], 0) # draw each vertex
|
||||
glEnd()
|
||||
|
||||
|
||||
def make_circle(radius=10, res=30, filled=True):
|
||||
points = []
|
||||
for i in range(res):
|
||||
ang = 2 * math.pi * i / res
|
||||
points.append((math.cos(ang) * radius, math.sin(ang) * radius))
|
||||
if filled:
|
||||
return FilledPolygon(points)
|
||||
else:
|
||||
return PolyLine(points, True)
|
||||
|
||||
|
||||
def make_polygon(v, filled=True):
|
||||
if filled:
|
||||
return FilledPolygon(v)
|
||||
else:
|
||||
return PolyLine(v, True)
|
||||
|
||||
|
||||
def make_polyline(v):
|
||||
return PolyLine(v, False)
|
||||
|
||||
|
||||
def make_capsule(length, width):
|
||||
l, r, t, b = 0, length, width / 2, -width / 2
|
||||
box = make_polygon([(l, b), (l, t), (r, t), (r, b)])
|
||||
circ0 = make_circle(width / 2)
|
||||
circ1 = make_circle(width / 2)
|
||||
circ1.add_attr(Transform(translation=(length, 0)))
|
||||
geom = Compound([box, circ0, circ1])
|
||||
return geom
|
||||
|
||||
|
||||
class Compound(Geom):
|
||||
def __init__(self, gs):
|
||||
Geom.__init__(self)
|
||||
self.gs = gs
|
||||
for g in self.gs:
|
||||
g.attrs = [a for a in g.attrs if not isinstance(a, Color)]
|
||||
|
||||
def render1(self):
|
||||
for g in self.gs:
|
||||
g.render()
|
||||
|
||||
|
||||
class PolyLine(Geom):
|
||||
def __init__(self, v, close):
|
||||
Geom.__init__(self)
|
||||
self.v = v
|
||||
self.close = close
|
||||
self.linewidth = LineWidth(1)
|
||||
self.add_attr(self.linewidth)
|
||||
|
||||
def render1(self):
|
||||
glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP)
|
||||
for p in self.v:
|
||||
glVertex3f(p[0], p[1], 0) # draw each vertex
|
||||
glEnd()
|
||||
|
||||
def set_linewidth(self, x):
|
||||
self.linewidth.stroke = x
|
||||
|
||||
|
||||
class Line(Geom):
|
||||
def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)):
|
||||
Geom.__init__(self)
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.linewidth = LineWidth(1)
|
||||
self.add_attr(self.linewidth)
|
||||
|
||||
def render1(self):
|
||||
glBegin(GL_LINES)
|
||||
glVertex2f(*self.start)
|
||||
glVertex2f(*self.end)
|
||||
glEnd()
|
||||
|
||||
|
||||
class Image(Geom):
|
||||
def __init__(self, fname, width, height):
|
||||
Geom.__init__(self)
|
||||
self.width = width
|
||||
self.height = height
|
||||
img = pyglet.image.load(fname)
|
||||
self.img = img
|
||||
self.flip = False
|
||||
|
||||
def render1(self):
|
||||
self.img.blit(-self.width / 2, -self.height / 2, width=self.width, height=self.height)
|
||||
|
||||
|
||||
class SimpleImageViewer(object):
|
||||
def __init__(self, display=None):
|
||||
self.window = None
|
||||
self.isopen = False
|
||||
self.display = display
|
||||
|
||||
def imshow(self, arr):
|
||||
if self.window is None:
|
||||
height, width, channels = arr.shape
|
||||
self.window = pyglet.window.Window(width=width, height=height, display=self.display)
|
||||
self.width = width
|
||||
self.height = height
|
||||
self.isopen = True
|
||||
assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape"
|
||||
image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3)
|
||||
self.window.clear()
|
||||
self.window.switch_to()
|
||||
self.window.dispatch_events()
|
||||
image.blit(0, 0)
|
||||
self.window.flip()
|
||||
|
||||
def close(self):
|
||||
if self.isopen:
|
||||
self.window.close()
|
||||
self.isopen = False
|
||||
|
||||
def __del__(self):
|
||||
self.close()
|
||||
@@ -1,123 +0,0 @@
|
||||
import os
|
||||
|
||||
from ray_on_aml.core import Ray_On_AML
|
||||
|
||||
from ray.tune import run_experiments
|
||||
from ray.tune.registry import register_trainable, register_env, get_trainable_cls
|
||||
import ray.rllib.contrib.maddpg.maddpg as maddpg
|
||||
|
||||
from rllib_multiagent_particle_env import env_creator
|
||||
from util import parse_args
|
||||
|
||||
|
||||
def setup_ray():
|
||||
ray_on_aml = Ray_On_AML()
|
||||
ray_on_aml.getRay()
|
||||
|
||||
register_env('particle', env_creator)
|
||||
|
||||
|
||||
def gen_policy(args, env, id):
|
||||
use_local_critic = [
|
||||
args.adv_policy == 'ddpg' if id < args.num_adversaries else
|
||||
args.good_policy == 'ddpg' for id in range(env.num_agents)
|
||||
]
|
||||
return (
|
||||
None,
|
||||
env.observation_space_dict[id],
|
||||
env.action_space_dict[id],
|
||||
{
|
||||
'agent_id': id,
|
||||
'use_local_critic': use_local_critic[id],
|
||||
'obs_space_dict': env.observation_space_dict,
|
||||
'act_space_dict': env.action_space_dict,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def gen_policies(args, env_config):
|
||||
env = env_creator(env_config)
|
||||
return {'policy_%d' % i: gen_policy(args, env, i) for i in range(len(env.observation_space_dict))}
|
||||
|
||||
|
||||
def to_multiagent_config(policies):
|
||||
policy_ids = list(policies.keys())
|
||||
return {
|
||||
'policies': policies,
|
||||
'policy_mapping_fn': lambda index: policy_ids[index]
|
||||
}
|
||||
|
||||
|
||||
def train(args, env_config):
|
||||
def stop(trial_id, result):
|
||||
max_train_time = int(os.environ.get('AML_MAX_TRAIN_TIME_SECONDS', 2 * 60 * 60))
|
||||
|
||||
return result['episode_reward_mean'] >= args.final_reward \
|
||||
or result['time_total_s'] >= max_train_time
|
||||
|
||||
run_experiments({
|
||||
'MADDPG_RLLib': {
|
||||
'run': 'contrib/MADDPG',
|
||||
'env': 'particle',
|
||||
'stop': stop,
|
||||
# Uncomment to enable more frequent checkpoints:
|
||||
# 'checkpoint_freq': args.checkpoint_freq,
|
||||
'checkpoint_at_end': True,
|
||||
'local_dir': args.local_dir,
|
||||
'restore': args.restore,
|
||||
'config': {
|
||||
# === Log ===
|
||||
'log_level': 'ERROR',
|
||||
|
||||
# === Environment ===
|
||||
'env_config': env_config,
|
||||
'num_envs_per_worker': args.num_envs_per_worker,
|
||||
'horizon': args.max_episode_len,
|
||||
|
||||
# === Policy Config ===
|
||||
# --- Model ---
|
||||
'good_policy': args.good_policy,
|
||||
'adv_policy': args.adv_policy,
|
||||
'actor_hiddens': [args.num_units] * 2,
|
||||
'actor_hidden_activation': 'relu',
|
||||
'critic_hiddens': [args.num_units] * 2,
|
||||
'critic_hidden_activation': 'relu',
|
||||
'n_step': args.n_step,
|
||||
'gamma': args.gamma,
|
||||
|
||||
# --- Exploration ---
|
||||
'tau': 0.01,
|
||||
|
||||
# --- Replay buffer ---
|
||||
'buffer_size': int(1e6),
|
||||
|
||||
# --- Optimization ---
|
||||
'actor_lr': args.lr,
|
||||
'critic_lr': args.lr,
|
||||
'learning_starts': args.train_batch_size * args.max_episode_len,
|
||||
'sample_batch_size': args.sample_batch_size,
|
||||
'train_batch_size': args.train_batch_size,
|
||||
'batch_mode': 'truncate_episodes',
|
||||
|
||||
# --- Parallelism ---
|
||||
'num_workers': args.num_workers,
|
||||
'num_gpus': args.num_gpus,
|
||||
'num_gpus_per_worker': 0,
|
||||
|
||||
# === Multi-agent setting ===
|
||||
'multiagent': to_multiagent_config(gen_policies(args, env_config)),
|
||||
},
|
||||
},
|
||||
}, verbose=1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_args()
|
||||
setup_ray()
|
||||
|
||||
env_config = {
|
||||
'scenario_name': args.scenario,
|
||||
'horizon': args.max_episode_len,
|
||||
'video_frequency': args.checkpoint_freq,
|
||||
}
|
||||
train(args, env_config)
|
||||
@@ -1,113 +0,0 @@
|
||||
# Some code taken from: https://github.com/wsjeon/maddpg-rllib/
|
||||
|
||||
import imp
|
||||
import os
|
||||
|
||||
import gym
|
||||
from gym import wrappers
|
||||
from ray import rllib
|
||||
|
||||
from multiagent.environment import MultiAgentEnv
|
||||
import multiagent.scenarios as scenarios
|
||||
|
||||
|
||||
CUSTOM_SCENARIOS = ['simple_switch']
|
||||
|
||||
|
||||
class ParticleEnvRenderWrapper(gym.Wrapper):
|
||||
def __init__(self, env, horizon):
|
||||
super().__init__(env)
|
||||
self.horizon = horizon
|
||||
|
||||
def reset(self):
|
||||
self._num_steps = 0
|
||||
|
||||
return self.env.reset()
|
||||
|
||||
def render(self, mode):
|
||||
if mode == 'human':
|
||||
self.env.render(mode=mode)
|
||||
else:
|
||||
return self.env.render(mode=mode)[0]
|
||||
|
||||
def step(self, actions):
|
||||
obs_list, rew_list, done_list, info_list = self.env.step(actions)
|
||||
|
||||
self._num_steps += 1
|
||||
done = (all(done_list) or self._num_steps >= self.horizon)
|
||||
|
||||
# Gym monitor expects reward to be an int. This is only used for its
|
||||
# stats reporter, which we're not interested in. To make video recording
|
||||
# work, we package the rewards in the info object and extract it below.
|
||||
return obs_list, 0, done, [rew_list, done_list, info_list]
|
||||
|
||||
|
||||
class RLlibMultiAgentParticleEnv(rllib.MultiAgentEnv):
|
||||
def __init__(self, scenario_name, horizon, monitor_enabled=False, video_frequency=500):
|
||||
self._env = _make_env(scenario_name, horizon, monitor_enabled, video_frequency)
|
||||
self.num_agents = self._env.n
|
||||
self.agent_ids = list(range(self.num_agents))
|
||||
|
||||
self.observation_space_dict = self._make_dict(self._env.observation_space)
|
||||
self.action_space_dict = self._make_dict(self._env.action_space)
|
||||
|
||||
def reset(self):
|
||||
obs_dict = self._make_dict(self._env.reset())
|
||||
return obs_dict
|
||||
|
||||
def step(self, action_dict):
|
||||
actions = list(action_dict.values())
|
||||
obs_list, _, _, infos = self._env.step(actions)
|
||||
rew_list, done_list, _ = infos
|
||||
|
||||
obs_dict = self._make_dict(obs_list)
|
||||
rew_dict = self._make_dict(rew_list)
|
||||
done_dict = self._make_dict(done_list)
|
||||
done_dict['__all__'] = all(done_list)
|
||||
info_dict = self._make_dict([{'done': done} for done in done_list])
|
||||
|
||||
return obs_dict, rew_dict, done_dict, info_dict
|
||||
|
||||
def render(self, mode='human'):
|
||||
self._env.render(mode=mode)
|
||||
|
||||
def _make_dict(self, values):
|
||||
return dict(zip(self.agent_ids, values))
|
||||
|
||||
|
||||
def _video_callable(video_frequency):
|
||||
def should_record_video(episode_id):
|
||||
if episode_id % video_frequency == 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
return should_record_video
|
||||
|
||||
|
||||
def _make_env(scenario_name, horizon, monitor_enabled, video_frequency):
|
||||
if scenario_name in CUSTOM_SCENARIOS:
|
||||
# Scenario file must exist locally
|
||||
file_path = os.path.join(os.path.dirname(__file__), scenario_name + '.py')
|
||||
scenario = imp.load_source('', file_path).Scenario()
|
||||
else:
|
||||
scenario = scenarios.load(scenario_name + '.py').Scenario()
|
||||
|
||||
world = scenario.make_world()
|
||||
|
||||
env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
|
||||
env.metadata['video.frames_per_second'] = 8
|
||||
|
||||
env = ParticleEnvRenderWrapper(env, horizon)
|
||||
|
||||
if not monitor_enabled:
|
||||
return env
|
||||
|
||||
return wrappers.Monitor(env, './logs/videos', resume=True, video_callable=_video_callable(video_frequency))
|
||||
|
||||
|
||||
def env_creator(config):
|
||||
monitor_enabled = False
|
||||
if hasattr(config, 'worker_index') and hasattr(config, 'vector_index'):
|
||||
monitor_enabled = (config.worker_index == 1 and config.vector_index == 0)
|
||||
|
||||
return RLlibMultiAgentParticleEnv(**config, monitor_enabled=monitor_enabled)
|
||||
@@ -1,358 +0,0 @@
|
||||
import numpy as np
|
||||
import random
|
||||
|
||||
from multiagent.core import World, Agent, Landmark
|
||||
from multiagent.scenario import BaseScenario
|
||||
|
||||
|
||||
class SwitchWorld(World):
|
||||
""" Extended World with hills and switches """
|
||||
def __init__(self, hills, switches):
|
||||
super().__init__()
|
||||
# add hills and switches
|
||||
self.hills = hills
|
||||
self.switches = switches
|
||||
self.landmarks.extend(self.hills)
|
||||
self.landmarks.extend(self.switches)
|
||||
|
||||
def step(self):
|
||||
|
||||
super().step()
|
||||
|
||||
# if all hills are activated, reset the switches and hills
|
||||
if all([hill.active for hill in self.hills]):
|
||||
self.reset_hills()
|
||||
self.reset_switches()
|
||||
else:
|
||||
# Update switches
|
||||
for switch in self.switches:
|
||||
switch.step(self)
|
||||
# Update hills
|
||||
for hill in self.hills:
|
||||
hill.step(self)
|
||||
|
||||
def reset_hills(self):
|
||||
possible_hill_positions = [np.array([-0.8, 0]), np.array([0, 0.8]), np.array([0.8, 0]), np.array([0, -0.8])]
|
||||
hill_positions = random.sample(possible_hill_positions, k=len(self.hills))
|
||||
for i, hill in enumerate(self.hills):
|
||||
hill.state.p_pos = hill_positions[i]
|
||||
hill.deactivate()
|
||||
|
||||
def reset_switches(self):
|
||||
possible_switch_positions = [
|
||||
np.array([-0.8, -0.8]),
|
||||
np.array([-0.8, 0.8]),
|
||||
np.array([0.8, -0.8]),
|
||||
np.array([0.8, 0.8])]
|
||||
switch_positions = random.sample(possible_switch_positions, k=len(self.switches))
|
||||
for i, switch in enumerate(self.switches):
|
||||
switch.state.p_pos = switch_positions[i]
|
||||
switch.deactivate()
|
||||
|
||||
|
||||
class Scenario(BaseScenario):
|
||||
def make_world(self):
|
||||
|
||||
# main configurations
|
||||
num_agents = 2
|
||||
num_hills = 2
|
||||
num_switches = 1
|
||||
self.max_episode_length = 100
|
||||
|
||||
# create hills (on edges)
|
||||
possible_hill_positions = [np.array([-0.8, 0]), np.array([0, 0.8]), np.array([0.8, 0]), np.array([0, -0.8])]
|
||||
hill_positions = random.sample(possible_hill_positions, k=num_hills)
|
||||
hills = [Hill(hill_positions[i]) for i in range(num_hills)]
|
||||
# create switches (in corners)
|
||||
possible_switch_positions = [
|
||||
np.array([-0.8, -0.8]),
|
||||
np.array([-0.8, 0.8]),
|
||||
np.array([0.8, -0.8]),
|
||||
np.array([0.8, 0.8])]
|
||||
switch_positions = random.sample(possible_switch_positions, k=num_switches)
|
||||
switches = [Switch(switch_positions[i]) for i in range(num_switches)]
|
||||
|
||||
# make world and set basic properties
|
||||
world = SwitchWorld(hills, switches)
|
||||
world.dim_c = 2
|
||||
world.collaborative = True
|
||||
|
||||
# add agents
|
||||
world.agents = [Agent() for i in range(num_agents)]
|
||||
for i, agent in enumerate(world.agents):
|
||||
agent.name = 'agent %d' % i
|
||||
agent.collide = True
|
||||
agent.silent = True
|
||||
agent.size = 0.1
|
||||
agent.accel = 5.0
|
||||
agent.max_speed = 5.0
|
||||
if i == 0:
|
||||
agent.color = np.array([0.35, 0.35, 0.85])
|
||||
else:
|
||||
agent.color = np.array([0.35, 0.85, 0.85])
|
||||
|
||||
# make initial conditions
|
||||
self.reset_world(world)
|
||||
|
||||
return world
|
||||
|
||||
def reset_world(self, world):
|
||||
# set random initial states
|
||||
for agent in world.agents:
|
||||
agent.state.p_pos = np.array([random.uniform(-1, +1) for _ in range(world.dim_p)])
|
||||
agent.state.p_vel = np.zeros(world.dim_p)
|
||||
agent.state.c = np.zeros(world.dim_c)
|
||||
# set hills randomly
|
||||
world.reset_hills()
|
||||
# set switches randomly
|
||||
world.reset_switches()
|
||||
|
||||
def is_collision(self, agent1, agent2):
|
||||
delta_pos = agent1.state.p_pos - agent2.state.p_pos
|
||||
dist = np.sqrt(np.sum(np.square(delta_pos)))
|
||||
dist_min = agent1.size + agent2.size
|
||||
return True if dist < dist_min else False
|
||||
|
||||
def reward(self, agent, world):
|
||||
# Agents are rewarded based on number of landmarks activated
|
||||
rew = 0
|
||||
if all([h.active for h in world.hills]):
|
||||
rew += 100
|
||||
else:
|
||||
# give bonus each time a hill is activated
|
||||
for hill in world.hills:
|
||||
if hill.activated_just_now:
|
||||
rew += 50
|
||||
# penalise timesteps where nothing is happening
|
||||
if rew == 0:
|
||||
rew -= 0.1
|
||||
# add collision penalty
|
||||
if agent.collide:
|
||||
for a in world.agents:
|
||||
# note: this also counts collision with "itself", so gives -1 at every timestep
|
||||
# would be good to tune the reward function and use (not a == agent) here
|
||||
if self.is_collision(a, agent):
|
||||
rew -= 1
|
||||
return rew
|
||||
|
||||
def observation(self, agent, world):
|
||||
# get positions of all entities in this agent's reference frame
|
||||
entity_pos = []
|
||||
for entity in world.landmarks: # world.entities:
|
||||
entity_pos.append(entity.state.p_pos - agent.state.p_pos)
|
||||
# entity colors
|
||||
entity_color = []
|
||||
for entity in world.landmarks: # world.entities:
|
||||
entity_color.append(entity.color)
|
||||
# communication of all other agents
|
||||
comm = []
|
||||
other_pos = []
|
||||
for other in world.agents:
|
||||
if other is agent:
|
||||
continue
|
||||
comm.append(other.state.c)
|
||||
other_pos.append(other.state.p_pos - agent.state.p_pos)
|
||||
return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + comm)
|
||||
|
||||
|
||||
class Hill(Landmark):
|
||||
"""
|
||||
A hill that can be captured by an agent.
|
||||
To be captured, a team must occupy a hill for a fixed amount of time.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
pos=None,
|
||||
size=0.08,
|
||||
capture_time=2
|
||||
):
|
||||
|
||||
# Initialize Landmark super class
|
||||
super().__init__()
|
||||
self.movable = False
|
||||
self.collide = False
|
||||
self.state.p_pos = pos
|
||||
self.size = size
|
||||
|
||||
# Set static configurations
|
||||
self.capture_time = capture_time
|
||||
|
||||
# Initialize all hills to be inactive
|
||||
self.active = False
|
||||
self.color = np.array([0.5, 0.5, 0.5])
|
||||
self.capture_timer = 0
|
||||
|
||||
self.activated_just_now = False
|
||||
|
||||
def activate(self):
|
||||
self.active = True
|
||||
self.color = np.array([0.1, 0.1, 0.9])
|
||||
|
||||
def deactivate(self):
|
||||
self.active = False
|
||||
self.color = np.array([0.5, 0.5, 0.5])
|
||||
|
||||
def _is_occupied(self, agents):
|
||||
# a hill is occupied if an agent stands on it
|
||||
for agent in agents:
|
||||
dist = np.sqrt(np.sum(np.square(agent.state.p_pos - self.state.p_pos)))
|
||||
if dist < agent.size + self.size:
|
||||
return True
|
||||
return False
|
||||
|
||||
def step(self, world):
|
||||
|
||||
self.activated_just_now = False
|
||||
|
||||
# If hill isn't activated yet, check if an agent activates it
|
||||
# if (not self.active) and (world.switch.is_active()):
|
||||
if (not self.active):
|
||||
|
||||
# Check if an agent is on the hill and all switches are active
|
||||
if (self._is_occupied(world.agents)) and all([switch.active for switch in world.switches]):
|
||||
self.capture_timer += 1
|
||||
|
||||
# activate hill (this is irreversible)
|
||||
if self.capture_timer > self.capture_time:
|
||||
self.activate()
|
||||
self.activated_just_now = True
|
||||
|
||||
# Reset capture timer if hill is not occupied
|
||||
else:
|
||||
self.capture_timer = 0
|
||||
|
||||
|
||||
class Switch(Landmark):
|
||||
"""
|
||||
A switch that can be activated by an agent.
|
||||
The agent has to stay on the switch for it to be active.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
pos=None,
|
||||
size=0.03,
|
||||
):
|
||||
|
||||
# Initialize Landmark super class
|
||||
super().__init__()
|
||||
self.movable = False
|
||||
self.collide = False
|
||||
self.state.p_pos = pos
|
||||
self.size = size
|
||||
|
||||
# Initialize all hills to be inactive
|
||||
self.active = False
|
||||
self.color = np.array([0.8, 0.05, 0.3])
|
||||
self.capture_timer = 0
|
||||
|
||||
def activate(self):
|
||||
self.active = True
|
||||
self.color = np.array([0.1, 0.9, 0.4])
|
||||
|
||||
def deactivate(self):
|
||||
self.active = False
|
||||
self.color = np.array([0.8, 0.05, 0.3])
|
||||
|
||||
def _is_occupied(self, agents):
|
||||
# a switch is active if an agent stands on it
|
||||
for agent in agents:
|
||||
dist = np.sqrt(np.sum(np.square(agent.state.p_pos - self.state.p_pos)))
|
||||
if dist < agent.size + self.size:
|
||||
return True
|
||||
return False
|
||||
|
||||
def step(self, world):
|
||||
# check if an agent is on the switch and activate/deactive accordingly
|
||||
if self._is_occupied(world.agents):
|
||||
self.activate()
|
||||
else:
|
||||
self.deactivate()
|
||||
|
||||
|
||||
class SwitchExpertPolicy():
|
||||
"""
|
||||
Hand-coded expert policy for the simple switch environment.
|
||||
Types of possible experts:
|
||||
- always go to the switch
|
||||
- always go to the hills
|
||||
"""
|
||||
def __init__(self, dim_c, agent, world, expert_type=None, discrete_action_input=True):
|
||||
|
||||
self.dim_c = dim_c
|
||||
self.discrete_action_input = discrete_action_input
|
||||
# the agent we control and world we're in
|
||||
self.agent = agent
|
||||
self.world = world
|
||||
|
||||
if expert_type is None:
|
||||
self.expert_type = random.choice(['switch', 'hill'])
|
||||
else:
|
||||
self.expert_type = expert_type
|
||||
if self.expert_type == 'switch':
|
||||
self.target_switch = self.select_inital_target_switch()
|
||||
elif self.expert_type == 'hill':
|
||||
self.target_hill = self.select_inital_target_hill()
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
self.step_count = 0
|
||||
|
||||
def select_inital_target_switch(self):
|
||||
return random.choice(self.world.switches)
|
||||
|
||||
def select_inital_target_hill(self):
|
||||
return random.choice(self.world.hills)
|
||||
|
||||
def action(self):
|
||||
|
||||
# select a target!
|
||||
if self.expert_type == 'switch':
|
||||
# if agent is not already on a switch, choose target switch
|
||||
if not any([switch._is_occupied([self.agent]) for switch in self.world.switches]):
|
||||
# select a target switch if there's an inactive one
|
||||
inactive_switches = [switch for switch in self.world.switches if not switch.active]
|
||||
if len(inactive_switches) > 0 and (self.target_switch not in inactive_switches):
|
||||
self.target_switch = random.choice(inactive_switches)
|
||||
target = self.target_switch.state.p_pos
|
||||
elif self.expert_type == 'hill':
|
||||
# select a target hill if we haven't done so yet, or the current target switch is inactive
|
||||
inactive_hills = [hill for hill in self.world.hills if not hill.active]
|
||||
if len(inactive_hills) > 0 and (self.target_hill not in inactive_hills):
|
||||
self.target_hill = random.choice(inactive_hills)
|
||||
target = self.target_hill.state.p_pos
|
||||
|
||||
self.step_count += 1
|
||||
|
||||
impulse = np.clip(target - self.agent.state.p_pos, -self.agent.u_range, self.agent.u_range)
|
||||
|
||||
if self.discrete_action_input:
|
||||
u_idx = np.argmax(np.abs(impulse))
|
||||
if u_idx == 0 and impulse[u_idx] < 0:
|
||||
u = 1
|
||||
elif u_idx == 0 and impulse[u_idx] > 0:
|
||||
u = 2
|
||||
elif u_idx == 1 and impulse[u_idx] < 0:
|
||||
u = 3
|
||||
elif u_idx == 1 and impulse[u_idx] > 0:
|
||||
u = 4
|
||||
else:
|
||||
u = 0
|
||||
else:
|
||||
u = np.zeros(5)
|
||||
if (impulse[0] == impulse[1] == 0) \
|
||||
or (self.step_count < self.burn_in) \
|
||||
or (self.burn_step != 0 and self.step_count % self.burn_step != 0):
|
||||
u[0] = 0.1
|
||||
else:
|
||||
pass
|
||||
# u: noop (?), right, left, down, up
|
||||
if impulse[0] > 0: # x-direction (- left/right + )
|
||||
u[1] = impulse[0] # right
|
||||
elif impulse[0] < 0:
|
||||
u[2] = -impulse[0]
|
||||
if impulse[1] > 0: # y-direction (- up/down + )
|
||||
u[3] = impulse[1]
|
||||
elif impulse[1] < 0:
|
||||
u[4] = -impulse[1]
|
||||
|
||||
return u
|
||||
@@ -1,82 +0,0 @@
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
|
||||
from rllib_multiagent_particle_env import CUSTOM_SCENARIOS
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser('MADDPG with OpenAI MPE')
|
||||
|
||||
# Environment
|
||||
parser.add_argument('--scenario', type=str, default='simple',
|
||||
choices=['simple', 'simple_speaker_listener',
|
||||
'simple_crypto', 'simple_push',
|
||||
'simple_tag', 'simple_spread', 'simple_adversary'
|
||||
] + CUSTOM_SCENARIOS,
|
||||
help='name of the scenario script')
|
||||
parser.add_argument('--max-episode-len', type=int, default=25,
|
||||
help='maximum episode length')
|
||||
parser.add_argument('--num-episodes', type=int, default=60000,
|
||||
help='number of episodes')
|
||||
parser.add_argument('--num-adversaries', type=int, default=0,
|
||||
help='number of adversaries')
|
||||
parser.add_argument('--good-policy', type=str, default='maddpg',
|
||||
help='policy for good agents')
|
||||
parser.add_argument('--adv-policy', type=str, default='maddpg',
|
||||
help='policy of adversaries')
|
||||
|
||||
# Core training parameters
|
||||
parser.add_argument('--lr', type=float, default=1e-2,
|
||||
help='learning rate for Adam optimizer')
|
||||
parser.add_argument('--gamma', type=float, default=0.95,
|
||||
help='discount factor')
|
||||
# NOTE: 1 iteration = sample_batch_size * num_workers timesteps * num_envs_per_worker
|
||||
parser.add_argument('--sample-batch-size', type=int, default=25,
|
||||
help='number of data points sampled /update /worker')
|
||||
parser.add_argument('--train-batch-size', type=int, default=1024,
|
||||
help='number of data points /update')
|
||||
parser.add_argument('--n-step', type=int, default=1,
|
||||
help='length of multistep value backup')
|
||||
parser.add_argument('--num-units', type=int, default=64,
|
||||
help='number of units in the mlp')
|
||||
parser.add_argument('--final-reward', type=int, default=-400,
|
||||
help='final reward after which to stop training')
|
||||
|
||||
# Checkpoint
|
||||
parser.add_argument('--checkpoint-freq', type=int, default=200,
|
||||
help='save model once every time this many iterations are completed')
|
||||
parser.add_argument('--local-dir', type=str, default='./logs',
|
||||
help='path to save checkpoints')
|
||||
parser.add_argument('--restore', type=str, default=None,
|
||||
help='directory in which training state and model are loaded')
|
||||
|
||||
# Parallelism
|
||||
parser.add_argument('--num-workers', type=int, default=1)
|
||||
parser.add_argument('--num-envs-per-worker', type=int, default=4)
|
||||
parser.add_argument('--num-gpus', type=int, default=0)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def find_final_checkpoint(start_dir):
|
||||
def find(pattern, path):
|
||||
result = []
|
||||
for root, _, files in os.walk(path):
|
||||
for name in files:
|
||||
if pattern.match(name):
|
||||
result.append(os.path.join(root, name))
|
||||
return result
|
||||
|
||||
cp_pattern = re.compile('.*checkpoint-\\d+$')
|
||||
checkpoint_files = find(cp_pattern, start_dir)
|
||||
|
||||
checkpoint_numbers = []
|
||||
for file in checkpoint_files:
|
||||
checkpoint_numbers.append(int(file.split('-')[-1]))
|
||||
|
||||
final_checkpoint_number = max(checkpoint_numbers)
|
||||
|
||||
return next(
|
||||
checkpoint_file for checkpoint_file in checkpoint_files
|
||||
if checkpoint_file.endswith(str(final_checkpoint_number)))
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 350 KiB |
@@ -1,566 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Reinforcement Learning in Azure Machine Learning - Training multiple agents on collaborative ParticleEnv tasks\n",
|
||||
"\n",
|
||||
"This tutorial will show you how to train policies in a multi-agent scenario.\n",
|
||||
"We use OpenAI Gym's [Particle environments](https://github.com/openai/multiagent-particle-envs),\n",
|
||||
"which model agents and landmarks in a two-dimensional world. Particle comes with\n",
|
||||
"several predefined scenarios, both competitive and collaborative, and with or without communication.\n",
|
||||
"\n",
|
||||
"For this tutorial, we pick a cooperative navigation scenario where N agents are in a world with N\n",
|
||||
"landmarks. The agents' goal is to cover all the landmarks without collisions,\n",
|
||||
"so agents must learn to avoid each other (social distancing!). The video below shows training\n",
|
||||
"results for N=3 agents/landmarks:\n",
|
||||
"\n",
|
||||
"<table style=\"width:50%\">\n",
|
||||
" <tr>\n",
|
||||
" <th style=\"text-align: center;\">\n",
|
||||
" <img src=\"./images/particle_simple_spread.gif\" alt=\"Particle video\" align=\"middle\" margin-left=\"auto\" margin-right=\"auto\"/>\n",
|
||||
" </th>\n",
|
||||
" </tr>\n",
|
||||
" <tr style=\"text-align: center;\">\n",
|
||||
" <th>Fig 1. Video of 3 agents covering 3 landmarks in a multiagent Particle scenario.</th>\n",
|
||||
" </tr>\n",
|
||||
"</table>\n",
|
||||
"\n",
|
||||
"The tutorial will cover the following steps:\n",
|
||||
"- Initializing Azure Machine Learning resources for training\n",
|
||||
"- Training policies in a multi-agent environment with Azure Machine Learning service\n",
|
||||
"- Monitoring training progress\n",
|
||||
"\n",
|
||||
"## Prerequisites\n",
|
||||
"\n",
|
||||
"The user should have completed the Azure Machine Learning introductory tutorial. You will need to make sure that you have a valid subscription id, a resource group and a workspace. For detailed instructions see [Tutorial: Get started creating your first ML experiment](https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-1st-experiment-sdk-setup).\n",
|
||||
"\n",
|
||||
"Please ensure that you have a current version of IPython (>= 7.15) installed.\n",
|
||||
"\n",
|
||||
"While this is a standalone notebook, we highly recommend going over the introductory notebooks for RL first.\n",
|
||||
"- Getting started:\n",
|
||||
" - [RL using a compute instance with Azure Machine Learning](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.ipynb)\n",
|
||||
" - [RL using Azure Machine Learning compute](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_sc.ipynb)\n",
|
||||
"- [Scaling RL training runs with Azure Machine Learning](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb)\n",
|
||||
"\n",
|
||||
"## Initialize resources\n",
|
||||
"\n",
|
||||
"All required Azure Machine Learning service resources for this tutorial can be set up from Jupyter. This includes:\n",
|
||||
"\n",
|
||||
"- Connecting to your existing Azure Machine Learning workspace.\n",
|
||||
"- Creating an experiment to track runs.\n",
|
||||
"- Creating remote compute targets for [Ray](https://docs.ray.io/en/latest/index.html).\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"### Azure Machine Learning SDK\n",
|
||||
"\n",
|
||||
"Display the Azure Machine Learning SDK version."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646249589452
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"print('Azure Machine Learning SDK version: ', azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Connect to workspace\n",
|
||||
"\n",
|
||||
"Get a reference to an existing Azure Machine Learning workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646250284486
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()\n",
|
||||
"print(ws.name, ws.location, ws.resource_group, sep=' | ')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create an experiment\n",
|
||||
"\n",
|
||||
"Create an experiment to track the runs in your workspace. A\n",
|
||||
"workspace can have multiple experiments and each experiment\n",
|
||||
"can be used to track multiple runs (see [documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.experiment.experiment?view=azure-ml-py)\n",
|
||||
"for details)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646250342411
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Experiment\n",
|
||||
"\n",
|
||||
"exp = Experiment(workspace=ws, name='particle-multiagent')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create or attach an existing compute resource\n",
|
||||
"\n",
|
||||
"A compute target is a designated compute resource where you run your training script. For more information, see [What are compute targets in Azure Machine Learning service?](https://docs.microsoft.com/en-us/azure/machine-learning/concept-compute-target).\n",
|
||||
"\n",
|
||||
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
|
||||
"\n",
|
||||
"#### CPU target for Ray head\n",
|
||||
"\n",
|
||||
"In the experiment setup for this tutorial, the Ray head node will\n",
|
||||
"run on a CPU node (D3 type). A maximum cluster size of 1 node is\n",
|
||||
"therefore sufficient. If you wish to run multiple experiments in\n",
|
||||
"parallel using the same CPU cluster, you may elect to increase this\n",
|
||||
"number. The cluster will automatically scale down to 0 nodes when\n",
|
||||
"no training jobs are scheduled (see min_nodes).\n",
|
||||
"\n",
|
||||
"The code below creates a compute cluster of D3 type nodes.\n",
|
||||
"If the cluster with the specified name is already in your workspace\n",
|
||||
"the code will skip the creation process.\n",
|
||||
"\n",
|
||||
"**Note: Creation of a compute resource can take several minutes**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646250346756
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import AmlCompute, ComputeTarget\n",
|
||||
"\n",
|
||||
"cpu_cluster_name = 'cpu-cl-d3'\n",
|
||||
"\n",
|
||||
"if cpu_cluster_name in ws.compute_targets:\n",
|
||||
" cpu_cluster = ws.compute_targets[cpu_cluster_name]\n",
|
||||
" if cpu_cluster and type(cpu_cluster) is AmlCompute:\n",
|
||||
" if cpu_cluster.provisioning_state == 'Succeeded':\n",
|
||||
" print('Found existing compute target for {}. Using it.'.format(cpu_cluster_name))\n",
|
||||
" else: \n",
|
||||
" raise Exception('Found existing compute target for {} '.format(cpu_cluster_name)\n",
|
||||
" + 'but it is in state {}'.format(cpu_cluster.provisioning_state))\n",
|
||||
"else:\n",
|
||||
" print('Creating a new compute target for {}...'.format(cpu_cluster_name))\n",
|
||||
" provisioning_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size='STANDARD_D3',\n",
|
||||
" min_nodes=0, \n",
|
||||
" max_nodes=1)\n",
|
||||
"\n",
|
||||
" cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, provisioning_config)\n",
|
||||
" cpu_cluster.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
|
||||
" \n",
|
||||
" print('Cluster created.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Training the policies\n",
|
||||
"\n",
|
||||
"### Training environment\n",
|
||||
"\n",
|
||||
"This tutorial uses a custom docker image\n",
|
||||
"with the necessary software installed. The [Environment](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-environments)\n",
|
||||
"class stores the configuration for the training environment. The\n",
|
||||
"docker image is set via `env.docker.base_image`.\n",
|
||||
"`user_managed_dependencies` is set so that\n",
|
||||
"the preinstalled Python packages in the image are preserved.\n",
|
||||
"\n",
|
||||
"Note that since we want to capture videos of the training runs requiring a display, we set the interpreter_path such that the Python process is started via **xvfb-run**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646257481631
|
||||
},
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
"source_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"ray_environment_name = 'particle-cpu'\n",
|
||||
"ray_environment_dockerfile_path = os.path.join(os.getcwd(), 'docker', 'cpu', 'Dockerfile')\n",
|
||||
"ray_environment = Environment. \\\n",
|
||||
" from_dockerfile(name=ray_environment_name, dockerfile=ray_environment_dockerfile_path). \\\n",
|
||||
" register(workspace=ws)\n",
|
||||
"ray_cpu_build_details = ray_environment.build(workspace=ws)\n",
|
||||
"\n",
|
||||
"ray_cpu_build_details.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Training script\n",
|
||||
"\n",
|
||||
"This tutorial uses the multiagent algorithm [Multi-Agent Deep Deterministic Policy Gradient (MADDPG)](https://docs.ray.io/en/latest/rllib-algorithms.html?highlight=maddpg#multi-agent-deep-deterministic-policy-gradient-contrib-maddpg).\n",
|
||||
"For training policies in a multiagent scenario, Ray's RLlib also\n",
|
||||
"requires the `multiagent` configuration section to be specified. You\n",
|
||||
"can find more information in the [common parameters](https://docs.ray.io/en/latest/rllib-training.html?highlight=multiagent#common-parameters)\n",
|
||||
"documentation.\n",
|
||||
"\n",
|
||||
"The stopping criteria are set such that the training run is\n",
|
||||
"terminated after either a mean reward of -450 is observed, or\n",
|
||||
"training has run for over 2 hours.\n",
|
||||
"\n",
|
||||
"### Submitting a training run\n",
|
||||
"\n",
|
||||
"You can submit the training run using a `ScriptRunConfig`. By providing the\n",
|
||||
"command to run the training, and a `RunConfig` object configured with your\n",
|
||||
"compute target, number of nodes, and environment image to use.\n",
|
||||
"\n",
|
||||
"Note that you can use the same notebook and scripts to experiment with\n",
|
||||
"different Particle environments. You can find a list of supported\n",
|
||||
"environments [here](https://github.com/openai/multiagent-particle-envs/tree/master#list-of-environments).\n",
|
||||
"Simply change the `--scenario` parameter to a supported scenario.\n",
|
||||
"\n",
|
||||
"In order to get the best training results, you can also adjust the\n",
|
||||
"`--final-reward` parameter to determine when to stop training. A greater\n",
|
||||
"reward means longer running time, but improved results. By default,\n",
|
||||
"the final reward will be -450, which should show good progress after\n",
|
||||
"about one hour of run time.\n",
|
||||
"\n",
|
||||
"For this notebook, we use a single D3 nodes, giving us a total of 4 CPUs and\n",
|
||||
"0 GPUs. One CPU is used by the MADDPG trainer, and an additional CPU is\n",
|
||||
"consumed by the RLlib rollout worker. The other 2 CPUs are not used, though\n",
|
||||
"smaller node types will run out of memory for this task."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"gather": {
|
||||
"logged": 1646275371701
|
||||
},
|
||||
"jupyter": {
|
||||
"outputs_hidden": false,
|
||||
"source_hidden": false
|
||||
},
|
||||
"nteract": {
|
||||
"transient": {
|
||||
"deleting": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import RunConfiguration, ScriptRunConfig, Experiment\n",
|
||||
"from azureml.core.runconfig import DockerConfiguration, RunConfiguration\n",
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"\n",
|
||||
"experiment_name = 'particle-multiagent'\n",
|
||||
"\n",
|
||||
"experiment = Experiment(workspace=ws, name=experiment_name)\n",
|
||||
"\n",
|
||||
"aml_run_config_ml = RunConfiguration(communicator='OpenMpi')\n",
|
||||
"aml_run_config_ml.target = cpu_cluster\n",
|
||||
"aml_run_config_ml.node_count = 1\n",
|
||||
"aml_run_config_ml.environment = ray_environment\n",
|
||||
"\n",
|
||||
"config = ScriptRunConfig(source_directory='./files',\n",
|
||||
" command=[\n",
|
||||
" 'xvfb-run -s \"-screen 0 640x480x16 -ac +extension GLX +render\" python',\n",
|
||||
" 'particle_train.py',\n",
|
||||
" '--scenario', 'simple_spread',\n",
|
||||
" '--final-reward', '-450'\n",
|
||||
" ],\n",
|
||||
" run_config = aml_run_config_ml\n",
|
||||
" )\n",
|
||||
"train_run = experiment.submit(config)\n",
|
||||
"\n",
|
||||
"RunDetails(train_run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Job cancellation\n",
|
||||
"\n",
|
||||
"You may cancel the job by uncommenting and running the cell below."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# If you wish to cancel the run before it completes, uncomment and execute:\n",
|
||||
"# train_run.cancel()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Monitoring training progress\n",
|
||||
"\n",
|
||||
"### View the Tensorboard\n",
|
||||
"\n",
|
||||
"The Tensorboard can be displayed via the Azure Machine Learning\n",
|
||||
"service's [Tensorboard API](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-monitor-tensorboard).\n",
|
||||
"When running locally, please make sure to follow the instructions\n",
|
||||
"in the link and install required packages. Running this cell will output a URL for the Tensorboard.\n",
|
||||
"\n",
|
||||
"Note that the training script sets the log directory when\n",
|
||||
"starting RLlib via the local_dir parameter. ./logs will automatically\n",
|
||||
"appear in the downloadable files for a run. Since this script is\n",
|
||||
"executed on the Ray head node run, we need to get a reference to it\n",
|
||||
"as shown below.\n",
|
||||
"\n",
|
||||
"The Tensorboard API will continuously stream logs from the run.\n",
|
||||
"\n",
|
||||
"**Note: It may take a couple of minutes after the run is in \"Running\"\n",
|
||||
"state before Tensorboard files are available and the board will refresh automatically**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# from azureml.tensorboard import Tensorboard\n",
|
||||
"\n",
|
||||
"# tb = Tensorboard([train_run])\n",
|
||||
"# tb.start()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### View training videos\n",
|
||||
"\n",
|
||||
"As mentioned above, we record videos of the agents interacting with the\n",
|
||||
"Particle world. These videos are often a crucial indicator for training\n",
|
||||
"success. The code below downloads the latest video as it becomes available\n",
|
||||
"and displays it in-line.\n",
|
||||
"\n",
|
||||
"Over time, the agents learn to cooperate and avoid collisions while\n",
|
||||
"traveling to all landmarks.\n",
|
||||
"\n",
|
||||
"**Note: It can take several minutes for a video to appear after the run\n",
|
||||
"was started.**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from azureml.core import Dataset\n",
|
||||
"from azureml.data.dataset_error_handling import DatasetValidationError\n",
|
||||
"\n",
|
||||
"from IPython.display import clear_output\n",
|
||||
"from IPython.core.display import display, Video\n",
|
||||
"\n",
|
||||
"datastore = ws.datastores['workspaceartifactstore']\n",
|
||||
"path_prefix = './tmp_videos'\n",
|
||||
"\n",
|
||||
"def download_latest_training_video(run, video_checkpoint_counter):\n",
|
||||
" run_artifacts_path = os.path.join('ExperimentRun', f'dcid.{run.id}', 'logs', 'videos')\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" run_artifacts_ds = Dataset.File.from_files(datastore.path(os.path.join(run_artifacts_path, '**')))\n",
|
||||
" except DatasetValidationError as e:\n",
|
||||
" # This happens at the start of the run when there is no data available\n",
|
||||
" # in the run's artifacts\n",
|
||||
" return None, video_checkpoint_counter\n",
|
||||
" \n",
|
||||
" video_files = [file for file in run_artifacts_ds.to_path() if file.endswith('.mp4')]\n",
|
||||
" if len(video_files) == video_checkpoint_counter:\n",
|
||||
" return None, video_checkpoint_counter\n",
|
||||
" \n",
|
||||
" iteration_numbers = [int(vf[vf.rindex('video') + len('video') : vf.index('.mp4')]) for vf in video_files]\n",
|
||||
" latest_video = next(vf for vf in video_files if vf.endswith('{num}.mp4'.format(num=max(iteration_numbers))))\n",
|
||||
" latest_video = os.path.join(run_artifacts_path, os.path.normpath(latest_video[1:]))\n",
|
||||
" \n",
|
||||
" datastore.download(\n",
|
||||
" target_path=path_prefix,\n",
|
||||
" prefix=latest_video.replace('\\\\', '/'),\n",
|
||||
" show_progress=False)\n",
|
||||
" \n",
|
||||
" return os.path.join(path_prefix, latest_video), len(video_files)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def render_video(vf):\n",
|
||||
" clear_output(wait=True)\n",
|
||||
" display(Video(data=vf, embed=True, html_attributes='loop autoplay controls width=50%'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import shutil, time\n",
|
||||
"\n",
|
||||
"terminal_statuses = ['Canceled', 'Completed', 'Failed']\n",
|
||||
"video_checkpoint_counter = 0\n",
|
||||
"\n",
|
||||
"while train_run.get_status() not in terminal_statuses:\n",
|
||||
" video_file, video_checkpoint_counter = download_latest_training_video(train_run, video_checkpoint_counter)\n",
|
||||
" if video_file is not None:\n",
|
||||
" render_video(video_file)\n",
|
||||
" \n",
|
||||
" print('Displaying video number {}'.format(video_checkpoint_counter))\n",
|
||||
" shutil.rmtree(path_prefix)\n",
|
||||
" \n",
|
||||
" # Interrupting the kernel can take up to 15 seconds\n",
|
||||
" # depending on when time.sleep started\n",
|
||||
" time.sleep(15)\n",
|
||||
" \n",
|
||||
"train_run.wait_for_completion()\n",
|
||||
"print('The training run has reached a terminal status.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Cleaning up\n",
|
||||
"\n",
|
||||
"Below, you can find code snippets for your convenience to clean up any resources created as part of this tutorial you don't wish to retain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# to stop the Tensorboard, uncomment and run\n",
|
||||
"# tb.stop()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# to delete the cpu compute target, uncomment and run\n",
|
||||
"# cpu_cluster.delete()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Next steps\n",
|
||||
"\n",
|
||||
"We would love to hear your feedback! Please let us know what you think of Reinforcement Learning in Azure Machine Learning and what features you are looking forward to."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "andress"
|
||||
}
|
||||
],
|
||||
"categories": [
|
||||
"how-to-use-azureml",
|
||||
"reinforcement-learning"
|
||||
],
|
||||
"interpreter": {
|
||||
"hash": "13382f70c1d0595120591d2e358c8d446daf961bf951d1fba9a32631e205d5ab"
|
||||
},
|
||||
"kernel_info": {
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.9"
|
||||
},
|
||||
"notice": "Copyright (c) Microsoft Corporation. All rights reserved.\u00c3\u0192\u00c2\u00a2\u00c3\u00a2\u00e2\u20ac\u0161\u00c2\u00ac\u00c3\u201a\u00c2\u00afLicensed under the MIT License.\u00c3\u0192\u00c2\u00a2\u00c3\u00a2\u00e2\u20ac\u0161\u00c2\u00ac\u00c3\u201a\u00c2\u00af ",
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
@@ -1,9 +0,0 @@
|
||||
name: particle
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-contrib-reinforcementlearning
|
||||
- azureml-widgets
|
||||
- tensorboard
|
||||
- azureml-tensorboard
|
||||
- ipython
|
||||
@@ -8,7 +8,7 @@ dependencies:
|
||||
- matplotlib
|
||||
- azureml-dataset-runtime
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.23.0
|
||||
- raiwidgets~=0.26.0
|
||||
- liac-arff
|
||||
- packaging>=20.9
|
||||
- itsdangerous==2.0.1
|
||||
|
||||
@@ -101,7 +101,7 @@
|
||||
"\n",
|
||||
"# Check core SDK version number\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using SDK version 1.48.0, you are currently running version\", azureml.core.VERSION)"
|
||||
"print(\"This notebook was created using SDK version 1.51.0, you are currently running version\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -6,5 +6,5 @@ dependencies:
|
||||
- tensorflow
|
||||
- tqdm
|
||||
- scipy
|
||||
- sklearn
|
||||
- scikit-learn
|
||||
- setuptools>=41.0.0
|
||||
|
||||
@@ -3,5 +3,6 @@ dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-tensorboard
|
||||
- tensorboard
|
||||
- tensorflow
|
||||
- setuptools>=41.0.0
|
||||
|
||||
@@ -277,7 +277,7 @@
|
||||
" - azureml-dataset-runtime\n",
|
||||
" - keras==2.6\n",
|
||||
" - tensorflow-gpu==2.6\n",
|
||||
" - numpy\n",
|
||||
" - numpy==1.23\n",
|
||||
" - scikit-learn\n",
|
||||
" - pandas\n",
|
||||
" - matplotlib\n",
|
||||
|
||||
238
index.md
238
index.md
@@ -9,76 +9,74 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
|
||||
|
||||
|Title| Task | Dataset | Training Compute | Deployment Target | ML Framework | Tags |
|
||||
|:----|:-----|:-------:|:----------------:|:-----------------:|:------------:|:------------:|
|
||||
| [Using Azure ML environments](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training/using-environments/using-environments.ipynb) | Creating and registering environments | None | Local | None | None | None |
|
||||
| [Using Azure ML environments](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/training/using-environments/using-environments.ipynb) | Creating and registering environments | None | Local | None | None | None |
|
||||
|
||||
## Tutorials
|
||||
|
||||
|Title| Task | Dataset | Training Compute | Deployment Target | ML Framework | Tags |
|
||||
|:----|:-----|:-------:|:----------------:|:-----------------:|:------------:|:------------:|
|
||||
| [Forecasting BikeShare Demand](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb) | Forecasting | BikeShare | Remote | None | Azure ML AutoML | Forecasting |
|
||||
| [Forecasting orange juice sales with deployment](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb) | Forecasting | Orange Juice Sales | Remote | Azure Container Instance | Azure ML AutoML | None |
|
||||
| [Forecasting orange juice sales with deployment](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-pipelines/auto-ml-forecasting-pipelines.ipynb) | Forecasting | Orange Juice Sales | Remote | Azure Container Instance | Azure ML AutoML | None |
|
||||
| [Register a model and deploy locally](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local.ipynb) | Deployment | None | Local | Local | None | None |
|
||||
| :star:[Data drift quickdemo](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/datadrift-tutorial/datadrift-tutorial.ipynb) | Filtering | NOAA | Remote | None | Azure ML | Dataset, Timeseries, Drift |
|
||||
| :star:[Datasets with ML Pipeline](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/datasets-tutorial/pipeline-with-datasets/pipeline-for-image-classification.ipynb) | Train | Fashion MNIST | Remote | None | Azure ML | Dataset, Pipeline, Estimator, ScriptRun |
|
||||
| :star:[Filtering data using Tabular Timeseiries Dataset related API](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/datasets-tutorial/timeseries-datasets/tabular-timeseries-dataset-filtering.ipynb) | Filtering | NOAA | Local | None | Azure ML | Dataset, Tabular Timeseries |
|
||||
| :star:[Train with Datasets (Tabular and File)](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/datasets-tutorial/train-with-datasets/train-with-datasets.ipynb) | Train | Iris, Diabetes | Remote | None | Azure ML | Dataset, Estimator, ScriptRun |
|
||||
| [Forecasting away from training data](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-forecast-function/auto-ml-forecasting-function.ipynb) | Forecasting | None | Remote | None | Azure ML AutoML | Forecasting, Confidence Intervals |
|
||||
| [Automated ML run with basic edition features.](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb) | Classification | Bankmarketing | AML | ACI | None | featurization, explainability, remote_run, AutomatedML |
|
||||
| [Classification of credit card fraudulent transactions using Automated ML](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb) | Classification | Creditcard | AML Compute | None | None | remote_run, AutomatedML |
|
||||
| [Classification of credit card fraudulent transactions using Automated ML](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/experimental/classification-credit-card-fraud-local-managed/auto-ml-classification-credit-card-fraud-local-managed.ipynb) | Classification | Creditcard | AML Compute | None | None | AutomatedML |
|
||||
| [Automated ML run with featurization and model explainability.](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb) | Regression | MachineData | AML | ACI | None | featurization, explainability, remote_run, AutomatedML |
|
||||
| [auto-ml-forecasting-backtest-single-model](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-backtest-single-model/auto-ml-forecasting-backtest-single-model.ipynb) | | None | Remote | None | Azure ML AutoML | |
|
||||
| :star:[Azure Machine Learning Pipeline with DataTranferStep](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-data-transfer.ipynb) | Demonstrates the use of DataTranferStep | Custom | ADF | None | Azure ML | None |
|
||||
| [Getting Started with Azure Machine Learning Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-getting-started.ipynb) | Getting Started notebook for ANML Pipelines | Custom | AML Compute | None | Azure ML | None |
|
||||
| [Azure Machine Learning Pipeline with AzureBatchStep](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-azurebatch-to-run-a-windows-executable.ipynb) | Demonstrates the use of AzureBatchStep | Custom | Azure Batch | None | Azure ML | None |
|
||||
| :star:[How to use ModuleStep with AML Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-modulestep.ipynb) | Demonstrates the use of ModuleStep | Custom | AML Compute | None | Azure ML | None |
|
||||
| :star:[How to use Pipeline Drafts to create a Published Pipeline](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-pipeline-drafts.ipynb) | Demonstrates the use of Pipeline Drafts | Custom | AML Compute | None | Azure ML | None |
|
||||
| :star:[Azure Machine Learning Pipeline with HyperDriveStep](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-parameter-tuning-with-hyperdrive.ipynb) | Demonstrates the use of HyperDriveStep | Custom | AML Compute | None | Azure ML | None |
|
||||
| :star:[How to Publish a Pipeline and Invoke the REST endpoint](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-publish-and-run-using-rest-endpoint.ipynb) | Demonstrates the use of Published Pipelines | Custom | AML Compute | None | Azure ML | None |
|
||||
| :star:[How to Setup a Schedule for a Published Pipeline or Pipeline Endpoint](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-setup-schedule-for-a-published-pipeline.ipynb) | Demonstrates the use of Schedules for Published Pipelines and Pipeline endpoints | Custom | AML Compute | None | Azure ML | None |
|
||||
| [How to setup a versioned Pipeline Endpoint](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-setup-versioned-pipeline-endpoints.ipynb) | Demonstrates the use of PipelineEndpoint to run a specific version of the Published Pipeline | Custom | AML Compute | None | Azure ML | None |
|
||||
| :star:[How to use DataPath as a PipelineParameter](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-showcasing-datapath-and-pipelineparameter.ipynb) | Demonstrates the use of DataPath as a PipelineParameter | Custom | AML Compute | None | Azure ML | None |
|
||||
| :star:[How to use Dataset as a PipelineParameter](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-showcasing-dataset-and-pipelineparameter.ipynb) | Demonstrates the use of Dataset as a PipelineParameter | Custom | AML Compute | None | Azure ML | None |
|
||||
| [How to use AdlaStep with AML Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-use-adla-as-compute-target.ipynb) | Demonstrates the use of AdlaStep | Custom | Azure Data Lake Analytics | None | Azure ML | None |
|
||||
| :star:[How to use DatabricksStep with AML Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-use-databricks-as-compute-target.ipynb) | Demonstrates the use of DatabricksStep | Custom | Azure Databricks | None | Azure ML, Azure Databricks | None |
|
||||
| :star:[How to use KustoStep with AML Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-use-kusto-as-compute-target.ipynb) | Demonstrates the use of KustoStep | Custom | Kusto | None | Azure ML, Kusto | None |
|
||||
| :star:[How to use AutoMLStep with AML Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-automated-machine-learning-step.ipynb) | Demonstrates the use of AutoMLStep | Custom | AML Compute | None | Automated Machine Learning | None |
|
||||
| [Azure Machine Learning Pipeline with CommandStep for R](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-commandstep-r.ipynb) | Demonstrates the use of CommandStep for running R scripts | Custom | AML Compute | None | Azure ML | None |
|
||||
| [Azure Machine Learning Pipeline with CommandStep](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-commandstep.ipynb) | Demonstrates the use of CommandStep | Custom | AML Compute | None | Azure ML | None |
|
||||
| :star:[Azure Machine Learning Pipelines with Data Dependency](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-data-dependency-steps.ipynb) | Demonstrates how to construct a Pipeline with data dependency between steps | Custom | AML Compute | None | Azure ML | None |
|
||||
| [How to use run a notebook as a step in AML Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-notebook-runner-step.ipynb) | Demonstrates the use of NotebookRunnerStep | Custom | AML Compute | None | Azure ML | None |
|
||||
| [Use MLflow with Azure Machine Learning to Train and Deploy Keras Image Classifier](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/using-mlflow/train-and-deploy-keras-auto-logging/train-and-deploy-keras-auto-logging.ipynb) | Use MLflow with Azure Machine Learning to Train and Deploy Keras Image Classifier, leveraging MLflow auto logging | MNIST | Local, AML Compute | Azure Container Instance | Keras | mlflow, keras |
|
||||
| [Use MLflow with Azure Machine Learning to Train and Deploy PyTorch Image Classifier](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/using-mlflow/train-and-deploy-pytorch/train-and-deploy-pytorch.ipynb) | Use MLflow with Azure Machine Learning to train and deploy PyTorch image classifier model | MNIST | Local, AML Compute | Azure Container Instance | PyTorch | mlflow, pytorch |
|
||||
| [Use MLflow projects with Azure Machine Learning to train a model with local compute](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/using-mlflow/train-projects-local/train-projects-local.ipynb) | Use MLflow projects with Azure Machine Learning to train a model using local compute | | Local | | ScikitLearn | mlflow, scikit |
|
||||
| [Use MLflow projects with Azure Machine Learning to train a model](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/using-mlflow/train-projects-remote/train-projects-remote.ipynb) | Use MLflow projects with Azure Machine Learning to train a model using azureml compute | | AML Compute | | Scikit | mlflow, scikit |
|
||||
| [How to use ScriptRun with data input and output](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/datasets-tutorial/scriptrun-with-data-input-output/how-to-use-scriptrun.ipynb) | Demonstrates the use of Scriptrun with datasets | Custom | AML Compute | None | Azure ML | Dataset, ScriptRun |
|
||||
| [Forecasting BikeShare Demand](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb) | Forecasting | BikeShare | Remote | None | Azure ML AutoML | Forecasting |
|
||||
| [Forecasting orange juice sales with deployment](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb) | Forecasting | Orange Juice Sales | Remote | Azure Container Instance | Azure ML AutoML | None |
|
||||
| [Forecasting orange juice sales with deployment](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/forecasting-pipelines/auto-ml-forecasting-pipelines.ipynb) | Forecasting | Orange Juice Sales | Remote | Azure Container Instance | Azure ML AutoML | None |
|
||||
| [Register a model and deploy locally](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local.ipynb) | Deployment | None | Local | Local | None | None |
|
||||
| :star:[Data drift quickdemo](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/work-with-data/datadrift-tutorial/datadrift-tutorial.ipynb) | Filtering | NOAA | Remote | None | Azure ML | Dataset, Timeseries, Drift |
|
||||
| :star:[Datasets with ML Pipeline](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/work-with-data/datasets-tutorial/pipeline-with-datasets/pipeline-for-image-classification.ipynb) | Train | Fashion MNIST | Remote | None | Azure ML | Dataset, Pipeline, Estimator, ScriptRun |
|
||||
| :star:[Filtering data using Tabular Timeseiries Dataset related API](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/work-with-data/datasets-tutorial/timeseries-datasets/tabular-timeseries-dataset-filtering.ipynb) | Filtering | NOAA | Local | None | Azure ML | Dataset, Tabular Timeseries |
|
||||
| :star:[Train with Datasets (Tabular and File)](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/work-with-data/datasets-tutorial/train-with-datasets/train-with-datasets.ipynb) | Train | Iris, Diabetes | Remote | None | Azure ML | Dataset, Estimator, ScriptRun |
|
||||
| [Forecasting away from training data](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/forecasting-forecast-function/auto-ml-forecasting-function.ipynb) | Forecasting | None | Remote | None | Azure ML AutoML | Forecasting, Confidence Intervals |
|
||||
| [Automated ML run with basic edition features.](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb) | Classification | Bankmarketing | AML | ACI | None | featurization, explainability, remote_run, AutomatedML |
|
||||
| [Classification of credit card fraudulent transactions using Automated ML](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb) | Classification | Creditcard | AML Compute | None | None | remote_run, AutomatedML |
|
||||
| [Classification of credit card fraudulent transactions using Automated ML](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/experimental/classification-credit-card-fraud-local-managed/auto-ml-classification-credit-card-fraud-local-managed.ipynb) | Classification | Creditcard | AML Compute | None | None | AutomatedML |
|
||||
| [Automated ML run with featurization and model explainability.](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb) | Regression | MachineData | AML | ACI | None | featurization, explainability, remote_run, AutomatedML |
|
||||
| [auto-ml-forecasting-backtest-single-model](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/forecasting-backtest-single-model/auto-ml-forecasting-backtest-single-model.ipynb) | | None | Remote | None | Azure ML AutoML | |
|
||||
| :star:[Azure Machine Learning Pipeline with DataTranferStep](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-data-transfer.ipynb) | Demonstrates the use of DataTranferStep | Custom | ADF | None | Azure ML | None |
|
||||
| [Getting Started with Azure Machine Learning Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-getting-started.ipynb) | Getting Started notebook for ANML Pipelines | Custom | AML Compute | None | Azure ML | None |
|
||||
| [Azure Machine Learning Pipeline with AzureBatchStep](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-azurebatch-to-run-a-windows-executable.ipynb) | Demonstrates the use of AzureBatchStep | Custom | Azure Batch | None | Azure ML | None |
|
||||
| :star:[How to use ModuleStep with AML Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-modulestep.ipynb) | Demonstrates the use of ModuleStep | Custom | AML Compute | None | Azure ML | None |
|
||||
| :star:[How to use Pipeline Drafts to create a Published Pipeline](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-pipeline-drafts.ipynb) | Demonstrates the use of Pipeline Drafts | Custom | AML Compute | None | Azure ML | None |
|
||||
| :star:[Azure Machine Learning Pipeline with HyperDriveStep](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-parameter-tuning-with-hyperdrive.ipynb) | Demonstrates the use of HyperDriveStep | Custom | AML Compute | None | Azure ML | None |
|
||||
| :star:[How to Publish a Pipeline and Invoke the REST endpoint](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-publish-and-run-using-rest-endpoint.ipynb) | Demonstrates the use of Published Pipelines | Custom | AML Compute | None | Azure ML | None |
|
||||
| :star:[How to Setup a Schedule for a Published Pipeline or Pipeline Endpoint](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-setup-schedule-for-a-published-pipeline.ipynb) | Demonstrates the use of Schedules for Published Pipelines and Pipeline endpoints | Custom | AML Compute | None | Azure ML | None |
|
||||
| [How to setup a versioned Pipeline Endpoint](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-setup-versioned-pipeline-endpoints.ipynb) | Demonstrates the use of PipelineEndpoint to run a specific version of the Published Pipeline | Custom | AML Compute | None | Azure ML | None |
|
||||
| :star:[How to use DataPath as a PipelineParameter](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-showcasing-datapath-and-pipelineparameter.ipynb) | Demonstrates the use of DataPath as a PipelineParameter | Custom | AML Compute | None | Azure ML | None |
|
||||
| :star:[How to use Dataset as a PipelineParameter](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-showcasing-dataset-and-pipelineparameter.ipynb) | Demonstrates the use of Dataset as a PipelineParameter | Custom | AML Compute | None | Azure ML | None |
|
||||
| [How to use AdlaStep with AML Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-use-adla-as-compute-target.ipynb) | Demonstrates the use of AdlaStep | Custom | Azure Data Lake Analytics | None | Azure ML | None |
|
||||
| :star:[How to use DatabricksStep with AML Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-use-databricks-as-compute-target.ipynb) | Demonstrates the use of DatabricksStep | Custom | Azure Databricks | None | Azure ML, Azure Databricks | None |
|
||||
| :star:[How to use KustoStep with AML Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-use-kusto-as-compute-target.ipynb) | Demonstrates the use of KustoStep | Custom | Kusto | None | Azure ML, Kusto | None |
|
||||
| :star:[How to use AutoMLStep with AML Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-automated-machine-learning-step.ipynb) | Demonstrates the use of AutoMLStep | Custom | AML Compute | None | Automated Machine Learning | None |
|
||||
| [Azure Machine Learning Pipeline with CommandStep for R](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-commandstep-r.ipynb) | Demonstrates the use of CommandStep for running R scripts | Custom | AML Compute | None | Azure ML | None |
|
||||
| [Azure Machine Learning Pipeline with CommandStep](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-commandstep.ipynb) | Demonstrates the use of CommandStep | Custom | AML Compute | None | Azure ML | None |
|
||||
| :star:[Azure Machine Learning Pipelines with Data Dependency](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-data-dependency-steps.ipynb) | Demonstrates how to construct a Pipeline with data dependency between steps | Custom | AML Compute | None | Azure ML | None |
|
||||
| [How to use run a notebook as a step in AML Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-notebook-runner-step.ipynb) | Demonstrates the use of NotebookRunnerStep | Custom | AML Compute | None | Azure ML | None |
|
||||
| [Use MLflow with Azure Machine Learning to Train and Deploy Keras Image Classifier](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/using-mlflow/train-and-deploy-keras-auto-logging/train-and-deploy-keras-auto-logging.ipynb) | Use MLflow with Azure Machine Learning to Train and Deploy Keras Image Classifier, leveraging MLflow auto logging | MNIST | Local, AML Compute | Azure Container Instance | Keras | mlflow, keras |
|
||||
| [Use MLflow with Azure Machine Learning to Train and Deploy PyTorch Image Classifier](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/using-mlflow/train-and-deploy-pytorch/train-and-deploy-pytorch.ipynb) | Use MLflow with Azure Machine Learning to train and deploy PyTorch image classifier model | MNIST | Local, AML Compute | Azure Container Instance | PyTorch | mlflow, pytorch |
|
||||
| [Use MLflow projects with Azure Machine Learning to train a model with local compute](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/track-and-monitor-experiments/using-mlflow/train-projects-local/train-projects-local.ipynb) | Use MLflow projects with Azure Machine Learning to train a model using local compute | | Local | | ScikitLearn | mlflow, scikit |
|
||||
| [Use MLflow projects with Azure Machine Learning to train a model](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/track-and-monitor-experiments/using-mlflow/train-projects-remote/train-projects-remote.ipynb) | Use MLflow projects with Azure Machine Learning to train a model using azureml compute | | AML Compute | | Scikit | mlflow, scikit |
|
||||
| [How to use ScriptRun with data input and output](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/work-with-data/datasets-tutorial/scriptrun-with-data-input-output/how-to-use-scriptrun.ipynb) | Demonstrates the use of Scriptrun with datasets | Custom | AML Compute | None | Azure ML | Dataset, ScriptRun |
|
||||
|
||||
## Training
|
||||
|
||||
|Title| Task | Dataset | Training Compute | Deployment Target | ML Framework | Tags |
|
||||
|:----|:-----|:-------:|:----------------:|:-----------------:|:------------:|:------------:|
|
||||
| [Distributed Training with Chainer](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/chainer/distributed-chainer/distributed-chainer.ipynb) | Use the Chainer estimator to perform distributed training | MNIST | AML Compute | None | Chainer | None |
|
||||
| [Train a model with hyperparameter tuning](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/chainer/train-hyperparameter-tune-deploy-with-chainer/train-hyperparameter-tune-deploy-with-chainer.ipynb) | Train a Convolutional Neural Network (CNN) | MNIST | AML Compute | Azure Container Instance | Chainer | None |
|
||||
| [Train a model with a custom Docker image](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/fastai/fastai-with-custom-docker/fastai-with-custom-docker.ipynb) | Train with custom Docker image | Oxford IIIT Pet | AML Compute | None | Pytorch | None |
|
||||
| [Train a DNN using hyperparameter tuning and deploying with Keras](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/keras/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.ipynb) | Create a multi-class classifier | MNIST | AML Compute | Azure Container Instance | TensorFlow | None |
|
||||
| [Distributed training with PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-distributeddataparallel/distributed-pytorch-with-distributeddataparallel.ipynb) | Train a model using distributed training via PyTorch DistributedDataParallel | CIFAR-10 | AML Compute | None | PyTorch | None |
|
||||
| [Distributed PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-horovod/distributed-pytorch-with-horovod.ipynb) | Train a model using the distributed training via Horovod | MNIST | AML Compute | None | PyTorch | None |
|
||||
| [Training with hyperparameter tuning using PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb) | Train an image classification model using transfer learning with the PyTorch estimator | ImageNet | AML Compute | Azure Container Instance | PyTorch | None |
|
||||
| [Training and hyperparameter tuning with Scikit-learn](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/scikit-learn/train-hyperparameter-tune-deploy-with-sklearn/train-hyperparameter-tune-deploy-with-sklearn.ipynb) | Train a support vector machine (SVM) to perform classification | Iris | AML Compute | None | Scikit-learn | None |
|
||||
| [Distributed training using TensorFlow with Horovod](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/distributed-tensorflow-with-horovod/distributed-tensorflow-with-horovod.ipynb) | Use the TensorFlow estimator to train a word2vec model | None | AML Compute | None | TensorFlow | None |
|
||||
| [Hyperparameter tuning and warm start using the TensorFlow estimator](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.ipynb) | Train a deep neural network | MNIST | AML Compute | Azure Container Instance | TensorFlow | None |
|
||||
| [Training and hyperparameter tuning using the TensorFlow estimator](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb) | Train a deep neural network | MNIST | AML Compute | Azure Container Instance | TensorFlow | None |
|
||||
| [Resuming a model](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/train-tensorflow-resume-training/train-tensorflow-resume-training.ipynb) | Resume a model in TensorFlow from a previously submitted run | MNIST | AML Compute | None | TensorFlow | None |
|
||||
| [Using Tensorboard](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/tensorboard/export-run-history-to-tensorboard/export-run-history-to-tensorboard.ipynb) | Export the run history as Tensorboard logs | None | None | None | TensorFlow | None |
|
||||
| [Training in Spark](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training/train-in-spark/train-in-spark.ipynb) | Submiting a run on a spark cluster | None | HDI cluster | None | PySpark | None |
|
||||
| [Train on Azure Machine Learning Compute](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb) | Submit a run on Azure Machine Learning Compute. | Diabetes | AML Compute | None | None | None |
|
||||
| [Train on local compute](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training/train-on-local/train-on-local.ipynb) | Train a model locally | Diabetes | Local | None | None | None |
|
||||
| [Train in a remote Linux virtual machine](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training/train-on-remote-vm/train-on-remote-vm.ipynb) | Configure and execute a run | Diabetes | Data Science Virtual Machine | None | None | None |
|
||||
| [Managing your training runs](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/manage-runs/manage-runs.ipynb) | Monitor and complete runs | None | Local | None | None | None |
|
||||
| [Tensorboard integration with run history](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/tensorboard/tensorboard/tensorboard.ipynb) | Run a TensorFlow job and view its Tensorboard output live | None | Local, DSVM, AML Compute | None | TensorFlow | None |
|
||||
| [Use MLflow with AML for a local training run](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/using-mlflow/train-local/train-local.ipynb) | Use MLflow tracking APIs together with Azure Machine Learning for storing your metrics and artifacts | Diabetes | Local | None | None | None |
|
||||
| [Use MLflow with AML for a remote training run](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/using-mlflow/train-remote/train-remote.ipynb) | Use MLflow tracking APIs together with AML for storing your metrics and artifacts | Diabetes | AML Compute | None | None | None |
|
||||
| [Train a model with a custom Docker image](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/fastai/fastai-with-custom-docker/fastai-with-custom-docker.ipynb) | Train with custom Docker image | Oxford IIIT Pet | AML Compute | None | Pytorch | None |
|
||||
| [Train a DNN using hyperparameter tuning and deploying with Keras](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/keras/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.ipynb) | Create a multi-class classifier | MNIST | AML Compute | Azure Container Instance | TensorFlow | None |
|
||||
| [Distributed training with PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-distributeddataparallel/distributed-pytorch-with-distributeddataparallel.ipynb) | Train a model using distributed training via PyTorch DistributedDataParallel | CIFAR-10 | AML Compute | None | PyTorch | None |
|
||||
| [Distributed PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-horovod/distributed-pytorch-with-horovod.ipynb) | Train a model using the distributed training via Horovod | MNIST | AML Compute | None | PyTorch | None |
|
||||
| [Training with hyperparameter tuning using PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/pytorch/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb) | Train an image classification model using transfer learning with the PyTorch estimator | ImageNet | AML Compute | Azure Container Instance | PyTorch | None |
|
||||
| [Training and hyperparameter tuning with Scikit-learn](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/scikit-learn/train-hyperparameter-tune-deploy-with-sklearn/train-hyperparameter-tune-deploy-with-sklearn.ipynb) | Train a support vector machine (SVM) to perform classification | Iris | AML Compute | None | Scikit-learn | None |
|
||||
| [Distributed training using TensorFlow with Horovod](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/tensorflow/distributed-tensorflow-with-horovod/distributed-tensorflow-with-horovod.ipynb) | Use the TensorFlow estimator to train a word2vec model | None | AML Compute | None | TensorFlow | None |
|
||||
| [Hyperparameter tuning and warm start using the TensorFlow estimator](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.ipynb) | Train a deep neural network | MNIST | AML Compute | Azure Container Instance | TensorFlow | None |
|
||||
| [Training and hyperparameter tuning using the TensorFlow estimator](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/tensorflow/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb) | Train a deep neural network | MNIST | AML Compute | Azure Container Instance | TensorFlow | None |
|
||||
| [Resuming a model](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/ml-frameworks/tensorflow/train-tensorflow-resume-training/train-tensorflow-resume-training.ipynb) | Resume a model in TensorFlow from a previously submitted run | MNIST | AML Compute | None | TensorFlow | None |
|
||||
| [Using Tensorboard](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/track-and-monitor-experiments/tensorboard/export-run-history-to-tensorboard/export-run-history-to-tensorboard.ipynb) | Export the run history as Tensorboard logs | None | None | None | TensorFlow | None |
|
||||
| [Training in Spark](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/training/train-in-spark/train-in-spark.ipynb) | Submiting a run on a spark cluster | None | HDI cluster | None | PySpark | None |
|
||||
| [Train on Azure Machine Learning Compute](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb) | Submit a run on Azure Machine Learning Compute. | Diabetes | AML Compute | None | None | None |
|
||||
| [Train on local compute](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/training/train-on-local/train-on-local.ipynb) | Train a model locally | Diabetes | Local | None | None | None |
|
||||
| [Train in a remote Linux virtual machine](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/training/train-on-remote-vm/train-on-remote-vm.ipynb) | Configure and execute a run | Diabetes | Data Science Virtual Machine | None | None | None |
|
||||
| [Managing your training runs](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/track-and-monitor-experiments/manage-runs/manage-runs.ipynb) | Monitor and complete runs | None | Local | None | None | None |
|
||||
| [Tensorboard integration with run history](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/track-and-monitor-experiments/tensorboard/tensorboard/tensorboard.ipynb) | Run a TensorFlow job and view its Tensorboard output live | None | Local, DSVM, AML Compute | None | TensorFlow | None |
|
||||
| [Use MLflow with AML for a local training run](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/track-and-monitor-experiments/using-mlflow/train-local/train-local.ipynb) | Use MLflow tracking APIs together with Azure Machine Learning for storing your metrics and artifacts | Diabetes | Local | None | None | None |
|
||||
| [Use MLflow with AML for a remote training run](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/track-and-monitor-experiments/using-mlflow/train-remote/train-remote.ipynb) | Use MLflow tracking APIs together with AML for storing your metrics and artifacts | Diabetes | AML Compute | None | None | None |
|
||||
|
||||
|
||||
## Deployment
|
||||
@@ -86,70 +84,68 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
|
||||
|
||||
|Title| Task | Dataset | Training Compute | Deployment Target | ML Framework | Tags |
|
||||
|:----|:-----|:-------:|:----------------:|:-----------------:|:------------:|:------------:|
|
||||
| [Deploy MNIST digit recognition with ONNX Runtime](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/onnx/onnx-inference-mnist-deploy.ipynb) | Image Classification | MNIST | Local | Azure Container Instance | ONNX | ONNX Model Zoo |
|
||||
| [Deploy Facial Expression Recognition (FER+) with ONNX Runtime](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/onnx/onnx-inference-facial-expression-recognition-deploy.ipynb) | Facial Expression Recognition | Emotion FER | Local | Azure Container Instance | ONNX | ONNX Model Zoo |
|
||||
| :star:[Register model and deploy as webservice](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/deploy-to-cloud/model-register-and-deploy.ipynb) | Deploy a model with Azure Machine Learning | Diabetes | None | Azure Container Instance | Scikit-learn | None |
|
||||
| :star:[Deploy models to AKS using controlled roll out](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/deploy-with-controlled-rollout/deploy-aks-with-controlled-rollout.ipynb) | Deploy a model with Azure Machine Learning | Diabetes | None | Azure Kubernetes Service | Scikit-learn | None |
|
||||
| [Train MNIST in PyTorch, convert, and deploy with ONNX Runtime](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/onnx/onnx-train-pytorch-aml-deploy-mnist.ipynb) | Image Classification | MNIST | AML Compute | Azure Container Instance | ONNX | ONNX Converter |
|
||||
| [Deploy ResNet50 with ONNX Runtime](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/onnx/onnx-modelzoo-aml-deploy-resnet50.ipynb) | Image Classification | ImageNet | Local | Azure Container Instance | ONNX | ONNX Model Zoo |
|
||||
| :star:[Convert and deploy TinyYolo with ONNX Runtime](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/onnx/onnx-convert-aml-deploy-tinyyolo.ipynb) | Object Detection | PASCAL VOC | local | Azure Container Instance | ONNX | ONNX Converter |
|
||||
| [Register Spark model and deploy as webservice](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/spark/model-register-and-deploy-spark.ipynb) | | Iris | None | Azure Container Instance | PySpark | |
|
||||
| [Deploy MNIST digit recognition with ONNX Runtime](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/onnx/onnx-inference-mnist-deploy.ipynb) | Image Classification | MNIST | Local | Azure Container Instance | ONNX | ONNX Model Zoo |
|
||||
| [Deploy Facial Expression Recognition (FER+) with ONNX Runtime](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/onnx/onnx-inference-facial-expression-recognition-deploy.ipynb) | Facial Expression Recognition | Emotion FER | Local | Azure Container Instance | ONNX | ONNX Model Zoo |
|
||||
| :star:[Register model and deploy as webservice](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/deploy-to-cloud/model-register-and-deploy.ipynb) | Deploy a model with Azure Machine Learning | Diabetes | None | Azure Container Instance | Scikit-learn | None |
|
||||
| [Train MNIST in PyTorch, convert, and deploy with ONNX Runtime](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/onnx/onnx-train-pytorch-aml-deploy-mnist.ipynb) | Image Classification | MNIST | AML Compute | Azure Container Instance | ONNX | ONNX Converter |
|
||||
| [Deploy ResNet50 with ONNX Runtime](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/onnx/onnx-modelzoo-aml-deploy-resnet50.ipynb) | Image Classification | ImageNet | Local | Azure Container Instance | ONNX | ONNX Model Zoo |
|
||||
| :star:[Convert and deploy TinyYolo with ONNX Runtime](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/onnx/onnx-convert-aml-deploy-tinyyolo.ipynb) | Object Detection | PASCAL VOC | local | Azure Container Instance | ONNX | ONNX Converter |
|
||||
| [Register Spark model and deploy as webservice](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/spark/model-register-and-deploy-spark.ipynb) | | Iris | None | Azure Container Instance | PySpark | |
|
||||
|
||||
|
||||
## Other Notebooks
|
||||
|Title| Task | Dataset | Training Compute | Deployment Target | ML Framework | Tags |
|
||||
|:----|:-----|:-------:|:----------------:|:-----------------:|:------------:|:------------:|
|
||||
| [DNN Text Featurization](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/classification-text-dnn/auto-ml-classification-text-dnn.ipynb) | Text featurization using DNNs for classification | None | AML Compute | None | None | None |
|
||||
| [DNN Text Featurization](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/classification-text-dnn/auto-ml-classification-text-dnn.ipynb) | Text featurization using DNNs for classification | None | AML Compute | None | None | None |
|
||||
| [configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) | | | | | | |
|
||||
| [fairlearn-azureml-mitigation](https://github.com/Azure/MachineLearningNotebooks/blob/master//contrib/fairness/fairlearn-azureml-mitigation.ipynb) | | | | | | |
|
||||
| [upload-fairness-dashboard](https://github.com/Azure/MachineLearningNotebooks/blob/master//contrib/fairness/upload-fairness-dashboard.ipynb) | | | | | | |
|
||||
| [azure-ml-with-nvidia-rapids](https://github.com/Azure/MachineLearningNotebooks/blob/master//contrib/RAPIDS/azure-ml-with-nvidia-rapids.ipynb) | | | | | | |
|
||||
| [auto-ml-continuous-retraining](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/continuous-retraining/auto-ml-continuous-retraining.ipynb) | | | | | | |
|
||||
| [codegen-for-autofeaturization](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/experimental/autofeaturization-codegen/codegen-for-autofeaturization.ipynb) | | | | | | |
|
||||
| [custom-model-training-from-autofeaturization-run](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/experimental/autofeaturization-custom-model-training/custom-model-training-from-autofeaturization-run.ipynb) | | | | | | |
|
||||
| [auto-ml-regression-model-proxy](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/experimental/regression-model-proxy/auto-ml-regression-model-proxy.ipynb) | | | | | | |
|
||||
| [auto-ml-forecasting-backtest-many-models](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-backtest-many-models/auto-ml-forecasting-backtest-many-models.ipynb) | | | | | | |
|
||||
| [auto-ml-forecasting-energy-demand](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb) | | | | | | |
|
||||
| [auto-ml-forecasting-github-dau](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-github-dau/auto-ml-forecasting-github-dau.ipynb) | | | | | | |
|
||||
| [auto-ml-forecasting-hierarchical-timeseries](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-hierarchical-timeseries/auto-ml-forecasting-hierarchical-timeseries.ipynb) | | | | | | |
|
||||
| [auto-ml-forecasting-many-models](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-many-models/auto-ml-forecasting-many-models.ipynb) | | | | | | |
|
||||
| [auto-ml-forecasting-univariate-recipe-experiment-settings](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-recipes-univariate/auto-ml-forecasting-univariate-recipe-experiment-settings.ipynb) | | | | | | |
|
||||
| [auto-ml-forecasting-univariate-recipe-run-experiment](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-recipes-univariate/auto-ml-forecasting-univariate-recipe-run-experiment.ipynb) | | | | | | |
|
||||
| [auto-ml-regression](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb) | | | | | | |
|
||||
| [automl-databricks-local-01](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb) | | | | | | |
|
||||
| [automl-databricks-local-with-deployment](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb) | | | | | | |
|
||||
| [spark_job_on_synapse_spark_pool](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-synapse/spark_job_on_synapse_spark_pool.ipynb) | | | | | | |
|
||||
| [spark_session_on_synapse_spark_pool](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-synapse/spark_session_on_synapse_spark_pool.ipynb) | | | | | | |
|
||||
| [Synapse_Job_Scala_Support](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-synapse/Synapse_Job_Scala_Support.ipynb) | | | | | | |
|
||||
| [Synapse_Session_Scala_Support](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-synapse/Synapse_Session_Scala_Support.ipynb) | | | | | | |
|
||||
| [multi-model-register-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/deploy-multi-model/multi-model-register-and-deploy.ipynb) | | | | | | |
|
||||
| [register-model-deploy-local-advanced](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local-advanced.ipynb) | | | | | | |
|
||||
| [enable-app-insights-in-production-service](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/enable-app-insights-in-production-service/enable-app-insights-in-production-service.ipynb) | | | | | | |
|
||||
| [onnx-model-register-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/onnx/onnx-model-register-and-deploy.ipynb) | | | | | | |
|
||||
| [production-deploy-to-aks-ssl](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/production-deploy-to-aks/production-deploy-to-aks-ssl.ipynb) | | | | | | |
|
||||
| [production-deploy-to-aks](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/production-deploy-to-aks/production-deploy-to-aks.ipynb) | | | | | | |
|
||||
| [production-deploy-to-aks-gpu](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/production-deploy-to-aks-gpu/production-deploy-to-aks-gpu.ipynb) | | | | | | |
|
||||
| [train-explain-model-gpu-tree-explainer](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/azure-integration/gpu-explanation/train-explain-model-gpu-tree-explainer.ipynb) | | | | | | |
|
||||
| [explain-model-on-amlcompute](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb) | | | | | | |
|
||||
| [save-retrieve-explanations-run-history](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.ipynb) | | | | | | |
|
||||
| [train-explain-model-locally-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb) | | | | | | |
|
||||
| [train-explain-model-on-amlcompute-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb) | | | | | | |
|
||||
| [training_notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/notebook_runner/training_notebook.ipynb) | | | | | | |
|
||||
| [nyc-taxi-data-regression-model-building](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/nyc-taxi-data-regression-model-building.ipynb) | | | | | | |
|
||||
| [authentication-in-azureml](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/manage-azureml-service/authentication-in-azureml/authentication-in-azureml.ipynb) | | | | | | |
|
||||
| [pong_rllib](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb) | | | | | | |
|
||||
| [cartpole_ci](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.ipynb) | | | | | | |
|
||||
| [cartpole_sc](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_sc.ipynb) | | | | | | |
|
||||
| [particle](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/particle.ipynb) | | | | | | |
|
||||
| [rai-loan-decision](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/responsible-ai/visualize-upload-loan-decision/rai-loan-decision.ipynb) | | | | | | |
|
||||
| [Logging APIs](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb) | Logging APIs and analyzing results | None | None | None | None | None |
|
||||
| [configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master//setup-environment/configuration.ipynb) | | | | | | |
|
||||
| [quickstart-azureml-automl](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/compute-instance-quickstarts/quickstart-azureml-automl/quickstart-azureml-automl.ipynb) | | | | | | |
|
||||
| [quickstart-azureml-in-10mins](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/compute-instance-quickstarts/quickstart-azureml-in-10mins/quickstart-azureml-in-10mins.ipynb) | | | | | | |
|
||||
| [quickstart-azureml-python-sdk](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/compute-instance-quickstarts/quickstart-azureml-python-sdk/quickstart-azureml-python-sdk.ipynb) | | | | | | |
|
||||
| [tutorial-1st-experiment-sdk-train](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/create-first-ml-experiment/tutorial-1st-experiment-sdk-train.ipynb) | | | | | | |
|
||||
| [img-classification-part1-training](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/image-classification-mnist-data/img-classification-part1-training.ipynb) | | | | | | |
|
||||
| [img-classification-part2-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/image-classification-mnist-data/img-classification-part2-deploy.ipynb) | | | | | | |
|
||||
| [img-classification-part3-deploy-encrypted](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/image-classification-mnist-data/img-classification-part3-deploy-encrypted.ipynb) | | | | | | |
|
||||
| [tutorial-pipeline-batch-scoring-classification](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/machine-learning-pipelines-advanced/tutorial-pipeline-batch-scoring-classification.ipynb) | | | | | | |
|
||||
| [regression-automated-ml](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/regression-automl-nyc-taxi-data/regression-automated-ml.ipynb) | | | | | | |
|
||||
| [fairlearn-azureml-mitigation](https://github.com/Azure/MachineLearningNotebooks/blob/master/contrib/fairness/fairlearn-azureml-mitigation.ipynb) | | | | | | |
|
||||
| [upload-fairness-dashboard](https://github.com/Azure/MachineLearningNotebooks/blob/master/contrib/fairness/upload-fairness-dashboard.ipynb) | | | | | | |
|
||||
| [azure-ml-with-nvidia-rapids](https://github.com/Azure/MachineLearningNotebooks/blob/master/contrib/RAPIDS/azure-ml-with-nvidia-rapids.ipynb) | | | | | | |
|
||||
| [auto-ml-continuous-retraining](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/continuous-retraining/auto-ml-continuous-retraining.ipynb) | | | | | | |
|
||||
| [codegen-for-autofeaturization](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/experimental/autofeaturization-codegen/codegen-for-autofeaturization.ipynb) | | | | | | |
|
||||
| [custom-model-training-from-autofeaturization-run](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/experimental/autofeaturization-custom-model-training/custom-model-training-from-autofeaturization-run.ipynb) | | | | | | |
|
||||
| [auto-ml-regression-model-proxy](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/experimental/regression-model-proxy/auto-ml-regression-model-proxy.ipynb) | | | | | | |
|
||||
| [auto-ml-forecasting-backtest-many-models](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/forecasting-backtest-many-models/auto-ml-forecasting-backtest-many-models.ipynb) | | | | | | |
|
||||
| [auto-ml-forecasting-energy-demand](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb) | | | | | | |
|
||||
| [auto-ml-forecasting-github-dau](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/forecasting-github-dau/auto-ml-forecasting-github-dau.ipynb) | | | | | | |
|
||||
| [auto-ml-forecasting-hierarchical-timeseries](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/forecasting-hierarchical-timeseries/auto-ml-forecasting-hierarchical-timeseries.ipynb) | | | | | | |
|
||||
| [auto-ml-forecasting-many-models](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/forecasting-many-models/auto-ml-forecasting-many-models.ipynb) | | | | | | |
|
||||
| [auto-ml-forecasting-univariate-recipe-experiment-settings](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/forecasting-recipes-univariate/auto-ml-forecasting-univariate-recipe-experiment-settings.ipynb) | | | | | | |
|
||||
| [auto-ml-forecasting-univariate-recipe-run-experiment](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/forecasting-recipes-univariate/auto-ml-forecasting-univariate-recipe-run-experiment.ipynb) | | | | | | |
|
||||
| [auto-ml-regression](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb) | | | | | | |
|
||||
| [automl-databricks-local-01](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb) | | | | | | |
|
||||
| [automl-databricks-local-with-deployment](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb) | | | | | | |
|
||||
| [spark_job_on_synapse_spark_pool](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/azure-synapse/spark_job_on_synapse_spark_pool.ipynb) | | | | | | |
|
||||
| [spark_session_on_synapse_spark_pool](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/azure-synapse/spark_session_on_synapse_spark_pool.ipynb) | | | | | | |
|
||||
| [Synapse_Job_Scala_Support](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/azure-synapse/Synapse_Job_Scala_Support.ipynb) | | | | | | |
|
||||
| [Synapse_Session_Scala_Support](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/azure-synapse/Synapse_Session_Scala_Support.ipynb) | | | | | | |
|
||||
| [multi-model-register-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/deploy-multi-model/multi-model-register-and-deploy.ipynb) | | | | | | |
|
||||
| [register-model-deploy-local-advanced](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local-advanced.ipynb) | | | | | | |
|
||||
| [enable-app-insights-in-production-service](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/enable-app-insights-in-production-service/enable-app-insights-in-production-service.ipynb) | | | | | | |
|
||||
| [onnx-model-register-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/onnx/onnx-model-register-and-deploy.ipynb) | | | | | | |
|
||||
| [production-deploy-to-aks-ssl](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/production-deploy-to-aks/production-deploy-to-aks-ssl.ipynb) | | | | | | |
|
||||
| [production-deploy-to-aks](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/production-deploy-to-aks/production-deploy-to-aks.ipynb) | | | | | | |
|
||||
| [production-deploy-to-aks-gpu](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/production-deploy-to-aks-gpu/production-deploy-to-aks-gpu.ipynb) | | | | | | |
|
||||
| [train-explain-model-gpu-tree-explainer](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/explain-model/azure-integration/gpu-explanation/train-explain-model-gpu-tree-explainer.ipynb) | | | | | | |
|
||||
| [explain-model-on-amlcompute](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb) | | | | | | |
|
||||
| [save-retrieve-explanations-run-history](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.ipynb) | | | | | | |
|
||||
| [train-explain-model-locally-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb) | | | | | | |
|
||||
| [train-explain-model-on-amlcompute-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb) | | | | | | |
|
||||
| [training_notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/notebook_runner/training_notebook.ipynb) | | | | | | |
|
||||
| [nyc-taxi-data-regression-model-building](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/nyc-taxi-data-regression-model-building.ipynb) | | | | | | |
|
||||
| [authentication-in-azureml](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/manage-azureml-service/authentication-in-azureml/authentication-in-azureml.ipynb) | | | | | | |
|
||||
| [pong_rllib](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb) | | | | | | |
|
||||
| [cartpole_ci](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.ipynb) | | | | | | |
|
||||
| [cartpole_sc](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_sc.ipynb) | | | | | | |
|
||||
| [rai-loan-decision](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/responsible-ai/visualize-upload-loan-decision/rai-loan-decision.ipynb) | | | | | | |
|
||||
| [Logging APIs](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb) | Logging APIs and analyzing results | None | None | None | None | None |
|
||||
| [configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/setup-environment/configuration.ipynb) | | | | | | |
|
||||
| [quickstart-azureml-automl](https://github.com/Azure/MachineLearningNotebooks/blob/master/tutorials/compute-instance-quickstarts/quickstart-azureml-automl/quickstart-azureml-automl.ipynb) | | | | | | |
|
||||
| [quickstart-azureml-in-10mins](https://github.com/Azure/MachineLearningNotebooks/blob/master/tutorials/compute-instance-quickstarts/quickstart-azureml-in-10mins/quickstart-azureml-in-10mins.ipynb) | | | | | | |
|
||||
| [quickstart-azureml-python-sdk](https://github.com/Azure/MachineLearningNotebooks/blob/master/tutorials/compute-instance-quickstarts/quickstart-azureml-python-sdk/quickstart-azureml-python-sdk.ipynb) | | | | | | |
|
||||
| [tutorial-1st-experiment-sdk-train](https://github.com/Azure/MachineLearningNotebooks/blob/master/tutorials/create-first-ml-experiment/tutorial-1st-experiment-sdk-train.ipynb) | | | | | | |
|
||||
| [img-classification-part1-training](https://github.com/Azure/MachineLearningNotebooks/blob/master/tutorials/image-classification-mnist-data/img-classification-part1-training.ipynb) | | | | | | |
|
||||
| [img-classification-part2-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master/tutorials/image-classification-mnist-data/img-classification-part2-deploy.ipynb) | | | | | | |
|
||||
| [img-classification-part3-deploy-encrypted](https://github.com/Azure/MachineLearningNotebooks/blob/master/tutorials/image-classification-mnist-data/img-classification-part3-deploy-encrypted.ipynb) | | | | | | |
|
||||
| [tutorial-pipeline-batch-scoring-classification](https://github.com/Azure/MachineLearningNotebooks/blob/master/tutorials/machine-learning-pipelines-advanced/tutorial-pipeline-batch-scoring-classification.ipynb) | | | | | | |
|
||||
| [regression-automated-ml](https://github.com/Azure/MachineLearningNotebooks/blob/master/tutorials/regression-automl-nyc-taxi-data/regression-automated-ml.ipynb) | | | | | | |
|
||||
|
||||
@@ -102,7 +102,7 @@
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.51.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -2,7 +2,7 @@ name: quickstart-azureml-in-10mins
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- sklearn
|
||||
- scikit-learn
|
||||
- numpy
|
||||
- matplotlib
|
||||
- joblib
|
||||
|
||||
@@ -2,7 +2,7 @@ name: quickstart-azureml-python-sdk
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- sklearn
|
||||
- scikit-learn
|
||||
- numpy
|
||||
- matplotlib
|
||||
- joblib
|
||||
|
||||
@@ -2,5 +2,5 @@ name: tutorial-1st-experiment-sdk-train
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- sklearn
|
||||
- scikit-learn
|
||||
- azureml-opendatasets
|
||||
|
||||
@@ -431,7 +431,7 @@
|
||||
"\n",
|
||||
"# to install required packages\n",
|
||||
"env = Environment('tutorial-env')\n",
|
||||
"cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults'], conda_packages = ['scikit-learn==0.22.1'])\n",
|
||||
"cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults'], conda_packages = ['scikit-learn==0.22.1', 'numpy==1.23'])\n",
|
||||
"\n",
|
||||
"env.python.conda_dependencies = cd\n",
|
||||
"\n",
|
||||
|
||||
@@ -4,7 +4,7 @@ dependencies:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- sklearn
|
||||
- scikit-learn
|
||||
- pandas
|
||||
- azureml-opendatasets
|
||||
- azureml-widgets
|
||||
|
||||
@@ -3,6 +3,6 @@ dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- matplotlib
|
||||
- sklearn
|
||||
- scikit-learn
|
||||
- pandas
|
||||
- azureml-opendatasets
|
||||
|
||||
@@ -82,7 +82,7 @@
|
||||
"\n",
|
||||
"# to install required packages\n",
|
||||
"env = Environment('tutorial-encryption-env')\n",
|
||||
"cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults', 'azure-storage-blob', 'encrypted-inference==0.9'], conda_packages = ['scikit-learn==0.22.1'])\n",
|
||||
"cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults', 'azure-storage-blob', 'encrypted-inference==0.9'], conda_packages = ['scikit-learn==0.22.1', 'numpy==1.23'])\n",
|
||||
"\n",
|
||||
"env.python.conda_dependencies = cd\n",
|
||||
"\n",
|
||||
|
||||
@@ -3,7 +3,7 @@ dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- matplotlib
|
||||
- sklearn
|
||||
- scikit-learn
|
||||
- pandas
|
||||
- azureml-opendatasets
|
||||
- encrypted-inference==0.9
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user