Compare commits

...

22 Commits

Author SHA1 Message Date
amlrelsa-ms
598a293dfa update samples from Release-175 as a part of SDK release 2023-03-28 01:02:26 +00:00
Harneet Virk
40b3068462 Merge pull request #1884 from Azure/release_update_stablev2/Release-166
update samples from Release-166 as a part of 1.49.0 SDK stable release
2023-02-13 21:22:05 -08:00
amlrelsa-ms
0ecbbbce75 update samples from Release-166 as a part of 1.49.0 SDK stable release 2023-02-14 02:46:24 +00:00
Harneet Virk
9b1e130d18 Merge pull request #1867 from Azure/release_update/Release-173
update samples from Release-173 as a part of  SDK release
2022-12-19 19:37:41 -08:00
amlrelsa-ms
0e17b33d2a update samples from Release-173 as a part of SDK release 2022-12-20 03:35:58 +00:00
Harneet Virk
34d80abd26 Merge pull request #1864 from Azure/release_update/Release-172
update samples from Release-172 as a part of  SDK release
2022-12-16 09:28:16 -08:00
amlrelsa-ms
249278ab77 update samples from Release-172 as a part of SDK release 2022-12-15 17:32:05 +00:00
Harneet Virk
25fdb17f80 Merge pull request #1862 from Azure/release_update/Release-170
update samples from Release-170 as a part of  SDK release
2022-12-06 10:06:06 -08:00
amlrelsa-ms
3a02a27f1e update samples from Release-170 as a part of SDK release 2022-12-06 03:22:18 +00:00
Harneet Virk
4eed9d529f Merge pull request #1861 from Azure/release_update/Release-169
update samples from Release-169 as a part of  SDK release
2022-12-05 12:33:52 -08:00
amlrelsa-ms
f344d410a2 update samples from Release-169 as a part of SDK release 2022-12-05 20:12:47 +00:00
Harneet Virk
9dc1228063 Merge pull request #1860 from Azure/release_update/Release-168
update samples from Release-168 as a part of  SDK release
2022-12-05 09:54:01 -08:00
amlrelsa-ms
4404e62f58 update samples from Release-168 as a part of SDK release 2022-12-05 17:52:07 +00:00
Harneet Virk
38d5743bbb Merge pull request #1852 from Azure/release_update/Release-167
update samples from Release-167 as a part of  SDK release
2022-11-08 11:01:10 -08:00
amlrelsa-ms
0814eee151 update samples from Release-167 as a part of SDK release 2022-11-08 01:17:48 +00:00
Harneet Virk
f45b815221 Merge pull request #1848 from Azure/release_update/Release-166
update samples from Release-166 as a part of  SDK release
2022-10-26 12:04:10 -07:00
amlrelsa-ms
bd629ae454 update samples from Release-166 as a part of SDK release 2022-10-26 18:46:34 +00:00
Harneet Virk
41de75a584 Merge pull request #1846 from Azure/release_update_stablev2/Release-156
update samples from Release-156 as a part of 1.47.0 SDK stable release
2022-10-25 21:01:03 -07:00
amlrelsa-ms
96a426dc36 update samples from Release-156 as a part of 1.47.0 SDK stable release 2022-10-25 21:28:24 +00:00
Harneet Virk
824dd40f7e Merge pull request #1836 from Azure/release_update/Release-165
update samples from Release-165 as a part of  SDK release
2022-10-11 13:07:26 -07:00
amlrelsa-ms
fa2e649fe8 update samples from Release-165 as a part of SDK release 2022-10-11 19:33:50 +00:00
Harneet Virk
e25e8e3a41 Merge pull request #1832 from Azure/release_update/Release-164
update samples from Release-164 as a part of  SDK release
2022-10-05 11:29:47 -07:00
102 changed files with 718 additions and 4953 deletions

View File

@@ -103,7 +103,7 @@
"source": [
"import azureml.core\n",
"\n",
"print(\"This notebook was created using version 1.46.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.49.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},

View File

@@ -398,7 +398,7 @@
"# run_config.target = gpu_cluster_name\n",
"# run_config.environment.docker.enabled = True\n",
"# run_config.environment.docker.gpu_support = True\n",
"# run_config.environment.docker.base_image = \"rapidsai/rapidsai:cuda9.2-runtime-ubuntu18.04\"\n",
"# run_config.environment.docker.base_image = \"rapidsai/rapidsai:cuda9.2-runtime-ubuntu20.04\"\n",
"# # run_config.environment.docker.base_image_registry.address = '<registry_url>' # not required if the base_image is in Docker hub\n",
"# # run_config.environment.docker.base_image_registry.username = '<user_name>' # needed only for private images\n",
"# # run_config.environment.docker.base_image_registry.password = '<password>' # needed only for private images\n",

View File

@@ -6,7 +6,8 @@ dependencies:
- fairlearn>=0.6.2
- joblib
- liac-arff
- raiwidgets~=0.22.0
- raiwidgets~=0.24.0
- itsdangerous==2.0.1
- markupsafe<2.1.0
- protobuf==3.20.0
- numpy<1.24.0

View File

@@ -6,7 +6,8 @@ dependencies:
- fairlearn>=0.6.2
- joblib
- liac-arff
- raiwidgets~=0.22.0
- raiwidgets~=0.24.0
- itsdangerous==2.0.1
- markupsafe<2.1.0
- protobuf==3.20.0
- numpy<1.24.0

View File

@@ -5,32 +5,21 @@ channels:
- main
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.6.0 and later.
- pip==20.2.4
- python>=3.6,<3.9
- matplotlib==3.2.1
- py-xgboost==1.3.3
- pytorch::pytorch=1.4.0
# Azure ML only supports 3.7.0 and later.
- pip==22.3.1
- python>=3.7,<3.9
- conda-forge::fbprophet==0.7.1
- cudatoolkit=10.1.243
- pandas==1.1.5
- scipy==1.5.3
- notebook
- pywin32==227
- PySocks==1.7.1
- conda-forge::pyqt==5.12.3
- jinja2<=2.11.2
- markupsafe<2.1.0
- Cython==0.29.14
- tqdm==4.64.1
- jsonschema==4.16.0
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azureml-widgets~=1.46.0
- azureml-defaults~=1.46.0
- pytorch-transformers==1.0.0
- spacy==2.2.4
- pystan==2.19.1.1
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.46.0/validated_win32_requirements.txt [--no-deps]
- arch==4.14
- wasabi==0.9.1
- azureml-widgets~=1.49.0
- azureml-defaults~=1.49.0
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.49.0/validated_win32_requirements.txt [--no-deps]
- matplotlib==3.6.2
- xgboost==1.3.3
- cmdstanpy==0.9.5
- setuptools-git==1.2

View File

@@ -5,11 +5,9 @@ channels:
- main
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.6.0 and later.
- pip==20.2.4
- python>=3.6,<3.9
- boto3==1.20.19
- botocore<=1.23.19
# Azure ML only supports 3.7 and later.
- pip==22.3.1
- python>=3.7,<3.9
- matplotlib==3.2.1
- numpy>=1.21.6,<=1.22.3
- cython==0.29.14
@@ -19,19 +17,16 @@ dependencies:
- py-xgboost<=1.3.3
- holidays==0.10.3
- conda-forge::fbprophet==0.7.1
- pytorch::pytorch=1.4.0
- pytorch::pytorch=1.11.0
- cudatoolkit=10.1.243
- jinja2<=2.11.2
- markupsafe<2.1.0
- jsonschema==4.15.0
- notebook
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azureml-widgets~=1.46.0
- azureml-defaults~=1.46.0
- azureml-widgets~=1.49.0
- azureml-defaults~=1.49.0
- pytorch-transformers==1.0.0
- spacy==2.2.4
- pystan==2.19.1.1
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.46.0/validated_linux_requirements.txt [--no-deps]
- arch==4.14
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.49.0/validated_linux_requirements.txt [--no-deps]

View File

@@ -5,12 +5,9 @@ channels:
- main
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.6.0 and later.
- pip==20.2.4
- nomkl
- python>=3.6,<3.9
- boto3==1.20.19
- botocore<=1.23.19
# Currently Azure ML only supports 3.7 and later.
- pip==22.3.1
- python>=3.7,<3.9
- matplotlib==3.2.1
- numpy>=1.21.6,<=1.22.3
- cython==0.29.14
@@ -20,19 +17,16 @@ dependencies:
- py-xgboost<=1.3.3
- holidays==0.10.3
- conda-forge::fbprophet==0.7.1
- pytorch::pytorch=1.4.0
- pytorch::pytorch=1.11.0
- cudatoolkit=9.0
- jinja2<=2.11.2
- markupsafe<2.1.0
- jsonschema==4.15.0
- notebook
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azureml-widgets~=1.46.0
- azureml-defaults~=1.46.0
- azureml-widgets~=1.49.0
- azureml-defaults~=1.49.0
- pytorch-transformers==1.0.0
- spacy==2.2.4
- pystan==2.19.1.1
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.46.0/validated_darwin_requirements.txt [--no-deps]
- arch==4.14
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.49.0/validated_darwin_requirements.txt [--no-deps]

View File

@@ -33,6 +33,8 @@ if not errorlevel 1 (
call conda env create -f %automl_env_file% -n %conda_env_name%
)
python "%conda_prefix%\scripts\pywin32_postinstall.py" -install
call conda activate %conda_env_name% 2>nul:
if errorlevel 1 goto ErrorExit

View File

@@ -1,4 +1,4 @@
from distutils.version import LooseVersion
from setuptools._vendor.packaging import version
import platform
try:
@@ -17,7 +17,7 @@ if architecture != "64bit":
minimumVersion = "4.7.8"
versionInvalid = (LooseVersion(conda.__version__) < LooseVersion(minimumVersion))
versionInvalid = (version.parse(conda.__version__) < version.parse(minimumVersion))
if versionInvalid:
print('Setup requires conda version ' + minimumVersion + ' or higher.')

View File

@@ -97,7 +97,7 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"This notebook was created using version 1.46.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.49.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},

View File

@@ -97,7 +97,7 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"This notebook was created using version 1.46.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.49.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},

View File

@@ -1,24 +1,15 @@
name: azure_automl_experimental
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.6.0 and later.
- pip<=20.2.4
- python>=3.6.0,<3.10
- cython==0.29.14
- urllib3==1.26.7
- PyJWT < 2.0.0
- numpy==1.22.3
- pywin32==227
- cryptography<37.0.0
# Currently Azure ML only supports 3.7.0 and later.
- pip<=22.3.1
- python>=3.7.0,<3.11
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azure-core==1.24.1
- azure-identity==1.7.0
- azureml-defaults
- azureml-sdk
- azureml-widgets
- azureml-mlflow
- pandas
- mlflow
- docker<6.0.0

View File

@@ -4,14 +4,13 @@ channels:
- main
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.6.0 and later.
# Currently Azure ML only supports 3.7.0 and later.
- pip<=20.2.4
- nomkl
- python>=3.6.0,<3.10
- python>=3.7.0,<3.11
- urllib3==1.26.7
- PyJWT < 2.0.0
- numpy>=1.21.6,<=1.22.3
- cryptography<37.0.0
- pip:
# Required packages for AzureML execution, history, and data preparation.

View File

@@ -92,7 +92,7 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"This notebook was created using version 1.46.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.49.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},

View File

@@ -91,7 +91,7 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"This notebook was created using version 1.46.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.49.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},

View File

@@ -122,7 +122,10 @@ def calculate_scores_and_build_plots(
input_dir: str, output_dir: str, automl_settings: Dict[str, Any]
):
os.makedirs(output_dir, exist_ok=True)
grains = automl_settings.get(constants.TimeSeries.TIME_SERIES_ID_COLUMN_NAMES)
grains = automl_settings.get(
constants.TimeSeries.TIME_SERIES_ID_COLUMN_NAMES,
automl_settings.get(constants.TimeSeries.GRAIN_COLUMN_NAMES, None),
)
time_column_name = automl_settings.get(constants.TimeSeries.TIME_COLUMN_NAME)
if grains is None:
grains = []

View File

@@ -33,6 +33,7 @@
"For this notebook we are using a synthetic dataset to demonstrate the back testing in many model scenario. This allows us to check historical performance of AutoML on a historical data. To do that we step back on the backtesting period by the data set several times and split the data to train and test sets. Then these data sets are used for training and evaluation of model.<br>\n",
"\n",
"Thus, it is a quick way of evaluating AutoML as if it was in production. Here, we do not test historical performance of a particular model, for this see the [notebook](../forecasting-backtest-single-model/auto-ml-forecasting-backtest-single-model.ipynb). Instead, the best model for every backtest iteration can be different since AutoML chooses the best model for a given training set.\n",
"\n",
"![Backtesting](Backtesting.png)\n",
"\n",
"**NOTE: There are limits on how many runs we can do in parallel per workspace, and we currently recommend to set the parallelism to maximum of 320 runs per experiment per workspace. If users want to have more parallelism and increase this limit they might encounter Too Many Requests errors (HTTP 429).**"
@@ -43,7 +44,7 @@
"metadata": {},
"source": [
"### Prerequisites\n",
"You'll need to create a compute Instance by following the instructions in the [EnvironmentSetup.md](../Setup_Resources/EnvironmentSetup.md)."
"You'll need to create a compute Instance by following [these](https://learn.microsoft.com/en-us/azure/machine-learning/v1/how-to-create-manage-compute-instance?tabs=python) instructions."
]
},
{
@@ -313,22 +314,37 @@
"source": [
"### Set up training parameters\n",
"\n",
"This dictionary defines the AutoML and many models settings. For this forecasting task we need to define several settings including the name of the time column, the maximum forecast horizon, and the partition column name definition. Please note, that in this case we are setting grain_column_names to be the time series ID column plus iteration, because we want to train a separate model for each time series and iteration.\n",
"We need to provide ``ForecastingParameters``, ``AutoMLConfig`` and ``ManyModelsTrainParameters`` objects. For the forecasting task we also need to define several settings including the name of the time column, the maximum forecast horizon, and the partition column name(s) definition.\n",
"\n",
"#### ``ForecastingParameters`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
"| **time_column_name** | The name of your time column. |\n",
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
"| **cv_step_size** | Number of periods between two consecutive cross-validation folds. The default value is \\\"auto\\\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
"\n",
"#### ``AutoMLConfig`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **task** | forecasting |\n",
"| **primary_metric** | This is the metric that you want to optimize.<br> Forecasting supports the following primary metrics <br><i>normalized_root_mean_squared_error</i><br><i>normalized_mean_absolute_error</i> |\n",
"| **primary_metric** | This is the metric that you want to optimize.<br> Forecasting supports the following primary metrics <br><i>spearman_correlation</i><br><i>normalized_root_mean_squared_error</i><br><i>r2_score</i><br><i>normalized_mean_absolute_error</i> |\n",
"| **blocked_models** | Blocked models won't be used by AutoML. |\n",
"| **iteration_timeout_minutes** | Maximum amount of time in minutes that the model can train. This is optional but provides customers with greater control on exit criteria. |\n",
"| **iterations** | Number of models to train. This is optional but provides customers with greater control on exit criteria. |\n",
"| **experiment_timeout_hours** | Maximum amount of time in hours that the experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. |\n",
"| **experiment_timeout_hours** | Maximum amount of time in hours that each experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. **It does not control the overall timeout for the pipeline run, instead controls the timeout for each training run per partitioned time series.** |\n",
"| **label_column_name** | The name of the label column. |\n",
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
"| **n_cross_validations** | Number of cross validation splits. The default value is \"auto\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
"|**cv_step_size**|Number of periods between two consecutive cross-validation folds. The default value is \"auto\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value.\n",
"| **time_column_name** | The name of your time column. |\n",
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
"| **n_cross_validations** | Number of cross validation splits. The default value is \\\"auto\\\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
"| **enable_early_stopping** | Flag to enable early termination if the primary metric is no longer improving. |\n",
"| **enable_engineered_explanations** | Engineered feature explanations will be downloaded if enable_engineered_explanations flag is set to True. By default it is set to False to save storage space. |\n",
"| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
"| **pipeline_fetch_max_batch_size** | Determines how many pipelines (training algorithms) to fetch at a time for training, this helps reduce throttling when training at large scale. |\n",
"\n",
"\n",
"#### ``ManyModelsTrainParameters`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **automl_settings** | The ``AutoMLConfig`` object defined above. |\n",
"| **partition_column_names** | The names of columns used to group your models. For timeseries, the groups must not split up individual time-series. That is, each group must contain one or more whole time-series. |"
]
},
@@ -345,22 +361,30 @@
"from azureml.train.automl.runtime._many_models.many_models_parameters import (\n",
" ManyModelsTrainParameters,\n",
")\n",
"from azureml.automl.core.forecasting_parameters import ForecastingParameters\n",
"from azureml.train.automl.automlconfig import AutoMLConfig\n",
"\n",
"partition_column_names = [TIME_SERIES_ID_COLNAME, \"backtest_iteration\"]\n",
"automl_settings = {\n",
" \"task\": \"forecasting\",\n",
" \"primary_metric\": \"normalized_root_mean_squared_error\",\n",
" \"iteration_timeout_minutes\": 10, # This needs to be changed based on the dataset. We ask customer to explore how long training is taking before settings this value\n",
" \"iterations\": 15,\n",
" \"experiment_timeout_hours\": 0.25, # This also needs to be changed based on the dataset. For larger data set this number needs to be bigger.\n",
" \"label_column_name\": TARGET_COLNAME,\n",
" \"n_cross_validations\": \"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
" \"cv_step_size\": \"auto\",\n",
" \"time_column_name\": TIME_COLNAME,\n",
" \"forecast_horizon\": 6,\n",
" \"time_series_id_column_names\": partition_column_names,\n",
" \"track_child_runs\": False,\n",
"}\n",
"\n",
"forecasting_parameters = ForecastingParameters(\n",
" time_column_name=TIME_COLNAME,\n",
" forecast_horizon=6,\n",
" time_series_id_column_names=partition_column_names,\n",
" cv_step_size=\"auto\",\n",
")\n",
"\n",
"automl_settings = AutoMLConfig(\n",
" task=\"forecasting\",\n",
" primary_metric=\"normalized_root_mean_squared_error\",\n",
" iteration_timeout_minutes=10,\n",
" iterations=15,\n",
" experiment_timeout_hours=0.25,\n",
" label_column_name=TARGET_COLNAME,\n",
" n_cross_validations=\"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
" track_child_runs=False,\n",
" forecasting_parameters=forecasting_parameters,\n",
")\n",
"\n",
"\n",
"mm_paramters = ManyModelsTrainParameters(\n",
" automl_settings=automl_settings, partition_column_names=partition_column_names\n",
@@ -387,8 +411,16 @@
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long. |\n",
"| **process_count_per_node** | Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node or optimal performance. |\n",
"| **train_pipeline_parameters** | The set of configuration parameters defined in the previous section. |\n",
"| **run_invocation_timeout** | Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. This must be greater than ``experiment_timeout_hours`` by at least 300 seconds. |\n",
"\n",
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution."
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution.\n",
"\n",
"**Note**: Total time taken for the **training step** in the pipeline to complete = $ \\frac{t}{ p \\times n } \\times ts $\n",
"where,\n",
"- $ t $ is time taken for training one partition (can be viewed in the training logs)\n",
"- $ p $ is ``process_count_per_node``\n",
"- $ n $ is ``node_count``\n",
"- $ ts $ is total number of partitions in time series based on ``partition_column_names``"
]
},
{
@@ -406,7 +438,7 @@
" compute_target=compute_target,\n",
" node_count=2,\n",
" process_count_per_node=2,\n",
" run_invocation_timeout=920,\n",
" run_invocation_timeout=1200,\n",
" train_pipeline_parameters=mm_paramters,\n",
")"
]
@@ -491,25 +523,31 @@
"source": [
"For many models we need to provide the ManyModelsInferenceParameters object.\n",
"\n",
"#### ManyModelsInferenceParameters arguments\n",
"#### ``ManyModelsInferenceParameters`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **partition_column_names** | List of column names that identifies groups. |\n",
"| **target_column_name** | \\[Optional\\] Column name only if the inference dataset has the target. |\n",
"| **time_column_name** | Column name only if it is timeseries. |\n",
"| **many_models_run_id** | \\[Optional\\] Many models pipeline run id where models were trained. |\n",
"| **partition_column_names** | List of column names that identifies groups. |\n",
"| **target_column_name** | \\[Optional] Column name only if the inference dataset has the target. |\n",
"| **time_column_name** | \\[Optional] Time column name only if it is timeseries. |\n",
"| **inference_type** | \\[Optional] Which inference method to use on the model. Possible values are 'forecast', 'predict_proba', and 'predict'. |\n",
"| **forecast_mode** | \\[Optional] The type of forecast to be used, either 'rolling' or 'recursive'; defaults to 'recursive'. |\n",
"| **step** | \\[Optional] Number of periods to advance the forecasting window in each iteration **(for rolling forecast only)**; defaults to 1. |\n",
"\n",
"#### get_many_models_batch_inference_steps arguments\n",
"#### ``get_many_models_batch_inference_steps`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **experiment** | The experiment used for inference run. |\n",
"| **inference_data** | The data to use for inferencing. It should be the same schema as used for training.\n",
"| **compute_target** | The compute target that runs the inference pipeline.|\n",
"| **compute_target** | The compute target that runs the inference pipeline. |\n",
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with the number of cores per node (varies by compute sku). |\n",
"| **process_count_per_node** | The number of processes per node.\n",
"| **train_run_id** | \\[Optional\\] The run id of the hierarchy training, by default it is the latest successful training many model run in the experiment. |\n",
"| **train_experiment_name** | \\[Optional\\] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline. |\n",
"| **process_count_per_node** | \\[Optional\\] The number of processes per node, by default it's 4. |"
"| **process_count_per_node** | \\[Optional] The number of processes per node. By default it's 2 (should be at most half of the number of cores in a single node of the compute cluster that will be used for the experiment).\n",
"| **inference_pipeline_parameters** | \\[Optional] The ``ManyModelsInferenceParameters`` object defined above. |\n",
"| **append_row_file_name** | \\[Optional] The name of the output file (optional, default value is 'parallel_run_step.txt'). Supports 'txt' and 'csv' file extension. A 'txt' file extension generates the output in 'txt' format with space as separator without column names. A 'csv' file extension generates the output in 'csv' format with comma as separator and with column names. |\n",
"| **train_run_id** | \\[Optional] The run id of the **training pipeline**. By default it is the latest successful training pipeline run in the experiment. |\n",
"| **train_experiment_name** | \\[Optional] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline. |\n",
"| **run_invocation_timeout** | \\[Optional] Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. |\n",
"| **output_datastore** | \\[Optional] The ``Datastore`` or ``OutputDatasetConfig`` to be used for output. If specified any pipeline output will be written to that location. If unspecified the default datastore will be used. |\n",
"| **arguments** | \\[Optional] Arguments to be passed to inference script. Possible argument is '--forecast_quantiles' followed by quantile values. |"
]
},
{
@@ -529,6 +567,8 @@
" target_column_name=TARGET_COLNAME,\n",
")\n",
"\n",
"output_file_name = \"parallel_run_step.csv\"\n",
"\n",
"inference_steps = AutoMLPipelineBuilder.get_many_models_batch_inference_steps(\n",
" experiment=experiment,\n",
" inference_data=test_data,\n",
@@ -540,6 +580,7 @@
" train_run_id=training_run.id,\n",
" train_experiment_name=training_run.experiment.name,\n",
" inference_pipeline_parameters=mm_parameters,\n",
" append_row_file_name=output_file_name,\n",
")"
]
},
@@ -587,18 +628,21 @@
"source": [
"from azureml.contrib.automl.pipeline.steps.utilities import get_output_from_mm_pipeline\n",
"\n",
"PREDICTION_COLNAME = \"Predictions\"\n",
"forecasting_results_name = \"forecasting_results\"\n",
"forecasting_output_name = \"many_models_inference_output\"\n",
"forecast_file = get_output_from_mm_pipeline(\n",
" inference_run, forecasting_results_name, forecasting_output_name\n",
" inference_run, forecasting_results_name, forecasting_output_name, output_file_name\n",
")\n",
"df = pd.read_csv(forecast_file, delimiter=\" \", header=None, parse_dates=[0])\n",
"df.columns = list(X_train.columns) + [\"predicted_level\"]\n",
"df = pd.read_csv(forecast_file, parse_dates=[0])\n",
"print(\n",
" \"Prediction has \", df.shape[0], \" rows. Here the first 10 rows are being displayed.\"\n",
")\n",
"# Save the scv file with header to read it in the next step.\n",
"df.rename(columns={TARGET_COLNAME: \"actual_level\"}, inplace=True)\n",
"# Save the csv file to read it in the next step.\n",
"df.rename(\n",
" columns={TARGET_COLNAME: \"actual_level\", PREDICTION_COLNAME: \"predicted_level\"},\n",
" inplace=True,\n",
")\n",
"df.to_csv(os.path.join(forecasting_results_name, \"forecast.csv\"), index=False)\n",
"df.head(10)"
]
@@ -622,7 +666,9 @@
"backtesting_results = \"backtesting_mm_results\"\n",
"os.makedirs(backtesting_results, exist_ok=True)\n",
"calculate_scores_and_build_plots(\n",
" forecasting_results_name, backtesting_results, automl_settings\n",
" forecasting_results_name,\n",
" backtesting_results,\n",
" automl_settings.as_serializable_dict(),\n",
")\n",
"pd.DataFrame({\"File\": os.listdir(backtesting_results)})"
]

View File

@@ -43,11 +43,20 @@ def init():
global output_dir
global automl_settings
global model_uid
global forecast_quantiles
logger.info("Initialization of the run.")
parser = argparse.ArgumentParser("Parsing input arguments.")
parser.add_argument("--output-dir", dest="out", required=True)
parser.add_argument("--model-name", dest="model", default=None)
parser.add_argument("--model-uid", dest="model_uid", default=None)
parser.add_argument(
"--forecast_quantiles",
nargs="*",
type=float,
help="forecast quantiles list",
default=None,
)
parsed_args, _ = parser.parse_known_args()
model_name = parsed_args.model
@@ -55,6 +64,7 @@ def init():
target_column_name = automl_settings.get("label_column_name")
output_dir = parsed_args.out
model_uid = parsed_args.model_uid
forecast_quantiles = parsed_args.forecast_quantiles
os.makedirs(output_dir, exist_ok=True)
os.environ["AUTOML_IGNORE_PACKAGE_VERSION_INCOMPATIBILITIES".lower()] = "True"
@@ -126,23 +136,18 @@ def run_backtest(data_input_name: str, file_name: str, experiment: Experiment):
)
print(f"The model {best_run.properties['model_name']} was registered.")
_, x_pred = fitted_model.forecast(X_test)
x_pred.reset_index(inplace=True, drop=False)
columns = [automl_settings[constants.TimeSeries.TIME_COLUMN_NAME]]
if automl_settings.get(constants.TimeSeries.GRAIN_COLUMN_NAMES):
# We know that fitted_model.grain_column_names is a list.
columns.extend(fitted_model.grain_column_names)
columns.append(constants.TimeSeriesInternal.DUMMY_TARGET_COLUMN)
# Remove featurized columns.
x_pred = x_pred[columns]
x_pred.rename(
{constants.TimeSeriesInternal.DUMMY_TARGET_COLUMN: "predicted_level"},
axis=1,
inplace=True,
)
# By default we will have forecast quantiles of 0.5, which is our target
if forecast_quantiles:
if 0.5 not in forecast_quantiles:
forecast_quantiles.append(0.5)
fitted_model.quantiles = forecast_quantiles
x_pred = fitted_model.forecast_quantiles(X_test)
x_pred["actual_level"] = y_test
x_pred["backtest_iteration"] = f"iteration_{last_training_date}"
x_pred.rename({0.5: "predicted_level"}, axis=1, inplace=True)
date_safe = RE_INVALID_SYMBOLS.sub("_", last_training_date)
x_pred.to_csv(os.path.join(output_dir, f"iteration_{date_safe}.csv"), index=False)
return x_pred

View File

@@ -365,6 +365,7 @@
" step_size=BACKTESTING_PERIOD,\n",
" step_number=NUMBER_OF_BACKTESTS,\n",
" model_uid=model_uid,\n",
" forecast_quantiles=[0.025, 0.975], # Optional\n",
")"
]
},
@@ -590,6 +591,7 @@
" step_size=BACKTESTING_PERIOD,\n",
" step_number=NUMBER_OF_BACKTESTS,\n",
" model_name=model_name,\n",
" forecast_quantiles=[0.025, 0.975],\n",
")"
]
},

View File

@@ -31,6 +31,7 @@ def get_backtest_pipeline(
step_number: int,
model_name: Optional[str] = None,
model_uid: Optional[str] = None,
forecast_quantiles: Optional[list] = None,
) -> Pipeline:
"""
:param experiment: The experiment used to run the pipeline.
@@ -44,6 +45,7 @@ def get_backtest_pipeline(
:param step_size: The number of periods to step back in backtesting.
:param step_number: The number of backtesting iterations.
:param model_uid: The uid to mark models from this run of the experiment.
:param forecast_quantiles: The forecast quantiles that are required in the inference.
:return: The pipeline to be used for model retraining.
**Note:** The output will be uploaded in the pipeline output
called 'score'.
@@ -135,6 +137,9 @@ def get_backtest_pipeline(
if model_uid is not None:
prs_args.append("--model-uid")
prs_args.append(model_uid)
if forecast_quantiles:
prs_args.append("--forecast_quantiles")
prs_args.extend(forecast_quantiles)
backtest_prs = ParallelRunStep(
name=parallel_step_name,
parallel_run_config=back_test_config,

View File

@@ -42,7 +42,7 @@
"\n",
"AutoML highlights here include built-in holiday featurization, accessing engineered feature names, and working with the `forecast` function. Please also look at the additional forecasting notebooks, which document lagging, rolling windows, forecast quantiles, other ways to use the forecast function, and forecaster deployment.\n",
"\n",
"Make sure you have executed the [configuration notebook](../../../configuration.ipynb) before running this notebook.\n",
"Make sure you have executed the [configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) before running this notebook.\n",
"\n",
"Notebook synopsis:\n",
"1. Creating an Experiment in an existing Workspace\n",
@@ -575,7 +575,32 @@
"outputs": [],
"source": [
"remote_run.download_file(\"outputs/predictions.csv\", \"predictions.csv\")\n",
"df_all = pd.read_csv(\"predictions.csv\")"
"fcst_df = pd.read_csv(\"predictions.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that the rolling forecast can contain multiple predictions for each date, each from a different forecast origin. For example, consider 2012-09-05:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fcst_df[fcst_df.date == \"2012-09-05\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here, the forecast origin refers to the latest date of actuals available for a given forecast. The earliest origin in the rolling forecast, 2012-08-31, is the last day in the training data. For origin date 2012-09-01, the forecasts use actual recorded counts from the training data *and* the actual count recorded on 2012-09-01. Note that the model is not retrained for origin dates later than 2012-08-31, but the values for model features, such as lagged values of daily count, are updated.\n",
"\n",
"Let's calculate the metrics over all rolling forecasts:"
]
},
{
@@ -587,29 +612,17 @@
"from azureml.automl.core.shared import constants\n",
"from azureml.automl.runtime.shared.score import scoring\n",
"from sklearn.metrics import mean_absolute_error, mean_squared_error\n",
"from matplotlib import pyplot as plt\n",
"\n",
"# use automl metrics module\n",
"scores = scoring.score_regression(\n",
" y_test=df_all[target_column_name],\n",
" y_pred=df_all[\"predicted\"],\n",
" y_test=fcst_df[target_column_name],\n",
" y_pred=fcst_df[\"predicted\"],\n",
" metrics=list(constants.Metric.SCALAR_REGRESSION_SET),\n",
")\n",
"\n",
"print(\"[Test data scores]\\n\")\n",
"for key, value in scores.items():\n",
" print(\"{}: {:.3f}\".format(key, value))\n",
"\n",
"# Plot outputs\n",
"%matplotlib inline\n",
"test_pred = plt.scatter(df_all[target_column_name], df_all[\"predicted\"], color=\"b\")\n",
"test_test = plt.scatter(\n",
" df_all[target_column_name], df_all[target_column_name], color=\"g\"\n",
")\n",
"plt.legend(\n",
" (test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8\n",
")\n",
"plt.show()"
" print(\"{}: {:.3f}\".format(key, value))"
]
},
{
@@ -618,36 +631,15 @@
"source": [
"For more details on what metrics are included and how they are calculated, please refer to [supported metrics](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-understand-automated-ml#regressionforecasting-metrics). You could also calculate residuals, like described [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-understand-automated-ml#residuals).\n",
"\n",
"\n",
"Since we did a rolling evaluation on the test set, we can analyze the predictions by their forecast horizon relative to the rolling origin. The model was initially trained at a forecast horizon of 14, so each prediction from the model is associated with a horizon value from 1 to 14. The horizon values are in a column named, \"horizon_origin,\" in the prediction set. For example, we can calculate some of the error metrics grouped by the horizon:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from metrics_helper import MAPE, APE\n",
"\n",
"df_all.groupby(\"horizon_origin\").apply(\n",
" lambda df: pd.Series(\n",
" {\n",
" \"MAPE\": MAPE(df[target_column_name], df[\"predicted\"]),\n",
" \"RMSE\": np.sqrt(\n",
" mean_squared_error(df[target_column_name], df[\"predicted\"])\n",
" ),\n",
" \"MAE\": mean_absolute_error(df[target_column_name], df[\"predicted\"]),\n",
" }\n",
" )\n",
")"
"The rolling forecast metric values are very high in comparison to the validation metrics reported by the AutoML job. What's going on here? We will investigate in the following cells!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To drill down more, we can look at the distributions of APE (absolute percentage error) by horizon. From the chart, it is clear that the overall MAPE is being skewed by one particular point where the actual value is of small absolute value."
"### Forecast versus actuals plot\n",
"We will plot predictions and actuals on a time series plot. Since there are many forecasts for each date, we select the 14-day-ahead forecast from each forecast origin for our comparison."
]
},
{
@@ -656,21 +648,55 @@
"metadata": {},
"outputs": [],
"source": [
"df_all_APE = df_all.assign(APE=APE(df_all[target_column_name], df_all[\"predicted\"]))\n",
"APEs = [\n",
" df_all_APE[df_all[\"horizon_origin\"] == h].APE.values\n",
" for h in range(1, forecast_horizon + 1)\n",
"]\n",
"from matplotlib import pyplot as plt\n",
"\n",
"%matplotlib inline\n",
"plt.boxplot(APEs)\n",
"plt.yscale(\"log\")\n",
"plt.xlabel(\"horizon\")\n",
"plt.ylabel(\"APE (%)\")\n",
"plt.title(\"Absolute Percentage Errors by Forecast Horizon\")\n",
"\n",
"fcst_df_h14 = (\n",
" fcst_df.groupby(\"forecast_origin\", as_index=False)\n",
" .last()\n",
" .drop(columns=[\"forecast_origin\"])\n",
")\n",
"fcst_df_h14.set_index(time_column_name, inplace=True)\n",
"plt.plot(fcst_df_h14[[target_column_name, \"predicted\"]])\n",
"plt.xticks(rotation=45)\n",
"plt.title(f\"Predicted vs. Actuals\")\n",
"plt.legend([\"actual\", \"14-day-ahead forecast\"])\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Looking at the plot, there are two clear issues:\n",
"1. An anomalously low count value on October 29th, 2012.\n",
"2. End-of-year holidays (Thanksgiving and Christmas) in late November and late December.\n",
"\n",
"What happened on Oct. 29th, 2012? That day, Hurricane Sandy brought severe storm surge flooding to the east coast of the United States, particularly around New York City. This is certainly an anomalous event that the model did not account for!\n",
"\n",
"As for the late year holidays, the model apparently did not learn to account for the full reduction of bike share rentals on these major holidays. The training data covers 2011 and early 2012, so the model fit only had access to a single occurrence of these holidays. This makes it challenging to resolve holiday effects; however, a larger AutoML model search may result in a better model that is more holiday-aware.\n",
"\n",
"If we filter the predictions prior to the Thanksgiving holiday and remove the anomalous day of 2012-10-29, the metrics are closer to validation levels:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"date_filter = (fcst_df.date != \"2012-10-29\") & (fcst_df.date < \"2012-11-22\")\n",
"scores = scoring.score_regression(\n",
" y_test=fcst_df[date_filter][target_column_name],\n",
" y_pred=fcst_df[date_filter][\"predicted\"],\n",
" metrics=list(constants.Metric.SCALAR_REGRESSION_SET),\n",
")\n",
"\n",
"print(\"[Test data scores (filtered)]\\n\")\n",
"for key, value in scores.items():\n",
" print(\"{}: {:.3f}\".format(key, value))"
]
}
],
"metadata": {
@@ -711,7 +737,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.7.13"
},
"mimetype": "text/x-python",
"name": "python",

View File

@@ -36,18 +36,18 @@ y_test_df = (
fitted_model = joblib.load("model.pkl")
y_pred, X_trans = fitted_model.rolling_evaluation(X_test_df, y_test_df.values)
X_rf = fitted_model.rolling_forecast(X_test_df, y_test_df.values, step=1)
# Add predictions, actuals, and horizon relative to rolling origin to the test feature data
assign_dict = {
"horizon_origin": X_trans["horizon_origin"].values,
"predicted": y_pred,
target_column_name: y_test_df[target_column_name].values,
fitted_model.forecast_origin_column_name: "forecast_origin",
fitted_model.forecast_column_name: "predicted",
fitted_model.actual_column_name: target_column_name,
}
df_all = X_test_df.assign(**assign_dict)
X_rf.rename(columns=assign_dict, inplace=True)
file_name = "outputs/predictions.csv"
export_csv = df_all.to_csv(file_name, header=True)
export_csv = X_rf.to_csv(file_name, header=True)
# Upload the predictions into artifacts
run.upload_file(name=file_name, path_or_stream=file_name)

View File

@@ -43,7 +43,7 @@
"\n",
"In this example we use the associated New York City energy demand dataset to showcase how you can use AutoML for a simple forecasting problem and explore the results. The goal is predict the energy demand for the next 48 hours based on historic time-series data.\n",
"\n",
"If you are using an Azure Machine Learning Compute Instance, you are all set. Otherwise, go through the [configuration notebook](../../../configuration.ipynb) first, if you haven't already, to establish your connection to the AzureML Workspace.\n",
"If you are using an Azure Machine Learning Compute Instance, you are all set. Otherwise, go through the [configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) first, if you haven't already, to establish your connection to the AzureML Workspace.\n",
"\n",
"In this notebook you will learn how to:\n",
"1. Creating an Experiment using an existing Workspace\n",

View File

@@ -52,7 +52,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Please make sure you have followed the `configuration.ipynb` notebook so that your ML workspace information is saved in the config file."
"Please make sure you have followed the [configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) so that your ML workspace information is saved in the config file."
]
},
{
@@ -758,7 +758,15 @@
"metadata": {},
"source": [
"## Forecasting farther than the forecast horizon <a id=\"recursive forecasting\"></a>\n",
"When the forecast destination, or the latest date in the prediction data frame, is farther into the future than the specified forecast horizon, the `forecast()` function will still make point predictions out to the later date using a recursive operation mode. Internally, the method recursively applies the regular forecaster to generate context so that we can forecast further into the future. \n",
"When the forecast destination, or the latest date in the prediction data frame, is farther into the future than the specified forecast horizon, the forecaster must be iteratively applied. Here, we advance the forecast origin on each iteration over the prediction window, predicting `max_horizon` periods ahead on each iteration. There are two choices for the context data to use as the forecaster advances into the prediction window:\n",
"\n",
"1. We can use forecasted values from previous iterations (recursive forecast),\n",
"2. We can use known, actual values of the target if they are available (rolling forecast).\n",
"\n",
"The first method is useful in a true forecasting scenario when we do not yet know the actual target values while the second is useful in an evaluation scenario where we want to compute accuracy metrics for the `max_horizon`-period-ahead forecaster over a long test set. We refer to the first as a **recursive forecast** since we apply the forecaster recursively over the prediction window and the second as a **rolling forecast** since we roll forward over known actuals.\n",
"\n",
"### Recursive forecasting\n",
"By default, the `forecast()` function will make point predictions out to the later date using a recursive operation mode. Internally, the method recursively applies the regular forecaster to generate context so that we can forecast further into the future. \n",
"\n",
"To illustrate the use-case and operation of recursive forecasting, we'll consider an example with a single time-series where the forecasting period directly follows the training period and is twice as long as the forecasting horizon given at training time.\n",
"\n",
@@ -818,6 +826,35 @@
"np.array_equal(y_pred_all, y_pred_long)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Rolling forecasts\n",
"A rolling forecast is a similar concept to the recursive forecasts described above except that we use known actual values of the target for our context data. We have provided a different, public method for this called `rolling_forecast`. In addition to test data and actuals (`X_test` and `y_test`), `rolling_forecast` also accepts an optional `step` parameter that controls how far the origin advances on each iteration. The recursive forecast mode uses a fixed step of `max_horizon` while `rolling_forecast` defaults to a step size of 1, but can be set to any integer from 1 to `max_horizon`, inclusive.\n",
"\n",
"Let's see what the rolling forecast looks like on the long test set with the step set to 1:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_rf = fitted_model.rolling_forecast(X_test_long, y_test_long, step=1)\n",
"X_rf.head(n=12)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Notice that `rolling_forecast` has returned a single DataFrame containing all results and has generated some new columns: `_automl_forecast_origin`, `_automl_forecast_y`, and `_automl_actual_y`. These are the origin date for each forecast, the forecasted value and the actual value, respectively. Note that \"y\" in the forecast and actual column names will generally be replaced by the target column name supplied to AutoML.\n",
"\n",
"The output above shows forecasts for two prediction windows, the first with origin at the end of the training set and the second including the first observation in the test set (2000-01-01 06:00:00). Since the forecast windows overlap, there are multiple forecasts for most dates which are associated with different origin dates."
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -880,7 +917,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.7.13"
},
"tags": [
"Forecasting",
@@ -894,5 +931,5 @@
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}

View File

@@ -52,7 +52,7 @@
"\n",
"AutoML highlights here include using Deep Learning forecasts, Arima, Prophet, Remote Execution and Remote Inferencing, and working with the `forecast` function. Please also look at the additional forecasting notebooks, which document lagging, rolling windows, forecast quantiles, other ways to use the forecast function, and forecaster deployment.\n",
"\n",
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
"Make sure you have executed the [configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) before running this notebook.\n",
"\n",
"Notebook synopsis:\n",
"\n",
@@ -325,7 +325,7 @@
"source": [
"### Setting forecaster maximum horizon \n",
"\n",
"The forecast horizon is the number of periods into the future that the model should predict. Here, we set the horizon to 12 periods (i.e. 12 months). Notice that this is much shorter than the number of months in the test set; we will need to use a rolling test to evaluate the performance on the whole test set. For more discussion of forecast horizons and guiding principles for setting them, please see the [energy demand notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand). "
"The forecast horizon is the number of periods into the future that the model should predict. Here, we set the horizon to 14 periods (i.e. 14 days). Notice that this is much shorter than the number of months in the test set; we will need to use a rolling test to evaluate the performance on the whole test set. For more discussion of forecast horizons and guiding principles for setting them, please see the [energy demand notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand). "
]
},
{
@@ -337,7 +337,7 @@
},
"outputs": [],
"source": [
"forecast_horizon = 12"
"forecast_horizon = 14"
]
},
{
@@ -699,5 +699,5 @@
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}

View File

@@ -4,7 +4,6 @@ import os
import numpy as np
import pandas as pd
from pandas.tseries.frequencies import to_offset
from sklearn.externals import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error
@@ -19,219 +18,8 @@ except ImportError:
_torch_present = False
def align_outputs(
y_predicted,
X_trans,
X_test,
y_test,
predicted_column_name="predicted",
horizon_colname="horizon_origin",
):
"""
Demonstrates how to get the output aligned to the inputs
using pandas indexes. Helps understand what happened if
the output's shape differs from the input shape, or if
the data got re-sorted by time and grain during forecasting.
Typical causes of misalignment are:
* we predicted some periods that were missing in actuals -> drop from eval
* model was asked to predict past max_horizon -> increase max horizon
* data at start of X_test was needed for lags -> provide previous periods
"""
if horizon_colname in X_trans:
df_fcst = pd.DataFrame(
{
predicted_column_name: y_predicted,
horizon_colname: X_trans[horizon_colname],
}
)
else:
df_fcst = pd.DataFrame({predicted_column_name: y_predicted})
# y and X outputs are aligned by forecast() function contract
df_fcst.index = X_trans.index
# align original X_test to y_test
X_test_full = X_test.copy()
X_test_full[target_column_name] = y_test
# X_test_full's index does not include origin, so reset for merge
df_fcst.reset_index(inplace=True)
X_test_full = X_test_full.reset_index().drop(columns="index")
together = df_fcst.merge(X_test_full, how="right")
# drop rows where prediction or actuals are nan
# happens because of missing actuals
# or at edges of time due to lags/rolling windows
clean = together[
together[[target_column_name, predicted_column_name]].notnull().all(axis=1)
]
return clean
def do_rolling_forecast_with_lookback(
fitted_model, X_test, y_test, max_horizon, X_lookback, y_lookback, freq="D"
):
"""
Produce forecasts on a rolling origin over the given test set.
Each iteration makes a forecast for the next 'max_horizon' periods
with respect to the current origin, then advances the origin by the
horizon time duration. The prediction context for each forecast is set so
that the forecaster uses the actual target values prior to the current
origin time for constructing lag features.
This function returns a concatenated DataFrame of rolling forecasts.
"""
print("Using lookback of size: ", y_lookback.size)
df_list = []
origin_time = X_test[time_column_name].min()
X = X_lookback.append(X_test)
y = np.concatenate((y_lookback, y_test), axis=0)
while origin_time <= X_test[time_column_name].max():
# Set the horizon time - end date of the forecast
horizon_time = origin_time + max_horizon * to_offset(freq)
# Extract test data from an expanding window up-to the horizon
expand_wind = X[time_column_name] < horizon_time
X_test_expand = X[expand_wind]
y_query_expand = np.zeros(len(X_test_expand)).astype(float)
y_query_expand.fill(np.NaN)
if origin_time != X[time_column_name].min():
# Set the context by including actuals up-to the origin time
test_context_expand_wind = X[time_column_name] < origin_time
context_expand_wind = X_test_expand[time_column_name] < origin_time
y_query_expand[context_expand_wind] = y[test_context_expand_wind]
# Print some debug info
print(
"Horizon_time:",
horizon_time,
" origin_time: ",
origin_time,
" max_horizon: ",
max_horizon,
" freq: ",
freq,
)
print("expand_wind: ", expand_wind)
print("y_query_expand")
print(y_query_expand)
print("X_test")
print(X)
print("X_test_expand")
print(X_test_expand)
print("Type of X_test_expand: ", type(X_test_expand))
print("Type of y_query_expand: ", type(y_query_expand))
print("y_query_expand")
print(y_query_expand)
# Make a forecast out to the maximum horizon
# y_fcst, X_trans = y_query_expand, X_test_expand
y_fcst, X_trans = fitted_model.forecast(X_test_expand, y_query_expand)
print("y_fcst")
print(y_fcst)
# Align forecast with test set for dates within
# the current rolling window
trans_tindex = X_trans.index.get_level_values(time_column_name)
trans_roll_wind = (trans_tindex >= origin_time) & (trans_tindex < horizon_time)
test_roll_wind = expand_wind & (X[time_column_name] >= origin_time)
df_list.append(
align_outputs(
y_fcst[trans_roll_wind],
X_trans[trans_roll_wind],
X[test_roll_wind],
y[test_roll_wind],
)
)
# Advance the origin time
origin_time = horizon_time
return pd.concat(df_list, ignore_index=True)
def do_rolling_forecast(fitted_model, X_test, y_test, max_horizon, freq="D"):
"""
Produce forecasts on a rolling origin over the given test set.
Each iteration makes a forecast for the next 'max_horizon' periods
with respect to the current origin, then advances the origin by the
horizon time duration. The prediction context for each forecast is set so
that the forecaster uses the actual target values prior to the current
origin time for constructing lag features.
This function returns a concatenated DataFrame of rolling forecasts.
"""
df_list = []
origin_time = X_test[time_column_name].min()
while origin_time <= X_test[time_column_name].max():
# Set the horizon time - end date of the forecast
horizon_time = origin_time + max_horizon * to_offset(freq)
# Extract test data from an expanding window up-to the horizon
expand_wind = X_test[time_column_name] < horizon_time
X_test_expand = X_test[expand_wind]
y_query_expand = np.zeros(len(X_test_expand)).astype(float)
y_query_expand.fill(np.NaN)
if origin_time != X_test[time_column_name].min():
# Set the context by including actuals up-to the origin time
test_context_expand_wind = X_test[time_column_name] < origin_time
context_expand_wind = X_test_expand[time_column_name] < origin_time
y_query_expand[context_expand_wind] = y_test[test_context_expand_wind]
# Print some debug info
print(
"Horizon_time:",
horizon_time,
" origin_time: ",
origin_time,
" max_horizon: ",
max_horizon,
" freq: ",
freq,
)
print("expand_wind: ", expand_wind)
print("y_query_expand")
print(y_query_expand)
print("X_test")
print(X_test)
print("X_test_expand")
print(X_test_expand)
print("Type of X_test_expand: ", type(X_test_expand))
print("Type of y_query_expand: ", type(y_query_expand))
print("y_query_expand")
print(y_query_expand)
# Make a forecast out to the maximum horizon
y_fcst, X_trans = fitted_model.forecast(X_test_expand, y_query_expand)
print("y_fcst")
print(y_fcst)
# Align forecast with test set for dates within the
# current rolling window
trans_tindex = X_trans.index.get_level_values(time_column_name)
trans_roll_wind = (trans_tindex >= origin_time) & (trans_tindex < horizon_time)
test_roll_wind = expand_wind & (X_test[time_column_name] >= origin_time)
df_list.append(
align_outputs(
y_fcst[trans_roll_wind],
X_trans[trans_roll_wind],
X_test[test_roll_wind],
y_test[test_roll_wind],
)
)
# Advance the origin time
origin_time = horizon_time
return pd.concat(df_list, ignore_index=True)
def map_location_cuda(storage, loc):
return storage.cuda()
def APE(actual, pred):
@@ -254,10 +42,6 @@ def MAPE(actual, pred):
return np.mean(APE(actual_safe, pred_safe))
def map_location_cuda(storage, loc):
return storage.cuda()
parser = argparse.ArgumentParser()
parser.add_argument(
"--max_horizon",
@@ -303,7 +87,6 @@ print(model_path)
run = Run.get_context()
# get input dataset by name
test_dataset = run.input_datasets["test_data"]
lookback_dataset = run.input_datasets["lookback_data"]
grain_column_names = []
@@ -312,15 +95,8 @@ df = test_dataset.to_pandas_dataframe()
print("Read df")
print(df)
X_test_df = test_dataset.drop_columns(columns=[target_column_name])
y_test_df = test_dataset.with_timestamp_columns(None).keep_columns(
columns=[target_column_name]
)
X_lookback_df = lookback_dataset.drop_columns(columns=[target_column_name])
y_lookback_df = lookback_dataset.with_timestamp_columns(None).keep_columns(
columns=[target_column_name]
)
X_test_df = df
y_test = df.pop(target_column_name).to_numpy()
_, ext = os.path.splitext(model_path)
if ext == ".pt":
@@ -336,37 +112,20 @@ else:
# Load the sklearn pipeline.
fitted_model = joblib.load(model_path)
if hasattr(fitted_model, "get_lookback"):
lookback = fitted_model.get_lookback()
df_all = do_rolling_forecast_with_lookback(
fitted_model,
X_test_df.to_pandas_dataframe(),
y_test_df.to_pandas_dataframe().values.T[0],
max_horizon,
X_lookback_df.to_pandas_dataframe()[-lookback:],
y_lookback_df.to_pandas_dataframe().values.T[0][-lookback:],
freq,
)
else:
df_all = do_rolling_forecast(
fitted_model,
X_test_df.to_pandas_dataframe(),
y_test_df.to_pandas_dataframe().values.T[0],
max_horizon,
freq,
)
X_rf = fitted_model.rolling_forecast(X_test_df, y_test, step=1)
assign_dict = {
fitted_model.forecast_origin_column_name: "forecast_origin",
fitted_model.forecast_column_name: "predicted",
fitted_model.actual_column_name: target_column_name,
}
X_rf.rename(columns=assign_dict, inplace=True)
print(df_all)
print("target values:::")
print(df_all[target_column_name])
print("predicted values:::")
print(df_all["predicted"])
print(X_rf.head())
# Use the AutoML scoring module
regression_metrics = list(constants.REGRESSION_SCALAR_SET)
y_test = np.array(df_all[target_column_name])
y_pred = np.array(df_all["predicted"])
y_test = np.array(X_rf[target_column_name])
y_pred = np.array(X_rf["predicted"])
scores = scoring.score_regression(y_test, y_pred, regression_metrics)
print("scores:")
@@ -376,11 +135,11 @@ for key, value in scores.items():
run.log(key, value)
print("Simple forecasting model")
rmse = np.sqrt(mean_squared_error(df_all[target_column_name], df_all["predicted"]))
rmse = np.sqrt(mean_squared_error(X_rf[target_column_name], X_rf["predicted"]))
print("[Test Data] \nRoot Mean squared error: %.2f" % rmse)
mae = mean_absolute_error(df_all[target_column_name], df_all["predicted"])
mae = mean_absolute_error(X_rf[target_column_name], X_rf["predicted"])
print("mean_absolute_error score: %.2f" % mae)
print("MAPE: %.2f" % MAPE(df_all[target_column_name], df_all["predicted"]))
print("MAPE: %.2f" % MAPE(X_rf[target_column_name], X_rf["predicted"]))
run.log("rmse", rmse)
run.log("mae", mae)

View File

@@ -40,7 +40,7 @@
"metadata": {},
"source": [
"### Prerequisites\n",
"You'll need to create a compute Instance by following the instructions in the [EnvironmentSetup.md](../Setup_Resources/EnvironmentSetup.md)."
"You'll need to create a compute Instance by following [these](https://learn.microsoft.com/en-us/azure/machine-learning/v1/how-to-create-manage-compute-instance?tabs=python) instructions."
]
},
{
@@ -251,8 +251,17 @@
"source": [
"### Set up training parameters\n",
"\n",
"This dictionary defines the AutoML and hierarchy settings. For this forecasting task we need to define several settings inncluding the name of the time column, the maximum forecast horizon, the hierarchy definition, and the level of the hierarchy at which to train.\n",
"We need to provide ``ForecastingParameters``, ``AutoMLConfig`` and ``HTSTrainParameters`` objects. For the forecasting task we need to define several settings including the name of the time column, the maximum forecast horizon, the hierarchy definition, and the level of the hierarchy at which to train.\n",
"\n",
"#### ``ForecastingParameters`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
"| **time_column_name** | The name of your time column. |\n",
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
"| **cv_step_size** | Number of periods between two consecutive cross-validation folds. The default value is \\\"auto\\\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
"\n",
"#### ``AutoMLConfig`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **task** | forecasting |\n",
@@ -260,20 +269,22 @@
"| **blocked_models** | Blocked models won't be used by AutoML. |\n",
"| **iteration_timeout_minutes** | Maximum amount of time in minutes that the model can train. This is optional but provides customers with greater control on exit criteria. |\n",
"| **iterations** | Number of models to train. This is optional but provides customers with greater control on exit criteria. |\n",
"| **experiment_timeout_hours** | Maximum amount of time in hours that the experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. |\n",
"| **experiment_timeout_hours** | Maximum amount of time in hours that each experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. **It does not control the overall timeout for the pipeline run, instead controls the timeout for each training run per partitioned time series.** |\n",
"| **label_column_name** | The name of the label column. |\n",
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
"|**n_cross_validations**|Number of cross-validation folds to use for model/pipeline selection. The default value is \"auto\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value.\n",
"|**cv_step_size**|Number of periods between two consecutive cross-validation folds. The default value is \"auto\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value.\n",
"| **enable_early_stopping** | Flag to enable early termination if the score is not improving in the short term. |\n",
"| **time_column_name** | The name of your time column. |\n",
"| **hierarchy_column_names** | The names of columns that define the hierarchical structure of the data from highest level to most granular. |\n",
"| **training_level** | The level of the hierarchy to be used for training models. |\n",
"| **n_cross_validations** | Number of cross validation splits. The default value is \\\"auto\\\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
"| **enable_early_stopping** | Flag to enable early termination if the primary metric is no longer improving. |\n",
"| **enable_engineered_explanations** | Engineered feature explanations will be downloaded if enable_engineered_explanations flag is set to True. By default it is set to False to save storage space. |\n",
"| **time_series_id_column_name** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
"| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
"| **pipeline_fetch_max_batch_size** | Determines how many pipelines (training algorithms) to fetch at a time for training, this helps reduce throttling when training at large scale. |\n",
"| **model_explainability** | Flag to disable explaining the best automated ML model at the end of all training iterations. The default is True and will block non-explainable models which may impact the forecast accuracy. For more information, see [Interpretability: model explanations in automated machine learning](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-machine-learning-interpretability-automl). |"
"| **model_explainability** | Flag to disable explaining the best automated ML model at the end of all training iterations. The default is True and will block non-explainable models which may impact the forecast accuracy. For more information, see [Interpretability: model explanations in automated machine learning](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-machine-learning-interpretability-automl). |\n",
"\n",
"#### ``HTSTrainParameters`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **automl_settings** | The ``AutoMLConfig`` object defined above. |\n",
"| **hierarchy_column_names** | The names of columns that define the hierarchical structure of the data from highest level to most granular. |\n",
"| **training_level** | The level of the hierarchy to be used for training models. |\n",
"| **enable_engineered_explanations** | The switch controls engineered explanations. |"
]
},
{
@@ -287,6 +298,9 @@
"outputs": [],
"source": [
"from azureml.train.automl.runtime._hts.hts_parameters import HTSTrainParameters\n",
"from azureml.automl.core.forecasting_parameters import ForecastingParameters\n",
"from azureml.train.automl.automlconfig import AutoMLConfig\n",
"\n",
"\n",
"model_explainability = True\n",
"\n",
@@ -300,24 +314,26 @@
"label_column_name = \"quantity\"\n",
"forecast_horizon = 7\n",
"\n",
"forecasting_parameters = ForecastingParameters(\n",
" time_column_name=time_column_name,\n",
" forecast_horizon=forecast_horizon,\n",
")\n",
"\n",
"automl_settings = {\n",
" \"task\": \"forecasting\",\n",
" \"primary_metric\": \"normalized_root_mean_squared_error\",\n",
" \"label_column_name\": label_column_name,\n",
" \"time_column_name\": time_column_name,\n",
" \"forecast_horizon\": forecast_horizon,\n",
" \"hierarchy_column_names\": hierarchy,\n",
" \"hierarchy_training_level\": training_level,\n",
" \"track_child_runs\": False,\n",
" \"pipeline_fetch_max_batch_size\": 15,\n",
" \"model_explainability\": model_explainability,\n",
" \"n_cross_validations\": \"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
" \"cv_step_size\": \"auto\",\n",
"automl_settings = AutoMLConfig(\n",
" task=\"forecasting\",\n",
" primary_metric=\"normalized_root_mean_squared_error\",\n",
" experiment_timeout_hours=1,\n",
" label_column_name=label_column_name,\n",
" track_child_runs=False,\n",
" forecasting_parameters=forecasting_parameters,\n",
" pipeline_fetch_max_batch_size=15,\n",
" model_explainability=model_explainability,\n",
" n_cross_validations=\"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
" cv_step_size=\"auto\",\n",
" # The following settings are specific to this sample and should be adjusted according to your own needs.\n",
" \"iteration_timeout_minutes\": 10,\n",
" \"iterations\": 10,\n",
"}\n",
" iteration_timeout_minutes=10,\n",
" iterations=15,\n",
")\n",
"\n",
"hts_parameters = HTSTrainParameters(\n",
" automl_settings=automl_settings,\n",
@@ -338,15 +354,25 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Parallel run step is leveraged to train the hierarchy. To configure the ParallelRunConfig you will need to determine the appropriate number of workers and nodes for your use case. The `process_count_per_node` is based off the number of cores of the compute VM. The node_count will determine the number of master nodes to use, increasing the node count will speed up the training process.\n",
"Parallel run step is leveraged to train multiple models at once. To configure the ParallelRunConfig you will need to determine the appropriate number of workers and nodes for your use case. The ``process_count_per_node`` is based off the number of cores of the compute VM. The node_count will determine the number of master nodes to use, increasing the node count will speed up the training process.\n",
"\n",
"* **experiment:** The experiment used for training.\n",
"* **train_data:** The tabular dataset to be used as input to the training run.\n",
"* **node_count:** The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long.\n",
"* **process_count_per_node:** Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node or optimal performance.\n",
"* **train_pipeline_parameters:** The set of configuration parameters defined in the previous section. \n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **experiment** | The experiment used for training. |\n",
"| **train_data** | The file dataset to be used as input to the training run. |\n",
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long. |\n",
"| **process_count_per_node** | Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node for optimal performance. |\n",
"| **train_pipeline_parameters** | The set of configuration parameters defined in the previous section. |\n",
"| **run_invocation_timeout** | Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. This must be greater than ``experiment_timeout_hours`` by at least 300 seconds. |\n",
"\n",
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution."
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution.\n",
"\n",
"**Note**: Total time taken for the **training step** in the pipeline to complete = $ \\frac{t}{ p \\times n } \\times ts $\n",
"where,\n",
"- $ t $ is time taken for training one partition (can be viewed in the training logs)\n",
"- $ p $ is ``process_count_per_node``\n",
"- $ n $ is ``node_count``\n",
"- $ ts $ is total number of partitions in time series based on ``partition_column_names``"
]
},
{
@@ -365,6 +391,7 @@
" node_count=2,\n",
" process_count_per_node=8,\n",
" train_pipeline_parameters=hts_parameters,\n",
" run_invocation_timeout=3900,\n",
")"
]
},
@@ -509,19 +536,24 @@
"source": [
"## 5.0 Forecasting\n",
"For hierarchical forecasting we need to provide the HTSInferenceParameters object.\n",
"#### HTSInferenceParameters arguments\n",
"* **hierarchy_forecast_level:** The default level of the hierarchy to produce prediction/forecast on.\n",
"* **allocation_method:** \\[Optional] The disaggregation method to use if the hierarchy forecast level specified is below the define hierarchy training level. <br><i>(average historical proportions) 'average_historical_proportions'</i><br><i>(proportions of the historical averages) 'proportions_of_historical_average'</i>\n",
"#### ``HTSInferenceParameters`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **hierarchy_forecast_level:** | The default level of the hierarchy to produce prediction/forecast on. |\n",
"| **allocation_method:** | \\[Optional] The disaggregation method to use if the hierarchy forecast level specified is below the define hierarchy training level. <br><i>(average historical proportions) 'average_historical_proportions'</i><br><i>(proportions of the historical averages) 'proportions_of_historical_average'</i> |\n",
"\n",
"#### get_many_models_batch_inference_steps arguments\n",
"* **experiment:** The experiment used for inference run.\n",
"* **inference_data:** The data to use for inferencing. It should be the same schema as used for training.\n",
"* **compute_target:** The compute target that runs the inference pipeline.\n",
"* **node_count:** The number of compute nodes to be used for running the user script. We recommend to start with the number of cores per node (varies by compute sku).\n",
"* **process_count_per_node:** The number of processes per node.\n",
"* **train_run_id:** \\[Optional] The run id of the hierarchy training, by default it is the latest successful training hts run in the experiment.\n",
"* **train_experiment_name:** \\[Optional] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline.\n",
"* **process_count_per_node:** \\[Optional] The number of processes per node, by default it's 4."
"#### ``get_many_models_batch_inference_steps`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **experiment** | The experiment used for inference run. |\n",
"| **inference_data** | The data to use for inferencing. It should be the same schema as used for training.\n",
"| **compute_target** | The compute target that runs the inference pipeline. |\n",
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with the number of cores per node (varies by compute sku). |\n",
"| **process_count_per_node** | \\[Optional] The number of processes per node. By default it's 2 (should be at most half of the number of cores in a single node of the compute cluster that will be used for the experiment).\n",
"| **inference_pipeline_parameters** | \\[Optional] The ``HTSInferenceParameters`` object defined above. |\n",
"| **train_run_id** | \\[Optional] The run id of the **training pipeline**. By default it is the latest successful training pipeline run in the experiment. |\n",
"| **train_experiment_name** | \\[Optional] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline. |\n",
"| **run_invocation_timeout** | \\[Optional] Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. |"
]
},
{

View File

@@ -40,7 +40,7 @@
"metadata": {},
"source": [
"### Prerequisites\n",
"You'll need to create a compute Instance by following the instructions in the [EnvironmentSetup.md](../Setup_Resources/EnvironmentSetup.md)."
"You'll need to create a compute Instance by following [these](https://learn.microsoft.com/en-us/azure/machine-learning/v1/how-to-create-manage-compute-instance?tabs=python) instructions."
]
},
{
@@ -379,8 +379,17 @@
"source": [
"### Set up training parameters\n",
"\n",
"This dictionary defines the AutoML and many models settings. For this forecasting task we need to define several settings inncluding the name of the time column, the maximum forecast horizon, and the partition column name definition.\n",
"We need to provide ``ForecastingParameters``, ``AutoMLConfig`` and ``ManyModelsTrainParameters`` objects. For the forecasting task we also need to define several settings including the name of the time column, the maximum forecast horizon, and the partition column name(s) definition.\n",
"\n",
"#### ``ForecastingParameters`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
"| **time_column_name** | The name of your time column. |\n",
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
"| **cv_step_size** | Number of periods between two consecutive cross-validation folds. The default value is \\\"auto\\\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
"\n",
"#### ``AutoMLConfig`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **task** | forecasting |\n",
@@ -388,17 +397,19 @@
"| **blocked_models** | Blocked models won't be used by AutoML. |\n",
"| **iteration_timeout_minutes** | Maximum amount of time in minutes that the model can train. This is optional but provides customers with greater control on exit criteria. |\n",
"| **iterations** | Number of models to train. This is optional but provides customers with greater control on exit criteria. |\n",
"| **experiment_timeout_hours** | Maximum amount of time in hours that the experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. |\n",
"| **experiment_timeout_hours** | Maximum amount of time in hours that each experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. **It does not control the overall timeout for the pipeline run, instead controls the timeout for each training run per partitioned time series.** |\n",
"| **label_column_name** | The name of the label column. |\n",
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
"| **n_cross_validations** | Number of cross validation splits. The default value is \"auto\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
"|**cv_step_size**|Number of periods between two consecutive cross-validation folds. The default value is \"auto\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value.\n",
"| **enable_early_stopping** | Flag to enable early termination if the score is not improving in the short term. |\n",
"| **time_column_name** | The name of your time column. |\n",
"| **n_cross_validations** | Number of cross validation splits. The default value is \\\"auto\\\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
"| **enable_early_stopping** | Flag to enable early termination if the primary metric is no longer improving. |\n",
"| **enable_engineered_explanations** | Engineered feature explanations will be downloaded if enable_engineered_explanations flag is set to True. By default it is set to False to save storage space. |\n",
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
"| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
"| **pipeline_fetch_max_batch_size** | Determines how many pipelines (training algorithms) to fetch at a time for training, this helps reduce throttling when training at large scale. |\n",
"\n",
"\n",
"#### ``ManyModelsTrainParameters`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **automl_settings** | The ``AutoMLConfig`` object defined above. |\n",
"| **partition_column_names** | The names of columns used to group your models. For timeseries, the groups must not split up individual time-series. That is, each group must contain one or more whole time-series. |"
]
},
@@ -415,23 +426,29 @@
"from azureml.train.automl.runtime._many_models.many_models_parameters import (\n",
" ManyModelsTrainParameters,\n",
")\n",
"from azureml.automl.core.forecasting_parameters import ForecastingParameters\n",
"from azureml.train.automl.automlconfig import AutoMLConfig\n",
"\n",
"partition_column_names = [\"Store\", \"Brand\"]\n",
"automl_settings = {\n",
" \"task\": \"forecasting\",\n",
" \"primary_metric\": \"normalized_root_mean_squared_error\",\n",
" \"iteration_timeout_minutes\": 10, # This needs to be changed based on the dataset. We ask customer to explore how long training is taking before settings this value\n",
" \"iterations\": 15,\n",
" \"experiment_timeout_hours\": 0.25,\n",
" \"label_column_name\": \"Quantity\",\n",
" \"n_cross_validations\": \"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
" \"cv_step_size\": \"auto\",\n",
" \"time_column_name\": \"WeekStarting\",\n",
" \"drop_column_names\": \"Revenue\",\n",
" \"forecast_horizon\": 6,\n",
" \"time_series_id_column_names\": partition_column_names,\n",
" \"track_child_runs\": False,\n",
"}\n",
"\n",
"forecasting_parameters = ForecastingParameters(\n",
" time_column_name=\"WeekStarting\",\n",
" forecast_horizon=6,\n",
" time_series_id_column_names=partition_column_names,\n",
" cv_step_size=\"auto\",\n",
")\n",
"\n",
"automl_settings = AutoMLConfig(\n",
" task=\"forecasting\",\n",
" primary_metric=\"normalized_root_mean_squared_error\",\n",
" iteration_timeout_minutes=10,\n",
" iterations=15,\n",
" experiment_timeout_hours=0.25,\n",
" label_column_name=\"Quantity\",\n",
" n_cross_validations=\"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
" track_child_runs=False,\n",
" forecasting_parameters=forecasting_parameters,\n",
")\n",
"\n",
"mm_paramters = ManyModelsTrainParameters(\n",
" automl_settings=automl_settings, partition_column_names=partition_column_names\n",
@@ -451,7 +468,9 @@
"\n",
"Reuse of previous results (``allow_reuse``) is key when using pipelines in a collaborative environment since eliminating unnecessary reruns offers agility. Reuse is the default behavior when the ``script_name``, ``inputs``, and the parameters of a step remain the same. When reuse is allowed, results from the previous run are immediately sent to the next step. If ``allow_reuse`` is set to False, a new run will always be generated for this step during pipeline execution.\n",
"\n",
"> Note that we only support partitioned FileDataset and TabularDataset without partition when using such output as input."
"> Note that we only support partitioned FileDataset and TabularDataset without partition when using such output as input.\n",
"\n",
"> Note that we **drop column** \"Revenue\" from the dataset in this step to avoid information leak as \"Quantity\" = \"Revenue\" / \"Price\". **Please modify the logic based on your data**."
]
},
{
@@ -489,17 +508,25 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Parallel run step is leveraged to train multiple models at once. To configure the ParallelRunConfig you will need to determine the appropriate number of workers and nodes for your use case. The process_count_per_node is based off the number of cores of the compute VM. The node_count will determine the number of master nodes to use, increasing the node count will speed up the training process.\n",
"Parallel run step is leveraged to train multiple models at once. To configure the ParallelRunConfig you will need to determine the appropriate number of workers and nodes for your use case. The ``process_count_per_node`` is based off the number of cores of the compute VM. The node_count will determine the number of master nodes to use, increasing the node count will speed up the training process.\n",
"\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **experiment** | The experiment used for training. |\n",
"| **train_data** | The file dataset to be used as input to the training run. |\n",
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long. |\n",
"| **process_count_per_node** | Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node or optimal performance. |\n",
"| **process_count_per_node** | Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node for optimal performance. |\n",
"| **train_pipeline_parameters** | The set of configuration parameters defined in the previous section. |\n",
"| **run_invocation_timeout** | Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. This must be greater than ``experiment_timeout_hours`` by at least 300 seconds. |\n",
"\n",
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution."
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution.\n",
"\n",
"**Note**: Total time taken for the **training step** in the pipeline to complete = $ \\frac{t}{ p \\times n } \\times ts $\n",
"where,\n",
"- $ t $ is time taken for training one partition (can be viewed in the training logs)\n",
"- $ p $ is ``process_count_per_node``\n",
"- $ n $ is ``node_count``\n",
"- $ ts $ is total number of partitions in time series based on ``partition_column_names``"
]
},
{
@@ -517,7 +544,7 @@
" compute_target=compute_target,\n",
" node_count=2,\n",
" process_count_per_node=8,\n",
" run_invocation_timeout=920,\n",
" run_invocation_timeout=1200,\n",
" train_pipeline_parameters=mm_paramters,\n",
")"
]
@@ -598,7 +625,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### 7.2 Schedule the pipeline\n",
"### 5.2 Schedule the pipeline\n",
"You can also [schedule the pipeline](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-schedule-pipelines) to run on a time-based or change-based schedule. This could be used to automatically retrain models every month or based on another trigger such as data drift."
]
},
@@ -654,25 +681,31 @@
"source": [
"For many models we need to provide the ManyModelsInferenceParameters object.\n",
"\n",
"#### ManyModelsInferenceParameters arguments\n",
"#### ``ManyModelsInferenceParameters`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **partition_column_names** | List of column names that identifies groups. |\n",
"| **partition_column_names** | List of column names that identifies groups. |\n",
"| **target_column_name** | \\[Optional] Column name only if the inference dataset has the target. |\n",
"| **time_column_name** | \\[Optional] Column name only if it is timeseries. |\n",
"| **many_models_run_id** | \\[Optional] Many models run id where models were trained. |\n",
"| **time_column_name** | \\[Optional] Time column name only if it is timeseries. |\n",
"| **inference_type** | \\[Optional] Which inference method to use on the model. Possible values are 'forecast', 'predict_proba', and 'predict'. |\n",
"| **forecast_mode** | \\[Optional] The type of forecast to be used, either 'rolling' or 'recursive'; defaults to 'recursive'. |\n",
"| **step** | \\[Optional] Number of periods to advance the forecasting window in each iteration **(for rolling forecast only)**; defaults to 1. |\n",
"\n",
"#### get_many_models_batch_inference_steps arguments\n",
"#### ``get_many_models_batch_inference_steps`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **experiment** | The experiment used for inference run. |\n",
"| **inference_data** | The data to use for inferencing. It should be the same schema as used for training.\n",
"| **compute_target** The compute target that runs the inference pipeline.|\n",
"| **compute_target** | The compute target that runs the inference pipeline. |\n",
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with the number of cores per node (varies by compute sku). |\n",
"| **process_count_per_node** The number of processes per node.\n",
"| **train_run_id** | \\[Optional] The run id of the hierarchy training, by default it is the latest successful training many model run in the experiment. |\n",
"| **process_count_per_node** | \\[Optional] The number of processes per node. By default it's 2 (should be at most half of the number of cores in a single node of the compute cluster that will be used for the experiment).\n",
"| **inference_pipeline_parameters** | \\[Optional] The ``ManyModelsInferenceParameters`` object defined above. |\n",
"| **append_row_file_name** | \\[Optional] The name of the output file (optional, default value is 'parallel_run_step.txt'). Supports 'txt' and 'csv' file extension. A 'txt' file extension generates the output in 'txt' format with space as separator without column names. A 'csv' file extension generates the output in 'csv' format with comma as separator and with column names. |\n",
"| **train_run_id** | \\[Optional] The run id of the **training pipeline**. By default it is the latest successful training pipeline run in the experiment. |\n",
"| **train_experiment_name** | \\[Optional] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline. |\n",
"| **process_count_per_node** | \\[Optional] The number of processes per node, by default it's 4. |"
"| **run_invocation_timeout** | \\[Optional] Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. |\n",
"| **output_datastore** | \\[Optional] The ``Datastore`` or ``OutputDatasetConfig`` to be used for output. If specified any pipeline output will be written to that location. If unspecified the default datastore will be used. |\n",
"| **arguments** | \\[Optional] Arguments to be passed to inference script. Possible argument is '--forecast_quantiles' followed by quantile values. |"
]
},
{
@@ -692,6 +725,8 @@
" target_column_name=\"Quantity\",\n",
")\n",
"\n",
"output_file_name = \"parallel_run_step.csv\"\n",
"\n",
"inference_steps = AutoMLPipelineBuilder.get_many_models_batch_inference_steps(\n",
" experiment=experiment,\n",
" inference_data=inference_ds_small,\n",
@@ -703,6 +738,8 @@
" train_run_id=training_run.id,\n",
" train_experiment_name=training_run.experiment.name,\n",
" inference_pipeline_parameters=mm_parameters,\n",
" append_row_file_name=output_file_name,\n",
" arguments=[\"--forecast_quantiles\", 0.1, 0.9],\n",
")"
]
},
@@ -737,7 +774,7 @@
"\n",
"The following code snippet:\n",
"1. Downloads the contents of the output folder that is passed in the parallel run step \n",
"2. Reads the parallel_run_step.txt file that has the predictions as pandas dataframe and \n",
"2. Reads the output file that has the predictions as pandas dataframe and \n",
"3. Displays the top 10 rows of the predictions"
]
},
@@ -752,19 +789,9 @@
"forecasting_results_name = \"forecasting_results\"\n",
"forecasting_output_name = \"many_models_inference_output\"\n",
"forecast_file = get_output_from_mm_pipeline(\n",
" inference_run, forecasting_results_name, forecasting_output_name\n",
" inference_run, forecasting_results_name, forecasting_output_name, output_file_name\n",
")\n",
"df = pd.read_csv(forecast_file, delimiter=\" \", header=None)\n",
"df.columns = [\n",
" \"Week Starting\",\n",
" \"Store\",\n",
" \"Brand\",\n",
" \"Quantity\",\n",
" \"Advert\",\n",
" \"Price\",\n",
" \"Revenue\",\n",
" \"Predicted\",\n",
"]\n",
"df = pd.read_csv(forecast_file)\n",
"print(\n",
" \"Prediction has \", df.shape[0], \" rows. Here the first 10 rows are being displayed.\"\n",
")\n",

View File

@@ -11,6 +11,12 @@ def main(args):
dataset = run_context.input_datasets["train_10_models"]
df = dataset.to_pandas_dataframe()
# Drop the column "Revenue" from the dataset to avoid information leak as
# "Quantity" = "Revenue" / "Price". Please modify the logic based on your data.
drop_column_name = "Revenue"
if drop_column_name in df.columns:
df.drop(drop_column_name, axis=1, inplace=True)
# Apply any data pre-processing techniques here
df.to_parquet(output / "data_prepared_result.parquet", compression=None)

View File

@@ -40,7 +40,7 @@
"## Introduction<a id=\"introduction\"></a>\n",
"In this example, we use AutoML to train, select, and operationalize a time-series forecasting model for multiple time-series.\n",
"\n",
"Make sure you have executed the [configuration notebook](../../../configuration.ipynb) before running this notebook.\n",
"Make sure you have executed the [configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) before running this notebook.\n",
"\n",
"The examples in the follow code samples use the University of Chicago's Dominick's Finer Foods dataset to forecast orange juice sales. Dominick's was a grocery chain in the Chicago metropolitan area."
]

View File

@@ -13,7 +13,7 @@
"source": [
"## Introduction\n",
"\n",
"In this notebook, we demonstrate how to use piplines to train and inference on AutoML Forecasting model. Two pipelines will be created: one for training AutoML model, and the other is for inference on AutoML model. We'll also demonstrate how to schedule the inference pipeline so you can get inference results periodically (with refreshed test dataset). Make sure you have executed the configuration notebook before running this notebook. In this notebook you will learn how to:\n",
"In this notebook, we demonstrate how to use piplines to train and inference on AutoML Forecasting model. Two pipelines will be created: one for training AutoML model, and the other is for inference on AutoML model. We'll also demonstrate how to schedule the inference pipeline so you can get inference results periodically (with refreshed test dataset). Make sure you have executed the [configuration notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) before running this notebook. In this notebook you will learn how to:\n",
"\n",
"- Configure AutoML using AutoMLConfig for forecasting tasks using pipeline AutoMLSteps.\n",
"- Create and register an AutoML model using AzureML pipeline.\n",

View File

@@ -859,8 +859,8 @@
"outputs": [],
"source": [
"%matplotlib inline\n",
"test_pred = plt.scatter(y_test, y_pred_test, color=\"\")\n",
"test_test = plt.scatter(y_test, y_test, color=\"g\")\n",
"test_pred = plt.scatter(y_test, y_pred_test, c=[\"b\"])\n",
"test_test = plt.scatter(y_test, y_test, c=[\"g\"])\n",
"plt.legend(\n",
" (test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8\n",
")\n",

View File

@@ -422,8 +422,8 @@
"outputs": [],
"source": [
"%matplotlib inline\n",
"test_pred = plt.scatter(y_test, y_pred_test, color=\"\")\n",
"test_test = plt.scatter(y_test, y_test, color=\"g\")\n",
"test_pred = plt.scatter(y_test, y_pred_test, c=[\"b\"])\n",
"test_test = plt.scatter(y_test, y_test, c=[\"g\"])\n",
"plt.legend(\n",
" (test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8\n",
")\n",

View File

@@ -239,7 +239,7 @@
"\n",
"env = Environment(\"deploytocloudenv\")\n",
"env.python.conda_dependencies.add_pip_package(\"joblib\")\n",
"env.python.conda_dependencies.add_pip_package(\"numpy\")\n",
"env.python.conda_dependencies.add_pip_package(\"numpy==1.23\")\n",
"env.python.conda_dependencies.add_pip_package(\"scikit-learn=={}\".format(sklearn.__version__))"
]
},

View File

@@ -285,7 +285,7 @@
" 'azureml-defaults',\n",
" 'inference-schema[numpy-support]',\n",
" 'joblib',\n",
" 'numpy',\n",
" 'numpy==1.23',\n",
" 'scikit-learn=={}'.format(sklearn.__version__)\n",
"])"
]
@@ -486,7 +486,7 @@
" 'azureml-defaults',\n",
" 'inference-schema[numpy-support]',\n",
" 'joblib',\n",
" 'numpy',\n",
" 'numpy==1.23',\n",
" 'scikit-learn=={}'.format(sklearn.__version__)\n",
"])\n",
"inference_config = InferenceConfig(entry_script='score.py', environment=environment)\n",

View File

@@ -110,7 +110,7 @@
" pip_packages=[\n",
" 'azureml-defaults',\n",
" 'inference-schema[numpy-support]',\n",
" 'numpy',\n",
" 'numpy==1.23',\n",
" 'scikit-learn==0.22.1',\n",
" 'scipy'\n",
"])"

View File

@@ -240,9 +240,9 @@
"# Please see [Azure ML Containers repository](https://github.com/Azure/AzureML-Containers#featured-tags)\n",
"# for open-sourced GPU base images.\n",
"env.docker.base_image = DEFAULT_GPU_IMAGE\n",
"env.python.conda_dependencies = CondaDependencies.create(python_version=\"3.6.2\", \n",
"env.python.conda_dependencies = CondaDependencies.create(python_version=\"3.6.2\", pin_sdk_version=False,\n",
" conda_packages=['tensorflow-gpu==1.12.0','numpy'],\n",
" pip_packages=['azureml-contrib-services', 'azureml-defaults'])\n",
" pip_packages=['azureml-contrib-services==1.47.0', 'azureml-defaults==1.47.0'])\n",
"\n",
"inference_config = InferenceConfig(entry_script=\"score.py\", environment=env)\n",
"aks_config = AksWebservice.deploy_configuration()\n",
@@ -343,7 +343,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
"version": "3.7.0"
}
},
"nbformat": 4,

View File

@@ -5,4 +5,4 @@ dependencies:
- matplotlib
- tqdm
- scipy
- sklearn
- scikit-learn

View File

@@ -5,4 +5,4 @@ dependencies:
- matplotlib
- tqdm
- scipy
- sklearn
- scikit-learn

View File

@@ -137,7 +137,7 @@
"myenv = Environment('my-pyspark-environment')\r\n",
"myenv.docker.base_image = \"mcr.microsoft.com/mmlspark/release:0.15\"\r\n",
"myenv.inferencing_stack_version = \"latest\"\r\n",
"myenv.python.conda_dependencies = CondaDependencies.create(pip_packages=[\"azureml-core\",\"azureml-defaults\",\"azureml-telemetry\",\"azureml-train-restclients-hyperdrive\",\"azureml-train-core\"], python_version=\"3.6.2\")\r\n",
"myenv.python.conda_dependencies = CondaDependencies.create(pip_packages=[\"azureml-core\",\"azureml-defaults\",\"azureml-telemetry\",\"azureml-train-restclients-hyperdrive\",\"azureml-train-core\"], python_version=\"3.7.0\")\r\n",
"myenv.python.conda_dependencies.add_channel(\"conda-forge\")\r\n",
"myenv.spark.packages = [SparkPackage(\"com.microsoft.ml.spark\", \"mmlspark_2.11\", \"0.15\"), SparkPackage(\"com.microsoft.azure\", \"azure-storage\", \"2.0.0\"), SparkPackage(\"org.apache.hadoop\", \"hadoop-azure\", \"2.7.0\")]\r\n",
"myenv.spark.repositories = [\"https://mmlspark.azureedge.net/maven\"]\r\n"
@@ -341,7 +341,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
"version": "3.7.0"
}
},
"nbformat": 4,

View File

@@ -106,7 +106,7 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"This notebook was created using version 1.46.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.49.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -235,18 +235,30 @@
"# Note: this is to pin the pandas and xgboost versions to be same as notebook.\n",
"# In production scenario user would choose their dependencies\n",
"import pkg_resources\n",
"from distutils.version import LooseVersion\n",
"available_packages = pkg_resources.working_set\n",
"pandas_ver = None\n",
"numpy_ver = None\n",
"sklearn_ver = None\n",
"for dist in list(available_packages):\n",
" if dist.key == 'pandas':\n",
" pandas_ver = dist.version\n",
" pandas_ver = dist.version\n",
" if dist.key == 'numpy':\n",
" if LooseVersion(dist.version) >= LooseVersion('1.20.0'):\n",
" numpy_ver = dist.version\n",
" else:\n",
" numpy_ver = '1.21.6'\n",
" if dist.key == 'scikit-learn':\n",
" sklearn_ver = dist.version\n",
"pandas_dep = 'pandas'\n",
"numpy_dep = 'numpy'\n",
"sklearn_dep = 'scikit-learn'\n",
"if pandas_ver:\n",
" pandas_dep = 'pandas=={}'.format(pandas_ver)\n",
"if numpy_ver:\n",
" numpy_dep = 'numpy=={}'.format(numpy_ver)\n",
"if sklearn_ver:\n",
" sklearn_dep = 'scikit-learn=={}'.format(sklearn_ver)\n",
"\n",
"# Note: we build shap at commit 690245 for Tesla K80 GPUs\n",
"env.docker.base_dockerfile = f\"\"\"\n",
@@ -286,7 +298,9 @@
"pip uninstall -y xgboost && \\\n",
"conda install py-xgboost==1.3.3 && \\\n",
"pip uninstall -y numpy && \\\n",
"conda install {numpy_dep} \\\n",
"pip install {numpy_dep} && \\\n",
"pip install {sklearn_dep} && \\\n",
"pip install chardet \\\n",
"\"\"\"\n",
"\n",
"env.python.user_managed_dependencies = True\n",

View File

@@ -10,7 +10,7 @@ dependencies:
- ipython
- matplotlib
- ipywidgets
- raiwidgets~=0.22.0
- raiwidgets~=0.24.0
- itsdangerous==2.0.1
- markupsafe<2.1.0
- scipy>=1.5.3

View File

@@ -10,7 +10,7 @@ dependencies:
- matplotlib
- azureml-dataset-runtime
- ipywidgets
- raiwidgets~=0.22.0
- raiwidgets~=0.24.0
- itsdangerous==2.0.1
- markupsafe<2.1.0
- scipy>=1.5.3

View File

@@ -9,7 +9,7 @@ dependencies:
- ipython
- matplotlib
- ipywidgets
- raiwidgets~=0.22.0
- raiwidgets~=0.24.0
- packaging>=20.9
- itsdangerous==2.0.1
- markupsafe<2.1.0

View File

@@ -9,7 +9,7 @@ dependencies:
- ipython
- matplotlib
- ipywidgets
- raiwidgets~=0.22.0
- raiwidgets~=0.24.0
- packaging>=20.9
- itsdangerous==2.0.1
- markupsafe<2.1.0

View File

@@ -11,7 +11,7 @@ dependencies:
- azureml-dataset-runtime
- azureml-core
- ipywidgets
- raiwidgets~=0.22.0
- raiwidgets~=0.24.0
- itsdangerous==2.0.1
- markupsafe<2.1.0
- scipy>=1.5.3

View File

@@ -330,7 +330,7 @@
"- **inputs:** List of input connections for data consumed by this step. Fetch this inside the notebook using dbutils.widgets.get(\"input\")\n",
"- **outputs:** List of output port definitions for outputs produced by this step. Fetch this inside the notebook using dbutils.widgets.get(\"output\")\n",
"- **existing_cluster_id:** Cluster ID of an existing Interactive cluster on the Databricks workspace. If you are providing this, do not provide any of the parameters below that are used to create a new cluster such as spark_version, node_type, etc.\n",
"- **spark_version:** Version of spark for the databricks run cluster. default value: 4.0.x-scala2.11\n",
"- **spark_version:** Version of spark for the databricks run cluster. You can refer to [DataBricks runtime version](https://learn.microsoft.com/azure/databricks/dev-tools/api/#--runtime-version-strings) to specify the spark version. default value: 10.4.x-scala2.12\n",
"- **node_type:** Azure vm node types for the databricks run cluster. default value: Standard_D3_v2\n",
"- **num_workers:** Specifies a static number of workers for the databricks run cluster\n",
"- **min_workers:** Specifies a min number of workers to use for auto-scaling the databricks run cluster\n",

View File

@@ -252,7 +252,7 @@
"# is_directory=None)\n",
"\n",
"# Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1.\n",
"processed_data1 = PipelineData(\"processed_data1\",datastore=def_blob_store)\n",
"processed_data1 = PipelineData(\"processed_data1\",datastore=def_blob_store, is_directory=True)\n",
"print(\"PipelineData object created\")"
]
},
@@ -347,7 +347,7 @@
"source": [
"# step5 to use the intermediate data produced by step4\n",
"# This step also produces an output processed_data2\n",
"processed_data2 = PipelineData(\"processed_data2\", datastore=def_blob_store)\n",
"processed_data2 = PipelineData(\"processed_data2\", datastore=def_blob_store, is_directory=True)\n",
"source_directory = \"data_dependency_run_extract\"\n",
"\n",
"extractStep = PythonScriptStep(\n",
@@ -394,7 +394,7 @@
"outputs": [],
"source": [
"# Now define the compare step which takes two inputs and produces an output\n",
"processed_data3 = PipelineData(\"processed_data3\", datastore=def_blob_store)\n",
"processed_data3 = PipelineData(\"processed_data3\", datastore=def_blob_store, is_directory=True)\n",
"source_directory = \"data_dependency_run_compare\"\n",
"\n",
"compareStep = PythonScriptStep(\n",

View File

@@ -235,7 +235,8 @@
" path_on_datastore=\"titanic/Titanic.csv\")\n",
"\n",
"output_data = PipelineData(name=\"processed_data\",\n",
" datastore=Datastore.get(ws, \"workspaceblobstore\"))"
" datastore=Datastore.get(ws, \"workspaceblobstore\"),\n",
" is_directory=True)"
]
},
{
@@ -306,7 +307,8 @@
"from azureml.pipeline.core import PipelineParameter\n",
"\n",
"output_from_notebook = PipelineData(name=\"notebook_processed_data\",\n",
" datastore=Datastore.get(ws, \"workspaceblobstore\"))\n",
" datastore=Datastore.get(ws, \"workspaceblobstore\"),\n",
" is_directory=True)\n",
"\n",
"my_pipeline_param = PipelineParameter(name=\"pipeline_param\", default_value=\"my_param\")\n",
"\n",

View File

@@ -1,5 +1,5 @@
# DisableDockerDetector "Disabled to unblock PRs until the owner can fix the file. Not used in any prod deployments - only as a documentation for the customers"
FROM rocker/tidyverse:4.0.0-ubuntu18.04
FROM rocker/tidyverse:4.0.0-ubuntu20.04
# Install python
RUN apt-get update -qq && \

View File

@@ -363,7 +363,7 @@
"}).replace(\",\", \";\")\n",
"\n",
"# Define output after cleansing step\n",
"cleansed_green_data = PipelineData(\"cleansed_green_data\", datastore=default_store).as_dataset()\n",
"cleansed_green_data = PipelineData(\"cleansed_green_data\", datastore=default_store, is_directory=True).as_dataset()\n",
"\n",
"print('Cleanse script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
"\n",
@@ -414,7 +414,7 @@
"}).replace(\",\", \";\")\n",
"\n",
"# Define output after cleansing step\n",
"cleansed_yellow_data = PipelineData(\"cleansed_yellow_data\", datastore=default_store).as_dataset()\n",
"cleansed_yellow_data = PipelineData(\"cleansed_yellow_data\", datastore=default_store, is_directory=True).as_dataset()\n",
"\n",
"print('Cleanse script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
"\n",
@@ -452,7 +452,7 @@
"outputs": [],
"source": [
"# Define output after merging step\n",
"merged_data = PipelineData(\"merged_data\", datastore=default_store).as_dataset()\n",
"merged_data = PipelineData(\"merged_data\", datastore=default_store, is_directory=True).as_dataset()\n",
"\n",
"print('Merge script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
"\n",
@@ -489,7 +489,7 @@
"outputs": [],
"source": [
"# Define output after merging step\n",
"filtered_data = PipelineData(\"filtered_data\", datastore=default_store).as_dataset()\n",
"filtered_data = PipelineData(\"filtered_data\", datastore=default_store, is_directory=True).as_dataset()\n",
"\n",
"print('Filter script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
"\n",
@@ -525,7 +525,7 @@
"outputs": [],
"source": [
"# Define output after normalize step\n",
"normalized_data = PipelineData(\"normalized_data\", datastore=default_store).as_dataset()\n",
"normalized_data = PipelineData(\"normalized_data\", datastore=default_store, is_directory=True).as_dataset()\n",
"\n",
"print('Normalize script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
"\n",
@@ -566,7 +566,7 @@
"outputs": [],
"source": [
"# Define output after transform step\n",
"transformed_data = PipelineData(\"transformed_data\", datastore=default_store).as_dataset()\n",
"transformed_data = PipelineData(\"transformed_data\", datastore=default_store, is_directory=True).as_dataset()\n",
"\n",
"print('Transform script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
"\n",
@@ -604,8 +604,8 @@
"train_model_folder = './scripts/trainmodel'\n",
"\n",
"# train and test splits output\n",
"output_split_train = PipelineData(\"output_split_train\", datastore=default_store).as_dataset()\n",
"output_split_test = PipelineData(\"output_split_test\", datastore=default_store).as_dataset()\n",
"output_split_train = PipelineData(\"output_split_train\", datastore=default_store, is_directory=True).as_dataset()\n",
"output_split_test = PipelineData(\"output_split_test\", datastore=default_store, is_directory=True).as_dataset()\n",
"\n",
"print('Data spilt script is in {}.'.format(os.path.realpath(train_model_folder)))\n",
"\n",

View File

@@ -86,7 +86,7 @@
"import requests\n",
"\n",
"oj_sales_path = \"./oj.csv\"\n",
"r = requests.get(\"http://www.cs.unitn.it/~taufer/Data/oj.csv\")\n",
"r = requests.get(\"https://raw.githubusercontent.com/Azure/azureml-examples/main/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-orange-juice-sales/data/dominicks_OJ.csv\")\n",
"open(oj_sales_path, \"wb\").write(r.content)"
]
},
@@ -140,7 +140,7 @@
"metadata": {},
"outputs": [],
"source": [
"partitioned_dataset = dataset.partition_by(partition_keys=['store', 'brand'], target=(datastore, \"partition_by_key_res\"), name=\"partitioned_oj_data\")\n",
"partitioned_dataset = dataset.partition_by(partition_keys=['Store', 'Brand'], target=(datastore, \"partition_by_key_res\"), name=\"partitioned_oj_data\")\n",
"partitioned_dataset.partition_keys"
]
},
@@ -274,7 +274,7 @@
"parallel_run_config = ParallelRunConfig(\n",
" source_directory=scripts_folder,\n",
" entry_script=script_file, # the user script to run against each input\n",
" partition_keys=['store', 'brand'],\n",
" partition_keys=['Store', 'Brand'],\n",
" error_threshold=5,\n",
" output_action='append_row',\n",
" append_row_file_name=\"revenue_outputs.txt\",\n",
@@ -362,8 +362,8 @@
"result_file = os.path.join(target_dir, batch_output.path_on_datastore, parallel_run_config.append_row_file_name)\n",
"\n",
"df = pd.read_csv(result_file, delimiter=\" \", header=None)\n",
"df.columns=[\"WeekStarting\", \"Quantity\", \"logQuantity\", \"Advert\", \"Price\", \"Age60\", \"COLLEGE\", \"INCOME\", \"Hincome150\", \"Large HH\", \"Minorities\", \"WorkingWoman\", \"SSTRDIST\", \"SSTRVOL\", \"CPDIST5\", \"CPWVOL5\", \"Store\", \"Brand\", \"total_income\"]\n",
"\n",
"df.columns = [\"week\", \"logmove\", \"feat\", \"price\", \"AGE60\", \"EDUC\", \"ETHNIC\", \"INCOME\", \"HHLARGE\", \"WORKWOM\", \"HVAL150\", \"SSTRDIST\", \"SSTRVOL\", \"CPDIST5\", \"CPWVOL5\", \"store\", \"brand\", \"total_income\"]\n",
"print(\"Prediction has \", df.shape[0], \" rows\")\n",
"df.head(10)"
]
@@ -413,7 +413,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
"version": "3.8.13"
}
},
"nbformat": 4,

View File

@@ -1,358 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/ml-frameworks/chainer/distributed-chainer/distributed-chainer.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Distributed Chainer\n",
"In this tutorial, you will run a Chainer training example on the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset using ChainerMN distributed training across a GPU cluster."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites\n",
"* If you are using an Azure Machine Learning compute instance, you are all set. Otherwise, go through the [Configuration](../../../../configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Diagnostics\n",
"Opt-in diagnostics for better experience, quality, and security of future releases."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"Diagnostics"
]
},
"outputs": [],
"source": [
"from azureml.telemetry import set_diagnostics_collection\n",
"\n",
"set_diagnostics_collection(send_diagnostics=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize workspace\n",
"\n",
"Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.workspace import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create or attach existing AmlCompute\n",
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, we use Azure ML managed compute ([AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute)) for our remote training compute resource. Specifically, the below code creates an `STANDARD_NC6` GPU cluster that autoscales from `0` to `4` nodes.\n",
"\n",
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
"\n",
"**Creation of AmlCompute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace, this code will skip the creation process.\n",
"\n",
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.core.compute_target import ComputeTargetException\n",
"\n",
"# choose a name for your cluster\n",
"cluster_name = \"gpu-cluster\"\n",
"\n",
"try:\n",
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
" print('Found existing compute target.')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',\n",
" max_nodes=4)\n",
"\n",
" # create the cluster\n",
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
"\n",
" compute_target.wait_for_completion(show_output=True)\n",
"\n",
"# use get_status() to get a detailed status for the current AmlCompute. \n",
"print(compute_target.get_status().serialize())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The above code creates GPU compute. If you instead want to create CPU compute, provide a different VM size to the `vm_size` parameter, such as `STANDARD_D2_V2`."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train model on the remote compute\n",
"Now that we have the AmlCompute ready to go, let's run our distributed training job."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a project directory\n",
"Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"project_folder = './chainer-distr'\n",
"os.makedirs(project_folder, exist_ok=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Prepare training script\n",
"Now you will need to create your training script. In this tutorial, the script for distributed training of MNIST is already provided for you at `train_mnist.py`. In practice, you should be able to take any custom Chainer training script as is and run it with Azure ML without having to modify your code."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once your script is ready, copy the training script `train_mnist.py` into the project directory."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import shutil\n",
"\n",
"shutil.copy('train_mnist.py', project_folder)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create an experiment\n",
"Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed Chainer tutorial. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Experiment\n",
"\n",
"experiment_name = 'chainer-distr'\n",
"experiment = Experiment(ws, name=experiment_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create an environment\n",
"\n",
"In this tutorial, we will use one of the Azure ML Chainer curated environments for training."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Environment\n",
"\n",
"chainer_env = Environment.get(ws, name='AzureML-Chainer-5.1.0-GPU')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Configure your training job\n",
"\n",
"Create a ScriptRunConfig object to specify the configuration details of your training job, including your training script, environment to use, and the compute target to run on.\n",
"\n",
"In order to execute a distributed run using MPI, you must create an `MpiConfiguration` object and specify it to the `distributed_job_config` parameter. The below code will configure a 2-node distributed job. If you would also like to run multiple processes per node (i.e. if your cluster SKU has multiple GPUs), additionally specify the `process_count_per_node` parameter in MpiConfiguration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import ScriptRunConfig\n",
"from azureml.core.runconfig import MpiConfiguration\n",
"\n",
"src = ScriptRunConfig(source_directory=project_folder,\n",
" script='train_mnist.py',\n",
" compute_target=compute_target,\n",
" environment=chainer_env,\n",
" distributed_job_config=MpiConfiguration(node_count=2))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Submit job\n",
"Run your experiment by submitting your ScriptRunConfig object. Note that this call is asynchronous."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run = experiment.submit(src)\n",
"print(run)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Monitor your run\n",
"You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes. You can see that the widget automatically plots and visualizes the loss metric that we logged to the Azure ML run."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"\n",
"RunDetails(run).show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.wait_for_completion(show_output=True)"
]
}
],
"metadata": {
"authors": [
{
"name": "ninhu"
}
],
"category": "training",
"compute": [
"AML Compute"
],
"datasets": [
"MNIST"
],
"deployment": [
"None"
],
"exclude_from_index": false,
"framework": [
"Chainer"
],
"friendly_name": "Distributed Training with Chainer",
"index_order": 1,
"kernelspec": {
"display_name": "Python 3.8 - AzureML",
"language": "python",
"name": "python38-azureml"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
},
"tags": [
"None"
],
"task": "Use the Chainer estimator to perform distributed training"
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,5 +0,0 @@
name: distributed-chainer
dependencies:
- pip:
- azureml-sdk
- azureml-widgets

View File

@@ -1,125 +0,0 @@
# Official ChainerMN example taken from
# https://github.com/chainer/chainer/blob/master/examples/chainermn/mnist/train_mnist.py
from __future__ import print_function
import argparse
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import training
from chainer.training import extensions
import chainermn
class MLP(chainer.Chain):
def __init__(self, n_units, n_out):
super(MLP, self).__init__(
# the size of the inputs to each layer will be inferred
l1=L.Linear(784, n_units), # n_in -> n_units
l2=L.Linear(n_units, n_units), # n_units -> n_units
l3=L.Linear(n_units, n_out), # n_units -> n_out
)
def __call__(self, x):
h1 = F.relu(self.l1(x))
h2 = F.relu(self.l2(h1))
return self.l3(h2)
def main():
parser = argparse.ArgumentParser(description='ChainerMN example: MNIST')
parser.add_argument('--batchsize', '-b', type=int, default=100,
help='Number of images in each mini-batch')
parser.add_argument('--communicator', type=str,
default='non_cuda_aware', help='Type of communicator')
parser.add_argument('--epoch', '-e', type=int, default=20,
help='Number of sweeps over the dataset to train')
parser.add_argument('--gpu', '-g', default=True,
help='Use GPU')
parser.add_argument('--out', '-o', default='result',
help='Directory to output the result')
parser.add_argument('--resume', '-r', default='',
help='Resume the training from snapshot')
parser.add_argument('--unit', '-u', type=int, default=1000,
help='Number of units')
args = parser.parse_args()
# Prepare ChainerMN communicator.
if args.gpu:
if args.communicator == 'naive':
print("Error: 'naive' communicator does not support GPU.\n")
exit(-1)
comm = chainermn.create_communicator(args.communicator)
device = comm.intra_rank
else:
if args.communicator != 'naive':
print('Warning: using naive communicator '
'because only naive supports CPU-only execution')
comm = chainermn.create_communicator('naive')
device = -1
if comm.rank == 0:
print('==========================================')
print('Num process (COMM_WORLD): {}'.format(comm.size))
if args.gpu:
print('Using GPUs')
print('Using {} communicator'.format(args.communicator))
print('Num unit: {}'.format(args.unit))
print('Num Minibatch-size: {}'.format(args.batchsize))
print('Num epoch: {}'.format(args.epoch))
print('==========================================')
model = L.Classifier(MLP(args.unit, 10))
if device >= 0:
chainer.cuda.get_device_from_id(device).use()
model.to_gpu()
# Create a multi node optimizer from a standard Chainer optimizer.
optimizer = chainermn.create_multi_node_optimizer(
chainer.optimizers.Adam(), comm)
optimizer.setup(model)
# Split and distribute the dataset. Only worker 0 loads the whole dataset.
# Datasets of worker 0 are evenly split and distributed to all workers.
if comm.rank == 0:
train, test = chainer.datasets.get_mnist()
else:
train, test = None, None
train = chainermn.scatter_dataset(train, comm, shuffle=True)
test = chainermn.scatter_dataset(test, comm, shuffle=True)
train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
test_iter = chainer.iterators.SerialIterator(test, args.batchsize,
repeat=False, shuffle=False)
updater = training.StandardUpdater(train_iter, optimizer, device=device)
trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
# Create a multi node evaluator from a standard Chainer evaluator.
evaluator = extensions.Evaluator(test_iter, model, device=device)
evaluator = chainermn.create_multi_node_evaluator(evaluator, comm)
trainer.extend(evaluator)
# Some display and output extensions are necessary only for one worker.
# (Otherwise, there would just be repeated outputs.)
if comm.rank == 0:
trainer.extend(extensions.dump_graph('main/loss'))
trainer.extend(extensions.LogReport())
trainer.extend(extensions.PrintReport(
['epoch', 'main/loss', 'validation/main/loss',
'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
trainer.extend(extensions.ProgressBar())
if args.resume:
chainer.serializers.load_npz(args.resume, trainer)
trainer.run()
if __name__ == '__main__':
main()

View File

@@ -1,142 +0,0 @@
import argparse
import os
import numpy as np
from datautils import download_mnist
import chainer
from chainer import backend
from chainer import backends
from chainer.backends import cuda
from chainer import Function, gradient_check, report, training, utils, Variable
from chainer import datasets, iterators, optimizers, serializers
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
from chainer.training import extensions
from chainer.dataset import concat_examples
from chainer.backends.cuda import to_cpu
from azureml.core.run import Run
run = Run.get_context()
class MyNetwork(Chain):
def __init__(self, n_mid_units=100, n_out=10):
super(MyNetwork, self).__init__()
with self.init_scope():
self.l1 = L.Linear(None, n_mid_units)
self.l2 = L.Linear(n_mid_units, n_mid_units)
self.l3 = L.Linear(n_mid_units, n_out)
def forward(self, x):
h = F.relu(self.l1(x))
h = F.relu(self.l2(h))
return self.l3(h)
def main():
parser = argparse.ArgumentParser(description='Chainer example: MNIST')
parser.add_argument('--batchsize', '-b', type=int, default=100,
help='Number of images in each mini-batch')
parser.add_argument('--epochs', '-e', type=int, default=20,
help='Number of sweeps over the dataset to train')
parser.add_argument('--output_dir', '-o', default='./outputs',
help='Directory to output the result')
parser.add_argument('--gpu_id', '-g', default=0,
help='ID of the GPU to be used. Set to -1 if you use CPU')
args = parser.parse_args()
# Download the MNIST data if you haven't downloaded it yet
train, test = download_mnist()
gpu_id = args.gpu_id
batchsize = args.batchsize
epochs = args.epochs
run.log('Batch size', np.int(batchsize))
run.log('Epochs', np.int(epochs))
train_iter = iterators.SerialIterator(train, batchsize)
test_iter = iterators.SerialIterator(test, batchsize,
repeat=False, shuffle=False)
model = MyNetwork()
if gpu_id >= 0:
# Make a specified GPU current
chainer.backends.cuda.get_device_from_id(0).use()
model.to_gpu() # Copy the model to the GPU
# Choose an optimizer algorithm
optimizer = optimizers.MomentumSGD(lr=0.01, momentum=0.9)
# Give the optimizer a reference to the model so that it
# can locate the model's parameters.
optimizer.setup(model)
while train_iter.epoch < epochs:
# ---------- One iteration of the training loop ----------
train_batch = train_iter.next()
image_train, target_train = concat_examples(train_batch, gpu_id)
# Calculate the prediction of the network
prediction_train = model(image_train)
# Calculate the loss with softmax_cross_entropy
loss = F.softmax_cross_entropy(prediction_train, target_train)
# Calculate the gradients in the network
model.cleargrads()
loss.backward()
# Update all the trainable parameters
optimizer.update()
# --------------------- until here ---------------------
# Check the validation accuracy of prediction after every epoch
if train_iter.is_new_epoch: # If this iteration is the final iteration of the current epoch
# Display the training loss
print('epoch:{:02d} train_loss:{:.04f} '.format(
train_iter.epoch, float(to_cpu(loss.array))), end='')
test_losses = []
test_accuracies = []
while True:
test_batch = test_iter.next()
image_test, target_test = concat_examples(test_batch, gpu_id)
# Forward the test data
prediction_test = model(image_test)
# Calculate the loss
loss_test = F.softmax_cross_entropy(prediction_test, target_test)
test_losses.append(to_cpu(loss_test.array))
# Calculate the accuracy
accuracy = F.accuracy(prediction_test, target_test)
accuracy.to_cpu()
test_accuracies.append(accuracy.array)
if test_iter.is_new_epoch:
test_iter.epoch = 0
test_iter.current_position = 0
test_iter.is_new_epoch = False
test_iter._pushed_position = None
break
val_accuracy = np.mean(test_accuracies)
print('val_loss:{:.04f} val_accuracy:{:.04f}'.format(
np.mean(test_losses), val_accuracy))
run.log("Accuracy", np.float(val_accuracy))
serializers.save_npz(os.path.join(args.output_dir, 'model.npz'), model)
if __name__ == '__main__':
main()

View File

@@ -1,50 +0,0 @@
import numpy as np
import os
import json
from datautils import download_mnist
from chainer import serializers, using_config, Variable, datasets
import chainer.functions as F
import chainer.links as L
from chainer import Chain
from azureml.core.model import Model
class MyNetwork(Chain):
def __init__(self, n_mid_units=100, n_out=10):
super(MyNetwork, self).__init__()
with self.init_scope():
self.l1 = L.Linear(None, n_mid_units)
self.l2 = L.Linear(n_mid_units, n_mid_units)
self.l3 = L.Linear(n_mid_units, n_out)
def forward(self, x):
h = F.relu(self.l1(x))
h = F.relu(self.l2(h))
return self.l3(h)
def init():
global model
# AZUREML_MODEL_DIR is an environment variable created during deployment.
# It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
# For multiple models, it points to the folder containing all deployed models (./azureml-models)
model_root = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'model.npz')
# Load our saved artifacts
model = MyNetwork()
serializers.load_npz(model_root, model)
def run(input_data):
i = np.array(json.loads(input_data)['data'])
_, test = download_mnist()
x = Variable(np.asarray([test[i][0]]))
y = model(x)
return np.ndarray.tolist(y.data.argmax(axis=1))

View File

@@ -1,50 +0,0 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import glob
import gzip
import numpy as np
import os
import struct
from azureml.core import Dataset
from azureml.opendatasets import MNIST
from chainer.datasets import tuple_dataset
# load compressed MNIST gz files and return numpy arrays
def load_data(filename, label=False):
with gzip.open(filename) as gz:
struct.unpack('I', gz.read(4))
n_items = struct.unpack('>I', gz.read(4))
if not label:
n_rows = struct.unpack('>I', gz.read(4))[0]
n_cols = struct.unpack('>I', gz.read(4))[0]
res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
res = res.reshape(n_items[0], n_rows * n_cols)
else:
res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
res = res.reshape(n_items[0], 1)
return res
def download_mnist():
data_folder = os.path.join(os.getcwd(), 'data/mnist')
os.makedirs(data_folder, exist_ok=True)
mnist_file_dataset = MNIST.get_file_dataset()
mnist_file_dataset.download(data_folder, overwrite=True)
X_train = load_data(glob.glob(os.path.join(data_folder, "**/train-images-idx3-ubyte.gz"),
recursive=True)[0], False) / 255.0
X_test = load_data(glob.glob(os.path.join(data_folder, "**/t10k-images-idx3-ubyte.gz"),
recursive=True)[0], False) / 255.0
y_train = load_data(glob.glob(os.path.join(data_folder, "**/train-labels-idx1-ubyte.gz"),
recursive=True)[0], True).reshape(-1)
y_test = load_data(glob.glob(os.path.join(data_folder, "**/t10k-labels-idx1-ubyte.gz"),
recursive=True)[0], True).reshape(-1)
train = tuple_dataset.TupleDataset(X_train.astype(np.float32), y_train.astype(np.int32))
test = tuple_dataset.TupleDataset(X_test.astype(np.float32), y_test.astype(np.int32))
return train, test

View File

@@ -1,809 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved. \n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/ml-frameworks/chainer/train-hyperparameter-tune-deploy-with-chainer/train-hyperparameter-tune-deploy-with-chainer.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Train and hyperparameter tune with Chainer\n",
"\n",
"In this tutorial, we demonstrate how to use the Azure ML Python SDK to train a Convolutional Neural Network (CNN) on a single-node GPU with Chainer to perform handwritten digit recognition on the popular MNIST dataset. We will also demonstrate how to perform hyperparameter tuning of the model using Azure ML's HyperDrive service."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites\n",
"* If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [Configuration](../../../../configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Diagnostics\n",
"Opt-in diagnostics for better experience, quality, and security of future releases."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"Diagnostics"
]
},
"outputs": [],
"source": [
"from azureml.telemetry import set_diagnostics_collection\n",
"\n",
"set_diagnostics_collection(send_diagnostics=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize workspace\n",
"Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.workspace import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create or Attach existing AmlCompute\n",
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, we use Azure ML managed compute ([AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute)) for our remote training compute resource.\n",
"\n",
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
"\n",
"**Creation of AmlCompute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace, this code will skip the creation process.\n",
"\n",
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.core.compute_target import ComputeTargetException\n",
"\n",
"# choose a name for your cluster\n",
"cluster_name = \"gpu-cluster\"\n",
"\n",
"try:\n",
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
" print('Found existing compute target.')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',\n",
" max_nodes=4)\n",
"\n",
" # create the cluster\n",
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
"\n",
"compute_target.wait_for_completion(show_output=True)\n",
"\n",
"# use get_status() to get a detailed status for the current cluster. \n",
"print(compute_target.get_status().serialize())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The above code creates a GPU cluster. If you instead want to create a CPU cluster, provide a different VM size to the `vm_size` parameter, such as `STANDARD_D2_V2`."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train model on the remote compute\n",
"Now that you have your data and training script prepared, you are ready to train on your remote compute cluster. You can take advantage of Azure compute to leverage GPUs to cut down your training time. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a project directory\n",
"Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script and any additional files your training script depends on."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"project_folder = './chainer-mnist'\n",
"os.makedirs(project_folder, exist_ok=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Prepare training script\n",
"Now you will need to create your training script. In this tutorial, the training script is already provided for you at `chainer_mnist.py`. In practice, you should be able to take any custom training script as is and run it with Azure ML without having to modify your code.\n",
"\n",
"However, if you would like to use Azure ML's [tracking and metrics](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#metrics) capabilities, you will have to add a small amount of Azure ML code inside your training script. \n",
"\n",
"In `chainer_mnist.py`, we will log some metrics to our Azure ML run. To do so, we will access the Azure ML `Run` object within the script:\n",
"```Python\n",
"from azureml.core.run import Run\n",
"run = Run.get_context()\n",
"```\n",
"Further within `chainer_mnist.py`, we log the batchsize and epochs parameters, and the highest accuracy the model achieves:\n",
"```Python\n",
"run.log('Batch size', np.int(args.batchsize))\n",
"run.log('Epochs', np.int(args.epochs))\n",
"\n",
"run.log('Accuracy', np.float(val_accuracy))\n",
"```\n",
"These run metrics will become particularly important when we begin hyperparameter tuning our model in the \"Tune model hyperparameters\" section."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once your script is ready, copy the training script `chainer_mnist.py` into your project directory."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import shutil\n",
"\n",
"shutil.copy('chainer_mnist.py', project_folder)\n",
"shutil.copy('chainer_score.py', project_folder)\n",
"shutil.copy('datautils.py', project_folder)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create an experiment\n",
"Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this Chainer tutorial. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Experiment\n",
"\n",
"experiment_name = 'chainer-mnist'\n",
"experiment = Experiment(ws, name=experiment_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create an environment\n",
"\n",
"Define a conda environment YAML file with your training script dependencies and create an Azure ML environment."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile conda_dependencies.yml\n",
"\n",
"channels:\n",
"- conda-forge\n",
"dependencies:\n",
"- python=3.6.2\n",
"- pip=21.3.1\n",
"- pip:\n",
" - azureml-defaults\n",
" - azureml-opendatasets\n",
" - chainer==5.1.0\n",
" - cupy-cuda100==5.1.0\n",
" - mpi4py==3.0.0\n",
" - pytest"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Environment\n",
"from azureml.core.runconfig import DockerConfiguration\n",
"\n",
"chainer_env = Environment.from_conda_specification(name = 'chainer-5.1.0-gpu', file_path = './conda_dependencies.yml')\n",
"\n",
"# Specify a GPU base image\n",
"chainer_env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.0-cudnn7-ubuntu18.04'\n",
"\n",
"docker_config = DockerConfiguration(use_docker=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Configure your training job\n",
"\n",
"Create a ScriptRunConfig object to specify the configuration details of your training job, including your training script, environment to use, and the compute target to run on."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import ScriptRunConfig\n",
"\n",
"src = ScriptRunConfig(source_directory=project_folder,\n",
" script='chainer_mnist.py',\n",
" arguments=['--epochs', 10, '--batchsize', 128, '--output_dir', './outputs'],\n",
" compute_target=compute_target,\n",
" environment=chainer_env,\n",
" docker_runtime_config=docker_config)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Submit job\n",
"Run your experiment by submitting your ScriptRunConfig object. Note that this call is asynchronous."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run = experiment.submit(src)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Monitor your run\n",
"You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"\n",
"RunDetails(run).show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# to get more details of your run\n",
"print(run.get_details())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tune model hyperparameters\n",
"Now that we've seen how to do a simple Chainer training run using the SDK, let's see if we can further improve the accuracy of our model. We can optimize our model's hyperparameters using Azure Machine Learning's hyperparameter tuning capabilities."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Start a hyperparameter sweep\n",
"First, we will define the hyperparameter space to sweep over. Let's tune the batch size and epochs parameters. In this example we will use random sampling to try different configuration sets of hyperparameters to maximize our primary metric, accuracy.\n",
"\n",
"Then, we specify the early termination policy to use to early terminate poorly performing runs. Here we use the `BanditPolicy`, which will terminate any run that doesn't fall within the slack factor of our primary evaluation metric. In this tutorial, we will apply this policy every epoch (since we report our `Accuracy` metric every epoch and `evaluation_interval=1`). Notice we will delay the first policy evaluation until after the first `3` epochs (`delay_evaluation=3`).\n",
"Refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-tune-hyperparameters#specify-an-early-termination-policy) for more information on the BanditPolicy and other policies available."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.train.hyperdrive.runconfig import HyperDriveConfig\n",
"from azureml.train.hyperdrive.sampling import RandomParameterSampling\n",
"from azureml.train.hyperdrive.policy import BanditPolicy\n",
"from azureml.train.hyperdrive.run import PrimaryMetricGoal\n",
"from azureml.train.hyperdrive.parameter_expressions import choice\n",
" \n",
"\n",
"param_sampling = RandomParameterSampling( {\n",
" \"--batchsize\": choice(128, 256),\n",
" \"--epochs\": choice(5, 10, 20, 40)\n",
" }\n",
")\n",
"\n",
"hyperdrive_config = HyperDriveConfig(run_config=src,\n",
" hyperparameter_sampling=param_sampling, \n",
" primary_metric_name='Accuracy',\n",
" policy=BanditPolicy(evaluation_interval=1, slack_factor=0.1, delay_evaluation=3),\n",
" primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,\n",
" max_total_runs=8,\n",
" max_concurrent_runs=4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, lauch the hyperparameter tuning job."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# start the HyperDrive run\n",
"hyperdrive_run = experiment.submit(hyperdrive_config)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Monitor HyperDrive runs\n",
"You can monitor the progress of the runs with the following Jupyter widget. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"RunDetails(hyperdrive_run).show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"hyperdrive_run.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert(hyperdrive_run.get_status() == \"Completed\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Warm start a Hyperparameter Tuning experiment and resuming child runs\n",
"Often times, finding the best hyperparameter values for your model can be an iterative process, needing multiple tuning runs that learn from previous hyperparameter tuning runs. Reusing knowledge from these previous runs will accelerate the hyperparameter tuning process, thereby reducing the cost of tuning the model and will potentially improve the primary metric of the resulting model. When warm starting a hyperparameter tuning experiment with Bayesian sampling, trials from the previous run will be used as prior knowledge to intelligently pick new samples, so as to improve the primary metric. Additionally, when using Random or Grid sampling, any early termination decisions will leverage metrics from the previous runs to determine poorly performing training runs. \n",
"\n",
"Azure Machine Learning allows you to warm start your hyperparameter tuning run by leveraging knowledge from up to 5 previously completed hyperparameter tuning parent runs. \n",
"\n",
"Additionally, there might be occasions when individual training runs of a hyperparameter tuning experiment are cancelled due to budget constraints or fail due to other reasons. It is now possible to resume such individual training runs from the last checkpoint (assuming your training script handles checkpoints). Resuming an individual training run will use the same hyperparameter configuration and mount the storage used for that run. The training script should accept the \"--resume-from\" argument, which contains the checkpoint or model files from which to resume the training run. You can also resume individual runs as part of an experiment that spends additional budget on hyperparameter tuning. Any additional budget, after resuming the specified training runs is used for exploring additional configurations.\n",
"\n",
"For more information on warm starting and resuming hyperparameter tuning runs, please refer to the [Hyperparameter Tuning for Azure Machine Learning documentation](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters) \n",
"\n",
"### Find and register best model\n",
"When all jobs finish, we can find out the one that has the highest accuracy."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"best_run = hyperdrive_run.get_best_run_by_primary_metric()\n",
"print(best_run.get_details()['runDefinition']['arguments'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, let's list the model files uploaded during the run."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(best_run.get_file_names())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can then register the folder (and all files in it) as a model named `chainer-dnn-mnist` under the workspace for deployment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = best_run.register_model(model_name='chainer-dnn-mnist', model_path='outputs/model.npz')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deploy the model in ACI\n",
"Now, we are ready to deploy the model as a web service running in Azure Container Instance, [ACI](https://azure.microsoft.com/en-us/services/container-instances/). Azure Machine Learning accomplishes this by constructing a Docker image with the scoring logic and model baked in.\n",
"\n",
"### Create scoring script\n",
"First, we will create a scoring script that will be invoked by the web service call.\n",
"+ Now that the scoring script must have two required functions, `init()` and `run(input_data)`.\n",
" + In `init()`, you typically load the model into a global object. This function is executed only once when the Docker contianer is started.\n",
" + In `run(input_data)`, the model is used to predict a value based on the input data. The input and output to `run` uses NPZ as the serialization and de-serialization format because it is the preferred format for Chainer, but you are not limited to it.\n",
" \n",
"Refer to the scoring script `chainer_score.py` for this tutorial. Our web service will use this file to predict. When writing your own scoring script, don't forget to test it locally first before you go and deploy the web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"shutil.copy('chainer_score.py', project_folder)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create myenv.yml\n",
"We also need to create an environment file so that Azure Machine Learning can install the necessary packages in the Docker image which are required by your scoring script. In this case, we need to specify conda package `numpy` and pip install `chainer`. Please note that you must indicate azureml-defaults with verion >= 1.0.45 as a pip dependency, because it contains the functionality needed to host the model as a web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import CondaDependencies\n",
"\n",
"cd = CondaDependencies.create()\n",
"cd.add_conda_package('numpy')\n",
"cd.add_pip_package('chainer==5.1.0')\n",
"cd.add_pip_package(\"azureml-defaults==1.43.0\")\n",
"cd.add_pip_package(\"azureml-opendatasets\")\n",
"cd.save_to_file(base_directory='./', conda_file_path='myenv.yml')\n",
"\n",
"print(cd.serialize_to_string())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Deploy to ACI\n",
"We are almost ready to deploy. Create the inference configuration and deployment configuration and deploy to ACI. This cell will run for about 7-8 minutes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.webservice import AciWebservice\n",
"from azureml.core.model import InferenceConfig\n",
"from azureml.core.webservice import Webservice\n",
"from azureml.core.model import Model\n",
"from azureml.core.environment import Environment\n",
"\n",
"\n",
"myenv = Environment.from_conda_specification(name=\"myenv\", file_path=\"myenv.yml\")\n",
"inference_config = InferenceConfig(entry_script=\"chainer_score.py\", environment=myenv,\n",
" source_directory=project_folder)\n",
"\n",
"aciconfig = AciWebservice.deploy_configuration(cpu_cores=1,\n",
" auth_enabled=True, # this flag generates API keys to secure access\n",
" memory_gb=2,\n",
" tags={'name': 'mnist', 'framework': 'Chainer'},\n",
" description='Chainer DNN with MNIST')\n",
"\n",
"service = Model.deploy(workspace=ws,\n",
" name='chainer-mnist-1',\n",
" models=[model],\n",
" inference_config=inference_config,\n",
" deployment_config=aciconfig)\n",
"service.wait_for_deployment(True)\n",
"print(service.state)\n",
"print(service.scoring_uri)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Tip: If something goes wrong with the deployment, the first thing to look at is the logs from the service by running the following command:** "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(service.get_logs())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is the scoring web service endpoint:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(service.scoring_uri)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Test the deployed model\n",
"Let's test the deployed model. Pick a random sample from the test set, and send it to the web service hosted in ACI for a prediction. Note, here we are using the an HTTP request to invoke the service.\n",
"\n",
"We can retrieve the API keys used for accessing the HTTP endpoint and construct a raw HTTP request to send to the service. Don't forget to add key to the HTTP header."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# retreive the API keys. two keys were generated.\n",
"key1, Key2 = service.get_keys()\n",
"print(key1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"import urllib\n",
"import gzip\n",
"import numpy as np\n",
"import struct\n",
"import requests\n",
"\n",
"\n",
"# load compressed MNIST gz files and return numpy arrays\n",
"def load_data(filename, label=False):\n",
" with gzip.open(filename) as gz:\n",
" struct.unpack('I', gz.read(4))\n",
" n_items = struct.unpack('>I', gz.read(4))\n",
" if not label:\n",
" n_rows = struct.unpack('>I', gz.read(4))[0]\n",
" n_cols = struct.unpack('>I', gz.read(4))[0]\n",
" res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)\n",
" res = res.reshape(n_items[0], n_rows * n_cols)\n",
" else:\n",
" res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)\n",
" res = res.reshape(n_items[0], 1)\n",
" return res\n",
"\n",
"data_folder = os.path.join(os.getcwd(), 'data/mnist')\n",
"os.makedirs(data_folder, exist_ok=True)\n",
"\n",
"urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/t10k-images-idx3-ubyte.gz',\n",
" filename=os.path.join(data_folder, 't10k-images-idx3-ubyte.gz'))\n",
"urllib.request.urlretrieve('https://azureopendatastorage.blob.core.windows.net/mnist/t10k-labels-idx1-ubyte.gz',\n",
" filename=os.path.join(data_folder, 't10k-labels-idx1-ubyte.gz'))\n",
"\n",
"X_test = load_data(os.path.join(data_folder, 't10k-images-idx3-ubyte.gz'), False) / np.float32(255.0)\n",
"y_test = load_data(os.path.join(data_folder, 't10k-labels-idx1-ubyte.gz'), True).reshape(-1)\n",
"\n",
"# send a random row from the test set to score\n",
"random_index = np.random.randint(0, len(X_test)-1)\n",
"input_data = \"{\\\"data\\\": [\" + str(random_index) + \"]}\"\n",
"\n",
"headers = {'Content-Type':'application/json', 'Authorization': 'Bearer ' + key1}\n",
"\n",
"# send sample to service for scoring\n",
"resp = requests.post(service.scoring_uri, input_data, headers=headers)\n",
"\n",
"print(\"label:\", y_test[random_index])\n",
"print(\"prediction:\", resp.text[1])\n",
"\n",
"plt.imshow(X_test[random_index].reshape((28,28)), cmap='gray')\n",
"plt.axis('off')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's look at the workspace after the web service was deployed. You should see\n",
"\n",
" + a registered model named 'chainer-dnn-mnist' and with the id 'chainer-dnn-mnist:1'\n",
" + a webservice called 'chainer-mnist-svc' with some scoring URL"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = ws.models['chainer-dnn-mnist']\n",
"print(\"Model: {}, ID: {}\".format('chainer-dnn-mnist', model.id))\n",
" \n",
"webservice = ws.webservices['chainer-mnist-1']\n",
"print(\"Webservice: {}, scoring URI: {}\".format('chainer-mnist-1', webservice.scoring_uri))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Clean up"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can delete the ACI deployment with a simple delete API call."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"service.delete()"
]
}
],
"metadata": {
"authors": [
{
"name": "nagaur"
}
],
"category": "training",
"compute": [
"AML Compute"
],
"datasets": [
"MNIST"
],
"deployment": [
"Azure Container Instance"
],
"exclude_from_index": false,
"framework": [
"Chainer"
],
"friendly_name": "Train a model with hyperparameter tuning",
"index_order": 1,
"kernelspec": {
"display_name": "Python 3.8 - AzureML",
"language": "python",
"name": "python38-azureml"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
},
"tags": [
"None"
],
"task": "Train a Convolutional Neural Network (CNN)"
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,13 +0,0 @@
name: train-hyperparameter-tune-deploy-with-chainer
dependencies:
- pip:
- azureml-sdk
- azureml-widgets
- numpy
- matplotlib
- json
- urllib
- gzip
- struct
- requests
- azureml-opendatasets

View File

@@ -166,7 +166,7 @@ def download_data():
from zipfile import ZipFile
# download data
data_file = './fowl_data.zip'
download_url = 'https://azureopendatastorage.blob.core.windows.net/testpublic/temp/fowl_data.zip'
download_url = 'https://azuremlexamples.blob.core.windows.net/datasets/fowl_data.zip'
urllib.request.urlretrieve(download_url, filename=data_file)
# extract files

View File

@@ -176,7 +176,7 @@
"metadata": {},
"source": [
"### Download training data\n",
"The dataset we will use (located on a public blob [here](https://azureopendatastorage.blob.core.windows.net/testpublic/temp/fowl_data.zip) as a zip file) consists of about 120 training images each for turkeys and chickens, with 100 validation images for each class. The images are a subset of the [Open Images v5 Dataset](https://storage.googleapis.com/openimages/web/index.html). We will download and extract the dataset as part of our training script `pytorch_train.py`"
"The dataset we will use (located on a public blob [here](https://azuremlexamples.blob.core.windows.net/datasets/fowl_data.zip) as a zip file) consists of about 120 training images each for turkeys and chickens, with 100 validation images for each class. The images are a subset of the [Open Images v5 Dataset](https://storage.googleapis.com/openimages/web/index.html). We will download and extract the dataset as part of our training script `pytorch_train.py`"
]
},
{
@@ -260,15 +260,14 @@
"\n",
"channels:\n",
"- conda-forge\n",
"- pytorch\n",
"dependencies:\n",
"- python=3.6.2\n",
"- python=3.8.12\n",
"- pip=21.3.1\n",
"- pytorch::pytorch==1.8.1\n",
"- pytorch::torchvision==0.9.1\n",
"- pip:\n",
" - azureml-defaults==1.43.0\n",
" - torch==1.6.0\n",
" - torchvision==0.7.0\n",
" - future==0.17.1\n",
" - pillow"
" - azureml-defaults"
]
},
{

View File

@@ -21,7 +21,7 @@
"metadata": {},
"source": [
"# Distributed TensorFlow with Horovod\n",
"In this tutorial, you will train a word2vec model in TensorFlow using distributed training via [Horovod](https://github.com/uber/horovod)."
"In this tutorial, you will train a model in TensorFlow using distributed training via [Horovod](https://github.com/uber/horovod)."
]
},
{
@@ -144,26 +144,6 @@
"The above code creates a GPU cluster. If you instead want to create a CPU cluster, provide a different VM size to the `vm_size` parameter, such as `STANDARD_D2_V2`."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create a Dataset for Files\n",
"A Dataset can reference single or multiple files in your datastores or public urls. The files can be of any format. FileDataset provides you with the ability to download or mount the files to your compute. By creating a dataset, you create a reference to the data source location. The data remains in its existing location, so no extra storage cost is incurred. [Learn More](https://aka.ms/azureml/howto/createdatasets)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Dataset\n",
"\n",
"web_paths = ['https://azureopendatastorage.blob.core.windows.net/testpublic/text8.zip']\n",
"dataset = Dataset.File.from_files(path=web_paths)"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -171,28 +151,6 @@
"You may want to register datasets using the register() method to your workspace so that the dataset can be shared with others, reused across various experiments, and referred to by name in your training script."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset = dataset.register(workspace=ws,\n",
" name='wikipedia-text',\n",
" description='Wikipedia text training and test dataset',\n",
" create_new_version=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# list the files referenced by the dataset\n",
"dataset.to_path()"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -200,43 +158,6 @@
"## Train model on the remote compute"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a project directory\n",
"Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script, and any additional files your training script depends on."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"project_folder = './tf-distr-hvd'\n",
"os.makedirs(project_folder, exist_ok=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copy the training script `tf_horovod_word2vec.py` into this project directory."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import shutil\n",
"\n",
"shutil.copy('tf_horovod_word2vec.py', project_folder)"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -274,7 +195,7 @@
"source": [
"from azureml.core import Environment\n",
"\n",
"tf_env = Environment.get(ws, name='AzureML-TensorFlow-1.13-GPU')"
"tf_env = Environment.get(ws, name='AzureML-tensorflow-2.7-ubuntu20.04-py38-cuda11-gpu')"
]
},
{
@@ -297,9 +218,8 @@
"from azureml.core import ScriptRunConfig\n",
"from azureml.core.runconfig import MpiConfiguration\n",
"\n",
"src = ScriptRunConfig(source_directory=project_folder,\n",
" script='tf_horovod_word2vec.py',\n",
" arguments=['--input_data', dataset.as_mount()],\n",
"src = ScriptRunConfig(source_directory=\"src\",\n",
" script='train.py',\n",
" compute_target=compute_target,\n",
" environment=tf_env,\n",
" distributed_job_config=MpiConfiguration(node_count=2))"

View File

@@ -2,10 +2,3 @@ name: distributed-tensorflow-with-horovod
dependencies:
- pip:
- azureml-sdk
- azureml-widgets
- keras
- tensorflow-gpu==1.13.2
- horovod==0.19.1
- matplotlib
- pandas
- fuse

View File

@@ -0,0 +1,120 @@
# Copyright 2019 Uber Technologies, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Script adapted from: https://github.com/horovod/horovod/blob/master/examples/tensorflow2_keras_mnist.py
# ==============================================================================
import tensorflow as tf
import horovod.tensorflow.keras as hvd
import os
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--learning-rate", "-lr", type=float, default=0.001)
parser.add_argument("--epochs", type=int, default=24)
parser.add_argument("--steps-per-epoch", type=int, default=500)
args = parser.parse_args()
# Horovod: initialize Horovod.
hvd.init()
# Horovod: pin GPU to be used to process local rank (one GPU per process)
gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
(mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data(
path="mnist-%d.npz" % hvd.rank()
)
dataset = tf.data.Dataset.from_tensor_slices(
(
tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
tf.cast(mnist_labels, tf.int64),
)
)
dataset = dataset.repeat().shuffle(10000).batch(128)
mnist_model = tf.keras.Sequential(
[
tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
tf.keras.layers.Dropout(0.25),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation="relu"),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(10, activation="softmax"),
]
)
# Horovod: adjust learning rate based on number of GPUs.
scaled_lr = args.learning_rate * hvd.size()
opt = tf.optimizers.Adam(scaled_lr)
# Horovod: add Horovod DistributedOptimizer.
opt = hvd.DistributedOptimizer(opt)
# Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow
# uses hvd.DistributedOptimizer() to compute gradients.
mnist_model.compile(
loss=tf.losses.SparseCategoricalCrossentropy(),
optimizer=opt,
metrics=["accuracy"],
experimental_run_tf_function=False,
)
callbacks = [
# Horovod: broadcast initial variable states from rank 0 to all other processes.
# This is necessary to ensure consistent initialization of all workers when
# training is started with random weights or restored from a checkpoint.
hvd.callbacks.BroadcastGlobalVariablesCallback(0),
# Horovod: average metrics among workers at the end of every epoch.
#
# Note: This callback must be in the list before the ReduceLROnPlateau,
# TensorBoard or other metrics-based callbacks.
hvd.callbacks.MetricAverageCallback(),
# Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
# accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
# the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
hvd.callbacks.LearningRateWarmupCallback(
warmup_epochs=3, initial_lr=scaled_lr, verbose=1
),
]
# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
if hvd.rank() == 0:
output_dir = "./outputs"
os.makedirs(output_dir, exist_ok=True)
callbacks.append(
tf.keras.callbacks.ModelCheckpoint(
os.path.join(output_dir, "checkpoint-{epoch}.h5")
)
)
# Horovod: write logs on worker 0.
verbose = 1 if hvd.rank() == 0 else 0
# Train the model.
# Horovod: adjust number of steps based on number of GPUs.
mnist_model.fit(
dataset,
steps_per_epoch=args.steps_per_epoch // hvd.size(),
callbacks=callbacks,
epochs=args.epochs,
verbose=verbose,
)

View File

@@ -1,238 +0,0 @@
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
# Modifications copyright (C) 2017 Uber Technologies, Inc.
# Additional modifications copyright (C) Microsoft Corporation
# Licensed under the Apache License, Version 2.0
# Script adapted from: https://github.com/uber/horovod/blob/master/examples/tensorflow_word2vec.py
# ======================================
"""Basic word2vec example."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import os
import random
import zipfile
import argparse
import glob
import numpy as np
from six.moves import urllib
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
import horovod.tensorflow as hvd
from azureml.core.run import Run
# Horovod: initialize Horovod.
hvd.init()
parser = argparse.ArgumentParser()
parser.add_argument('--input_data', type=str, help='training data')
args = parser.parse_args()
input_data = args.input_data
print("the input data is at %s" % input_data)
# Step 1: Read data.
filename = input_data
# Read the data into a list of strings.
def read_data(filename):
"""Extract the first file enclosed in a zip file as a list of words."""
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
vocabulary = read_data(filename)
print('Data size', len(vocabulary))
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000
def build_dataset(words, n_words):
"""Process raw inputs into a dataset."""
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(n_words - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary['UNK']
unk_count += 1
data.append(index)
count[0][1] = unk_count
reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reversed_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
vocabulary_size)
del vocabulary # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
assert num_skips <= 2 * skip_window
# Adjust batch_size to match num_skips
batch_size = batch_size // num_skips * num_skips
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
# Backtrack a little bit to avoid skipping words in the end of a batch
data_index = random.randint(0, len(data) - span - 1)
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
target = skip_window # target label at the center of the buffer
targets_to_avoid = [skip_window]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target]
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch, labels
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
print(batch[i], reverse_dictionary[batch[i]],
'->', labels[i, 0], reverse_dictionary[labels[i, 0]])
# Step 4: Build and train a skip-gram model.
max_batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64 # Number of negative examples to sample.
graph = tf.Graph()
with graph.as_default():
# Input data.
train_inputs = tf.placeholder(tf.int32, shape=[None])
train_labels = tf.placeholder(tf.int32, shape=[None, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# Look up embeddings for inputs.
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
# Construct the variables for the NCE loss
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
loss = tf.reduce_mean(
tf.nn.nce_loss(weights=nce_weights,
biases=nce_biases,
labels=train_labels,
inputs=embed,
num_sampled=num_sampled,
num_classes=vocabulary_size))
# Horovod: adjust learning rate based on number of GPUs.
optimizer = tf.train.GradientDescentOptimizer(1.0 * hvd.size())
# Horovod: add Horovod Distributed Optimizer.
optimizer = hvd.DistributedOptimizer(optimizer)
train_op = optimizer.minimize(loss)
# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(
normalized_embeddings, valid_dataset)
similarity = tf.matmul(
valid_embeddings, normalized_embeddings, transpose_b=True)
# Add variable initializer.
init = tf.global_variables_initializer()
# Horovod: broadcast initial variable states from rank 0 to all other processes.
# This is necessary to ensure consistent initialization of all workers when
# training is started with random weights or restored from a checkpoint.
bcast = hvd.broadcast_global_variables(0)
# Step 5: Begin training.
# Horovod: adjust number of steps based on number of GPUs.
num_steps = 4000 // hvd.size() + 1
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
with tf.Session(graph=graph, config=config) as session:
# We must initialize all variables before we use them.
init.run()
bcast.run()
print('Initialized')
run = Run.get_context()
average_loss = 0
for step in xrange(num_steps):
# simulate various sentence length by randomization
batch_size = random.randint(max_batch_size // 2, max_batch_size)
batch_inputs, batch_labels = generate_batch(
batch_size, num_skips, skip_window)
feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
# We perform one update step by evaluating the optimizer op (including it
# in the list of returned values for session.run()
_, loss_val = session.run([train_op, loss], feed_dict=feed_dict)
average_loss += loss_val
if step % 2000 == 0:
if step > 0:
average_loss /= 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print('Average loss at step ', step, ': ', average_loss)
run.log("Loss", average_loss)
average_loss = 0
final_embeddings = normalized_embeddings.eval()
# Evaluate similarity in the end on worker 0.
if hvd.rank() == 0:
sim = similarity.eval()
for i in xrange(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k + 1]
log_str = 'Nearest to %s:' % valid_word
for k in xrange(top_k):
close_word = reverse_dictionary[nearest[k]]
log_str = '%s %s,' % (log_str, close_word)
print(log_str)

View File

@@ -1,354 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/ml-frameworks/tensorflow/distributed-tensorflow-with-parameter-server/distributed-tensorflow-with-parameter-server.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Distributed TensorFlow with parameter server\n",
"In this tutorial, you will train a TensorFlow model on the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset using native [distributed TensorFlow](https://www.tensorflow.org/guide/distributed_training)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites\n",
"* Understand the [architecture and terms](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture) introduced by Azure Machine Learning (AML)\n",
"* If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [configuration notebook](../../../../configuration.ipynb) to:\n",
" * install the AML SDK\n",
" * create a workspace and its configuration file (`config.json`)\n",
"* Review the [tutorial](../train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb) on single-node TensorFlow training using the SDK"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Diagnostics\n",
"Opt-in diagnostics for better experience, quality, and security of future releases."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"Diagnostics"
]
},
"outputs": [],
"source": [
"from azureml.telemetry import set_diagnostics_collection\n",
"\n",
"set_diagnostics_collection(send_diagnostics=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize workspace\n",
"Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.workspace import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create or Attach existing AmlCompute\n",
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, you create `AmlCompute` as your training compute resource.\n",
"\n",
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
"\n",
"**Creation of AmlCompute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
"\n",
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.core.compute_target import ComputeTargetException\n",
"\n",
"# choose a name for your cluster\n",
"cluster_name = \"gpu-cluster\"\n",
"\n",
"try:\n",
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
" print('Found existing compute target.')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n",
" max_nodes=4)\n",
"\n",
" # create the cluster\n",
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
"\n",
" compute_target.wait_for_completion(show_output=True)\n",
"\n",
"# use get_status() to get a detailed status for the current cluster. \n",
"print(compute_target.get_status().serialize())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train model on the remote compute\n",
"Now that we have the cluster ready to go, let's run our distributed training job."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a project directory\n",
"Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script, and any additional files your training script depends on."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"project_folder = './tf-distr-ps'\n",
"os.makedirs(project_folder, exist_ok=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copy the training script `tf_mnist_replica.py` into this project directory."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import shutil\n",
"\n",
"shutil.copy('tf_mnist_replica.py', project_folder)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create an experiment\n",
"Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed TensorFlow tutorial. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Experiment\n",
"\n",
"experiment_name = 'tf-distr-ps'\n",
"experiment = Experiment(ws, name=experiment_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create an environment\n",
"\n",
"In this tutorial, we will use one of Azure ML's curated TensorFlow environments for training. [Curated environments](https://docs.microsoft.com/azure/machine-learning/how-to-use-environments#use-a-curated-environment) are available in your workspace by default. Specifically, we will use the TensorFlow 1.13 GPU curated environment."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Environment\n",
"\n",
"tf_env = Environment.get(ws, name='AzureML-TensorFlow-1.13-GPU')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Configure the training job\n",
"\n",
"Create a ScriptRunConfig object to specify the configuration details of your training job, including your training script, environment to use, and the compute target to run on.\n",
"\n",
"In order to execute a distributed TensorFlow run with the parameter server strategy, you must create a `TensorflowConfiguration` object and pass it to the `distributed_job_config` parameter of the ScriptRunConfig constructor. The below code configures a distributed TensorFlow run with `2` workers and `1` parameter server."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import ScriptRunConfig\n",
"from azureml.core.runconfig import TensorflowConfiguration\n",
"\n",
"src = ScriptRunConfig(source_directory=project_folder,\n",
" script='tf_mnist_replica.py',\n",
" arguments=['--num_gpus', 1, '--train_steps', 500],\n",
" compute_target=compute_target,\n",
" environment=tf_env,\n",
" distributed_job_config=TensorflowConfiguration(worker_count=2, parameter_server_count=1))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Submit job\n",
"Run your experiment by submitting your ScriptRunConfig object. Note that this call is asynchronous."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run = experiment.submit(src)\n",
"print(run)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Monitor your run\n",
"You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"\n",
"RunDetails(run).show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Alternatively, you can block until the script has completed training before running more code."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.wait_for_completion(show_output=True) # this provides a verbose log"
]
}
],
"metadata": {
"authors": [
{
"name": "minxia"
}
],
"category": "training",
"compute": [
"AML Compute"
],
"datasets": [
"MNIST"
],
"deployment": [
"None"
],
"exclude_from_index": false,
"framework": [
"TensorFlow"
],
"friendly_name": "Distributed TensorFlow with parameter server",
"index_order": 1,
"kernelspec": {
"display_name": "Python 3.8 - AzureML",
"language": "python",
"name": "python38-azureml"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
},
"tags": [
"None"
],
"task": "Use the TensorFlow estimator to train a model using distributed training"
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,5 +0,0 @@
name: distributed-tensorflow-with-parameter-server
dependencies:
- pip:
- azureml-sdk
- azureml-widgets

View File

@@ -1,271 +0,0 @@
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
# Licensed under the Apache License, Version 2.0
# Script adapted from:
# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dist_test/python/mnist_replica.py
# ==============================================================================
"""Distributed MNIST training and validation, with model replicas.
A simple softmax model with one hidden layer is defined. The parameters
(weights and biases) are located on one parameter server (ps), while the ops
are executed on two worker nodes by default. The TF sessions also run on the
worker node.
Multiple invocations of this script can be done in parallel, with different
values for --task_index. There should be exactly one invocation with
--task_index, which will create a master session that carries out variable
initialization. The other, non-master, sessions will wait for the master
session to finish the initialization before proceeding to the training stage.
The coordination between the multiple worker invocations occurs due to
the definition of the parameters on the same ps devices. The parameter updates
from one worker is visible to all other workers. As such, the workers can
perform forward computation and gradient calculation in parallel, which
should lead to increased training speed for the simple model.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import math
import sys
import tempfile
import time
import json
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from azureml.core.run import Run
flags = tf.app.flags
flags.DEFINE_string("data_dir", "/tmp/mnist-data",
"Directory for storing mnist data")
flags.DEFINE_boolean("download_only", False,
"Only perform downloading of data; Do not proceed to "
"session preparation, model definition or training")
flags.DEFINE_integer("num_gpus", 0, "Total number of gpus for each machine."
"If you don't use GPU, please set it to '0'")
flags.DEFINE_integer("replicas_to_aggregate", None,
"Number of replicas to aggregate before parameter update "
"is applied (For sync_replicas mode only; default: "
"num_workers)")
flags.DEFINE_integer("hidden_units", 100,
"Number of units in the hidden layer of the NN")
flags.DEFINE_integer("train_steps", 200,
"Number of (global) training steps to perform")
flags.DEFINE_integer("batch_size", 100, "Training batch size")
flags.DEFINE_float("learning_rate", 0.01, "Learning rate")
flags.DEFINE_boolean(
"sync_replicas", False,
"Use the sync_replicas (synchronized replicas) mode, "
"wherein the parameter updates from workers are aggregated "
"before applied to avoid stale gradients")
flags.DEFINE_boolean(
"existing_servers", False, "Whether servers already exists. If True, "
"will use the worker hosts via their GRPC URLs (one client process "
"per worker host). Otherwise, will create an in-process TensorFlow "
"server.")
FLAGS = flags.FLAGS
IMAGE_PIXELS = 28
def main(unused_argv):
data_root = os.path.join("outputs", "MNIST")
mnist = None
tf_config = os.environ.get("TF_CONFIG")
if not tf_config or tf_config == "":
raise ValueError("TF_CONFIG not found.")
tf_config_json = json.loads(tf_config)
cluster = tf_config_json.get('cluster')
job_name = tf_config_json.get('task', {}).get('type')
task_index = tf_config_json.get('task', {}).get('index')
job_name = "worker" if job_name == "master" else job_name
sentinel_path = os.path.join(data_root, "complete.txt")
if job_name == "worker" and task_index == 0:
mnist = input_data.read_data_sets(data_root, one_hot=True)
with open(sentinel_path, 'w+') as f:
f.write("download complete")
else:
while not os.path.exists(sentinel_path):
time.sleep(0.01)
mnist = input_data.read_data_sets(data_root, one_hot=True)
if FLAGS.download_only:
sys.exit(0)
print("job name = %s" % job_name)
print("task index = %d" % task_index)
print("number of GPUs = %d" % FLAGS.num_gpus)
# Construct the cluster and start the server
cluster_spec = tf.train.ClusterSpec(cluster)
# Get the number of workers.
num_workers = len(cluster_spec.task_indices("worker"))
if not FLAGS.existing_servers:
# Not using existing servers. Create an in-process server.
server = tf.train.Server(
cluster_spec, job_name=job_name, task_index=task_index)
if job_name == "ps":
server.join()
is_chief = (task_index == 0)
if FLAGS.num_gpus > 0:
# Avoid gpu allocation conflict: now allocate task_num -> #gpu
# for each worker in the corresponding machine
gpu = (task_index % FLAGS.num_gpus)
worker_device = "/job:worker/task:%d/gpu:%d" % (task_index, gpu)
elif FLAGS.num_gpus == 0:
# Just allocate the CPU to worker server
cpu = 0
worker_device = "/job:worker/task:%d/cpu:%d" % (task_index, cpu)
# The device setter will automatically place Variables ops on separate
# parameter servers (ps). The non-Variable ops will be placed on the workers.
# The ps use CPU and workers use corresponding GPU
with tf.device(
tf.train.replica_device_setter(
worker_device=worker_device,
ps_device="/job:ps/cpu:0",
cluster=cluster)):
global_step = tf.Variable(0, name="global_step", trainable=False)
# Variables of the hidden layer
hid_w = tf.Variable(
tf.truncated_normal(
[IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
stddev=1.0 / IMAGE_PIXELS),
name="hid_w")
hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b")
# Variables of the softmax layer
sm_w = tf.Variable(
tf.truncated_normal(
[FLAGS.hidden_units, 10],
stddev=1.0 / math.sqrt(FLAGS.hidden_units)),
name="sm_w")
sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
# Ops: located on the worker specified with task_index
x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])
y_ = tf.placeholder(tf.float32, [None, 10])
hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
hid = tf.nn.relu(hid_lin)
y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
opt = tf.train.AdamOptimizer(FLAGS.learning_rate)
if FLAGS.sync_replicas:
if FLAGS.replicas_to_aggregate is None:
replicas_to_aggregate = num_workers
else:
replicas_to_aggregate = FLAGS.replicas_to_aggregate
opt = tf.train.SyncReplicasOptimizer(
opt,
replicas_to_aggregate=replicas_to_aggregate,
total_num_replicas=num_workers,
name="mnist_sync_replicas")
train_step = opt.minimize(cross_entropy, global_step=global_step)
if FLAGS.sync_replicas:
local_init_op = opt.local_step_init_op
if is_chief:
local_init_op = opt.chief_init_op
ready_for_local_init_op = opt.ready_for_local_init_op
# Initial token and chief queue runners required by the sync_replicas mode
chief_queue_runner = opt.get_chief_queue_runner()
sync_init_op = opt.get_init_tokens_op()
init_op = tf.global_variables_initializer()
train_dir = tempfile.mkdtemp()
if FLAGS.sync_replicas:
sv = tf.train.Supervisor(
is_chief=is_chief,
logdir=train_dir,
init_op=init_op,
local_init_op=local_init_op,
ready_for_local_init_op=ready_for_local_init_op,
recovery_wait_secs=1,
global_step=global_step)
else:
sv = tf.train.Supervisor(
is_chief=is_chief,
logdir=train_dir,
init_op=init_op,
recovery_wait_secs=1,
global_step=global_step)
sess_config = tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=False,
device_filters=["/job:ps",
"/job:worker/task:%d" % task_index])
# The chief worker (task_index==0) session will prepare the session,
# while the remaining workers will wait for the preparation to complete.
if is_chief:
print("Worker %d: Initializing session..." % task_index)
else:
print("Worker %d: Waiting for session to be initialized..." %
task_index)
if FLAGS.existing_servers:
server_grpc_url = "grpc://" + task_index
print("Using existing server at: %s" % server_grpc_url)
sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config)
else:
sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
print("Worker %d: Session initialization complete." % task_index)
if FLAGS.sync_replicas and is_chief:
# Chief worker will start the chief queue runner and call the init op.
sess.run(sync_init_op)
sv.start_queue_runners(sess, [chief_queue_runner])
# Perform training
time_begin = time.time()
print("Training begins @ %f" % time_begin)
local_step = 0
while True:
# Training feed
batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size)
train_feed = {x: batch_xs, y_: batch_ys}
_, step = sess.run([train_step, global_step], feed_dict=train_feed)
local_step += 1
now = time.time()
print("%f: Worker %d: training step %d done (global step: %d)" %
(now, task_index, local_step, step))
if step >= FLAGS.train_steps:
break
time_end = time.time()
print("Training ends @ %f" % time_end)
training_time = time_end - time_begin
print("Training elapsed time: %f s" % training_time)
# Validation feed
val_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
val_xent = sess.run(cross_entropy, feed_dict=val_feed)
print("After %d training step(s), validation cross entropy = %g" %
(FLAGS.train_steps, val_xent))
if job_name == "worker" and task_index == 0:
run = Run.get_context()
run.log("CrossEntropy", val_xent)
if __name__ == "__main__":
tf.app.run()

View File

@@ -36,8 +36,6 @@ Using these samples, you will learn how to do the following.
| [cartpole_ci.ipynb](cartpole-on-compute-instance/cartpole_ci.ipynb) | Notebook to train a Cartpole playing agent on an Azure Machine Learning Compute Instance |
| [cartpole_sc.ipynb](cartpole-on-single-compute/cartpole_sc.ipynb) | Notebook to train a Cartpole playing agent on an Azure Machine Learning Compute Cluster (single node) |
| [pong_rllib.ipynb](atari-on-distributed-compute/pong_rllib.ipynb) | Notebook for distributed training of Pong agent using RLlib on multiple compute targets |
| [minecraft.ipynb](minecraft-on-distributed-compute/minecraft.ipynb) | Notebook to train an agent to navigate through a lava maze in the Minecraft game |
| [particle.ipynb](multiagent-particle-envs/particle.ipynb) | Notebook to train policies in a multiagent cooperative navigation scenario based on OpenAI's Particle environments |
## Prerequisites

View File

@@ -1,14 +1,16 @@
FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.0.3-cudnn8-ubuntu18.04
FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.2-cudnn8-ubuntu20.04:20221010.v1
USER root
RUN conda install -c anaconda python=3.7
# CUDA repository key rotation: https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212771
RUN apt-key del 7fa2af80
ENV distro ubuntu1804
ENV arch x86_64
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/$distro/$arch/3bf863cc.pub
ENV AZUREML_CONDA_ENVIRONMENT_PATH /azureml-envs/tensorflow-2.7
# Create conda environment
RUN conda create -p $AZUREML_CONDA_ENVIRONMENT_PATH \
python=3.8 pip=20.2.4
# Prepend path to AzureML conda environment
ENV PATH $AZUREML_CONDA_ENVIRONMENT_PATH/bin:$PATH
# Install necessary packages to support videos in rllib/gym
RUN apt-get update && apt-get install -y --no-install-recommends \
python-opengl \
rsync \
@@ -17,15 +19,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
rm -rf /var/lib/apt/lists/* && \
rm -rf /usr/share/man/*
ENV AZUREML_CONDA_ENVIRONMENT_PATH /azureml-envs/tensorflow-2.4
# Create conda environment
RUN conda create -p $AZUREML_CONDA_ENVIRONMENT_PATH \
python=3.7 pip=20.2.4
# Prepend path to AzureML conda environment
ENV PATH $AZUREML_CONDA_ENVIRONMENT_PATH/bin:$PATH
RUN pip --version
RUN python --version
@@ -36,23 +29,25 @@ RUN pip install ray==0.8.7
RUN pip install gym[atari]==0.19.0
RUN pip install gym[accept-rom-license]==0.19.0
# Install pip dependencies
RUN pip install 'matplotlib>=3.3,<3.4' \
'psutil>=5.8,<5.9' \
'tqdm>=4.59,<4.60' \
'pandas>=1.1,<1.2' \
'scipy>=1.5,<1.6' \
'numpy>=1.10,<1.20' \
# Install pip dependencies for Tensorflow
RUN pip install 'matplotlib~=3.5.0' \
'psutil~=5.8.0' \
'tqdm~=4.62.0' \
'pandas~=1.3.0' \
'scipy~=1.7.0' \
'numpy~=1.21.0' \
'ipykernel~=6.0' \
'azureml-core==1.36.0.post2' \
'azureml-defaults==1.36.0' \
'azureml-mlflow==1.36.0' \
'azureml-telemetry==1.36.0' \
'tensorboard==2.4.0' \
'tensorflow-gpu==2.4.1' \
'tensorflow-datasets==4.3.0' \
'onnxruntime-gpu>=1.7,<1.8' \
'horovod[tensorflow-gpu]==0.21.3'
'azureml-core==1.47.0' \
'azureml-defaults==1.47.0' \
'azureml-mlflow==1.47.0' \
'azureml-telemetry==1.47.0' \
'tensorboard~=2.7.0' \
'tensorflow-gpu~=2.7.0' \
'tensorflow-datasets~=4.5.0' \
'onnxruntime-gpu~=1.9.0' \
'protobuf~=3.20' \
'horovod[tensorflow-gpu]~=0.23.0' \
'debugpy~=1.6.3'
RUN pip install --no-cache-dir \
azureml-defaults \

View File

@@ -1,4 +1,4 @@
FROM mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04
FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
USER root
RUN conda install -c anaconda python=3.7

View File

@@ -1,4 +1,4 @@
FROM mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20200423.v1
FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
RUN apt-get update && apt-get install -y --no-install-recommends \
python-opengl \
@@ -8,7 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
rm -rf /var/lib/apt/lists/* && \
rm -rf /usr/share/man/*
RUN conda install -y conda=4.13.0 python=3.7 && conda clean -ay
RUN conda install -y conda=23.1.0 python=3.7 && conda clean -ay
RUN pip install ray-on-aml==0.2.1 & \
pip install --no-cache-dir \
azureml-defaults \
@@ -16,7 +16,7 @@ RUN pip install ray-on-aml==0.2.1 & \
azureml-contrib-reinforcementlearning \
gputil \
scipy \
pyglet \
pyglet==1.5.27 \
cloudpickle==1.3.0 \
tensorboardX \
tensorflow==1.14.0 \

View File

@@ -1,39 +0,0 @@
# DisableDockerDetector "Disabled to unblock PRs until the owner can fix the file. Not used in any prod deployments - only as a documentation for the customers"
FROM akdmsft/particle-cpu
RUN conda install -c anaconda python=3.7
# Install required pip packages
RUN pip3 install --upgrade pip setuptools && pip3 install --upgrade \
pandas \
matplotlib \
psutil \
numpy \
scipy \
gym \
azureml-defaults \
tensorboardX \
tensorflow==1.15 \
tensorflow-probability==0.8.0 \
onnxruntime \
tf2onnx \
cloudpickle==1.1.1 \
tabulate \
dm_tree \
lz4 \
opencv-python
RUN cd multiagent-particle-envs && \
pip3 install -e . && \
pip3 install --upgrade pyglet==1.3.2
RUN pip3 install ray-on-aml==0.1.6
RUN pip install protobuf==3.20.0
RUN pip3 install --upgrade \
ray==0.8.7 \
ray[rllib]==0.8.7 \
ray[tune]==0.8.7
RUN pip install 'msrest<0.7.0'

View File

@@ -1,70 +0,0 @@
# MIT License
# Copyright (c) 2018 OpenAI
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import numpy as np
import gym
class MultiDiscrete(gym.Space):
"""
- The multi-discrete action space consists of a series of discrete action spaces with different
parameters
- It can be adapted to both a Discrete action space or a continuous (Box) action space
- It is useful to represent game controllers or keyboards where each key can be represented as
a discrete action space
- It is parametrized by passing an array of arrays containing [min, max] for each discrete action
space where the discrete action space can take any integers from `min` to `max` (both inclusive)
Note: A value of 0 always need to represent the NOOP action.
e.g. Nintendo Game Controller
- Can be conceptualized as 3 discrete action spaces:
1) Arrow Keys: Discrete 5 - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4] - params: min: 0, max: 4
2) Button A: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1
3) Button B: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1
- Can be initialized as
MultiDiscrete([ [0,4], [0,1], [0,1] ])
"""
def __init__(self, array_of_param_array):
self.low = np.array([x[0] for x in array_of_param_array])
self.high = np.array([x[1] for x in array_of_param_array])
self.num_discrete_space = self.low.shape[0]
def sample(self):
""" Returns a array with one sample from each discrete action space """
# For each row: round(random .* (max - min) + min, 0)
# random_array = prng.np_random.rand(self.num_discrete_space)
random_array = np.random.RandomState().rand(self.num_discrete_space)
return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
def contains(self, x):
return len(x) == self.num_discrete_space \
and (np.array(x) >= self.low).all() \
and (np.array(x) <= self.high).all()
@property
def shape(self):
return self.num_discrete_space
def __repr__(self):
return "MultiDiscrete" + str(self.num_discrete_space)
def __eq__(self, other):
return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)

View File

@@ -1,413 +0,0 @@
# MIT License
# Copyright (c) 2018 OpenAI
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
2D rendering framework
"""
from __future__ import division
import os
import six
import sys
from gym import error
import math
import numpy as np
import pyglet
from pyglet.gl import glEnable, glHint, glLineWidth, glBlendFunc, glClearColor, glPushMatrix, \
glTranslatef, glRotatef, glScalef, glPopMatrix, glColor4f, glBegin, glVertex3f, glEnd, glLineStipple, \
glDisable, glVertex2f, GL_BLEND, GL_LINE_SMOOTH, GL_LINE_SMOOTH_HINT, GL_NICEST, GL_SRC_ALPHA, \
GL_ONE_MINUS_SRC_ALPHA, GL_LINE_STIPPLE, GL_POINTS, GL_QUADS, GL_TRIANGLES, GL_POLYGON, GL_LINE_LOOP, \
GL_LINE_STRIP, GL_LINES
if "Apple" in sys.version:
if 'DYLD_FALLBACK_LIBRARY_PATH' in os.environ:
os.environ['DYLD_FALLBACK_LIBRARY_PATH'] += ':/usr/lib'
# (JDS 2016/04/15): avoid bug on Anaconda 2.3.0 / Yosemite
RAD2DEG = 57.29577951308232
def get_display(spec):
"""Convert a display specification (such as :0) into an actual Display
object.
Pyglet only supports multiple Displays on Linux.
"""
if spec is None:
return None
elif isinstance(spec, six.string_types):
return pyglet.canvas.Display(spec)
else:
raise error.Error('Invalid display specification: {}. (Must be a string like :0 or None.)'.format(spec))
class Viewer(object):
def __init__(self, width, height, display=None):
display = get_display(display)
self.width = width
self.height = height
self.window = pyglet.window.Window(width=width, height=height, display=display)
self.window.on_close = self.window_closed_by_user
self.geoms = []
self.onetime_geoms = []
self.transform = Transform()
glEnable(GL_BLEND)
# glEnable(GL_MULTISAMPLE)
glEnable(GL_LINE_SMOOTH)
# glHint(GL_LINE_SMOOTH_HINT, GL_DONT_CARE)
glHint(GL_LINE_SMOOTH_HINT, GL_NICEST)
glLineWidth(2.0)
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)
def close(self):
self.window.close()
def window_closed_by_user(self):
self.close()
def set_bounds(self, left, right, bottom, top):
assert right > left and top > bottom
scalex = self.width / (right - left)
scaley = self.height / (top - bottom)
self.transform = Transform(
translation=(-left * scalex, -bottom * scaley),
scale=(scalex, scaley))
def add_geom(self, geom):
self.geoms.append(geom)
def add_onetime(self, geom):
self.onetime_geoms.append(geom)
def render(self, return_rgb_array=False):
glClearColor(1, 1, 1, 1)
self.window.clear()
self.window.switch_to()
self.window.dispatch_events()
self.transform.enable()
for geom in self.geoms:
geom.render()
for geom in self.onetime_geoms:
geom.render()
self.transform.disable()
arr = None
if return_rgb_array:
buffer = pyglet.image.get_buffer_manager().get_color_buffer()
image_data = buffer.get_image_data()
arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
# In https://github.com/openai/gym-http-api/issues/2, we
# discovered that someone using Xmonad on Arch was having
# a window of size 598 x 398, though a 600 x 400 window
# was requested. (Guess Xmonad was preserving a pixel for
# the boundary.) So we use the buffer height/width rather
# than the requested one.
arr = arr.reshape(buffer.height, buffer.width, 4)
arr = arr[::-1, :, 0:3]
self.window.flip()
self.onetime_geoms = []
return arr
# Convenience
def draw_circle(self, radius=10, res=30, filled=True, **attrs):
geom = make_circle(radius=radius, res=res, filled=filled)
_add_attrs(geom, attrs)
self.add_onetime(geom)
return geom
def draw_polygon(self, v, filled=True, **attrs):
geom = make_polygon(v=v, filled=filled)
_add_attrs(geom, attrs)
self.add_onetime(geom)
return geom
def draw_polyline(self, v, **attrs):
geom = make_polyline(v=v)
_add_attrs(geom, attrs)
self.add_onetime(geom)
return geom
def draw_line(self, start, end, **attrs):
geom = Line(start, end)
_add_attrs(geom, attrs)
self.add_onetime(geom)
return geom
def get_array(self):
self.window.flip()
image_data = pyglet.image.get_buffer_manager().get_color_buffer().get_image_data()
self.window.flip()
arr = np.fromstring(image_data.data, dtype=np.uint8, sep='')
arr = arr.reshape(self.height, self.width, 4)
return arr[::-1, :, 0:3]
def _add_attrs(geom, attrs):
if "color" in attrs:
geom.set_color(*attrs["color"])
if "linewidth" in attrs:
geom.set_linewidth(attrs["linewidth"])
class Geom(object):
def __init__(self):
self._color = Color((0, 0, 0, 1.0))
self.attrs = [self._color]
def render(self):
for attr in reversed(self.attrs):
attr.enable()
self.render1()
for attr in self.attrs:
attr.disable()
def render1(self):
raise NotImplementedError
def add_attr(self, attr):
self.attrs.append(attr)
def set_color(self, r, g, b, alpha=1):
self._color.vec4 = (r, g, b, alpha)
class Attr(object):
def enable(self):
raise NotImplementedError
def disable(self):
pass
class Transform(Attr):
def __init__(self, translation=(0.0, 0.0), rotation=0.0, scale=(1, 1)):
self.set_translation(*translation)
self.set_rotation(rotation)
self.set_scale(*scale)
def enable(self):
glPushMatrix()
glTranslatef(self.translation[0], self.translation[1], 0) # translate to GL loc ppint
glRotatef(RAD2DEG * self.rotation, 0, 0, 1.0)
glScalef(self.scale[0], self.scale[1], 1)
def disable(self):
glPopMatrix()
def set_translation(self, newx, newy):
self.translation = (float(newx), float(newy))
def set_rotation(self, new):
self.rotation = float(new)
def set_scale(self, newx, newy):
self.scale = (float(newx), float(newy))
class Color(Attr):
def __init__(self, vec4):
self.vec4 = vec4
def enable(self):
glColor4f(*self.vec4)
class LineStyle(Attr):
def __init__(self, style):
self.style = style
def enable(self):
glEnable(GL_LINE_STIPPLE)
glLineStipple(1, self.style)
def disable(self):
glDisable(GL_LINE_STIPPLE)
class LineWidth(Attr):
def __init__(self, stroke):
self.stroke = stroke
def enable(self):
glLineWidth(self.stroke)
class Point(Geom):
def __init__(self):
Geom.__init__(self)
def render1(self):
glBegin(GL_POINTS) # draw point
glVertex3f(0.0, 0.0, 0.0)
glEnd()
class FilledPolygon(Geom):
def __init__(self, v):
Geom.__init__(self)
self.v = v
def render1(self):
if len(self.v) == 4:
glBegin(GL_QUADS)
elif len(self.v) > 4:
glBegin(GL_POLYGON)
else:
glBegin(GL_TRIANGLES)
for p in self.v:
glVertex3f(p[0], p[1], 0) # draw each vertex
glEnd()
color = (
self._color.vec4[0] * 0.5,
self._color.vec4[1] * 0.5,
self._color.vec4[2] * 0.5,
self._color.vec4[3] * 0.5)
glColor4f(*color)
glBegin(GL_LINE_LOOP)
for p in self.v:
glVertex3f(p[0], p[1], 0) # draw each vertex
glEnd()
def make_circle(radius=10, res=30, filled=True):
points = []
for i in range(res):
ang = 2 * math.pi * i / res
points.append((math.cos(ang) * radius, math.sin(ang) * radius))
if filled:
return FilledPolygon(points)
else:
return PolyLine(points, True)
def make_polygon(v, filled=True):
if filled:
return FilledPolygon(v)
else:
return PolyLine(v, True)
def make_polyline(v):
return PolyLine(v, False)
def make_capsule(length, width):
l, r, t, b = 0, length, width / 2, -width / 2
box = make_polygon([(l, b), (l, t), (r, t), (r, b)])
circ0 = make_circle(width / 2)
circ1 = make_circle(width / 2)
circ1.add_attr(Transform(translation=(length, 0)))
geom = Compound([box, circ0, circ1])
return geom
class Compound(Geom):
def __init__(self, gs):
Geom.__init__(self)
self.gs = gs
for g in self.gs:
g.attrs = [a for a in g.attrs if not isinstance(a, Color)]
def render1(self):
for g in self.gs:
g.render()
class PolyLine(Geom):
def __init__(self, v, close):
Geom.__init__(self)
self.v = v
self.close = close
self.linewidth = LineWidth(1)
self.add_attr(self.linewidth)
def render1(self):
glBegin(GL_LINE_LOOP if self.close else GL_LINE_STRIP)
for p in self.v:
glVertex3f(p[0], p[1], 0) # draw each vertex
glEnd()
def set_linewidth(self, x):
self.linewidth.stroke = x
class Line(Geom):
def __init__(self, start=(0.0, 0.0), end=(0.0, 0.0)):
Geom.__init__(self)
self.start = start
self.end = end
self.linewidth = LineWidth(1)
self.add_attr(self.linewidth)
def render1(self):
glBegin(GL_LINES)
glVertex2f(*self.start)
glVertex2f(*self.end)
glEnd()
class Image(Geom):
def __init__(self, fname, width, height):
Geom.__init__(self)
self.width = width
self.height = height
img = pyglet.image.load(fname)
self.img = img
self.flip = False
def render1(self):
self.img.blit(-self.width / 2, -self.height / 2, width=self.width, height=self.height)
class SimpleImageViewer(object):
def __init__(self, display=None):
self.window = None
self.isopen = False
self.display = display
def imshow(self, arr):
if self.window is None:
height, width, channels = arr.shape
self.window = pyglet.window.Window(width=width, height=height, display=self.display)
self.width = width
self.height = height
self.isopen = True
assert arr.shape == (self.height, self.width, 3), "You passed in an image with the wrong number shape"
image = pyglet.image.ImageData(self.width, self.height, 'RGB', arr.tobytes(), pitch=self.width * -3)
self.window.clear()
self.window.switch_to()
self.window.dispatch_events()
image.blit(0, 0)
self.window.flip()
def close(self):
if self.isopen:
self.window.close()
self.isopen = False
def __del__(self):
self.close()

View File

@@ -1,123 +0,0 @@
import os
from ray_on_aml.core import Ray_On_AML
from ray.tune import run_experiments
from ray.tune.registry import register_trainable, register_env, get_trainable_cls
import ray.rllib.contrib.maddpg.maddpg as maddpg
from rllib_multiagent_particle_env import env_creator
from util import parse_args
def setup_ray():
ray_on_aml = Ray_On_AML()
ray_on_aml.getRay()
register_env('particle', env_creator)
def gen_policy(args, env, id):
use_local_critic = [
args.adv_policy == 'ddpg' if id < args.num_adversaries else
args.good_policy == 'ddpg' for id in range(env.num_agents)
]
return (
None,
env.observation_space_dict[id],
env.action_space_dict[id],
{
'agent_id': id,
'use_local_critic': use_local_critic[id],
'obs_space_dict': env.observation_space_dict,
'act_space_dict': env.action_space_dict,
}
)
def gen_policies(args, env_config):
env = env_creator(env_config)
return {'policy_%d' % i: gen_policy(args, env, i) for i in range(len(env.observation_space_dict))}
def to_multiagent_config(policies):
policy_ids = list(policies.keys())
return {
'policies': policies,
'policy_mapping_fn': lambda index: policy_ids[index]
}
def train(args, env_config):
def stop(trial_id, result):
max_train_time = int(os.environ.get('AML_MAX_TRAIN_TIME_SECONDS', 2 * 60 * 60))
return result['episode_reward_mean'] >= args.final_reward \
or result['time_total_s'] >= max_train_time
run_experiments({
'MADDPG_RLLib': {
'run': 'contrib/MADDPG',
'env': 'particle',
'stop': stop,
# Uncomment to enable more frequent checkpoints:
# 'checkpoint_freq': args.checkpoint_freq,
'checkpoint_at_end': True,
'local_dir': args.local_dir,
'restore': args.restore,
'config': {
# === Log ===
'log_level': 'ERROR',
# === Environment ===
'env_config': env_config,
'num_envs_per_worker': args.num_envs_per_worker,
'horizon': args.max_episode_len,
# === Policy Config ===
# --- Model ---
'good_policy': args.good_policy,
'adv_policy': args.adv_policy,
'actor_hiddens': [args.num_units] * 2,
'actor_hidden_activation': 'relu',
'critic_hiddens': [args.num_units] * 2,
'critic_hidden_activation': 'relu',
'n_step': args.n_step,
'gamma': args.gamma,
# --- Exploration ---
'tau': 0.01,
# --- Replay buffer ---
'buffer_size': int(1e6),
# --- Optimization ---
'actor_lr': args.lr,
'critic_lr': args.lr,
'learning_starts': args.train_batch_size * args.max_episode_len,
'sample_batch_size': args.sample_batch_size,
'train_batch_size': args.train_batch_size,
'batch_mode': 'truncate_episodes',
# --- Parallelism ---
'num_workers': args.num_workers,
'num_gpus': args.num_gpus,
'num_gpus_per_worker': 0,
# === Multi-agent setting ===
'multiagent': to_multiagent_config(gen_policies(args, env_config)),
},
},
}, verbose=1)
if __name__ == '__main__':
args = parse_args()
setup_ray()
env_config = {
'scenario_name': args.scenario,
'horizon': args.max_episode_len,
'video_frequency': args.checkpoint_freq,
}
train(args, env_config)

View File

@@ -1,113 +0,0 @@
# Some code taken from: https://github.com/wsjeon/maddpg-rllib/
import imp
import os
import gym
from gym import wrappers
from ray import rllib
from multiagent.environment import MultiAgentEnv
import multiagent.scenarios as scenarios
CUSTOM_SCENARIOS = ['simple_switch']
class ParticleEnvRenderWrapper(gym.Wrapper):
def __init__(self, env, horizon):
super().__init__(env)
self.horizon = horizon
def reset(self):
self._num_steps = 0
return self.env.reset()
def render(self, mode):
if mode == 'human':
self.env.render(mode=mode)
else:
return self.env.render(mode=mode)[0]
def step(self, actions):
obs_list, rew_list, done_list, info_list = self.env.step(actions)
self._num_steps += 1
done = (all(done_list) or self._num_steps >= self.horizon)
# Gym monitor expects reward to be an int. This is only used for its
# stats reporter, which we're not interested in. To make video recording
# work, we package the rewards in the info object and extract it below.
return obs_list, 0, done, [rew_list, done_list, info_list]
class RLlibMultiAgentParticleEnv(rllib.MultiAgentEnv):
def __init__(self, scenario_name, horizon, monitor_enabled=False, video_frequency=500):
self._env = _make_env(scenario_name, horizon, monitor_enabled, video_frequency)
self.num_agents = self._env.n
self.agent_ids = list(range(self.num_agents))
self.observation_space_dict = self._make_dict(self._env.observation_space)
self.action_space_dict = self._make_dict(self._env.action_space)
def reset(self):
obs_dict = self._make_dict(self._env.reset())
return obs_dict
def step(self, action_dict):
actions = list(action_dict.values())
obs_list, _, _, infos = self._env.step(actions)
rew_list, done_list, _ = infos
obs_dict = self._make_dict(obs_list)
rew_dict = self._make_dict(rew_list)
done_dict = self._make_dict(done_list)
done_dict['__all__'] = all(done_list)
info_dict = self._make_dict([{'done': done} for done in done_list])
return obs_dict, rew_dict, done_dict, info_dict
def render(self, mode='human'):
self._env.render(mode=mode)
def _make_dict(self, values):
return dict(zip(self.agent_ids, values))
def _video_callable(video_frequency):
def should_record_video(episode_id):
if episode_id % video_frequency == 0:
return True
return False
return should_record_video
def _make_env(scenario_name, horizon, monitor_enabled, video_frequency):
if scenario_name in CUSTOM_SCENARIOS:
# Scenario file must exist locally
file_path = os.path.join(os.path.dirname(__file__), scenario_name + '.py')
scenario = imp.load_source('', file_path).Scenario()
else:
scenario = scenarios.load(scenario_name + '.py').Scenario()
world = scenario.make_world()
env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
env.metadata['video.frames_per_second'] = 8
env = ParticleEnvRenderWrapper(env, horizon)
if not monitor_enabled:
return env
return wrappers.Monitor(env, './logs/videos', resume=True, video_callable=_video_callable(video_frequency))
def env_creator(config):
monitor_enabled = False
if hasattr(config, 'worker_index') and hasattr(config, 'vector_index'):
monitor_enabled = (config.worker_index == 1 and config.vector_index == 0)
return RLlibMultiAgentParticleEnv(**config, monitor_enabled=monitor_enabled)

View File

@@ -1,358 +0,0 @@
import numpy as np
import random
from multiagent.core import World, Agent, Landmark
from multiagent.scenario import BaseScenario
class SwitchWorld(World):
""" Extended World with hills and switches """
def __init__(self, hills, switches):
super().__init__()
# add hills and switches
self.hills = hills
self.switches = switches
self.landmarks.extend(self.hills)
self.landmarks.extend(self.switches)
def step(self):
super().step()
# if all hills are activated, reset the switches and hills
if all([hill.active for hill in self.hills]):
self.reset_hills()
self.reset_switches()
else:
# Update switches
for switch in self.switches:
switch.step(self)
# Update hills
for hill in self.hills:
hill.step(self)
def reset_hills(self):
possible_hill_positions = [np.array([-0.8, 0]), np.array([0, 0.8]), np.array([0.8, 0]), np.array([0, -0.8])]
hill_positions = random.sample(possible_hill_positions, k=len(self.hills))
for i, hill in enumerate(self.hills):
hill.state.p_pos = hill_positions[i]
hill.deactivate()
def reset_switches(self):
possible_switch_positions = [
np.array([-0.8, -0.8]),
np.array([-0.8, 0.8]),
np.array([0.8, -0.8]),
np.array([0.8, 0.8])]
switch_positions = random.sample(possible_switch_positions, k=len(self.switches))
for i, switch in enumerate(self.switches):
switch.state.p_pos = switch_positions[i]
switch.deactivate()
class Scenario(BaseScenario):
def make_world(self):
# main configurations
num_agents = 2
num_hills = 2
num_switches = 1
self.max_episode_length = 100
# create hills (on edges)
possible_hill_positions = [np.array([-0.8, 0]), np.array([0, 0.8]), np.array([0.8, 0]), np.array([0, -0.8])]
hill_positions = random.sample(possible_hill_positions, k=num_hills)
hills = [Hill(hill_positions[i]) for i in range(num_hills)]
# create switches (in corners)
possible_switch_positions = [
np.array([-0.8, -0.8]),
np.array([-0.8, 0.8]),
np.array([0.8, -0.8]),
np.array([0.8, 0.8])]
switch_positions = random.sample(possible_switch_positions, k=num_switches)
switches = [Switch(switch_positions[i]) for i in range(num_switches)]
# make world and set basic properties
world = SwitchWorld(hills, switches)
world.dim_c = 2
world.collaborative = True
# add agents
world.agents = [Agent() for i in range(num_agents)]
for i, agent in enumerate(world.agents):
agent.name = 'agent %d' % i
agent.collide = True
agent.silent = True
agent.size = 0.1
agent.accel = 5.0
agent.max_speed = 5.0
if i == 0:
agent.color = np.array([0.35, 0.35, 0.85])
else:
agent.color = np.array([0.35, 0.85, 0.85])
# make initial conditions
self.reset_world(world)
return world
def reset_world(self, world):
# set random initial states
for agent in world.agents:
agent.state.p_pos = np.array([random.uniform(-1, +1) for _ in range(world.dim_p)])
agent.state.p_vel = np.zeros(world.dim_p)
agent.state.c = np.zeros(world.dim_c)
# set hills randomly
world.reset_hills()
# set switches randomly
world.reset_switches()
def is_collision(self, agent1, agent2):
delta_pos = agent1.state.p_pos - agent2.state.p_pos
dist = np.sqrt(np.sum(np.square(delta_pos)))
dist_min = agent1.size + agent2.size
return True if dist < dist_min else False
def reward(self, agent, world):
# Agents are rewarded based on number of landmarks activated
rew = 0
if all([h.active for h in world.hills]):
rew += 100
else:
# give bonus each time a hill is activated
for hill in world.hills:
if hill.activated_just_now:
rew += 50
# penalise timesteps where nothing is happening
if rew == 0:
rew -= 0.1
# add collision penalty
if agent.collide:
for a in world.agents:
# note: this also counts collision with "itself", so gives -1 at every timestep
# would be good to tune the reward function and use (not a == agent) here
if self.is_collision(a, agent):
rew -= 1
return rew
def observation(self, agent, world):
# get positions of all entities in this agent's reference frame
entity_pos = []
for entity in world.landmarks: # world.entities:
entity_pos.append(entity.state.p_pos - agent.state.p_pos)
# entity colors
entity_color = []
for entity in world.landmarks: # world.entities:
entity_color.append(entity.color)
# communication of all other agents
comm = []
other_pos = []
for other in world.agents:
if other is agent:
continue
comm.append(other.state.c)
other_pos.append(other.state.p_pos - agent.state.p_pos)
return np.concatenate([agent.state.p_vel] + [agent.state.p_pos] + entity_pos + other_pos + comm)
class Hill(Landmark):
"""
A hill that can be captured by an agent.
To be captured, a team must occupy a hill for a fixed amount of time.
"""
def __init__(self,
pos=None,
size=0.08,
capture_time=2
):
# Initialize Landmark super class
super().__init__()
self.movable = False
self.collide = False
self.state.p_pos = pos
self.size = size
# Set static configurations
self.capture_time = capture_time
# Initialize all hills to be inactive
self.active = False
self.color = np.array([0.5, 0.5, 0.5])
self.capture_timer = 0
self.activated_just_now = False
def activate(self):
self.active = True
self.color = np.array([0.1, 0.1, 0.9])
def deactivate(self):
self.active = False
self.color = np.array([0.5, 0.5, 0.5])
def _is_occupied(self, agents):
# a hill is occupied if an agent stands on it
for agent in agents:
dist = np.sqrt(np.sum(np.square(agent.state.p_pos - self.state.p_pos)))
if dist < agent.size + self.size:
return True
return False
def step(self, world):
self.activated_just_now = False
# If hill isn't activated yet, check if an agent activates it
# if (not self.active) and (world.switch.is_active()):
if (not self.active):
# Check if an agent is on the hill and all switches are active
if (self._is_occupied(world.agents)) and all([switch.active for switch in world.switches]):
self.capture_timer += 1
# activate hill (this is irreversible)
if self.capture_timer > self.capture_time:
self.activate()
self.activated_just_now = True
# Reset capture timer if hill is not occupied
else:
self.capture_timer = 0
class Switch(Landmark):
"""
A switch that can be activated by an agent.
The agent has to stay on the switch for it to be active.
"""
def __init__(self,
pos=None,
size=0.03,
):
# Initialize Landmark super class
super().__init__()
self.movable = False
self.collide = False
self.state.p_pos = pos
self.size = size
# Initialize all hills to be inactive
self.active = False
self.color = np.array([0.8, 0.05, 0.3])
self.capture_timer = 0
def activate(self):
self.active = True
self.color = np.array([0.1, 0.9, 0.4])
def deactivate(self):
self.active = False
self.color = np.array([0.8, 0.05, 0.3])
def _is_occupied(self, agents):
# a switch is active if an agent stands on it
for agent in agents:
dist = np.sqrt(np.sum(np.square(agent.state.p_pos - self.state.p_pos)))
if dist < agent.size + self.size:
return True
return False
def step(self, world):
# check if an agent is on the switch and activate/deactive accordingly
if self._is_occupied(world.agents):
self.activate()
else:
self.deactivate()
class SwitchExpertPolicy():
"""
Hand-coded expert policy for the simple switch environment.
Types of possible experts:
- always go to the switch
- always go to the hills
"""
def __init__(self, dim_c, agent, world, expert_type=None, discrete_action_input=True):
self.dim_c = dim_c
self.discrete_action_input = discrete_action_input
# the agent we control and world we're in
self.agent = agent
self.world = world
if expert_type is None:
self.expert_type = random.choice(['switch', 'hill'])
else:
self.expert_type = expert_type
if self.expert_type == 'switch':
self.target_switch = self.select_inital_target_switch()
elif self.expert_type == 'hill':
self.target_hill = self.select_inital_target_hill()
else:
raise NotImplementedError
self.step_count = 0
def select_inital_target_switch(self):
return random.choice(self.world.switches)
def select_inital_target_hill(self):
return random.choice(self.world.hills)
def action(self):
# select a target!
if self.expert_type == 'switch':
# if agent is not already on a switch, choose target switch
if not any([switch._is_occupied([self.agent]) for switch in self.world.switches]):
# select a target switch if there's an inactive one
inactive_switches = [switch for switch in self.world.switches if not switch.active]
if len(inactive_switches) > 0 and (self.target_switch not in inactive_switches):
self.target_switch = random.choice(inactive_switches)
target = self.target_switch.state.p_pos
elif self.expert_type == 'hill':
# select a target hill if we haven't done so yet, or the current target switch is inactive
inactive_hills = [hill for hill in self.world.hills if not hill.active]
if len(inactive_hills) > 0 and (self.target_hill not in inactive_hills):
self.target_hill = random.choice(inactive_hills)
target = self.target_hill.state.p_pos
self.step_count += 1
impulse = np.clip(target - self.agent.state.p_pos, -self.agent.u_range, self.agent.u_range)
if self.discrete_action_input:
u_idx = np.argmax(np.abs(impulse))
if u_idx == 0 and impulse[u_idx] < 0:
u = 1
elif u_idx == 0 and impulse[u_idx] > 0:
u = 2
elif u_idx == 1 and impulse[u_idx] < 0:
u = 3
elif u_idx == 1 and impulse[u_idx] > 0:
u = 4
else:
u = 0
else:
u = np.zeros(5)
if (impulse[0] == impulse[1] == 0) \
or (self.step_count < self.burn_in) \
or (self.burn_step != 0 and self.step_count % self.burn_step != 0):
u[0] = 0.1
else:
pass
# u: noop (?), right, left, down, up
if impulse[0] > 0: # x-direction (- left/right + )
u[1] = impulse[0] # right
elif impulse[0] < 0:
u[2] = -impulse[0]
if impulse[1] > 0: # y-direction (- up/down + )
u[3] = impulse[1]
elif impulse[1] < 0:
u[4] = -impulse[1]
return u

View File

@@ -1,82 +0,0 @@
import argparse
import os
import re
from rllib_multiagent_particle_env import CUSTOM_SCENARIOS
def parse_args():
parser = argparse.ArgumentParser('MADDPG with OpenAI MPE')
# Environment
parser.add_argument('--scenario', type=str, default='simple',
choices=['simple', 'simple_speaker_listener',
'simple_crypto', 'simple_push',
'simple_tag', 'simple_spread', 'simple_adversary'
] + CUSTOM_SCENARIOS,
help='name of the scenario script')
parser.add_argument('--max-episode-len', type=int, default=25,
help='maximum episode length')
parser.add_argument('--num-episodes', type=int, default=60000,
help='number of episodes')
parser.add_argument('--num-adversaries', type=int, default=0,
help='number of adversaries')
parser.add_argument('--good-policy', type=str, default='maddpg',
help='policy for good agents')
parser.add_argument('--adv-policy', type=str, default='maddpg',
help='policy of adversaries')
# Core training parameters
parser.add_argument('--lr', type=float, default=1e-2,
help='learning rate for Adam optimizer')
parser.add_argument('--gamma', type=float, default=0.95,
help='discount factor')
# NOTE: 1 iteration = sample_batch_size * num_workers timesteps * num_envs_per_worker
parser.add_argument('--sample-batch-size', type=int, default=25,
help='number of data points sampled /update /worker')
parser.add_argument('--train-batch-size', type=int, default=1024,
help='number of data points /update')
parser.add_argument('--n-step', type=int, default=1,
help='length of multistep value backup')
parser.add_argument('--num-units', type=int, default=64,
help='number of units in the mlp')
parser.add_argument('--final-reward', type=int, default=-400,
help='final reward after which to stop training')
# Checkpoint
parser.add_argument('--checkpoint-freq', type=int, default=200,
help='save model once every time this many iterations are completed')
parser.add_argument('--local-dir', type=str, default='./logs',
help='path to save checkpoints')
parser.add_argument('--restore', type=str, default=None,
help='directory in which training state and model are loaded')
# Parallelism
parser.add_argument('--num-workers', type=int, default=1)
parser.add_argument('--num-envs-per-worker', type=int, default=4)
parser.add_argument('--num-gpus', type=int, default=0)
return parser.parse_args()
def find_final_checkpoint(start_dir):
def find(pattern, path):
result = []
for root, _, files in os.walk(path):
for name in files:
if pattern.match(name):
result.append(os.path.join(root, name))
return result
cp_pattern = re.compile('.*checkpoint-\\d+$')
checkpoint_files = find(cp_pattern, start_dir)
checkpoint_numbers = []
for file in checkpoint_files:
checkpoint_numbers.append(int(file.split('-')[-1]))
final_checkpoint_number = max(checkpoint_numbers)
return next(
checkpoint_file for checkpoint_file in checkpoint_files
if checkpoint_file.endswith(str(final_checkpoint_number)))

View File

@@ -1,566 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/particle.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Reinforcement Learning in Azure Machine Learning - Training multiple agents on collaborative ParticleEnv tasks\n",
"\n",
"This tutorial will show you how to train policies in a multi-agent scenario.\n",
"We use OpenAI Gym's [Particle environments](https://github.com/openai/multiagent-particle-envs),\n",
"which model agents and landmarks in a two-dimensional world. Particle comes with\n",
"several predefined scenarios, both competitive and collaborative, and with or without communication.\n",
"\n",
"For this tutorial, we pick a cooperative navigation scenario where N agents are in a world with N\n",
"landmarks. The agents' goal is to cover all the landmarks without collisions,\n",
"so agents must learn to avoid each other (social distancing!). The video below shows training\n",
"results for N=3 agents/landmarks:\n",
"\n",
"<table style=\"width:50%\">\n",
" <tr>\n",
" <th style=\"text-align: center;\">\n",
" <img src=\"./images/particle_simple_spread.gif\" alt=\"Particle video\" align=\"middle\" margin-left=\"auto\" margin-right=\"auto\"/>\n",
" </th>\n",
" </tr>\n",
" <tr style=\"text-align: center;\">\n",
" <th>Fig 1. Video of 3 agents covering 3 landmarks in a multiagent Particle scenario.</th>\n",
" </tr>\n",
"</table>\n",
"\n",
"The tutorial will cover the following steps:\n",
"- Initializing Azure Machine Learning resources for training\n",
"- Training policies in a multi-agent environment with Azure Machine Learning service\n",
"- Monitoring training progress\n",
"\n",
"## Prerequisites\n",
"\n",
"The user should have completed the Azure Machine Learning introductory tutorial. You will need to make sure that you have a valid subscription id, a resource group and a workspace. For detailed instructions see [Tutorial: Get started creating your first ML experiment](https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-1st-experiment-sdk-setup).\n",
"\n",
"Please ensure that you have a current version of IPython (>= 7.15) installed.\n",
"\n",
"While this is a standalone notebook, we highly recommend going over the introductory notebooks for RL first.\n",
"- Getting started:\n",
" - [RL using a compute instance with Azure Machine Learning](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.ipynb)\n",
" - [RL using Azure Machine Learning compute](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_sc.ipynb)\n",
"- [Scaling RL training runs with Azure Machine Learning](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb)\n",
"\n",
"## Initialize resources\n",
"\n",
"All required Azure Machine Learning service resources for this tutorial can be set up from Jupyter. This includes:\n",
"\n",
"- Connecting to your existing Azure Machine Learning workspace.\n",
"- Creating an experiment to track runs.\n",
"- Creating remote compute targets for [Ray](https://docs.ray.io/en/latest/index.html).\n",
"\n",
"\n",
"### Azure Machine Learning SDK\n",
"\n",
"Display the Azure Machine Learning SDK version."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646249589452
}
},
"outputs": [],
"source": [
"import azureml.core\n",
"print('Azure Machine Learning SDK version: ', azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Connect to workspace\n",
"\n",
"Get a reference to an existing Azure Machine Learning workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646250284486
}
},
"outputs": [],
"source": [
"from azureml.core import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print(ws.name, ws.location, ws.resource_group, sep=' | ')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create an experiment\n",
"\n",
"Create an experiment to track the runs in your workspace. A\n",
"workspace can have multiple experiments and each experiment\n",
"can be used to track multiple runs (see [documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.experiment.experiment?view=azure-ml-py)\n",
"for details)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646250342411
}
},
"outputs": [],
"source": [
"from azureml.core import Experiment\n",
"\n",
"exp = Experiment(workspace=ws, name='particle-multiagent')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create or attach an existing compute resource\n",
"\n",
"A compute target is a designated compute resource where you run your training script. For more information, see [What are compute targets in Azure Machine Learning service?](https://docs.microsoft.com/en-us/azure/machine-learning/concept-compute-target).\n",
"\n",
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
"\n",
"#### CPU target for Ray head\n",
"\n",
"In the experiment setup for this tutorial, the Ray head node will\n",
"run on a CPU node (D3 type). A maximum cluster size of 1 node is\n",
"therefore sufficient. If you wish to run multiple experiments in\n",
"parallel using the same CPU cluster, you may elect to increase this\n",
"number. The cluster will automatically scale down to 0 nodes when\n",
"no training jobs are scheduled (see min_nodes).\n",
"\n",
"The code below creates a compute cluster of D3 type nodes.\n",
"If the cluster with the specified name is already in your workspace\n",
"the code will skip the creation process.\n",
"\n",
"**Note: Creation of a compute resource can take several minutes**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646250346756
}
},
"outputs": [],
"source": [
"from azureml.core.compute import AmlCompute, ComputeTarget\n",
"\n",
"cpu_cluster_name = 'cpu-cl-d3'\n",
"\n",
"if cpu_cluster_name in ws.compute_targets:\n",
" cpu_cluster = ws.compute_targets[cpu_cluster_name]\n",
" if cpu_cluster and type(cpu_cluster) is AmlCompute:\n",
" if cpu_cluster.provisioning_state == 'Succeeded':\n",
" print('Found existing compute target for {}. Using it.'.format(cpu_cluster_name))\n",
" else: \n",
" raise Exception('Found existing compute target for {} '.format(cpu_cluster_name)\n",
" + 'but it is in state {}'.format(cpu_cluster.provisioning_state))\n",
"else:\n",
" print('Creating a new compute target for {}...'.format(cpu_cluster_name))\n",
" provisioning_config = AmlCompute.provisioning_configuration(\n",
" vm_size='STANDARD_D3',\n",
" min_nodes=0, \n",
" max_nodes=1)\n",
"\n",
" cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, provisioning_config)\n",
" cpu_cluster.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
" \n",
" print('Cluster created.')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training the policies\n",
"\n",
"### Training environment\n",
"\n",
"This tutorial uses a custom docker image\n",
"with the necessary software installed. The [Environment](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-environments)\n",
"class stores the configuration for the training environment. The\n",
"docker image is set via `env.docker.base_image`.\n",
"`user_managed_dependencies` is set so that\n",
"the preinstalled Python packages in the image are preserved.\n",
"\n",
"Note that since we want to capture videos of the training runs requiring a display, we set the interpreter_path such that the Python process is started via **xvfb-run**."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646257481631
},
"jupyter": {
"outputs_hidden": false,
"source_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
}
},
"outputs": [],
"source": [
"from azureml.core import Environment\n",
"import os\n",
"\n",
"ray_environment_name = 'particle-cpu'\n",
"ray_environment_dockerfile_path = os.path.join(os.getcwd(), 'docker', 'cpu', 'Dockerfile')\n",
"ray_environment = Environment. \\\n",
" from_dockerfile(name=ray_environment_name, dockerfile=ray_environment_dockerfile_path). \\\n",
" register(workspace=ws)\n",
"ray_cpu_build_details = ray_environment.build(workspace=ws)\n",
"\n",
"ray_cpu_build_details.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Training script\n",
"\n",
"This tutorial uses the multiagent algorithm [Multi-Agent Deep Deterministic Policy Gradient (MADDPG)](https://docs.ray.io/en/latest/rllib-algorithms.html?highlight=maddpg#multi-agent-deep-deterministic-policy-gradient-contrib-maddpg).\n",
"For training policies in a multiagent scenario, Ray's RLlib also\n",
"requires the `multiagent` configuration section to be specified. You\n",
"can find more information in the [common parameters](https://docs.ray.io/en/latest/rllib-training.html?highlight=multiagent#common-parameters)\n",
"documentation.\n",
"\n",
"The stopping criteria are set such that the training run is\n",
"terminated after either a mean reward of -450 is observed, or\n",
"training has run for over 2 hours.\n",
"\n",
"### Submitting a training run\n",
"\n",
"You can submit the training run using a `ScriptRunConfig`. By providing the\n",
"command to run the training, and a `RunConfig` object configured with your\n",
"compute target, number of nodes, and environment image to use.\n",
"\n",
"Note that you can use the same notebook and scripts to experiment with\n",
"different Particle environments. You can find a list of supported\n",
"environments [here](https://github.com/openai/multiagent-particle-envs/tree/master#list-of-environments).\n",
"Simply change the `--scenario` parameter to a supported scenario.\n",
"\n",
"In order to get the best training results, you can also adjust the\n",
"`--final-reward` parameter to determine when to stop training. A greater\n",
"reward means longer running time, but improved results. By default,\n",
"the final reward will be -450, which should show good progress after\n",
"about one hour of run time.\n",
"\n",
"For this notebook, we use a single D3 nodes, giving us a total of 4 CPUs and\n",
"0 GPUs. One CPU is used by the MADDPG trainer, and an additional CPU is\n",
"consumed by the RLlib rollout worker. The other 2 CPUs are not used, though\n",
"smaller node types will run out of memory for this task."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"gather": {
"logged": 1646275371701
},
"jupyter": {
"outputs_hidden": false,
"source_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
}
},
"outputs": [],
"source": [
"from azureml.core import RunConfiguration, ScriptRunConfig, Experiment\n",
"from azureml.core.runconfig import DockerConfiguration, RunConfiguration\n",
"from azureml.widgets import RunDetails\n",
"\n",
"experiment_name = 'particle-multiagent'\n",
"\n",
"experiment = Experiment(workspace=ws, name=experiment_name)\n",
"\n",
"aml_run_config_ml = RunConfiguration(communicator='OpenMpi')\n",
"aml_run_config_ml.target = cpu_cluster\n",
"aml_run_config_ml.node_count = 1\n",
"aml_run_config_ml.environment = ray_environment\n",
"\n",
"config = ScriptRunConfig(source_directory='./files',\n",
" command=[\n",
" 'xvfb-run -s \"-screen 0 640x480x16 -ac +extension GLX +render\" python',\n",
" 'particle_train.py',\n",
" '--scenario', 'simple_spread',\n",
" '--final-reward', '-450'\n",
" ],\n",
" run_config = aml_run_config_ml\n",
" )\n",
"train_run = experiment.submit(config)\n",
"\n",
"RunDetails(train_run).show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Job cancellation\n",
"\n",
"You may cancel the job by uncommenting and running the cell below."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# If you wish to cancel the run before it completes, uncomment and execute:\n",
"# train_run.cancel()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Monitoring training progress\n",
"\n",
"### View the Tensorboard\n",
"\n",
"The Tensorboard can be displayed via the Azure Machine Learning\n",
"service's [Tensorboard API](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-monitor-tensorboard).\n",
"When running locally, please make sure to follow the instructions\n",
"in the link and install required packages. Running this cell will output a URL for the Tensorboard.\n",
"\n",
"Note that the training script sets the log directory when\n",
"starting RLlib via the local_dir parameter. ./logs will automatically\n",
"appear in the downloadable files for a run. Since this script is\n",
"executed on the Ray head node run, we need to get a reference to it\n",
"as shown below.\n",
"\n",
"The Tensorboard API will continuously stream logs from the run.\n",
"\n",
"**Note: It may take a couple of minutes after the run is in \"Running\"\n",
"state before Tensorboard files are available and the board will refresh automatically**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# from azureml.tensorboard import Tensorboard\n",
"\n",
"# tb = Tensorboard([train_run])\n",
"# tb.start()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### View training videos\n",
"\n",
"As mentioned above, we record videos of the agents interacting with the\n",
"Particle world. These videos are often a crucial indicator for training\n",
"success. The code below downloads the latest video as it becomes available\n",
"and displays it in-line.\n",
"\n",
"Over time, the agents learn to cooperate and avoid collisions while\n",
"traveling to all landmarks.\n",
"\n",
"**Note: It can take several minutes for a video to appear after the run\n",
"was started.**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from azureml.core import Dataset\n",
"from azureml.data.dataset_error_handling import DatasetValidationError\n",
"\n",
"from IPython.display import clear_output\n",
"from IPython.core.display import display, Video\n",
"\n",
"datastore = ws.datastores['workspaceartifactstore']\n",
"path_prefix = './tmp_videos'\n",
"\n",
"def download_latest_training_video(run, video_checkpoint_counter):\n",
" run_artifacts_path = os.path.join('ExperimentRun', f'dcid.{run.id}', 'logs', 'videos')\n",
" \n",
" try:\n",
" run_artifacts_ds = Dataset.File.from_files(datastore.path(os.path.join(run_artifacts_path, '**')))\n",
" except DatasetValidationError as e:\n",
" # This happens at the start of the run when there is no data available\n",
" # in the run's artifacts\n",
" return None, video_checkpoint_counter\n",
" \n",
" video_files = [file for file in run_artifacts_ds.to_path() if file.endswith('.mp4')]\n",
" if len(video_files) == video_checkpoint_counter:\n",
" return None, video_checkpoint_counter\n",
" \n",
" iteration_numbers = [int(vf[vf.rindex('video') + len('video') : vf.index('.mp4')]) for vf in video_files]\n",
" latest_video = next(vf for vf in video_files if vf.endswith('{num}.mp4'.format(num=max(iteration_numbers))))\n",
" latest_video = os.path.join(run_artifacts_path, os.path.normpath(latest_video[1:]))\n",
" \n",
" datastore.download(\n",
" target_path=path_prefix,\n",
" prefix=latest_video.replace('\\\\', '/'),\n",
" show_progress=False)\n",
" \n",
" return os.path.join(path_prefix, latest_video), len(video_files)\n",
"\n",
"\n",
"def render_video(vf):\n",
" clear_output(wait=True)\n",
" display(Video(data=vf, embed=True, html_attributes='loop autoplay controls width=50%'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import shutil, time\n",
"\n",
"terminal_statuses = ['Canceled', 'Completed', 'Failed']\n",
"video_checkpoint_counter = 0\n",
"\n",
"while train_run.get_status() not in terminal_statuses:\n",
" video_file, video_checkpoint_counter = download_latest_training_video(train_run, video_checkpoint_counter)\n",
" if video_file is not None:\n",
" render_video(video_file)\n",
" \n",
" print('Displaying video number {}'.format(video_checkpoint_counter))\n",
" shutil.rmtree(path_prefix)\n",
" \n",
" # Interrupting the kernel can take up to 15 seconds\n",
" # depending on when time.sleep started\n",
" time.sleep(15)\n",
" \n",
"train_run.wait_for_completion()\n",
"print('The training run has reached a terminal status.')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleaning up\n",
"\n",
"Below, you can find code snippets for your convenience to clean up any resources created as part of this tutorial you don't wish to retain."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# to stop the Tensorboard, uncomment and run\n",
"# tb.stop()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# to delete the cpu compute target, uncomment and run\n",
"# cpu_cluster.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Next steps\n",
"\n",
"We would love to hear your feedback! Please let us know what you think of Reinforcement Learning in Azure Machine Learning and what features you are looking forward to."
]
}
],
"metadata": {
"authors": [
{
"name": "andress"
}
],
"categories": [
"how-to-use-azureml",
"reinforcement-learning"
],
"interpreter": {
"hash": "13382f70c1d0595120591d2e358c8d446daf961bf951d1fba9a32631e205d5ab"
},
"kernel_info": {
"name": "python38-azureml"
},
"kernelspec": {
"display_name": "Python 3.8 - AzureML",
"language": "python",
"name": "python38-azureml"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
},
"notice": "Copyright (c) Microsoft Corporation. All rights reserved.\u00c3\u0192\u00c2\u00a2\u00c3\u00a2\u00e2\u20ac\u0161\u00c2\u00ac\u00c3\u201a\u00c2\u00afLicensed under the MIT License.\u00c3\u0192\u00c2\u00a2\u00c3\u00a2\u00e2\u20ac\u0161\u00c2\u00ac\u00c3\u201a\u00c2\u00af ",
"nteract": {
"version": "nteract-front-end@1.0.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@@ -1,9 +0,0 @@
name: particle
dependencies:
- pip:
- azureml-sdk
- azureml-contrib-reinforcementlearning
- azureml-widgets
- tensorboard
- azureml-tensorboard
- ipython

View File

@@ -8,7 +8,7 @@ dependencies:
- matplotlib
- azureml-dataset-runtime
- ipywidgets
- raiwidgets~=0.22.0
- raiwidgets~=0.24.0
- liac-arff
- packaging>=20.9
- itsdangerous==2.0.1

View File

@@ -101,7 +101,7 @@
"\n",
"# Check core SDK version number\n",
"\n",
"print(\"This notebook was created using SDK version 1.46.0, you are currently running version\", azureml.core.VERSION)"
"print(\"This notebook was created using SDK version 1.49.0, you are currently running version\", azureml.core.VERSION)"
]
},
{

View File

@@ -6,5 +6,5 @@ dependencies:
- tensorflow
- tqdm
- scipy
- sklearn
- scikit-learn
- setuptools>=41.0.0

View File

@@ -3,5 +3,6 @@ dependencies:
- pip:
- azureml-sdk
- azureml-tensorboard
- tensorboard
- tensorflow
- setuptools>=41.0.0

View File

@@ -139,7 +139,7 @@
"metadata": {},
"outputs": [],
"source": [
"with open('./train.py', 'r') as f:\n",
"with open('./scripts/train.py', 'r') as f:\n",
" print(f.read())"
]
},
@@ -149,7 +149,7 @@
"metadata": {},
"outputs": [],
"source": [
"with open('./mylib.py', 'r') as f:\n",
"with open('./scripts/mylib.py', 'r') as f:\n",
" print(f.read())"
]
},
@@ -203,7 +203,7 @@
"source": [
"from azureml.core import ScriptRunConfig\n",
"\n",
"src = ScriptRunConfig(source_directory='./', script='train.py', environment=user_managed_env)"
"src = ScriptRunConfig(source_directory='./scripts', script='train.py', environment=user_managed_env)"
]
},
{

View File

@@ -176,7 +176,7 @@
"metadata": {},
"outputs": [],
"source": [
"data_urls = ['https://data4mldemo6150520719.blob.core.windows.net/demo/mnist-fashion']\n",
"data_urls = ['https://azuremldownloads.blob.core.windows.net/demo/mnist-fashion']\n",
"fashion_ds = Dataset.File.from_files(data_urls)\n",
"\n",
"# list the files referenced by fashion_ds\n",
@@ -277,7 +277,7 @@
" - azureml-dataset-runtime\n",
" - keras==2.6\n",
" - tensorflow-gpu==2.6\n",
" - numpy\n",
" - numpy==1.23\n",
" - scikit-learn\n",
" - pandas\n",
" - matplotlib\n",

View File

@@ -58,8 +58,6 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
|Title| Task | Dataset | Training Compute | Deployment Target | ML Framework | Tags |
|:----|:-----|:-------:|:----------------:|:-----------------:|:------------:|:------------:|
| [Distributed Training with Chainer](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/chainer/distributed-chainer/distributed-chainer.ipynb) | Use the Chainer estimator to perform distributed training | MNIST | AML Compute | None | Chainer | None |
| [Train a model with hyperparameter tuning](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/chainer/train-hyperparameter-tune-deploy-with-chainer/train-hyperparameter-tune-deploy-with-chainer.ipynb) | Train a Convolutional Neural Network (CNN) | MNIST | AML Compute | Azure Container Instance | Chainer | None |
| [Train a model with a custom Docker image](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/fastai/fastai-with-custom-docker/fastai-with-custom-docker.ipynb) | Train with custom Docker image | Oxford IIIT Pet | AML Compute | None | Pytorch | None |
| [Train a DNN using hyperparameter tuning and deploying with Keras](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/keras/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.ipynb) | Create a multi-class classifier | MNIST | AML Compute | Azure Container Instance | TensorFlow | None |
| [Distributed training with PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-distributeddataparallel/distributed-pytorch-with-distributeddataparallel.ipynb) | Train a model using distributed training via PyTorch DistributedDataParallel | CIFAR-10 | AML Compute | None | PyTorch | None |
@@ -67,7 +65,6 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
| [Training with hyperparameter tuning using PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb) | Train an image classification model using transfer learning with the PyTorch estimator | ImageNet | AML Compute | Azure Container Instance | PyTorch | None |
| [Training and hyperparameter tuning with Scikit-learn](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/scikit-learn/train-hyperparameter-tune-deploy-with-sklearn/train-hyperparameter-tune-deploy-with-sklearn.ipynb) | Train a support vector machine (SVM) to perform classification | Iris | AML Compute | None | Scikit-learn | None |
| [Distributed training using TensorFlow with Horovod](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/distributed-tensorflow-with-horovod/distributed-tensorflow-with-horovod.ipynb) | Use the TensorFlow estimator to train a word2vec model | None | AML Compute | None | TensorFlow | None |
| [Distributed TensorFlow with parameter server](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/distributed-tensorflow-with-parameter-server/distributed-tensorflow-with-parameter-server.ipynb) | Use the TensorFlow estimator to train a model using distributed training | MNIST | AML Compute | None | TensorFlow | None |
| [Hyperparameter tuning and warm start using the TensorFlow estimator](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.ipynb) | Train a deep neural network | MNIST | AML Compute | Azure Container Instance | TensorFlow | None |
| [Training and hyperparameter tuning using the TensorFlow estimator](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb) | Train a deep neural network | MNIST | AML Compute | Azure Container Instance | TensorFlow | None |
| [Resuming a model](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/train-tensorflow-resume-training/train-tensorflow-resume-training.ipynb) | Resume a model in TensorFlow from a previously submitted run | MNIST | AML Compute | None | TensorFlow | None |
@@ -141,7 +138,6 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
| [pong_rllib](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/atari-on-distributed-compute/pong_rllib.ipynb) | | | | | | |
| [cartpole_ci](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/cartpole-on-compute-instance/cartpole_ci.ipynb) | | | | | | |
| [cartpole_sc](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/cartpole-on-single-compute/cartpole_sc.ipynb) | | | | | | |
| [particle](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/reinforcement-learning/multiagent-particle-envs/particle.ipynb) | | | | | | |
| [rai-loan-decision](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/responsible-ai/visualize-upload-loan-decision/rai-loan-decision.ipynb) | | | | | | |
| [Logging APIs](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb) | Logging APIs and analyzing results | None | None | None | None | None |
| [configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master//setup-environment/configuration.ipynb) | | | | | | |

View File

@@ -102,7 +102,7 @@
"source": [
"import azureml.core\n",
"\n",
"print(\"This notebook was created using version 1.46.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.49.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},

View File

@@ -2,7 +2,7 @@ name: quickstart-azureml-in-10mins
dependencies:
- pip:
- azureml-sdk
- sklearn
- scikit-learn
- numpy
- matplotlib
- joblib

View File

@@ -151,7 +151,7 @@
"# use a curated environment that has already been built for you\n",
"\n",
"env = Environment.get(workspace=ws, \n",
" name=\"AzureML-sklearn-0.24-ubuntu18.04-py37-cpu\")"
" name=\"AzureML-sklearn-1.0-ubuntu20.04-py38-cpu\")"
]
},
{

View File

@@ -2,7 +2,7 @@ name: quickstart-azureml-python-sdk
dependencies:
- pip:
- azureml-sdk
- sklearn
- scikit-learn
- numpy
- matplotlib
- joblib

View File

@@ -2,5 +2,5 @@ name: tutorial-1st-experiment-sdk-train
dependencies:
- pip:
- azureml-sdk
- sklearn
- scikit-learn
- azureml-opendatasets

View File

@@ -431,7 +431,7 @@
"\n",
"# to install required packages\n",
"env = Environment('tutorial-env')\n",
"cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults'], conda_packages = ['scikit-learn==0.22.1'])\n",
"cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults'], conda_packages = ['scikit-learn==0.22.1', 'numpy==1.23'])\n",
"\n",
"env.python.conda_dependencies = cd\n",
"\n",

View File

@@ -4,7 +4,7 @@ dependencies:
- azureml-sdk
- azureml-widgets
- matplotlib
- sklearn
- scikit-learn
- pandas
- azureml-opendatasets
- azureml-widgets

View File

@@ -3,6 +3,6 @@ dependencies:
- pip:
- azureml-sdk
- matplotlib
- sklearn
- scikit-learn
- pandas
- azureml-opendatasets

Some files were not shown because too many files have changed in this diff Show More