update samples from Release-162 as a part of 1.48.0 SDK stable release

This commit is contained in:
amlrelsa-ms
2022-12-05 15:58:23 +00:00
parent 38d5743bbb
commit 123a38bc3a
42 changed files with 182 additions and 809 deletions

View File

@@ -103,7 +103,7 @@
"source": [
"import azureml.core\n",
"\n",
"print(\"This notebook was created using version 1.47.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},

View File

@@ -398,7 +398,7 @@
"# run_config.target = gpu_cluster_name\n",
"# run_config.environment.docker.enabled = True\n",
"# run_config.environment.docker.gpu_support = True\n",
"# run_config.environment.docker.base_image = \"rapidsai/rapidsai:cuda9.2-runtime-ubuntu18.04\"\n",
"# run_config.environment.docker.base_image = \"rapidsai/rapidsai:cuda9.2-runtime-ubuntu20.04\"\n",
"# # run_config.environment.docker.base_image_registry.address = '<registry_url>' # not required if the base_image is in Docker hub\n",
"# # run_config.environment.docker.base_image_registry.username = '<user_name>' # needed only for private images\n",
"# # run_config.environment.docker.base_image_registry.password = '<password>' # needed only for private images\n",

View File

@@ -6,7 +6,7 @@ dependencies:
- fairlearn>=0.6.2
- joblib
- liac-arff
- raiwidgets~=0.22.0
- raiwidgets~=0.23.0
- itsdangerous==2.0.1
- markupsafe<2.1.0
- protobuf==3.20.0

View File

@@ -6,7 +6,7 @@ dependencies:
- fairlearn>=0.6.2
- joblib
- liac-arff
- raiwidgets~=0.22.0
- raiwidgets~=0.23.0
- itsdangerous==2.0.1
- markupsafe<2.1.0
- protobuf==3.20.0

View File

@@ -5,32 +5,17 @@ channels:
- main
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.6.0 and later.
- pip==20.2.4
- python>=3.6,<3.9
- matplotlib==3.2.1
- py-xgboost==1.3.3
- pytorch::pytorch=1.11.0
- conda-forge::fbprophet==0.7.1
- cudatoolkit=10.1.243
- scipy==1.5.3
- notebook
- PySocks==1.7.1
- conda-forge::pyqt==5.12.3
- jinja2<=2.11.2
- markupsafe<2.1.0
- tqdm==4.64.1
- jsonschema==4.16.0
- websocket-client==1.4.1
# Azure ML only supports 3.7.0 and later.
- pip==22.3.1
- python>=3.7,<3.9
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azureml-widgets~=1.47.0
- azureml-defaults~=1.47.0
- pytorch-transformers==1.0.0
- spacy==2.2.4
- pystan==2.19.1.1
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.47.0/validated_win32_requirements.txt [--no-deps]
- azureml-widgets~=latest
- azureml-defaults~=latest
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/latest/validated_win32_requirements.txt [--no-deps]
- matplotlib==3.6.2
- xgboost==1.3.3
- arch==4.14
- wasabi==0.9.1
- mlflow-skinny==1.30.0

View File

@@ -5,9 +5,9 @@ channels:
- main
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.6.0 and later.
# Azure ML only supports 3.7 and later.
- pip==20.1.1
- python>=3.6,<3.9
- python>=3.7,<3.9
- matplotlib==3.2.1
- numpy>=1.21.6,<=1.22.3
- cython==0.29.14
@@ -25,11 +25,11 @@ dependencies:
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azureml-widgets~=1.47.0
- azureml-defaults~=1.47.0
- azureml-widgets~=1.48.0
- azureml-defaults~=1.48.0
- pytorch-transformers==1.0.0
- spacy==2.2.4
- pystan==2.19.1.1
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.47.0/validated_linux_requirements.txt [--no-deps]
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.48.0/validated_linux_requirements.txt [--no-deps]
- arch==4.14

View File

@@ -5,9 +5,9 @@ channels:
- main
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.6.0 and later.
# Currently Azure ML only supports 3.7 and later.
- pip==20.1.1
- python>=3.6,<3.9
- python>=3.7,<3.9
- matplotlib==3.2.1
- numpy>=1.21.6,<=1.22.3
- cython==0.29.14
@@ -25,11 +25,11 @@ dependencies:
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azureml-widgets~=1.47.0
- azureml-defaults~=1.47.0
- azureml-widgets~=1.48.0
- azureml-defaults~=1.48.0
- pytorch-transformers==1.0.0
- spacy==2.2.4
- pystan==2.19.1.1
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.47.0/validated_darwin_requirements.txt [--no-deps]
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.48.0/validated_darwin_requirements.txt [--no-deps]
- arch==4.14

View File

@@ -97,7 +97,7 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"This notebook was created using version 1.47.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},

View File

@@ -97,7 +97,7 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"This notebook was created using version 1.47.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},

View File

@@ -1,23 +1,15 @@
name: azure_automl_experimental
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.6.0 and later.
- pip<=20.2.4
- python>=3.6.0,<3.10
- cython==0.29.14
- urllib3==1.26.7
- PyJWT < 2.0.0
- numpy==1.22.3
- pywin32==227
# Currently Azure ML only supports 3.7.0 and later.
- pip<=22.3.1
- python>=3.7.0,<3.10
- pip:
# Required packages for AzureML execution, history, and data preparation.
- azure-core==1.24.1
- azure-identity==1.7.0
- azureml-defaults
- azureml-sdk
- azureml-widgets
- azureml-mlflow
- pandas
- mlflow
- docker<6.0.0

View File

@@ -92,7 +92,7 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"This notebook was created using version 1.47.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},

View File

@@ -91,7 +91,7 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"This notebook was created using version 1.47.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},

View File

@@ -387,6 +387,7 @@
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long. |\n",
"| **process_count_per_node** | Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node or optimal performance. |\n",
"| **train_pipeline_parameters** | The set of configuration parameters defined in the previous section. |\n",
"| **run_invocation_timeout** | Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. This must be greater than ``experiment_timeout_hours`` by at least 300 seconds. |\n",
"\n",
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution."
]
@@ -529,6 +530,8 @@
" target_column_name=TARGET_COLNAME,\n",
")\n",
"\n",
"output_file_name = \"parallel_run_step.csv\"\n",
"\n",
"inference_steps = AutoMLPipelineBuilder.get_many_models_batch_inference_steps(\n",
" experiment=experiment,\n",
" inference_data=test_data,\n",
@@ -540,6 +543,7 @@
" train_run_id=training_run.id,\n",
" train_experiment_name=training_run.experiment.name,\n",
" inference_pipeline_parameters=mm_parameters,\n",
" append_row_file_name=output_file_name,\n",
")"
]
},
@@ -587,18 +591,21 @@
"source": [
"from azureml.contrib.automl.pipeline.steps.utilities import get_output_from_mm_pipeline\n",
"\n",
"PREDICTION_COLNAME = \"Predictions\"\n",
"forecasting_results_name = \"forecasting_results\"\n",
"forecasting_output_name = \"many_models_inference_output\"\n",
"forecast_file = get_output_from_mm_pipeline(\n",
" inference_run, forecasting_results_name, forecasting_output_name\n",
" inference_run, forecasting_results_name, forecasting_output_name, output_file_name\n",
")\n",
"df = pd.read_csv(forecast_file, delimiter=\" \", header=None, parse_dates=[0])\n",
"df.columns = list(X_train.columns) + [\"predicted_level\"]\n",
"df = pd.read_csv(forecast_file, parse_dates=[0])\n",
"print(\n",
" \"Prediction has \", df.shape[0], \" rows. Here the first 10 rows are being displayed.\"\n",
")\n",
"# Save the scv file with header to read it in the next step.\n",
"df.rename(columns={TARGET_COLNAME: \"actual_level\"}, inplace=True)\n",
"# Save the csv file to read it in the next step.\n",
"df.rename(\n",
" columns={TARGET_COLNAME: \"actual_level\", PREDICTION_COLNAME: \"predicted_level\"},\n",
" inplace=True,\n",
")\n",
"df.to_csv(os.path.join(forecasting_results_name, \"forecast.csv\"), index=False)\n",
"df.head(10)"
]

View File

@@ -251,8 +251,16 @@
"source": [
"### Set up training parameters\n",
"\n",
"This dictionary defines the AutoML and hierarchy settings. For this forecasting task we need to define several settings inncluding the name of the time column, the maximum forecast horizon, the hierarchy definition, and the level of the hierarchy at which to train.\n",
"We need to provide ``ForecastingParameters``, ``AutoMLConfig`` and ``HTSTrainParameters`` objects. For the forecasting task we need to define several settings including the name of the time column, the maximum forecast horizon, the hierarchy definition, and the level of the hierarchy at which to train.\n",
"\n",
"#### ``ForecastingParameters`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
"| **time_column_name** | The name of your time column. |\n",
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
"\n",
"#### ``AutoMLConfig`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **task** | forecasting |\n",
@@ -262,18 +270,21 @@
"| **iterations** | Number of models to train. This is optional but provides customers with greater control on exit criteria. |\n",
"| **experiment_timeout_hours** | Maximum amount of time in hours that the experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. |\n",
"| **label_column_name** | The name of the label column. |\n",
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
"|**n_cross_validations**|Number of cross-validation folds to use for model/pipeline selection. The default value is \"auto\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value.\n",
"|**cv_step_size**|Number of periods between two consecutive cross-validation folds. The default value is \"auto\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value.\n",
"| **n_cross_validations** | Number of cross-validation folds to use for model/pipeline selection. The default value is \\\"auto\\\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
"| **cv_step_size** | Number of periods between two consecutive cross-validation folds. The default value is \\\"auto\\\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
"| **enable_early_stopping** | Flag to enable early termination if the score is not improving in the short term. |\n",
"| **time_column_name** | The name of your time column. |\n",
"| **hierarchy_column_names** | The names of columns that define the hierarchical structure of the data from highest level to most granular. |\n",
"| **training_level** | The level of the hierarchy to be used for training models. |\n",
"| **enable_engineered_explanations** | Engineered feature explanations will be downloaded if enable_engineered_explanations flag is set to True. By default it is set to False to save storage space. |\n",
"| **time_series_id_column_name** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
"| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
"| **pipeline_fetch_max_batch_size** | Determines how many pipelines (training algorithms) to fetch at a time for training, this helps reduce throttling when training at large scale. |\n",
"| **model_explainability** | Flag to disable explaining the best automated ML model at the end of all training iterations. The default is True and will block non-explainable models which may impact the forecast accuracy. For more information, see [Interpretability: model explanations in automated machine learning](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-machine-learning-interpretability-automl). |"
"| **model_explainability** | Flag to disable explaining the best automated ML model at the end of all training iterations. The default is True and will block non-explainable models which may impact the forecast accuracy. For more information, see [Interpretability: model explanations in automated machine learning](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-machine-learning-interpretability-automl). |\n",
"\n",
"#### ``HTSTrainParameters`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **automl_settings** | ``AutoMLConfig`` object.\n",
"| **hierarchy_column_names** | The names of columns that define the hierarchical structure of the data from highest level to most granular. |\n",
"| **training_level** | The level of the hierarchy to be used for training models. |\n",
"| **enable_engineered_explanations** | The switch controls engineered explanations. |"
]
},
{
@@ -287,6 +298,9 @@
"outputs": [],
"source": [
"from azureml.train.automl.runtime._hts.hts_parameters import HTSTrainParameters\n",
"from azureml.automl.core.forecasting_parameters import ForecastingParameters\n",
"from azureml.train.automl.automlconfig import AutoMLConfig\n",
"\n",
"\n",
"model_explainability = True\n",
"\n",
@@ -300,24 +314,26 @@
"label_column_name = \"quantity\"\n",
"forecast_horizon = 7\n",
"\n",
"forecasting_parameters = ForecastingParameters(\n",
" time_column_name=time_column_name,\n",
" forecast_horizon=forecast_horizon,\n",
")\n",
"\n",
"automl_settings = {\n",
" \"task\": \"forecasting\",\n",
" \"primary_metric\": \"normalized_root_mean_squared_error\",\n",
" \"label_column_name\": label_column_name,\n",
" \"time_column_name\": time_column_name,\n",
" \"forecast_horizon\": forecast_horizon,\n",
" \"hierarchy_column_names\": hierarchy,\n",
" \"hierarchy_training_level\": training_level,\n",
" \"track_child_runs\": False,\n",
" \"pipeline_fetch_max_batch_size\": 15,\n",
" \"model_explainability\": model_explainability,\n",
" \"n_cross_validations\": \"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
" \"cv_step_size\": \"auto\",\n",
"automl_settings = AutoMLConfig(\n",
" task=\"forecasting\",\n",
" primary_metric=\"normalized_root_mean_squared_error\",\n",
" experiment_timeout_hours=1,\n",
" label_column_name=label_column_name,\n",
" track_child_runs=False,\n",
" forecasting_parameters=forecasting_parameters,\n",
" pipeline_fetch_max_batch_size=15,\n",
" model_explainability=model_explainability,\n",
" n_cross_validations=\"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
" cv_step_size=\"auto\",\n",
" # The following settings are specific to this sample and should be adjusted according to your own needs.\n",
" \"iteration_timeout_minutes\": 10,\n",
" \"iterations\": 10,\n",
"}\n",
" iteration_timeout_minutes=10,\n",
" iterations=15,\n",
")\n",
"\n",
"hts_parameters = HTSTrainParameters(\n",
" automl_settings=automl_settings,\n",
@@ -345,6 +361,7 @@
"* **node_count:** The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long.\n",
"* **process_count_per_node:** Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node or optimal performance.\n",
"* **train_pipeline_parameters:** The set of configuration parameters defined in the previous section. \n",
"* **run_invocation_timeout:** Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. This must be greater than ``experiment_timeout_hours`` by at least 300 seconds.\n",
"\n",
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution."
]
@@ -635,12 +652,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
},
"vscode": {
"interpreter": {
"hash": "6db9c8d9f0cce2d9127e384e15560d42c3b661994c9f717d0553d1d8985ab1ea"
}
"version": "3.6.8"
}
},
"nbformat": 4,

View File

@@ -379,8 +379,16 @@
"source": [
"### Set up training parameters\n",
"\n",
"This dictionary defines the AutoML and many models settings. For this forecasting task we need to define several settings inncluding the name of the time column, the maximum forecast horizon, and the partition column name definition.\n",
"We need to provide ``ForecastingParameters``, ``AutoMLConfig`` and ``ManyModelsTrainParameters`` objects. For the forecasting task we also need to define several settings including the name of the time column, the maximum forecast horizon, and the partition column name definition.\n",
"\n",
"#### ``ForecastingParameters`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
"| **time_column_name** | The name of your time column. |\n",
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
"\n",
"#### ``AutoMLConfig`` arguments\n",
"| Property | Description|\n",
"| :--------------- | :------------------- |\n",
"| **task** | forecasting |\n",
@@ -390,13 +398,10 @@
"| **iterations** | Number of models to train. This is optional but provides customers with greater control on exit criteria. |\n",
"| **experiment_timeout_hours** | Maximum amount of time in hours that the experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. |\n",
"| **label_column_name** | The name of the label column. |\n",
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
"| **n_cross_validations** | Number of cross validation splits. The default value is \"auto\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
"|**cv_step_size**|Number of periods between two consecutive cross-validation folds. The default value is \"auto\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value.\n",
"| **n_cross_validations** | Number of cross validation splits. The default value is \\\"auto\\\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
"| **cv_step_size** |Number of periods between two consecutive cross-validation folds. The default value is \\\"auto\\\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
"| **enable_early_stopping** | Flag to enable early termination if the score is not improving in the short term. |\n",
"| **time_column_name** | The name of your time column. |\n",
"| **enable_engineered_explanations** | Engineered feature explanations will be downloaded if enable_engineered_explanations flag is set to True. By default it is set to False to save storage space. |\n",
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
"| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
"| **pipeline_fetch_max_batch_size** | Determines how many pipelines (training algorithms) to fetch at a time for training, this helps reduce throttling when training at large scale. |\n",
"| **partition_column_names** | The names of columns used to group your models. For timeseries, the groups must not split up individual time-series. That is, each group must contain one or more whole time-series. |"
@@ -415,23 +420,30 @@
"from azureml.train.automl.runtime._many_models.many_models_parameters import (\n",
" ManyModelsTrainParameters,\n",
")\n",
"from azureml.automl.core.forecasting_parameters import ForecastingParameters\n",
"from azureml.train.automl.automlconfig import AutoMLConfig\n",
"\n",
"partition_column_names = [\"Store\", \"Brand\"]\n",
"automl_settings = {\n",
" \"task\": \"forecasting\",\n",
" \"primary_metric\": \"normalized_root_mean_squared_error\",\n",
" \"iteration_timeout_minutes\": 10, # This needs to be changed based on the dataset. We ask customer to explore how long training is taking before settings this value\n",
" \"iterations\": 15,\n",
" \"experiment_timeout_hours\": 0.25,\n",
" \"label_column_name\": \"Quantity\",\n",
" \"n_cross_validations\": \"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
" \"cv_step_size\": \"auto\",\n",
" \"time_column_name\": \"WeekStarting\",\n",
" \"drop_column_names\": \"Revenue\",\n",
" \"forecast_horizon\": 6,\n",
" \"time_series_id_column_names\": partition_column_names,\n",
" \"track_child_runs\": False,\n",
"}\n",
"\n",
"forecasting_parameters = ForecastingParameters(\n",
" time_column_name=\"WeekStarting\",\n",
" drop_column_names=\"Revenue\",\n",
" forecast_horizon=6,\n",
" time_series_id_column_names=partition_column_names,\n",
")\n",
"\n",
"automl_settings = AutoMLConfig(\n",
" task=\"forecasting\",\n",
" primary_metric=\"normalized_root_mean_squared_error\",\n",
" iteration_timeout_minutes=10,\n",
" iterations=15,\n",
" experiment_timeout_hours=0.25,\n",
" label_column_name=\"Quantity\",\n",
" n_cross_validations=\"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
" cv_step_size=\"auto\",\n",
" track_child_runs=False,\n",
" forecasting_parameters=forecasting_parameters,\n",
")\n",
"\n",
"mm_paramters = ManyModelsTrainParameters(\n",
" automl_settings=automl_settings, partition_column_names=partition_column_names\n",
@@ -498,6 +510,7 @@
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long. |\n",
"| **process_count_per_node** | Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node or optimal performance. |\n",
"| **train_pipeline_parameters** | The set of configuration parameters defined in the previous section. |\n",
"| **run_invocation_timeout** | Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. This must be greater than ``experiment_timeout_hours`` by at least 300 seconds. |\n",
"\n",
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution."
]
@@ -667,9 +680,9 @@
"| :--------------- | :------------------- |\n",
"| **experiment** | The experiment used for inference run. |\n",
"| **inference_data** | The data to use for inferencing. It should be the same schema as used for training.\n",
"| **compute_target** The compute target that runs the inference pipeline.|\n",
"| **compute_target** | The compute target that runs the inference pipeline. |\n",
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with the number of cores per node (varies by compute sku). |\n",
"| **process_count_per_node** The number of processes per node.\n",
"| **process_count_per_node** | The number of processes per node (should be at most half of the number of cores of the compute cluster that will be used for the experiment).\n",
"| **train_run_id** | \\[Optional] The run id of the hierarchy training, by default it is the latest successful training many model run in the experiment. |\n",
"| **train_experiment_name** | \\[Optional] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline. |\n",
"| **process_count_per_node** | \\[Optional] The number of processes per node, by default it's 4. |"
@@ -692,6 +705,8 @@
" target_column_name=\"Quantity\",\n",
")\n",
"\n",
"output_file_name = \"parallel_run_step.csv\"\n",
"\n",
"inference_steps = AutoMLPipelineBuilder.get_many_models_batch_inference_steps(\n",
" experiment=experiment,\n",
" inference_data=inference_ds_small,\n",
@@ -703,6 +718,8 @@
" train_run_id=training_run.id,\n",
" train_experiment_name=training_run.experiment.name,\n",
" inference_pipeline_parameters=mm_parameters,\n",
" append_row_file_name=output_file_name,\n",
" arguments=[\"--forecast_quantiles\", 0.1, 0.9],\n",
")"
]
},
@@ -737,7 +754,7 @@
"\n",
"The following code snippet:\n",
"1. Downloads the contents of the output folder that is passed in the parallel run step \n",
"2. Reads the parallel_run_step.txt file that has the predictions as pandas dataframe and \n",
"2. Reads the output file that has the predictions as pandas dataframe and \n",
"3. Displays the top 10 rows of the predictions"
]
},
@@ -752,19 +769,9 @@
"forecasting_results_name = \"forecasting_results\"\n",
"forecasting_output_name = \"many_models_inference_output\"\n",
"forecast_file = get_output_from_mm_pipeline(\n",
" inference_run, forecasting_results_name, forecasting_output_name\n",
" inference_run, forecasting_results_name, forecasting_output_name, output_file_name\n",
")\n",
"df = pd.read_csv(forecast_file, delimiter=\" \", header=None)\n",
"df.columns = [\n",
" \"Week Starting\",\n",
" \"Store\",\n",
" \"Brand\",\n",
" \"Quantity\",\n",
" \"Advert\",\n",
" \"Price\",\n",
" \"Revenue\",\n",
" \"Predicted\",\n",
"]\n",
"df = pd.read_csv(forecast_file)\n",
"print(\n",
" \"Prediction has \", df.shape[0], \" rows. Here the first 10 rows are being displayed.\"\n",
")\n",

View File

@@ -859,8 +859,8 @@
"outputs": [],
"source": [
"%matplotlib inline\n",
"test_pred = plt.scatter(y_test, y_pred_test, color=\"\")\n",
"test_test = plt.scatter(y_test, y_test, color=\"g\")\n",
"test_pred = plt.scatter(y_test, y_pred_test, c=[\"b\"])\n",
"test_test = plt.scatter(y_test, y_test, c=[\"g\"])\n",
"plt.legend(\n",
" (test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8\n",
")\n",

View File

@@ -422,8 +422,8 @@
"outputs": [],
"source": [
"%matplotlib inline\n",
"test_pred = plt.scatter(y_test, y_pred_test, color=\"\")\n",
"test_test = plt.scatter(y_test, y_test, color=\"g\")\n",
"test_pred = plt.scatter(y_test, y_pred_test, c=[\"b\"])\n",
"test_test = plt.scatter(y_test, y_test, c=[\"g\"])\n",
"plt.legend(\n",
" (test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8\n",
")\n",

View File

@@ -106,7 +106,7 @@
"metadata": {},
"outputs": [],
"source": [
"print(\"This notebook was created using version 1.47.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},
@@ -235,6 +235,7 @@
"# Note: this is to pin the pandas and xgboost versions to be same as notebook.\n",
"# In production scenario user would choose their dependencies\n",
"import pkg_resources\n",
"from distutils.version import LooseVersion\n",
"available_packages = pkg_resources.working_set\n",
"pandas_ver = None\n",
"numpy_ver = None\n",
@@ -242,7 +243,10 @@
" if dist.key == 'pandas':\n",
" pandas_ver = dist.version\n",
" if dist.key == 'numpy':\n",
" if LooseVersion(dist.version) >= LooseVersion('1.20.0'):\n",
" numpy_ver = dist.version\n",
" else:\n",
" numpy_ver = '1.21.6'\n",
"pandas_dep = 'pandas'\n",
"numpy_dep = 'numpy'\n",
"if pandas_ver:\n",
@@ -252,7 +256,7 @@
"\n",
"# Note: we build shap at commit 690245 for Tesla K80 GPUs\n",
"env.docker.base_dockerfile = f\"\"\"\n",
"FROM nvidia/cuda:10.2-devel-ubuntu18.04\n",
"FROM nvidia/cuda:10.2-devel-ubuntu20.04\n",
"ENV PATH=\"/root/miniconda3/bin:${{PATH}}\"\n",
"ARG PATH=\"/root/miniconda3/bin:${{PATH}}\"\n",
"RUN apt-get update && \\\n",

View File

@@ -10,7 +10,7 @@ dependencies:
- ipython
- matplotlib
- ipywidgets
- raiwidgets~=0.22.0
- raiwidgets~=0.23.0
- itsdangerous==2.0.1
- markupsafe<2.1.0
- scipy>=1.5.3

View File

@@ -10,7 +10,7 @@ dependencies:
- matplotlib
- azureml-dataset-runtime
- ipywidgets
- raiwidgets~=0.22.0
- raiwidgets~=0.23.0
- itsdangerous==2.0.1
- markupsafe<2.1.0
- scipy>=1.5.3

View File

@@ -9,7 +9,7 @@ dependencies:
- ipython
- matplotlib
- ipywidgets
- raiwidgets~=0.22.0
- raiwidgets~=0.23.0
- packaging>=20.9
- itsdangerous==2.0.1
- markupsafe<2.1.0

View File

@@ -9,7 +9,7 @@ dependencies:
- ipython
- matplotlib
- ipywidgets
- raiwidgets~=0.22.0
- raiwidgets~=0.23.0
- packaging>=20.9
- itsdangerous==2.0.1
- markupsafe<2.1.0

View File

@@ -11,7 +11,7 @@ dependencies:
- azureml-dataset-runtime
- azureml-core
- ipywidgets
- raiwidgets~=0.22.0
- raiwidgets~=0.23.0
- itsdangerous==2.0.1
- markupsafe<2.1.0
- scipy>=1.5.3

View File

@@ -330,7 +330,7 @@
"- **inputs:** List of input connections for data consumed by this step. Fetch this inside the notebook using dbutils.widgets.get(\"input\")\n",
"- **outputs:** List of output port definitions for outputs produced by this step. Fetch this inside the notebook using dbutils.widgets.get(\"output\")\n",
"- **existing_cluster_id:** Cluster ID of an existing Interactive cluster on the Databricks workspace. If you are providing this, do not provide any of the parameters below that are used to create a new cluster such as spark_version, node_type, etc.\n",
"- **spark_version:** Version of spark for the databricks run cluster. default value: 4.0.x-scala2.11\n",
"- **spark_version:** Version of spark for the databricks run cluster. You can refer to [DataBricks runtime version](https://learn.microsoft.com/azure/databricks/dev-tools/api/#--runtime-version-strings) to specify the spark version. default value: 4.0.x-scala2.11\n",
"- **node_type:** Azure vm node types for the databricks run cluster. default value: Standard_D3_v2\n",
"- **num_workers:** Specifies a static number of workers for the databricks run cluster\n",
"- **min_workers:** Specifies a min number of workers to use for auto-scaling the databricks run cluster\n",

View File

@@ -252,7 +252,7 @@
"# is_directory=None)\n",
"\n",
"# Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1.\n",
"processed_data1 = PipelineData(\"processed_data1\",datastore=def_blob_store)\n",
"processed_data1 = PipelineData(\"processed_data1\",datastore=def_blob_store, is_directory=True)\n",
"print(\"PipelineData object created\")"
]
},
@@ -347,7 +347,7 @@
"source": [
"# step5 to use the intermediate data produced by step4\n",
"# This step also produces an output processed_data2\n",
"processed_data2 = PipelineData(\"processed_data2\", datastore=def_blob_store)\n",
"processed_data2 = PipelineData(\"processed_data2\", datastore=def_blob_store, is_directory=True)\n",
"source_directory = \"data_dependency_run_extract\"\n",
"\n",
"extractStep = PythonScriptStep(\n",
@@ -394,7 +394,7 @@
"outputs": [],
"source": [
"# Now define the compare step which takes two inputs and produces an output\n",
"processed_data3 = PipelineData(\"processed_data3\", datastore=def_blob_store)\n",
"processed_data3 = PipelineData(\"processed_data3\", datastore=def_blob_store, is_directory=True)\n",
"source_directory = \"data_dependency_run_compare\"\n",
"\n",
"compareStep = PythonScriptStep(\n",

View File

@@ -235,7 +235,8 @@
" path_on_datastore=\"titanic/Titanic.csv\")\n",
"\n",
"output_data = PipelineData(name=\"processed_data\",\n",
" datastore=Datastore.get(ws, \"workspaceblobstore\"))"
" datastore=Datastore.get(ws, \"workspaceblobstore\"),\n",
" is_directory=True)"
]
},
{
@@ -306,7 +307,8 @@
"from azureml.pipeline.core import PipelineParameter\n",
"\n",
"output_from_notebook = PipelineData(name=\"notebook_processed_data\",\n",
" datastore=Datastore.get(ws, \"workspaceblobstore\"))\n",
" datastore=Datastore.get(ws, \"workspaceblobstore\"),\n",
" is_directory=True)\n",
"\n",
"my_pipeline_param = PipelineParameter(name=\"pipeline_param\", default_value=\"my_param\")\n",
"\n",

View File

@@ -1,5 +1,5 @@
# DisableDockerDetector "Disabled to unblock PRs until the owner can fix the file. Not used in any prod deployments - only as a documentation for the customers"
FROM rocker/tidyverse:4.0.0-ubuntu18.04
FROM rocker/tidyverse:4.0.0-ubuntu20.04
# Install python
RUN apt-get update -qq && \

View File

@@ -363,7 +363,7 @@
"}).replace(\",\", \";\")\n",
"\n",
"# Define output after cleansing step\n",
"cleansed_green_data = PipelineData(\"cleansed_green_data\", datastore=default_store).as_dataset()\n",
"cleansed_green_data = PipelineData(\"cleansed_green_data\", datastore=default_store, is_directory=True).as_dataset()\n",
"\n",
"print('Cleanse script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
"\n",
@@ -414,7 +414,7 @@
"}).replace(\",\", \";\")\n",
"\n",
"# Define output after cleansing step\n",
"cleansed_yellow_data = PipelineData(\"cleansed_yellow_data\", datastore=default_store).as_dataset()\n",
"cleansed_yellow_data = PipelineData(\"cleansed_yellow_data\", datastore=default_store, is_directory=True).as_dataset()\n",
"\n",
"print('Cleanse script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
"\n",
@@ -452,7 +452,7 @@
"outputs": [],
"source": [
"# Define output after merging step\n",
"merged_data = PipelineData(\"merged_data\", datastore=default_store).as_dataset()\n",
"merged_data = PipelineData(\"merged_data\", datastore=default_store, is_directory=True).as_dataset()\n",
"\n",
"print('Merge script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
"\n",
@@ -489,7 +489,7 @@
"outputs": [],
"source": [
"# Define output after merging step\n",
"filtered_data = PipelineData(\"filtered_data\", datastore=default_store).as_dataset()\n",
"filtered_data = PipelineData(\"filtered_data\", datastore=default_store, is_directory=True).as_dataset()\n",
"\n",
"print('Filter script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
"\n",
@@ -525,7 +525,7 @@
"outputs": [],
"source": [
"# Define output after normalize step\n",
"normalized_data = PipelineData(\"normalized_data\", datastore=default_store).as_dataset()\n",
"normalized_data = PipelineData(\"normalized_data\", datastore=default_store, is_directory=True).as_dataset()\n",
"\n",
"print('Normalize script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
"\n",
@@ -566,7 +566,7 @@
"outputs": [],
"source": [
"# Define output after transform step\n",
"transformed_data = PipelineData(\"transformed_data\", datastore=default_store).as_dataset()\n",
"transformed_data = PipelineData(\"transformed_data\", datastore=default_store, is_directory=True).as_dataset()\n",
"\n",
"print('Transform script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
"\n",
@@ -604,8 +604,8 @@
"train_model_folder = './scripts/trainmodel'\n",
"\n",
"# train and test splits output\n",
"output_split_train = PipelineData(\"output_split_train\", datastore=default_store).as_dataset()\n",
"output_split_test = PipelineData(\"output_split_test\", datastore=default_store).as_dataset()\n",
"output_split_train = PipelineData(\"output_split_train\", datastore=default_store, is_directory=True).as_dataset()\n",
"output_split_test = PipelineData(\"output_split_test\", datastore=default_store, is_directory=True).as_dataset()\n",
"\n",
"print('Data spilt script is in {}.'.format(os.path.realpath(train_model_folder)))\n",
"\n",

View File

@@ -61,8 +61,6 @@ def main():
run.log('Epochs', np.int(epochs))
train_iter = iterators.SerialIterator(train, batchsize)
test_iter = iterators.SerialIterator(test, batchsize,
repeat=False, shuffle=False)
model = MyNetwork()
@@ -106,6 +104,8 @@ def main():
test_losses = []
test_accuracies = []
test_iter = iterators.SerialIterator(test, batchsize,
repeat=False, shuffle=False)
while True:
test_batch = test_iter.next()
image_test, target_test = concat_examples(test_batch, gpu_id)
@@ -123,10 +123,6 @@ def main():
test_accuracies.append(accuracy.array)
if test_iter.is_new_epoch:
test_iter.epoch = 0
test_iter.current_position = 0
test_iter.is_new_epoch = False
test_iter._pushed_position = None
break
val_accuracy = np.mean(test_accuracies)

View File

@@ -253,14 +253,13 @@
"channels:\n",
"- conda-forge\n",
"dependencies:\n",
"- python=3.6.2\n",
"- pip=21.3.1\n",
"- python=3.8.12\n",
"- pip=22.3.1\n",
"- pip:\n",
" - azureml-defaults\n",
" - azureml-opendatasets\n",
" - chainer==5.1.0\n",
" - cupy-cuda100==5.1.0\n",
" - mpi4py==3.0.0\n",
" - chainer\n",
" - cupy-cuda111\n",
" - pytest"
]
},
@@ -273,10 +272,10 @@
"from azureml.core import Environment\n",
"from azureml.core.runconfig import DockerConfiguration\n",
"\n",
"chainer_env = Environment.from_conda_specification(name = 'chainer-5.1.0-gpu', file_path = './conda_dependencies.yml')\n",
"chainer_env = Environment.from_conda_specification(name = 'chainer-gpu', file_path = './conda_dependencies.yml')\n",
"\n",
"# Specify a GPU base image\n",
"chainer_env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.0-cudnn7-ubuntu18.04'\n",
"chainer_env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu20.04'\n",
"\n",
"docker_config = DockerConfiguration(use_docker=True)"
]
@@ -553,8 +552,8 @@
"\n",
"cd = CondaDependencies.create()\n",
"cd.add_conda_package('numpy')\n",
"cd.add_pip_package('chainer==5.1.0')\n",
"cd.add_pip_package(\"azureml-defaults==1.43.0\")\n",
"cd.add_pip_package('chainer')\n",
"cd.add_pip_package(\"azureml-defaults\")\n",
"cd.add_pip_package(\"azureml-opendatasets\")\n",
"cd.save_to_file(base_directory='./', conda_file_path='myenv.yml')\n",
"\n",

View File

@@ -1,354 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/ml-frameworks/tensorflow/distributed-tensorflow-with-parameter-server/distributed-tensorflow-with-parameter-server.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Distributed TensorFlow with parameter server\n",
"In this tutorial, you will train a TensorFlow model on the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset using native [distributed TensorFlow](https://www.tensorflow.org/guide/distributed_training)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites\n",
"* Understand the [architecture and terms](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture) introduced by Azure Machine Learning (AML)\n",
"* If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [configuration notebook](../../../../configuration.ipynb) to:\n",
" * install the AML SDK\n",
" * create a workspace and its configuration file (`config.json`)\n",
"* Review the [tutorial](../train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb) on single-node TensorFlow training using the SDK"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check core SDK version number\n",
"import azureml.core\n",
"\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Diagnostics\n",
"Opt-in diagnostics for better experience, quality, and security of future releases."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"Diagnostics"
]
},
"outputs": [],
"source": [
"from azureml.telemetry import set_diagnostics_collection\n",
"\n",
"set_diagnostics_collection(send_diagnostics=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize workspace\n",
"Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.workspace import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create or Attach existing AmlCompute\n",
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, you create `AmlCompute` as your training compute resource.\n",
"\n",
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
"\n",
"**Creation of AmlCompute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
"\n",
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.core.compute_target import ComputeTargetException\n",
"\n",
"# choose a name for your cluster\n",
"cluster_name = \"gpu-cluster\"\n",
"\n",
"try:\n",
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
" print('Found existing compute target.')\n",
"except ComputeTargetException:\n",
" print('Creating a new compute target...')\n",
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n",
" max_nodes=4)\n",
"\n",
" # create the cluster\n",
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
"\n",
" compute_target.wait_for_completion(show_output=True)\n",
"\n",
"# use get_status() to get a detailed status for the current cluster. \n",
"print(compute_target.get_status().serialize())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train model on the remote compute\n",
"Now that we have the cluster ready to go, let's run our distributed training job."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a project directory\n",
"Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script, and any additional files your training script depends on."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"project_folder = './tf-distr-ps'\n",
"os.makedirs(project_folder, exist_ok=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copy the training script `tf_mnist_replica.py` into this project directory."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import shutil\n",
"\n",
"shutil.copy('tf_mnist_replica.py', project_folder)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create an experiment\n",
"Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed TensorFlow tutorial. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Experiment\n",
"\n",
"experiment_name = 'tf-distr-ps'\n",
"experiment = Experiment(ws, name=experiment_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create an environment\n",
"\n",
"In this tutorial, we will use one of Azure ML's curated TensorFlow environments for training. [Curated environments](https://docs.microsoft.com/azure/machine-learning/how-to-use-environments#use-a-curated-environment) are available in your workspace by default. Specifically, we will use the TensorFlow 1.13 GPU curated environment."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Environment\n",
"\n",
"tf_env = Environment.get(ws, name='AzureML-TensorFlow-1.13-GPU')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Configure the training job\n",
"\n",
"Create a ScriptRunConfig object to specify the configuration details of your training job, including your training script, environment to use, and the compute target to run on.\n",
"\n",
"In order to execute a distributed TensorFlow run with the parameter server strategy, you must create a `TensorflowConfiguration` object and pass it to the `distributed_job_config` parameter of the ScriptRunConfig constructor. The below code configures a distributed TensorFlow run with `2` workers and `1` parameter server."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import ScriptRunConfig\n",
"from azureml.core.runconfig import TensorflowConfiguration\n",
"\n",
"src = ScriptRunConfig(source_directory=project_folder,\n",
" script='tf_mnist_replica.py',\n",
" arguments=['--num_gpus', 1, '--train_steps', 500],\n",
" compute_target=compute_target,\n",
" environment=tf_env,\n",
" distributed_job_config=TensorflowConfiguration(worker_count=2, parameter_server_count=1))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Submit job\n",
"Run your experiment by submitting your ScriptRunConfig object. Note that this call is asynchronous."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run = experiment.submit(src)\n",
"print(run)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Monitor your run\n",
"You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.widgets import RunDetails\n",
"\n",
"RunDetails(run).show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Alternatively, you can block until the script has completed training before running more code."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.wait_for_completion(show_output=True) # this provides a verbose log"
]
}
],
"metadata": {
"authors": [
{
"name": "minxia"
}
],
"category": "training",
"compute": [
"AML Compute"
],
"datasets": [
"MNIST"
],
"deployment": [
"None"
],
"exclude_from_index": false,
"framework": [
"TensorFlow"
],
"friendly_name": "Distributed TensorFlow with parameter server",
"index_order": 1,
"kernelspec": {
"display_name": "Python 3.8 - AzureML",
"language": "python",
"name": "python38-azureml"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
},
"tags": [
"None"
],
"task": "Use the TensorFlow estimator to train a model using distributed training"
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,5 +0,0 @@
name: distributed-tensorflow-with-parameter-server
dependencies:
- pip:
- azureml-sdk
- azureml-widgets

View File

@@ -1,271 +0,0 @@
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
# Licensed under the Apache License, Version 2.0
# Script adapted from:
# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dist_test/python/mnist_replica.py
# ==============================================================================
"""Distributed MNIST training and validation, with model replicas.
A simple softmax model with one hidden layer is defined. The parameters
(weights and biases) are located on one parameter server (ps), while the ops
are executed on two worker nodes by default. The TF sessions also run on the
worker node.
Multiple invocations of this script can be done in parallel, with different
values for --task_index. There should be exactly one invocation with
--task_index, which will create a master session that carries out variable
initialization. The other, non-master, sessions will wait for the master
session to finish the initialization before proceeding to the training stage.
The coordination between the multiple worker invocations occurs due to
the definition of the parameters on the same ps devices. The parameter updates
from one worker is visible to all other workers. As such, the workers can
perform forward computation and gradient calculation in parallel, which
should lead to increased training speed for the simple model.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import math
import sys
import tempfile
import time
import json
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
from azureml.core.run import Run
flags = tf.app.flags
flags.DEFINE_string("data_dir", "/tmp/mnist-data",
"Directory for storing mnist data")
flags.DEFINE_boolean("download_only", False,
"Only perform downloading of data; Do not proceed to "
"session preparation, model definition or training")
flags.DEFINE_integer("num_gpus", 0, "Total number of gpus for each machine."
"If you don't use GPU, please set it to '0'")
flags.DEFINE_integer("replicas_to_aggregate", None,
"Number of replicas to aggregate before parameter update "
"is applied (For sync_replicas mode only; default: "
"num_workers)")
flags.DEFINE_integer("hidden_units", 100,
"Number of units in the hidden layer of the NN")
flags.DEFINE_integer("train_steps", 200,
"Number of (global) training steps to perform")
flags.DEFINE_integer("batch_size", 100, "Training batch size")
flags.DEFINE_float("learning_rate", 0.01, "Learning rate")
flags.DEFINE_boolean(
"sync_replicas", False,
"Use the sync_replicas (synchronized replicas) mode, "
"wherein the parameter updates from workers are aggregated "
"before applied to avoid stale gradients")
flags.DEFINE_boolean(
"existing_servers", False, "Whether servers already exists. If True, "
"will use the worker hosts via their GRPC URLs (one client process "
"per worker host). Otherwise, will create an in-process TensorFlow "
"server.")
FLAGS = flags.FLAGS
IMAGE_PIXELS = 28
def main(unused_argv):
data_root = os.path.join("outputs", "MNIST")
mnist = None
tf_config = os.environ.get("TF_CONFIG")
if not tf_config or tf_config == "":
raise ValueError("TF_CONFIG not found.")
tf_config_json = json.loads(tf_config)
cluster = tf_config_json.get('cluster')
job_name = tf_config_json.get('task', {}).get('type')
task_index = tf_config_json.get('task', {}).get('index')
job_name = "worker" if job_name == "master" else job_name
sentinel_path = os.path.join(data_root, "complete.txt")
if job_name == "worker" and task_index == 0:
mnist = input_data.read_data_sets(data_root, one_hot=True)
with open(sentinel_path, 'w+') as f:
f.write("download complete")
else:
while not os.path.exists(sentinel_path):
time.sleep(0.01)
mnist = input_data.read_data_sets(data_root, one_hot=True)
if FLAGS.download_only:
sys.exit(0)
print("job name = %s" % job_name)
print("task index = %d" % task_index)
print("number of GPUs = %d" % FLAGS.num_gpus)
# Construct the cluster and start the server
cluster_spec = tf.train.ClusterSpec(cluster)
# Get the number of workers.
num_workers = len(cluster_spec.task_indices("worker"))
if not FLAGS.existing_servers:
# Not using existing servers. Create an in-process server.
server = tf.train.Server(
cluster_spec, job_name=job_name, task_index=task_index)
if job_name == "ps":
server.join()
is_chief = (task_index == 0)
if FLAGS.num_gpus > 0:
# Avoid gpu allocation conflict: now allocate task_num -> #gpu
# for each worker in the corresponding machine
gpu = (task_index % FLAGS.num_gpus)
worker_device = "/job:worker/task:%d/gpu:%d" % (task_index, gpu)
elif FLAGS.num_gpus == 0:
# Just allocate the CPU to worker server
cpu = 0
worker_device = "/job:worker/task:%d/cpu:%d" % (task_index, cpu)
# The device setter will automatically place Variables ops on separate
# parameter servers (ps). The non-Variable ops will be placed on the workers.
# The ps use CPU and workers use corresponding GPU
with tf.device(
tf.train.replica_device_setter(
worker_device=worker_device,
ps_device="/job:ps/cpu:0",
cluster=cluster)):
global_step = tf.Variable(0, name="global_step", trainable=False)
# Variables of the hidden layer
hid_w = tf.Variable(
tf.truncated_normal(
[IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
stddev=1.0 / IMAGE_PIXELS),
name="hid_w")
hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b")
# Variables of the softmax layer
sm_w = tf.Variable(
tf.truncated_normal(
[FLAGS.hidden_units, 10],
stddev=1.0 / math.sqrt(FLAGS.hidden_units)),
name="sm_w")
sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
# Ops: located on the worker specified with task_index
x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])
y_ = tf.placeholder(tf.float32, [None, 10])
hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
hid = tf.nn.relu(hid_lin)
y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
opt = tf.train.AdamOptimizer(FLAGS.learning_rate)
if FLAGS.sync_replicas:
if FLAGS.replicas_to_aggregate is None:
replicas_to_aggregate = num_workers
else:
replicas_to_aggregate = FLAGS.replicas_to_aggregate
opt = tf.train.SyncReplicasOptimizer(
opt,
replicas_to_aggregate=replicas_to_aggregate,
total_num_replicas=num_workers,
name="mnist_sync_replicas")
train_step = opt.minimize(cross_entropy, global_step=global_step)
if FLAGS.sync_replicas:
local_init_op = opt.local_step_init_op
if is_chief:
local_init_op = opt.chief_init_op
ready_for_local_init_op = opt.ready_for_local_init_op
# Initial token and chief queue runners required by the sync_replicas mode
chief_queue_runner = opt.get_chief_queue_runner()
sync_init_op = opt.get_init_tokens_op()
init_op = tf.global_variables_initializer()
train_dir = tempfile.mkdtemp()
if FLAGS.sync_replicas:
sv = tf.train.Supervisor(
is_chief=is_chief,
logdir=train_dir,
init_op=init_op,
local_init_op=local_init_op,
ready_for_local_init_op=ready_for_local_init_op,
recovery_wait_secs=1,
global_step=global_step)
else:
sv = tf.train.Supervisor(
is_chief=is_chief,
logdir=train_dir,
init_op=init_op,
recovery_wait_secs=1,
global_step=global_step)
sess_config = tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=False,
device_filters=["/job:ps",
"/job:worker/task:%d" % task_index])
# The chief worker (task_index==0) session will prepare the session,
# while the remaining workers will wait for the preparation to complete.
if is_chief:
print("Worker %d: Initializing session..." % task_index)
else:
print("Worker %d: Waiting for session to be initialized..." %
task_index)
if FLAGS.existing_servers:
server_grpc_url = "grpc://" + task_index
print("Using existing server at: %s" % server_grpc_url)
sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config)
else:
sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
print("Worker %d: Session initialization complete." % task_index)
if FLAGS.sync_replicas and is_chief:
# Chief worker will start the chief queue runner and call the init op.
sess.run(sync_init_op)
sv.start_queue_runners(sess, [chief_queue_runner])
# Perform training
time_begin = time.time()
print("Training begins @ %f" % time_begin)
local_step = 0
while True:
# Training feed
batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size)
train_feed = {x: batch_xs, y_: batch_ys}
_, step = sess.run([train_step, global_step], feed_dict=train_feed)
local_step += 1
now = time.time()
print("%f: Worker %d: training step %d done (global step: %d)" %
(now, task_index, local_step, step))
if step >= FLAGS.train_steps:
break
time_end = time.time()
print("Training ends @ %f" % time_end)
training_time = time_end - time_begin
print("Training elapsed time: %f s" % training_time)
# Validation feed
val_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
val_xent = sess.run(cross_entropy, feed_dict=val_feed)
print("After %d training step(s), validation cross entropy = %g" %
(FLAGS.train_steps, val_xent))
if job_name == "worker" and task_index == 0:
run = Run.get_context()
run.log("CrossEntropy", val_xent)
if __name__ == "__main__":
tf.app.run()

View File

@@ -1,4 +1,4 @@
FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.0.3-cudnn8-ubuntu18.04
FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu20.04
USER root
RUN conda install -c anaconda python=3.7

View File

@@ -1,4 +1,4 @@
FROM mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04
FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
USER root
RUN conda install -c anaconda python=3.7

View File

@@ -1,4 +1,4 @@
FROM mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20200423.v1
FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
RUN apt-get update && apt-get install -y --no-install-recommends \
python-opengl \
@@ -16,7 +16,7 @@ RUN pip install ray-on-aml==0.2.1 & \
azureml-contrib-reinforcementlearning \
gputil \
scipy \
pyglet \
pyglet==1.5.27 \
cloudpickle==1.3.0 \
tensorboardX \
tensorflow==1.14.0 \

View File

@@ -8,7 +8,7 @@ dependencies:
- matplotlib
- azureml-dataset-runtime
- ipywidgets
- raiwidgets~=0.22.0
- raiwidgets~=0.23.0
- liac-arff
- packaging>=20.9
- itsdangerous==2.0.1

View File

@@ -101,7 +101,7 @@
"\n",
"# Check core SDK version number\n",
"\n",
"print(\"This notebook was created using SDK version 1.47.0, you are currently running version\", azureml.core.VERSION)"
"print(\"This notebook was created using SDK version 1.48.0, you are currently running version\", azureml.core.VERSION)"
]
},
{

View File

@@ -176,7 +176,7 @@
"metadata": {},
"outputs": [],
"source": [
"data_urls = ['https://data4mldemo6150520719.blob.core.windows.net/demo/mnist-fashion']\n",
"data_urls = ['https://azuremldownloads.blob.core.windows.net/demo/mnist-fashion']\n",
"fashion_ds = Dataset.File.from_files(data_urls)\n",
"\n",
"# list the files referenced by fashion_ds\n",

View File

@@ -67,7 +67,6 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
| [Training with hyperparameter tuning using PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb) | Train an image classification model using transfer learning with the PyTorch estimator | ImageNet | AML Compute | Azure Container Instance | PyTorch | None |
| [Training and hyperparameter tuning with Scikit-learn](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/scikit-learn/train-hyperparameter-tune-deploy-with-sklearn/train-hyperparameter-tune-deploy-with-sklearn.ipynb) | Train a support vector machine (SVM) to perform classification | Iris | AML Compute | None | Scikit-learn | None |
| [Distributed training using TensorFlow with Horovod](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/distributed-tensorflow-with-horovod/distributed-tensorflow-with-horovod.ipynb) | Use the TensorFlow estimator to train a word2vec model | None | AML Compute | None | TensorFlow | None |
| [Distributed TensorFlow with parameter server](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/distributed-tensorflow-with-parameter-server/distributed-tensorflow-with-parameter-server.ipynb) | Use the TensorFlow estimator to train a model using distributed training | MNIST | AML Compute | None | TensorFlow | None |
| [Hyperparameter tuning and warm start using the TensorFlow estimator](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.ipynb) | Train a deep neural network | MNIST | AML Compute | Azure Container Instance | TensorFlow | None |
| [Training and hyperparameter tuning using the TensorFlow estimator](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb) | Train a deep neural network | MNIST | AML Compute | Azure Container Instance | TensorFlow | None |
| [Resuming a model](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/train-tensorflow-resume-training/train-tensorflow-resume-training.ipynb) | Resume a model in TensorFlow from a previously submitted run | MNIST | AML Compute | None | TensorFlow | None |

View File

@@ -102,7 +102,7 @@
"source": [
"import azureml.core\n",
"\n",
"print(\"This notebook was created using version 1.47.0 of the Azure ML SDK\")\n",
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
]
},

View File

@@ -151,7 +151,7 @@
"# use a curated environment that has already been built for you\n",
"\n",
"env = Environment.get(workspace=ws, \n",
" name=\"AzureML-sklearn-0.24-ubuntu18.04-py37-cpu\")"
" name=\"AzureML-sklearn-1.0-ubuntu20.04-py38-cpu\")"
]
},
{