mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-23 20:00:06 -05:00
Compare commits
8 Commits
azureml-sd
...
release_up
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4404e62f58 | ||
|
|
38d5743bbb | ||
|
|
0814eee151 | ||
|
|
f45b815221 | ||
|
|
bd629ae454 | ||
|
|
41de75a584 | ||
|
|
96a426dc36 | ||
|
|
824dd40f7e |
@@ -103,7 +103,7 @@
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.46.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -398,7 +398,7 @@
|
||||
"# run_config.target = gpu_cluster_name\n",
|
||||
"# run_config.environment.docker.enabled = True\n",
|
||||
"# run_config.environment.docker.gpu_support = True\n",
|
||||
"# run_config.environment.docker.base_image = \"rapidsai/rapidsai:cuda9.2-runtime-ubuntu18.04\"\n",
|
||||
"# run_config.environment.docker.base_image = \"rapidsai/rapidsai:cuda9.2-runtime-ubuntu20.04\"\n",
|
||||
"# # run_config.environment.docker.base_image_registry.address = '<registry_url>' # not required if the base_image is in Docker hub\n",
|
||||
"# # run_config.environment.docker.base_image_registry.username = '<user_name>' # needed only for private images\n",
|
||||
"# # run_config.environment.docker.base_image_registry.password = '<password>' # needed only for private images\n",
|
||||
|
||||
@@ -6,7 +6,7 @@ dependencies:
|
||||
- fairlearn>=0.6.2
|
||||
- joblib
|
||||
- liac-arff
|
||||
- raiwidgets~=0.22.0
|
||||
- raiwidgets~=0.23.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- protobuf==3.20.0
|
||||
|
||||
@@ -6,7 +6,7 @@ dependencies:
|
||||
- fairlearn>=0.6.2
|
||||
- joblib
|
||||
- liac-arff
|
||||
- raiwidgets~=0.22.0
|
||||
- raiwidgets~=0.23.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- protobuf==3.20.0
|
||||
|
||||
@@ -5,32 +5,17 @@ channels:
|
||||
- main
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.6.0 and later.
|
||||
- pip==20.2.4
|
||||
- python>=3.6,<3.9
|
||||
- matplotlib==3.2.1
|
||||
- py-xgboost==1.3.3
|
||||
- pytorch::pytorch=1.4.0
|
||||
- conda-forge::fbprophet==0.7.1
|
||||
- cudatoolkit=10.1.243
|
||||
- scipy==1.5.3
|
||||
- notebook
|
||||
- pywin32==227
|
||||
- PySocks==1.7.1
|
||||
- conda-forge::pyqt==5.12.3
|
||||
- jinja2<=2.11.2
|
||||
- markupsafe<2.1.0
|
||||
- tqdm==4.64.1
|
||||
- jsonschema==4.16.0
|
||||
# Azure ML only supports 3.7.0 and later.
|
||||
- pip==22.3.1
|
||||
- python>=3.7,<3.9
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.46.0
|
||||
- azureml-defaults~=1.46.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- spacy==2.2.4
|
||||
- pystan==2.19.1.1
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.46.0/validated_win32_requirements.txt [--no-deps]
|
||||
- azureml-widgets~=latest
|
||||
- azureml-defaults~=latest
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/latest/validated_win32_requirements.txt [--no-deps]
|
||||
- matplotlib==3.6.2
|
||||
- xgboost==1.3.3
|
||||
- arch==4.14
|
||||
- wasabi==0.9.1
|
||||
- mlflow-skinny==1.30.0
|
||||
|
||||
|
||||
@@ -5,11 +5,9 @@ channels:
|
||||
- main
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.6.0 and later.
|
||||
- pip==20.2.4
|
||||
- python>=3.6,<3.9
|
||||
- boto3==1.20.19
|
||||
- botocore<=1.23.19
|
||||
# Azure ML only supports 3.7 and later.
|
||||
- pip==20.1.1
|
||||
- python>=3.7,<3.9
|
||||
- matplotlib==3.2.1
|
||||
- numpy>=1.21.6,<=1.22.3
|
||||
- cython==0.29.14
|
||||
@@ -19,19 +17,19 @@ dependencies:
|
||||
- py-xgboost<=1.3.3
|
||||
- holidays==0.10.3
|
||||
- conda-forge::fbprophet==0.7.1
|
||||
- pytorch::pytorch=1.4.0
|
||||
- pytorch::pytorch=1.11.0
|
||||
- cudatoolkit=10.1.243
|
||||
- notebook
|
||||
- jinja2<=2.11.2
|
||||
- markupsafe<2.1.0
|
||||
- jsonschema==4.15.0
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.46.0
|
||||
- azureml-defaults~=1.46.0
|
||||
- azureml-widgets~=1.48.0
|
||||
- azureml-defaults~=1.48.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- spacy==2.2.4
|
||||
- pystan==2.19.1.1
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.46.0/validated_linux_requirements.txt [--no-deps]
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.48.0/validated_linux_requirements.txt [--no-deps]
|
||||
- arch==4.14
|
||||
|
||||
@@ -5,12 +5,9 @@ channels:
|
||||
- main
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.6.0 and later.
|
||||
- pip==20.2.4
|
||||
- nomkl
|
||||
- python>=3.6,<3.9
|
||||
- boto3==1.20.19
|
||||
- botocore<=1.23.19
|
||||
# Currently Azure ML only supports 3.7 and later.
|
||||
- pip==20.1.1
|
||||
- python>=3.7,<3.9
|
||||
- matplotlib==3.2.1
|
||||
- numpy>=1.21.6,<=1.22.3
|
||||
- cython==0.29.14
|
||||
@@ -20,19 +17,19 @@ dependencies:
|
||||
- py-xgboost<=1.3.3
|
||||
- holidays==0.10.3
|
||||
- conda-forge::fbprophet==0.7.1
|
||||
- pytorch::pytorch=1.4.0
|
||||
- pytorch::pytorch=1.11.0
|
||||
- cudatoolkit=9.0
|
||||
- notebook
|
||||
- jinja2<=2.11.2
|
||||
- markupsafe<2.1.0
|
||||
- jsonschema==4.15.0
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.46.0
|
||||
- azureml-defaults~=1.46.0
|
||||
- azureml-widgets~=1.48.0
|
||||
- azureml-defaults~=1.48.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- spacy==2.2.4
|
||||
- pystan==2.19.1.1
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.46.0/validated_darwin_requirements.txt [--no-deps]
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.48.0/validated_darwin_requirements.txt [--no-deps]
|
||||
- arch==4.14
|
||||
|
||||
@@ -33,6 +33,8 @@ if not errorlevel 1 (
|
||||
call conda env create -f %automl_env_file% -n %conda_env_name%
|
||||
)
|
||||
|
||||
python "%conda_prefix%\scripts\pywin32_postinstall.py" -install
|
||||
|
||||
call conda activate %conda_env_name% 2>nul:
|
||||
if errorlevel 1 goto ErrorExit
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from distutils.version import LooseVersion
|
||||
from setuptools._vendor.packaging import version
|
||||
import platform
|
||||
|
||||
try:
|
||||
@@ -17,7 +17,7 @@ if architecture != "64bit":
|
||||
|
||||
minimumVersion = "4.7.8"
|
||||
|
||||
versionInvalid = (LooseVersion(conda.__version__) < LooseVersion(minimumVersion))
|
||||
versionInvalid = (version.parse(conda.__version__) < version.parse(minimumVersion))
|
||||
|
||||
if versionInvalid:
|
||||
print('Setup requires conda version ' + minimumVersion + ' or higher.')
|
||||
|
||||
@@ -97,7 +97,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.46.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -97,7 +97,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.46.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -1,24 +1,15 @@
|
||||
name: azure_automl_experimental
|
||||
dependencies:
|
||||
# The python interpreter version.
|
||||
# Currently Azure ML only supports 3.6.0 and later.
|
||||
- pip<=20.2.4
|
||||
- python>=3.6.0,<3.10
|
||||
- cython==0.29.14
|
||||
- urllib3==1.26.7
|
||||
- PyJWT < 2.0.0
|
||||
- numpy==1.22.3
|
||||
- pywin32==227
|
||||
- cryptography<37.0.0
|
||||
# Currently Azure ML only supports 3.7.0 and later.
|
||||
- pip<=22.3.1
|
||||
- python>=3.7.0,<3.10
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azure-core==1.24.1
|
||||
- azure-identity==1.7.0
|
||||
- azureml-defaults
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
- azureml-mlflow
|
||||
- pandas
|
||||
- mlflow
|
||||
- docker<6.0.0
|
||||
|
||||
@@ -11,7 +11,6 @@ dependencies:
|
||||
- urllib3==1.26.7
|
||||
- PyJWT < 2.0.0
|
||||
- numpy>=1.21.6,<=1.22.3
|
||||
- cryptography<37.0.0
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
|
||||
@@ -92,7 +92,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.46.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -91,7 +91,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.46.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -387,6 +387,7 @@
|
||||
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long. |\n",
|
||||
"| **process_count_per_node** | Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node or optimal performance. |\n",
|
||||
"| **train_pipeline_parameters** | The set of configuration parameters defined in the previous section. |\n",
|
||||
"| **run_invocation_timeout** | Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. This must be greater than ``experiment_timeout_hours`` by at least 300 seconds. |\n",
|
||||
"\n",
|
||||
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution."
|
||||
]
|
||||
@@ -406,7 +407,7 @@
|
||||
" compute_target=compute_target,\n",
|
||||
" node_count=2,\n",
|
||||
" process_count_per_node=2,\n",
|
||||
" run_invocation_timeout=920,\n",
|
||||
" run_invocation_timeout=1200,\n",
|
||||
" train_pipeline_parameters=mm_paramters,\n",
|
||||
")"
|
||||
]
|
||||
@@ -529,6 +530,8 @@
|
||||
" target_column_name=TARGET_COLNAME,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"output_file_name = \"parallel_run_step.csv\"\n",
|
||||
"\n",
|
||||
"inference_steps = AutoMLPipelineBuilder.get_many_models_batch_inference_steps(\n",
|
||||
" experiment=experiment,\n",
|
||||
" inference_data=test_data,\n",
|
||||
@@ -540,6 +543,7 @@
|
||||
" train_run_id=training_run.id,\n",
|
||||
" train_experiment_name=training_run.experiment.name,\n",
|
||||
" inference_pipeline_parameters=mm_parameters,\n",
|
||||
" append_row_file_name=output_file_name,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -587,18 +591,21 @@
|
||||
"source": [
|
||||
"from azureml.contrib.automl.pipeline.steps.utilities import get_output_from_mm_pipeline\n",
|
||||
"\n",
|
||||
"PREDICTION_COLNAME = \"Predictions\"\n",
|
||||
"forecasting_results_name = \"forecasting_results\"\n",
|
||||
"forecasting_output_name = \"many_models_inference_output\"\n",
|
||||
"forecast_file = get_output_from_mm_pipeline(\n",
|
||||
" inference_run, forecasting_results_name, forecasting_output_name\n",
|
||||
" inference_run, forecasting_results_name, forecasting_output_name, output_file_name\n",
|
||||
")\n",
|
||||
"df = pd.read_csv(forecast_file, delimiter=\" \", header=None, parse_dates=[0])\n",
|
||||
"df.columns = list(X_train.columns) + [\"predicted_level\"]\n",
|
||||
"df = pd.read_csv(forecast_file, parse_dates=[0])\n",
|
||||
"print(\n",
|
||||
" \"Prediction has \", df.shape[0], \" rows. Here the first 10 rows are being displayed.\"\n",
|
||||
")\n",
|
||||
"# Save the scv file with header to read it in the next step.\n",
|
||||
"df.rename(columns={TARGET_COLNAME: \"actual_level\"}, inplace=True)\n",
|
||||
"# Save the csv file to read it in the next step.\n",
|
||||
"df.rename(\n",
|
||||
" columns={TARGET_COLNAME: \"actual_level\", PREDICTION_COLNAME: \"predicted_level\"},\n",
|
||||
" inplace=True,\n",
|
||||
")\n",
|
||||
"df.to_csv(os.path.join(forecasting_results_name, \"forecast.csv\"), index=False)\n",
|
||||
"df.head(10)"
|
||||
]
|
||||
|
||||
@@ -43,11 +43,20 @@ def init():
|
||||
global output_dir
|
||||
global automl_settings
|
||||
global model_uid
|
||||
global forecast_quantiles
|
||||
|
||||
logger.info("Initialization of the run.")
|
||||
parser = argparse.ArgumentParser("Parsing input arguments.")
|
||||
parser.add_argument("--output-dir", dest="out", required=True)
|
||||
parser.add_argument("--model-name", dest="model", default=None)
|
||||
parser.add_argument("--model-uid", dest="model_uid", default=None)
|
||||
parser.add_argument(
|
||||
"--forecast_quantiles",
|
||||
nargs="*",
|
||||
type=float,
|
||||
help="forecast quantiles list",
|
||||
default=None,
|
||||
)
|
||||
|
||||
parsed_args, _ = parser.parse_known_args()
|
||||
model_name = parsed_args.model
|
||||
@@ -55,6 +64,7 @@ def init():
|
||||
target_column_name = automl_settings.get("label_column_name")
|
||||
output_dir = parsed_args.out
|
||||
model_uid = parsed_args.model_uid
|
||||
forecast_quantiles = parsed_args.forecast_quantiles
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
os.environ["AUTOML_IGNORE_PACKAGE_VERSION_INCOMPATIBILITIES".lower()] = "True"
|
||||
|
||||
@@ -126,23 +136,18 @@ def run_backtest(data_input_name: str, file_name: str, experiment: Experiment):
|
||||
)
|
||||
print(f"The model {best_run.properties['model_name']} was registered.")
|
||||
|
||||
_, x_pred = fitted_model.forecast(X_test)
|
||||
x_pred.reset_index(inplace=True, drop=False)
|
||||
columns = [automl_settings[constants.TimeSeries.TIME_COLUMN_NAME]]
|
||||
if automl_settings.get(constants.TimeSeries.GRAIN_COLUMN_NAMES):
|
||||
# We know that fitted_model.grain_column_names is a list.
|
||||
columns.extend(fitted_model.grain_column_names)
|
||||
columns.append(constants.TimeSeriesInternal.DUMMY_TARGET_COLUMN)
|
||||
# Remove featurized columns.
|
||||
x_pred = x_pred[columns]
|
||||
x_pred.rename(
|
||||
{constants.TimeSeriesInternal.DUMMY_TARGET_COLUMN: "predicted_level"},
|
||||
axis=1,
|
||||
inplace=True,
|
||||
)
|
||||
# By default we will have forecast quantiles of 0.5, which is our target
|
||||
if forecast_quantiles:
|
||||
if 0.5 not in forecast_quantiles:
|
||||
forecast_quantiles.append(0.5)
|
||||
fitted_model.quantiles = forecast_quantiles
|
||||
|
||||
x_pred = fitted_model.forecast_quantiles(X_test)
|
||||
x_pred["actual_level"] = y_test
|
||||
x_pred["backtest_iteration"] = f"iteration_{last_training_date}"
|
||||
x_pred.rename({0.5: "predicted_level"}, axis=1, inplace=True)
|
||||
date_safe = RE_INVALID_SYMBOLS.sub("_", last_training_date)
|
||||
|
||||
x_pred.to_csv(os.path.join(output_dir, f"iteration_{date_safe}.csv"), index=False)
|
||||
return x_pred
|
||||
|
||||
|
||||
@@ -365,6 +365,7 @@
|
||||
" step_size=BACKTESTING_PERIOD,\n",
|
||||
" step_number=NUMBER_OF_BACKTESTS,\n",
|
||||
" model_uid=model_uid,\n",
|
||||
" forecast_quantiles=[0.025, 0.975], # Optional\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -590,6 +591,7 @@
|
||||
" step_size=BACKTESTING_PERIOD,\n",
|
||||
" step_number=NUMBER_OF_BACKTESTS,\n",
|
||||
" model_name=model_name,\n",
|
||||
" forecast_quantiles=[0.025, 0.975],\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -31,6 +31,7 @@ def get_backtest_pipeline(
|
||||
step_number: int,
|
||||
model_name: Optional[str] = None,
|
||||
model_uid: Optional[str] = None,
|
||||
forecast_quantiles: Optional[list] = None,
|
||||
) -> Pipeline:
|
||||
"""
|
||||
:param experiment: The experiment used to run the pipeline.
|
||||
@@ -44,6 +45,7 @@ def get_backtest_pipeline(
|
||||
:param step_size: The number of periods to step back in backtesting.
|
||||
:param step_number: The number of backtesting iterations.
|
||||
:param model_uid: The uid to mark models from this run of the experiment.
|
||||
:param forecast_quantiles: The forecast quantiles that are required in the inference.
|
||||
:return: The pipeline to be used for model retraining.
|
||||
**Note:** The output will be uploaded in the pipeline output
|
||||
called 'score'.
|
||||
@@ -135,6 +137,9 @@ def get_backtest_pipeline(
|
||||
if model_uid is not None:
|
||||
prs_args.append("--model-uid")
|
||||
prs_args.append(model_uid)
|
||||
if forecast_quantiles:
|
||||
prs_args.append("--forecast_quantiles")
|
||||
prs_args.extend(forecast_quantiles)
|
||||
backtest_prs = ParallelRunStep(
|
||||
name=parallel_step_name,
|
||||
parallel_run_config=back_test_config,
|
||||
|
||||
@@ -575,7 +575,32 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"remote_run.download_file(\"outputs/predictions.csv\", \"predictions.csv\")\n",
|
||||
"df_all = pd.read_csv(\"predictions.csv\")"
|
||||
"fcst_df = pd.read_csv(\"predictions.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Note that the rolling forecast can contain multiple predictions for each date, each from a different forecast origin. For example, consider 2012-09-05:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fcst_df[fcst_df.date == \"2012-09-05\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Here, the forecast origin refers to the latest date of actuals available for a given forecast. The earliest origin in the rolling forecast, 2012-08-31, is the last day in the training data. For origin date 2012-09-01, the forecasts use actual recorded counts from the training data *and* the actual count recorded on 2012-09-01. Note that the model is not retrained for origin dates later than 2012-08-31, but the values for model features, such as lagged values of daily count, are updated.\n",
|
||||
"\n",
|
||||
"Let's calculate the metrics over all rolling forecasts:"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -587,29 +612,17 @@
|
||||
"from azureml.automl.core.shared import constants\n",
|
||||
"from azureml.automl.runtime.shared.score import scoring\n",
|
||||
"from sklearn.metrics import mean_absolute_error, mean_squared_error\n",
|
||||
"from matplotlib import pyplot as plt\n",
|
||||
"\n",
|
||||
"# use automl metrics module\n",
|
||||
"scores = scoring.score_regression(\n",
|
||||
" y_test=df_all[target_column_name],\n",
|
||||
" y_pred=df_all[\"predicted\"],\n",
|
||||
" y_test=fcst_df[target_column_name],\n",
|
||||
" y_pred=fcst_df[\"predicted\"],\n",
|
||||
" metrics=list(constants.Metric.SCALAR_REGRESSION_SET),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"[Test data scores]\\n\")\n",
|
||||
"for key, value in scores.items():\n",
|
||||
" print(\"{}: {:.3f}\".format(key, value))\n",
|
||||
"\n",
|
||||
"# Plot outputs\n",
|
||||
"%matplotlib inline\n",
|
||||
"test_pred = plt.scatter(df_all[target_column_name], df_all[\"predicted\"], color=\"b\")\n",
|
||||
"test_test = plt.scatter(\n",
|
||||
" df_all[target_column_name], df_all[target_column_name], color=\"g\"\n",
|
||||
")\n",
|
||||
"plt.legend(\n",
|
||||
" (test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8\n",
|
||||
")\n",
|
||||
"plt.show()"
|
||||
" print(\"{}: {:.3f}\".format(key, value))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -618,36 +631,15 @@
|
||||
"source": [
|
||||
"For more details on what metrics are included and how they are calculated, please refer to [supported metrics](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-understand-automated-ml#regressionforecasting-metrics). You could also calculate residuals, like described [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-understand-automated-ml#residuals).\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Since we did a rolling evaluation on the test set, we can analyze the predictions by their forecast horizon relative to the rolling origin. The model was initially trained at a forecast horizon of 14, so each prediction from the model is associated with a horizon value from 1 to 14. The horizon values are in a column named, \"horizon_origin,\" in the prediction set. For example, we can calculate some of the error metrics grouped by the horizon:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from metrics_helper import MAPE, APE\n",
|
||||
"\n",
|
||||
"df_all.groupby(\"horizon_origin\").apply(\n",
|
||||
" lambda df: pd.Series(\n",
|
||||
" {\n",
|
||||
" \"MAPE\": MAPE(df[target_column_name], df[\"predicted\"]),\n",
|
||||
" \"RMSE\": np.sqrt(\n",
|
||||
" mean_squared_error(df[target_column_name], df[\"predicted\"])\n",
|
||||
" ),\n",
|
||||
" \"MAE\": mean_absolute_error(df[target_column_name], df[\"predicted\"]),\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
")"
|
||||
"The rolling forecast metric values are very high in comparison to the validation metrics reported by the AutoML job. What's going on here? We will investigate in the following cells!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To drill down more, we can look at the distributions of APE (absolute percentage error) by horizon. From the chart, it is clear that the overall MAPE is being skewed by one particular point where the actual value is of small absolute value."
|
||||
"### Forecast versus actuals plot\n",
|
||||
"We will plot predictions and actuals on a time series plot. Since there are many forecasts for each date, we select the 14-day-ahead forecast from each forecast origin for our comparison."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -656,21 +648,55 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_all_APE = df_all.assign(APE=APE(df_all[target_column_name], df_all[\"predicted\"]))\n",
|
||||
"APEs = [\n",
|
||||
" df_all_APE[df_all[\"horizon_origin\"] == h].APE.values\n",
|
||||
" for h in range(1, forecast_horizon + 1)\n",
|
||||
"]\n",
|
||||
"from matplotlib import pyplot as plt\n",
|
||||
"\n",
|
||||
"%matplotlib inline\n",
|
||||
"plt.boxplot(APEs)\n",
|
||||
"plt.yscale(\"log\")\n",
|
||||
"plt.xlabel(\"horizon\")\n",
|
||||
"plt.ylabel(\"APE (%)\")\n",
|
||||
"plt.title(\"Absolute Percentage Errors by Forecast Horizon\")\n",
|
||||
"\n",
|
||||
"fcst_df_h14 = (\n",
|
||||
" fcst_df.groupby(\"forecast_origin\", as_index=False)\n",
|
||||
" .last()\n",
|
||||
" .drop(columns=[\"forecast_origin\"])\n",
|
||||
")\n",
|
||||
"fcst_df_h14.set_index(time_column_name, inplace=True)\n",
|
||||
"plt.plot(fcst_df_h14[[target_column_name, \"predicted\"]])\n",
|
||||
"plt.xticks(rotation=45)\n",
|
||||
"plt.title(f\"Predicted vs. Actuals\")\n",
|
||||
"plt.legend([\"actual\", \"14-day-ahead forecast\"])\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Looking at the plot, there are two clear issues:\n",
|
||||
"1. An anomalously low count value on October 29th, 2012.\n",
|
||||
"2. End-of-year holidays (Thanksgiving and Christmas) in late November and late December.\n",
|
||||
"\n",
|
||||
"What happened on Oct. 29th, 2012? That day, Hurricane Sandy brought severe storm surge flooding to the east coast of the United States, particularly around New York City. This is certainly an anomalous event that the model did not account for!\n",
|
||||
"\n",
|
||||
"As for the late year holidays, the model apparently did not learn to account for the full reduction of bike share rentals on these major holidays. The training data covers 2011 and early 2012, so the model fit only had access to a single occurrence of these holidays. This makes it challenging to resolve holiday effects; however, a larger AutoML model search may result in a better model that is more holiday-aware.\n",
|
||||
"\n",
|
||||
"If we filter the predictions prior to the Thanksgiving holiday and remove the anomalous day of 2012-10-29, the metrics are closer to validation levels:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"date_filter = (fcst_df.date != \"2012-10-29\") & (fcst_df.date < \"2012-11-22\")\n",
|
||||
"scores = scoring.score_regression(\n",
|
||||
" y_test=fcst_df[date_filter][target_column_name],\n",
|
||||
" y_pred=fcst_df[date_filter][\"predicted\"],\n",
|
||||
" metrics=list(constants.Metric.SCALAR_REGRESSION_SET),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"[Test data scores (filtered)]\\n\")\n",
|
||||
"for key, value in scores.items():\n",
|
||||
" print(\"{}: {:.3f}\".format(key, value))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -711,7 +737,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"version": "3.7.13"
|
||||
},
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
|
||||
@@ -36,18 +36,18 @@ y_test_df = (
|
||||
|
||||
fitted_model = joblib.load("model.pkl")
|
||||
|
||||
y_pred, X_trans = fitted_model.rolling_evaluation(X_test_df, y_test_df.values)
|
||||
X_rf = fitted_model.rolling_forecast(X_test_df, y_test_df.values, step=1)
|
||||
|
||||
# Add predictions, actuals, and horizon relative to rolling origin to the test feature data
|
||||
assign_dict = {
|
||||
"horizon_origin": X_trans["horizon_origin"].values,
|
||||
"predicted": y_pred,
|
||||
target_column_name: y_test_df[target_column_name].values,
|
||||
fitted_model.forecast_origin_column_name: "forecast_origin",
|
||||
fitted_model.forecast_column_name: "predicted",
|
||||
fitted_model.actual_column_name: target_column_name,
|
||||
}
|
||||
df_all = X_test_df.assign(**assign_dict)
|
||||
X_rf.rename(columns=assign_dict, inplace=True)
|
||||
|
||||
file_name = "outputs/predictions.csv"
|
||||
export_csv = df_all.to_csv(file_name, header=True)
|
||||
export_csv = X_rf.to_csv(file_name, header=True)
|
||||
|
||||
# Upload the predictions into artifacts
|
||||
run.upload_file(name=file_name, path_or_stream=file_name)
|
||||
|
||||
@@ -758,7 +758,15 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Forecasting farther than the forecast horizon <a id=\"recursive forecasting\"></a>\n",
|
||||
"When the forecast destination, or the latest date in the prediction data frame, is farther into the future than the specified forecast horizon, the `forecast()` function will still make point predictions out to the later date using a recursive operation mode. Internally, the method recursively applies the regular forecaster to generate context so that we can forecast further into the future. \n",
|
||||
"When the forecast destination, or the latest date in the prediction data frame, is farther into the future than the specified forecast horizon, the forecaster must be iteratively applied. Here, we advance the forecast origin on each iteration over the prediction window, predicting `max_horizon` periods ahead on each iteration. There are two choices for the context data to use as the forecaster advances into the prediction window:\n",
|
||||
"\n",
|
||||
"1. We can use forecasted values from previous iterations (recursive forecast),\n",
|
||||
"2. We can use known, actual values of the target if they are available (rolling forecast).\n",
|
||||
"\n",
|
||||
"The first method is useful in a true forecasting scenario when we do not yet know the actual target values while the second is useful in an evaluation scenario where we want to compute accuracy metrics for the `max_horizon`-period-ahead forecaster over a long test set. We refer to the first as a **recursive forecast** since we apply the forecaster recursively over the prediction window and the second as a **rolling forecast** since we roll forward over known actuals.\n",
|
||||
"\n",
|
||||
"### Recursive forecasting\n",
|
||||
"By default, the `forecast()` function will make point predictions out to the later date using a recursive operation mode. Internally, the method recursively applies the regular forecaster to generate context so that we can forecast further into the future. \n",
|
||||
"\n",
|
||||
"To illustrate the use-case and operation of recursive forecasting, we'll consider an example with a single time-series where the forecasting period directly follows the training period and is twice as long as the forecasting horizon given at training time.\n",
|
||||
"\n",
|
||||
@@ -818,6 +826,35 @@
|
||||
"np.array_equal(y_pred_all, y_pred_long)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Rolling forecasts\n",
|
||||
"A rolling forecast is a similar concept to the recursive forecasts described above except that we use known actual values of the target for our context data. We have provided a different, public method for this called `rolling_forecast`. In addition to test data and actuals (`X_test` and `y_test`), `rolling_forecast` also accepts an optional `step` parameter that controls how far the origin advances on each iteration. The recursive forecast mode uses a fixed step of `max_horizon` while `rolling_forecast` defaults to a step size of 1, but can be set to any integer from 1 to `max_horizon`, inclusive.\n",
|
||||
"\n",
|
||||
"Let's see what the rolling forecast looks like on the long test set with the step set to 1:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_rf = fitted_model.rolling_forecast(X_test_long, y_test_long, step=1)\n",
|
||||
"X_rf.head(n=12)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Notice that `rolling_forecast` has returned a single DataFrame containing all results and has generated some new columns: `_automl_forecast_origin`, `_automl_forecast_y`, and `_automl_actual_y`. These are the origin date for each forecast, the forecasted value and the actual value, respectively. Note that \"y\" in the forecast and actual column names will generally be replaced by the target column name supplied to AutoML.\n",
|
||||
"\n",
|
||||
"The output above shows forecasts for two prediction windows, the first with origin at the end of the training set and the second including the first observation in the test set (2000-01-01 06:00:00). Since the forecast windows overlap, there are multiple forecasts for most dates which are associated with different origin dates."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -880,7 +917,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
"version": "3.7.13"
|
||||
},
|
||||
"tags": [
|
||||
"Forecasting",
|
||||
@@ -894,5 +931,5 @@
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -325,7 +325,7 @@
|
||||
"source": [
|
||||
"### Setting forecaster maximum horizon \n",
|
||||
"\n",
|
||||
"The forecast horizon is the number of periods into the future that the model should predict. Here, we set the horizon to 12 periods (i.e. 12 months). Notice that this is much shorter than the number of months in the test set; we will need to use a rolling test to evaluate the performance on the whole test set. For more discussion of forecast horizons and guiding principles for setting them, please see the [energy demand notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand). "
|
||||
"The forecast horizon is the number of periods into the future that the model should predict. Here, we set the horizon to 14 periods (i.e. 14 days). Notice that this is much shorter than the number of months in the test set; we will need to use a rolling test to evaluate the performance on the whole test set. For more discussion of forecast horizons and guiding principles for setting them, please see the [energy demand notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand). "
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -337,7 +337,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"forecast_horizon = 12"
|
||||
"forecast_horizon = 14"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -699,5 +699,5 @@
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -4,7 +4,6 @@ import os
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from pandas.tseries.frequencies import to_offset
|
||||
from sklearn.externals import joblib
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error
|
||||
|
||||
@@ -19,219 +18,8 @@ except ImportError:
|
||||
_torch_present = False
|
||||
|
||||
|
||||
def align_outputs(
|
||||
y_predicted,
|
||||
X_trans,
|
||||
X_test,
|
||||
y_test,
|
||||
predicted_column_name="predicted",
|
||||
horizon_colname="horizon_origin",
|
||||
):
|
||||
"""
|
||||
Demonstrates how to get the output aligned to the inputs
|
||||
using pandas indexes. Helps understand what happened if
|
||||
the output's shape differs from the input shape, or if
|
||||
the data got re-sorted by time and grain during forecasting.
|
||||
|
||||
Typical causes of misalignment are:
|
||||
* we predicted some periods that were missing in actuals -> drop from eval
|
||||
* model was asked to predict past max_horizon -> increase max horizon
|
||||
* data at start of X_test was needed for lags -> provide previous periods
|
||||
"""
|
||||
if horizon_colname in X_trans:
|
||||
df_fcst = pd.DataFrame(
|
||||
{
|
||||
predicted_column_name: y_predicted,
|
||||
horizon_colname: X_trans[horizon_colname],
|
||||
}
|
||||
)
|
||||
else:
|
||||
df_fcst = pd.DataFrame({predicted_column_name: y_predicted})
|
||||
|
||||
# y and X outputs are aligned by forecast() function contract
|
||||
df_fcst.index = X_trans.index
|
||||
|
||||
# align original X_test to y_test
|
||||
X_test_full = X_test.copy()
|
||||
X_test_full[target_column_name] = y_test
|
||||
|
||||
# X_test_full's index does not include origin, so reset for merge
|
||||
df_fcst.reset_index(inplace=True)
|
||||
X_test_full = X_test_full.reset_index().drop(columns="index")
|
||||
together = df_fcst.merge(X_test_full, how="right")
|
||||
|
||||
# drop rows where prediction or actuals are nan
|
||||
# happens because of missing actuals
|
||||
# or at edges of time due to lags/rolling windows
|
||||
clean = together[
|
||||
together[[target_column_name, predicted_column_name]].notnull().all(axis=1)
|
||||
]
|
||||
return clean
|
||||
|
||||
|
||||
def do_rolling_forecast_with_lookback(
|
||||
fitted_model, X_test, y_test, max_horizon, X_lookback, y_lookback, freq="D"
|
||||
):
|
||||
"""
|
||||
Produce forecasts on a rolling origin over the given test set.
|
||||
|
||||
Each iteration makes a forecast for the next 'max_horizon' periods
|
||||
with respect to the current origin, then advances the origin by the
|
||||
horizon time duration. The prediction context for each forecast is set so
|
||||
that the forecaster uses the actual target values prior to the current
|
||||
origin time for constructing lag features.
|
||||
|
||||
This function returns a concatenated DataFrame of rolling forecasts.
|
||||
"""
|
||||
print("Using lookback of size: ", y_lookback.size)
|
||||
df_list = []
|
||||
origin_time = X_test[time_column_name].min()
|
||||
X = X_lookback.append(X_test)
|
||||
y = np.concatenate((y_lookback, y_test), axis=0)
|
||||
while origin_time <= X_test[time_column_name].max():
|
||||
# Set the horizon time - end date of the forecast
|
||||
horizon_time = origin_time + max_horizon * to_offset(freq)
|
||||
|
||||
# Extract test data from an expanding window up-to the horizon
|
||||
expand_wind = X[time_column_name] < horizon_time
|
||||
X_test_expand = X[expand_wind]
|
||||
y_query_expand = np.zeros(len(X_test_expand)).astype(float)
|
||||
y_query_expand.fill(np.NaN)
|
||||
|
||||
if origin_time != X[time_column_name].min():
|
||||
# Set the context by including actuals up-to the origin time
|
||||
test_context_expand_wind = X[time_column_name] < origin_time
|
||||
context_expand_wind = X_test_expand[time_column_name] < origin_time
|
||||
y_query_expand[context_expand_wind] = y[test_context_expand_wind]
|
||||
|
||||
# Print some debug info
|
||||
print(
|
||||
"Horizon_time:",
|
||||
horizon_time,
|
||||
" origin_time: ",
|
||||
origin_time,
|
||||
" max_horizon: ",
|
||||
max_horizon,
|
||||
" freq: ",
|
||||
freq,
|
||||
)
|
||||
print("expand_wind: ", expand_wind)
|
||||
print("y_query_expand")
|
||||
print(y_query_expand)
|
||||
print("X_test")
|
||||
print(X)
|
||||
print("X_test_expand")
|
||||
print(X_test_expand)
|
||||
print("Type of X_test_expand: ", type(X_test_expand))
|
||||
print("Type of y_query_expand: ", type(y_query_expand))
|
||||
|
||||
print("y_query_expand")
|
||||
print(y_query_expand)
|
||||
|
||||
# Make a forecast out to the maximum horizon
|
||||
# y_fcst, X_trans = y_query_expand, X_test_expand
|
||||
y_fcst, X_trans = fitted_model.forecast(X_test_expand, y_query_expand)
|
||||
|
||||
print("y_fcst")
|
||||
print(y_fcst)
|
||||
|
||||
# Align forecast with test set for dates within
|
||||
# the current rolling window
|
||||
trans_tindex = X_trans.index.get_level_values(time_column_name)
|
||||
trans_roll_wind = (trans_tindex >= origin_time) & (trans_tindex < horizon_time)
|
||||
test_roll_wind = expand_wind & (X[time_column_name] >= origin_time)
|
||||
df_list.append(
|
||||
align_outputs(
|
||||
y_fcst[trans_roll_wind],
|
||||
X_trans[trans_roll_wind],
|
||||
X[test_roll_wind],
|
||||
y[test_roll_wind],
|
||||
)
|
||||
)
|
||||
|
||||
# Advance the origin time
|
||||
origin_time = horizon_time
|
||||
|
||||
return pd.concat(df_list, ignore_index=True)
|
||||
|
||||
|
||||
def do_rolling_forecast(fitted_model, X_test, y_test, max_horizon, freq="D"):
|
||||
"""
|
||||
Produce forecasts on a rolling origin over the given test set.
|
||||
|
||||
Each iteration makes a forecast for the next 'max_horizon' periods
|
||||
with respect to the current origin, then advances the origin by the
|
||||
horizon time duration. The prediction context for each forecast is set so
|
||||
that the forecaster uses the actual target values prior to the current
|
||||
origin time for constructing lag features.
|
||||
|
||||
This function returns a concatenated DataFrame of rolling forecasts.
|
||||
"""
|
||||
df_list = []
|
||||
origin_time = X_test[time_column_name].min()
|
||||
while origin_time <= X_test[time_column_name].max():
|
||||
# Set the horizon time - end date of the forecast
|
||||
horizon_time = origin_time + max_horizon * to_offset(freq)
|
||||
|
||||
# Extract test data from an expanding window up-to the horizon
|
||||
expand_wind = X_test[time_column_name] < horizon_time
|
||||
X_test_expand = X_test[expand_wind]
|
||||
y_query_expand = np.zeros(len(X_test_expand)).astype(float)
|
||||
y_query_expand.fill(np.NaN)
|
||||
|
||||
if origin_time != X_test[time_column_name].min():
|
||||
# Set the context by including actuals up-to the origin time
|
||||
test_context_expand_wind = X_test[time_column_name] < origin_time
|
||||
context_expand_wind = X_test_expand[time_column_name] < origin_time
|
||||
y_query_expand[context_expand_wind] = y_test[test_context_expand_wind]
|
||||
|
||||
# Print some debug info
|
||||
print(
|
||||
"Horizon_time:",
|
||||
horizon_time,
|
||||
" origin_time: ",
|
||||
origin_time,
|
||||
" max_horizon: ",
|
||||
max_horizon,
|
||||
" freq: ",
|
||||
freq,
|
||||
)
|
||||
print("expand_wind: ", expand_wind)
|
||||
print("y_query_expand")
|
||||
print(y_query_expand)
|
||||
print("X_test")
|
||||
print(X_test)
|
||||
print("X_test_expand")
|
||||
print(X_test_expand)
|
||||
print("Type of X_test_expand: ", type(X_test_expand))
|
||||
print("Type of y_query_expand: ", type(y_query_expand))
|
||||
print("y_query_expand")
|
||||
print(y_query_expand)
|
||||
|
||||
# Make a forecast out to the maximum horizon
|
||||
y_fcst, X_trans = fitted_model.forecast(X_test_expand, y_query_expand)
|
||||
|
||||
print("y_fcst")
|
||||
print(y_fcst)
|
||||
|
||||
# Align forecast with test set for dates within the
|
||||
# current rolling window
|
||||
trans_tindex = X_trans.index.get_level_values(time_column_name)
|
||||
trans_roll_wind = (trans_tindex >= origin_time) & (trans_tindex < horizon_time)
|
||||
test_roll_wind = expand_wind & (X_test[time_column_name] >= origin_time)
|
||||
df_list.append(
|
||||
align_outputs(
|
||||
y_fcst[trans_roll_wind],
|
||||
X_trans[trans_roll_wind],
|
||||
X_test[test_roll_wind],
|
||||
y_test[test_roll_wind],
|
||||
)
|
||||
)
|
||||
|
||||
# Advance the origin time
|
||||
origin_time = horizon_time
|
||||
|
||||
return pd.concat(df_list, ignore_index=True)
|
||||
def map_location_cuda(storage, loc):
|
||||
return storage.cuda()
|
||||
|
||||
|
||||
def APE(actual, pred):
|
||||
@@ -254,10 +42,6 @@ def MAPE(actual, pred):
|
||||
return np.mean(APE(actual_safe, pred_safe))
|
||||
|
||||
|
||||
def map_location_cuda(storage, loc):
|
||||
return storage.cuda()
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--max_horizon",
|
||||
@@ -303,7 +87,6 @@ print(model_path)
|
||||
run = Run.get_context()
|
||||
# get input dataset by name
|
||||
test_dataset = run.input_datasets["test_data"]
|
||||
lookback_dataset = run.input_datasets["lookback_data"]
|
||||
|
||||
grain_column_names = []
|
||||
|
||||
@@ -312,15 +95,8 @@ df = test_dataset.to_pandas_dataframe()
|
||||
print("Read df")
|
||||
print(df)
|
||||
|
||||
X_test_df = test_dataset.drop_columns(columns=[target_column_name])
|
||||
y_test_df = test_dataset.with_timestamp_columns(None).keep_columns(
|
||||
columns=[target_column_name]
|
||||
)
|
||||
|
||||
X_lookback_df = lookback_dataset.drop_columns(columns=[target_column_name])
|
||||
y_lookback_df = lookback_dataset.with_timestamp_columns(None).keep_columns(
|
||||
columns=[target_column_name]
|
||||
)
|
||||
X_test_df = df
|
||||
y_test = df.pop(target_column_name).to_numpy()
|
||||
|
||||
_, ext = os.path.splitext(model_path)
|
||||
if ext == ".pt":
|
||||
@@ -336,37 +112,20 @@ else:
|
||||
# Load the sklearn pipeline.
|
||||
fitted_model = joblib.load(model_path)
|
||||
|
||||
if hasattr(fitted_model, "get_lookback"):
|
||||
lookback = fitted_model.get_lookback()
|
||||
df_all = do_rolling_forecast_with_lookback(
|
||||
fitted_model,
|
||||
X_test_df.to_pandas_dataframe(),
|
||||
y_test_df.to_pandas_dataframe().values.T[0],
|
||||
max_horizon,
|
||||
X_lookback_df.to_pandas_dataframe()[-lookback:],
|
||||
y_lookback_df.to_pandas_dataframe().values.T[0][-lookback:],
|
||||
freq,
|
||||
)
|
||||
else:
|
||||
df_all = do_rolling_forecast(
|
||||
fitted_model,
|
||||
X_test_df.to_pandas_dataframe(),
|
||||
y_test_df.to_pandas_dataframe().values.T[0],
|
||||
max_horizon,
|
||||
freq,
|
||||
)
|
||||
X_rf = fitted_model.rolling_forecast(X_test_df, y_test, step=1)
|
||||
assign_dict = {
|
||||
fitted_model.forecast_origin_column_name: "forecast_origin",
|
||||
fitted_model.forecast_column_name: "predicted",
|
||||
fitted_model.actual_column_name: target_column_name,
|
||||
}
|
||||
X_rf.rename(columns=assign_dict, inplace=True)
|
||||
|
||||
print(df_all)
|
||||
|
||||
print("target values:::")
|
||||
print(df_all[target_column_name])
|
||||
print("predicted values:::")
|
||||
print(df_all["predicted"])
|
||||
print(X_rf.head())
|
||||
|
||||
# Use the AutoML scoring module
|
||||
regression_metrics = list(constants.REGRESSION_SCALAR_SET)
|
||||
y_test = np.array(df_all[target_column_name])
|
||||
y_pred = np.array(df_all["predicted"])
|
||||
y_test = np.array(X_rf[target_column_name])
|
||||
y_pred = np.array(X_rf["predicted"])
|
||||
scores = scoring.score_regression(y_test, y_pred, regression_metrics)
|
||||
|
||||
print("scores:")
|
||||
@@ -376,11 +135,11 @@ for key, value in scores.items():
|
||||
run.log(key, value)
|
||||
|
||||
print("Simple forecasting model")
|
||||
rmse = np.sqrt(mean_squared_error(df_all[target_column_name], df_all["predicted"]))
|
||||
rmse = np.sqrt(mean_squared_error(X_rf[target_column_name], X_rf["predicted"]))
|
||||
print("[Test Data] \nRoot Mean squared error: %.2f" % rmse)
|
||||
mae = mean_absolute_error(df_all[target_column_name], df_all["predicted"])
|
||||
mae = mean_absolute_error(X_rf[target_column_name], X_rf["predicted"])
|
||||
print("mean_absolute_error score: %.2f" % mae)
|
||||
print("MAPE: %.2f" % MAPE(df_all[target_column_name], df_all["predicted"]))
|
||||
print("MAPE: %.2f" % MAPE(X_rf[target_column_name], X_rf["predicted"]))
|
||||
|
||||
run.log("rmse", rmse)
|
||||
run.log("mae", mae)
|
||||
|
||||
@@ -251,8 +251,16 @@
|
||||
"source": [
|
||||
"### Set up training parameters\n",
|
||||
"\n",
|
||||
"This dictionary defines the AutoML and hierarchy settings. For this forecasting task we need to define several settings inncluding the name of the time column, the maximum forecast horizon, the hierarchy definition, and the level of the hierarchy at which to train.\n",
|
||||
"We need to provide ``ForecastingParameters``, ``AutoMLConfig`` and ``HTSTrainParameters`` objects. For the forecasting task we need to define several settings including the name of the time column, the maximum forecast horizon, the hierarchy definition, and the level of the hierarchy at which to train.\n",
|
||||
"\n",
|
||||
"#### ``ForecastingParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
|
||||
"| **time_column_name** | The name of your time column. |\n",
|
||||
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"\n",
|
||||
"#### ``AutoMLConfig`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **task** | forecasting |\n",
|
||||
@@ -262,18 +270,21 @@
|
||||
"| **iterations** | Number of models to train. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **experiment_timeout_hours** | Maximum amount of time in hours that the experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **label_column_name** | The name of the label column. |\n",
|
||||
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
|
||||
"|**n_cross_validations**|Number of cross-validation folds to use for model/pipeline selection. The default value is \"auto\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value.\n",
|
||||
"|**cv_step_size**|Number of periods between two consecutive cross-validation folds. The default value is \"auto\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value.\n",
|
||||
"| **n_cross_validations** | Number of cross-validation folds to use for model/pipeline selection. The default value is \\\"auto\\\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
|
||||
"| **cv_step_size** | Number of periods between two consecutive cross-validation folds. The default value is \\\"auto\\\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
|
||||
"| **enable_early_stopping** | Flag to enable early termination if the score is not improving in the short term. |\n",
|
||||
"| **time_column_name** | The name of your time column. |\n",
|
||||
"| **hierarchy_column_names** | The names of columns that define the hierarchical structure of the data from highest level to most granular. |\n",
|
||||
"| **training_level** | The level of the hierarchy to be used for training models. |\n",
|
||||
"| **enable_engineered_explanations** | Engineered feature explanations will be downloaded if enable_engineered_explanations flag is set to True. By default it is set to False to save storage space. |\n",
|
||||
"| **time_series_id_column_name** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
|
||||
"| **pipeline_fetch_max_batch_size** | Determines how many pipelines (training algorithms) to fetch at a time for training, this helps reduce throttling when training at large scale. |\n",
|
||||
"| **model_explainability** | Flag to disable explaining the best automated ML model at the end of all training iterations. The default is True and will block non-explainable models which may impact the forecast accuracy. For more information, see [Interpretability: model explanations in automated machine learning](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-machine-learning-interpretability-automl). |"
|
||||
"| **model_explainability** | Flag to disable explaining the best automated ML model at the end of all training iterations. The default is True and will block non-explainable models which may impact the forecast accuracy. For more information, see [Interpretability: model explanations in automated machine learning](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-machine-learning-interpretability-automl). |\n",
|
||||
"\n",
|
||||
"#### ``HTSTrainParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **automl_settings** | ``AutoMLConfig`` object.\n",
|
||||
"| **hierarchy_column_names** | The names of columns that define the hierarchical structure of the data from highest level to most granular. |\n",
|
||||
"| **training_level** | The level of the hierarchy to be used for training models. |\n",
|
||||
"| **enable_engineered_explanations** | The switch controls engineered explanations. |"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -287,6 +298,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.train.automl.runtime._hts.hts_parameters import HTSTrainParameters\n",
|
||||
"from azureml.automl.core.forecasting_parameters import ForecastingParameters\n",
|
||||
"from azureml.train.automl.automlconfig import AutoMLConfig\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"model_explainability = True\n",
|
||||
"\n",
|
||||
@@ -300,24 +314,26 @@
|
||||
"label_column_name = \"quantity\"\n",
|
||||
"forecast_horizon = 7\n",
|
||||
"\n",
|
||||
"forecasting_parameters = ForecastingParameters(\n",
|
||||
" time_column_name=time_column_name,\n",
|
||||
" forecast_horizon=forecast_horizon,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"automl_settings = {\n",
|
||||
" \"task\": \"forecasting\",\n",
|
||||
" \"primary_metric\": \"normalized_root_mean_squared_error\",\n",
|
||||
" \"label_column_name\": label_column_name,\n",
|
||||
" \"time_column_name\": time_column_name,\n",
|
||||
" \"forecast_horizon\": forecast_horizon,\n",
|
||||
" \"hierarchy_column_names\": hierarchy,\n",
|
||||
" \"hierarchy_training_level\": training_level,\n",
|
||||
" \"track_child_runs\": False,\n",
|
||||
" \"pipeline_fetch_max_batch_size\": 15,\n",
|
||||
" \"model_explainability\": model_explainability,\n",
|
||||
" \"n_cross_validations\": \"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
|
||||
" \"cv_step_size\": \"auto\",\n",
|
||||
"automl_settings = AutoMLConfig(\n",
|
||||
" task=\"forecasting\",\n",
|
||||
" primary_metric=\"normalized_root_mean_squared_error\",\n",
|
||||
" experiment_timeout_hours=1,\n",
|
||||
" label_column_name=label_column_name,\n",
|
||||
" track_child_runs=False,\n",
|
||||
" forecasting_parameters=forecasting_parameters,\n",
|
||||
" pipeline_fetch_max_batch_size=15,\n",
|
||||
" model_explainability=model_explainability,\n",
|
||||
" n_cross_validations=\"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
|
||||
" cv_step_size=\"auto\",\n",
|
||||
" # The following settings are specific to this sample and should be adjusted according to your own needs.\n",
|
||||
" \"iteration_timeout_minutes\": 10,\n",
|
||||
" \"iterations\": 10,\n",
|
||||
"}\n",
|
||||
" iteration_timeout_minutes=10,\n",
|
||||
" iterations=15,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"hts_parameters = HTSTrainParameters(\n",
|
||||
" automl_settings=automl_settings,\n",
|
||||
@@ -345,6 +361,7 @@
|
||||
"* **node_count:** The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long.\n",
|
||||
"* **process_count_per_node:** Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node or optimal performance.\n",
|
||||
"* **train_pipeline_parameters:** The set of configuration parameters defined in the previous section. \n",
|
||||
"* **run_invocation_timeout:** Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. This must be greater than ``experiment_timeout_hours`` by at least 300 seconds.\n",
|
||||
"\n",
|
||||
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution."
|
||||
]
|
||||
@@ -365,6 +382,7 @@
|
||||
" node_count=2,\n",
|
||||
" process_count_per_node=8,\n",
|
||||
" train_pipeline_parameters=hts_parameters,\n",
|
||||
" run_invocation_timeout=3900,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -379,8 +379,16 @@
|
||||
"source": [
|
||||
"### Set up training parameters\n",
|
||||
"\n",
|
||||
"This dictionary defines the AutoML and many models settings. For this forecasting task we need to define several settings inncluding the name of the time column, the maximum forecast horizon, and the partition column name definition.\n",
|
||||
"We need to provide ``ForecastingParameters``, ``AutoMLConfig`` and ``ManyModelsTrainParameters`` objects. For the forecasting task we also need to define several settings including the name of the time column, the maximum forecast horizon, and the partition column name definition.\n",
|
||||
"\n",
|
||||
"#### ``ForecastingParameters`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
|
||||
"| **time_column_name** | The name of your time column. |\n",
|
||||
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"\n",
|
||||
"#### ``AutoMLConfig`` arguments\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **task** | forecasting |\n",
|
||||
@@ -390,13 +398,10 @@
|
||||
"| **iterations** | Number of models to train. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **experiment_timeout_hours** | Maximum amount of time in hours that the experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **label_column_name** | The name of the label column. |\n",
|
||||
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
|
||||
"| **n_cross_validations** | Number of cross validation splits. The default value is \"auto\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
|
||||
"|**cv_step_size**|Number of periods between two consecutive cross-validation folds. The default value is \"auto\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value.\n",
|
||||
"| **n_cross_validations** | Number of cross validation splits. The default value is \\\"auto\\\", in which case AutoMl determines the number of cross-validations automatically, if a validation set is not provided. Or users could specify an integer value. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
|
||||
"| **cv_step_size** |Number of periods between two consecutive cross-validation folds. The default value is \\\"auto\\\", in which case AutoMl determines the cross-validation step size automatically, if a validation set is not provided. Or users could specify an integer value. |\n",
|
||||
"| **enable_early_stopping** | Flag to enable early termination if the score is not improving in the short term. |\n",
|
||||
"| **time_column_name** | The name of your time column. |\n",
|
||||
"| **enable_engineered_explanations** | Engineered feature explanations will be downloaded if enable_engineered_explanations flag is set to True. By default it is set to False to save storage space. |\n",
|
||||
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
|
||||
"| **pipeline_fetch_max_batch_size** | Determines how many pipelines (training algorithms) to fetch at a time for training, this helps reduce throttling when training at large scale. |\n",
|
||||
"| **partition_column_names** | The names of columns used to group your models. For timeseries, the groups must not split up individual time-series. That is, each group must contain one or more whole time-series. |"
|
||||
@@ -415,23 +420,30 @@
|
||||
"from azureml.train.automl.runtime._many_models.many_models_parameters import (\n",
|
||||
" ManyModelsTrainParameters,\n",
|
||||
")\n",
|
||||
"from azureml.automl.core.forecasting_parameters import ForecastingParameters\n",
|
||||
"from azureml.train.automl.automlconfig import AutoMLConfig\n",
|
||||
"\n",
|
||||
"partition_column_names = [\"Store\", \"Brand\"]\n",
|
||||
"automl_settings = {\n",
|
||||
" \"task\": \"forecasting\",\n",
|
||||
" \"primary_metric\": \"normalized_root_mean_squared_error\",\n",
|
||||
" \"iteration_timeout_minutes\": 10, # This needs to be changed based on the dataset. We ask customer to explore how long training is taking before settings this value\n",
|
||||
" \"iterations\": 15,\n",
|
||||
" \"experiment_timeout_hours\": 0.25,\n",
|
||||
" \"label_column_name\": \"Quantity\",\n",
|
||||
" \"n_cross_validations\": \"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
|
||||
" \"cv_step_size\": \"auto\",\n",
|
||||
" \"time_column_name\": \"WeekStarting\",\n",
|
||||
" \"drop_column_names\": \"Revenue\",\n",
|
||||
" \"forecast_horizon\": 6,\n",
|
||||
" \"time_series_id_column_names\": partition_column_names,\n",
|
||||
" \"track_child_runs\": False,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"forecasting_parameters = ForecastingParameters(\n",
|
||||
" time_column_name=\"WeekStarting\",\n",
|
||||
" drop_column_names=\"Revenue\",\n",
|
||||
" forecast_horizon=6,\n",
|
||||
" time_series_id_column_names=partition_column_names,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"automl_settings = AutoMLConfig(\n",
|
||||
" task=\"forecasting\",\n",
|
||||
" primary_metric=\"normalized_root_mean_squared_error\",\n",
|
||||
" iteration_timeout_minutes=10,\n",
|
||||
" iterations=15,\n",
|
||||
" experiment_timeout_hours=0.25,\n",
|
||||
" label_column_name=\"Quantity\",\n",
|
||||
" n_cross_validations=\"auto\", # Feel free to set to a small integer (>=2) if runtime is an issue.\n",
|
||||
" cv_step_size=\"auto\",\n",
|
||||
" track_child_runs=False,\n",
|
||||
" forecasting_parameters=forecasting_parameters,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"mm_paramters = ManyModelsTrainParameters(\n",
|
||||
" automl_settings=automl_settings, partition_column_names=partition_column_names\n",
|
||||
@@ -498,6 +510,7 @@
|
||||
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long. |\n",
|
||||
"| **process_count_per_node** | Process count per node, we recommend 2:1 ratio for number of cores: number of processes per node. eg. If node has 16 cores then configure 8 or less process count per node or optimal performance. |\n",
|
||||
"| **train_pipeline_parameters** | The set of configuration parameters defined in the previous section. |\n",
|
||||
"| **run_invocation_timeout** | Maximum amount of time in seconds that the ``ParallelRunStep`` class is allowed. This is optional but provides customers with greater control on exit criteria. This must be greater than ``experiment_timeout_hours`` by at least 300 seconds. |\n",
|
||||
"\n",
|
||||
"Calling this method will create a new aggregated dataset which is generated dynamically on pipeline execution."
|
||||
]
|
||||
@@ -517,7 +530,7 @@
|
||||
" compute_target=compute_target,\n",
|
||||
" node_count=2,\n",
|
||||
" process_count_per_node=8,\n",
|
||||
" run_invocation_timeout=920,\n",
|
||||
" run_invocation_timeout=1200,\n",
|
||||
" train_pipeline_parameters=mm_paramters,\n",
|
||||
")"
|
||||
]
|
||||
@@ -667,9 +680,9 @@
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **experiment** | The experiment used for inference run. |\n",
|
||||
"| **inference_data** | The data to use for inferencing. It should be the same schema as used for training.\n",
|
||||
"| **compute_target** The compute target that runs the inference pipeline.|\n",
|
||||
"| **compute_target** | The compute target that runs the inference pipeline. |\n",
|
||||
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with the number of cores per node (varies by compute sku). |\n",
|
||||
"| **process_count_per_node** The number of processes per node.\n",
|
||||
"| **process_count_per_node** | The number of processes per node (should be at most half of the number of cores of the compute cluster that will be used for the experiment).\n",
|
||||
"| **train_run_id** | \\[Optional] The run id of the hierarchy training, by default it is the latest successful training many model run in the experiment. |\n",
|
||||
"| **train_experiment_name** | \\[Optional] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline. |\n",
|
||||
"| **process_count_per_node** | \\[Optional] The number of processes per node, by default it's 4. |"
|
||||
@@ -692,6 +705,8 @@
|
||||
" target_column_name=\"Quantity\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"output_file_name = \"parallel_run_step.csv\"\n",
|
||||
"\n",
|
||||
"inference_steps = AutoMLPipelineBuilder.get_many_models_batch_inference_steps(\n",
|
||||
" experiment=experiment,\n",
|
||||
" inference_data=inference_ds_small,\n",
|
||||
@@ -703,6 +718,8 @@
|
||||
" train_run_id=training_run.id,\n",
|
||||
" train_experiment_name=training_run.experiment.name,\n",
|
||||
" inference_pipeline_parameters=mm_parameters,\n",
|
||||
" append_row_file_name=output_file_name,\n",
|
||||
" arguments=[\"--forecast_quantiles\", 0.1, 0.9],\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -737,7 +754,7 @@
|
||||
"\n",
|
||||
"The following code snippet:\n",
|
||||
"1. Downloads the contents of the output folder that is passed in the parallel run step \n",
|
||||
"2. Reads the parallel_run_step.txt file that has the predictions as pandas dataframe and \n",
|
||||
"2. Reads the output file that has the predictions as pandas dataframe and \n",
|
||||
"3. Displays the top 10 rows of the predictions"
|
||||
]
|
||||
},
|
||||
@@ -752,19 +769,9 @@
|
||||
"forecasting_results_name = \"forecasting_results\"\n",
|
||||
"forecasting_output_name = \"many_models_inference_output\"\n",
|
||||
"forecast_file = get_output_from_mm_pipeline(\n",
|
||||
" inference_run, forecasting_results_name, forecasting_output_name\n",
|
||||
" inference_run, forecasting_results_name, forecasting_output_name, output_file_name\n",
|
||||
")\n",
|
||||
"df = pd.read_csv(forecast_file, delimiter=\" \", header=None)\n",
|
||||
"df.columns = [\n",
|
||||
" \"Week Starting\",\n",
|
||||
" \"Store\",\n",
|
||||
" \"Brand\",\n",
|
||||
" \"Quantity\",\n",
|
||||
" \"Advert\",\n",
|
||||
" \"Price\",\n",
|
||||
" \"Revenue\",\n",
|
||||
" \"Predicted\",\n",
|
||||
"]\n",
|
||||
"df = pd.read_csv(forecast_file)\n",
|
||||
"print(\n",
|
||||
" \"Prediction has \", df.shape[0], \" rows. Here the first 10 rows are being displayed.\"\n",
|
||||
")\n",
|
||||
|
||||
@@ -859,8 +859,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"test_pred = plt.scatter(y_test, y_pred_test, color=\"\")\n",
|
||||
"test_test = plt.scatter(y_test, y_test, color=\"g\")\n",
|
||||
"test_pred = plt.scatter(y_test, y_pred_test, c=[\"b\"])\n",
|
||||
"test_test = plt.scatter(y_test, y_test, c=[\"g\"])\n",
|
||||
"plt.legend(\n",
|
||||
" (test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8\n",
|
||||
")\n",
|
||||
|
||||
@@ -422,8 +422,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"test_pred = plt.scatter(y_test, y_pred_test, color=\"\")\n",
|
||||
"test_test = plt.scatter(y_test, y_test, color=\"g\")\n",
|
||||
"test_pred = plt.scatter(y_test, y_pred_test, c=[\"b\"])\n",
|
||||
"test_test = plt.scatter(y_test, y_test, c=[\"g\"])\n",
|
||||
"plt.legend(\n",
|
||||
" (test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8\n",
|
||||
")\n",
|
||||
|
||||
@@ -240,9 +240,9 @@
|
||||
"# Please see [Azure ML Containers repository](https://github.com/Azure/AzureML-Containers#featured-tags)\n",
|
||||
"# for open-sourced GPU base images.\n",
|
||||
"env.docker.base_image = DEFAULT_GPU_IMAGE\n",
|
||||
"env.python.conda_dependencies = CondaDependencies.create(python_version=\"3.6.2\", \n",
|
||||
"env.python.conda_dependencies = CondaDependencies.create(python_version=\"3.6.2\", pin_sdk_version=False,\n",
|
||||
" conda_packages=['tensorflow-gpu==1.12.0','numpy'],\n",
|
||||
" pip_packages=['azureml-contrib-services', 'azureml-defaults'])\n",
|
||||
" pip_packages=['azureml-contrib-services==1.47.0', 'azureml-defaults==1.47.0'])\n",
|
||||
"\n",
|
||||
"inference_config = InferenceConfig(entry_script=\"score.py\", environment=env)\n",
|
||||
"aks_config = AksWebservice.deploy_configuration()\n",
|
||||
@@ -343,7 +343,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.6"
|
||||
"version": "3.7.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -137,7 +137,7 @@
|
||||
"myenv = Environment('my-pyspark-environment')\r\n",
|
||||
"myenv.docker.base_image = \"mcr.microsoft.com/mmlspark/release:0.15\"\r\n",
|
||||
"myenv.inferencing_stack_version = \"latest\"\r\n",
|
||||
"myenv.python.conda_dependencies = CondaDependencies.create(pip_packages=[\"azureml-core\",\"azureml-defaults\",\"azureml-telemetry\",\"azureml-train-restclients-hyperdrive\",\"azureml-train-core\"], python_version=\"3.6.2\")\r\n",
|
||||
"myenv.python.conda_dependencies = CondaDependencies.create(pip_packages=[\"azureml-core\",\"azureml-defaults\",\"azureml-telemetry\",\"azureml-train-restclients-hyperdrive\",\"azureml-train-core\"], python_version=\"3.7.0\")\r\n",
|
||||
"myenv.python.conda_dependencies.add_channel(\"conda-forge\")\r\n",
|
||||
"myenv.spark.packages = [SparkPackage(\"com.microsoft.ml.spark\", \"mmlspark_2.11\", \"0.15\"), SparkPackage(\"com.microsoft.azure\", \"azure-storage\", \"2.0.0\"), SparkPackage(\"org.apache.hadoop\", \"hadoop-azure\", \"2.7.0\")]\r\n",
|
||||
"myenv.spark.repositories = [\"https://mmlspark.azureedge.net/maven\"]\r\n"
|
||||
@@ -341,7 +341,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.2"
|
||||
"version": "3.7.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -106,7 +106,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.46.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -235,14 +235,18 @@
|
||||
"# Note: this is to pin the pandas and xgboost versions to be same as notebook.\n",
|
||||
"# In production scenario user would choose their dependencies\n",
|
||||
"import pkg_resources\n",
|
||||
"from distutils.version import LooseVersion\n",
|
||||
"available_packages = pkg_resources.working_set\n",
|
||||
"pandas_ver = None\n",
|
||||
"numpy_ver = None\n",
|
||||
"for dist in list(available_packages):\n",
|
||||
" if dist.key == 'pandas':\n",
|
||||
" pandas_ver = dist.version\n",
|
||||
" pandas_ver = dist.version\n",
|
||||
" if dist.key == 'numpy':\n",
|
||||
" numpy_ver = dist.version\n",
|
||||
" if LooseVersion(dist.version) >= LooseVersion('1.20.0'):\n",
|
||||
" numpy_ver = dist.version\n",
|
||||
" else:\n",
|
||||
" numpy_ver = '1.21.6'\n",
|
||||
"pandas_dep = 'pandas'\n",
|
||||
"numpy_dep = 'numpy'\n",
|
||||
"if pandas_ver:\n",
|
||||
@@ -252,7 +256,7 @@
|
||||
"\n",
|
||||
"# Note: we build shap at commit 690245 for Tesla K80 GPUs\n",
|
||||
"env.docker.base_dockerfile = f\"\"\"\n",
|
||||
"FROM nvidia/cuda:10.2-devel-ubuntu18.04\n",
|
||||
"FROM nvidia/cuda:10.2-devel-ubuntu20.04\n",
|
||||
"ENV PATH=\"/root/miniconda3/bin:${{PATH}}\"\n",
|
||||
"ARG PATH=\"/root/miniconda3/bin:${{PATH}}\"\n",
|
||||
"RUN apt-get update && \\\n",
|
||||
|
||||
@@ -10,7 +10,7 @@ dependencies:
|
||||
- ipython
|
||||
- matplotlib
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.22.0
|
||||
- raiwidgets~=0.23.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- scipy>=1.5.3
|
||||
|
||||
@@ -10,7 +10,7 @@ dependencies:
|
||||
- matplotlib
|
||||
- azureml-dataset-runtime
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.22.0
|
||||
- raiwidgets~=0.23.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- scipy>=1.5.3
|
||||
|
||||
@@ -9,7 +9,7 @@ dependencies:
|
||||
- ipython
|
||||
- matplotlib
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.22.0
|
||||
- raiwidgets~=0.23.0
|
||||
- packaging>=20.9
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
|
||||
@@ -9,7 +9,7 @@ dependencies:
|
||||
- ipython
|
||||
- matplotlib
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.22.0
|
||||
- raiwidgets~=0.23.0
|
||||
- packaging>=20.9
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
|
||||
@@ -11,7 +11,7 @@ dependencies:
|
||||
- azureml-dataset-runtime
|
||||
- azureml-core
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.22.0
|
||||
- raiwidgets~=0.23.0
|
||||
- itsdangerous==2.0.1
|
||||
- markupsafe<2.1.0
|
||||
- scipy>=1.5.3
|
||||
|
||||
@@ -330,7 +330,7 @@
|
||||
"- **inputs:** List of input connections for data consumed by this step. Fetch this inside the notebook using dbutils.widgets.get(\"input\")\n",
|
||||
"- **outputs:** List of output port definitions for outputs produced by this step. Fetch this inside the notebook using dbutils.widgets.get(\"output\")\n",
|
||||
"- **existing_cluster_id:** Cluster ID of an existing Interactive cluster on the Databricks workspace. If you are providing this, do not provide any of the parameters below that are used to create a new cluster such as spark_version, node_type, etc.\n",
|
||||
"- **spark_version:** Version of spark for the databricks run cluster. default value: 4.0.x-scala2.11\n",
|
||||
"- **spark_version:** Version of spark for the databricks run cluster. You can refer to [DataBricks runtime version](https://learn.microsoft.com/azure/databricks/dev-tools/api/#--runtime-version-strings) to specify the spark version. default value: 4.0.x-scala2.11\n",
|
||||
"- **node_type:** Azure vm node types for the databricks run cluster. default value: Standard_D3_v2\n",
|
||||
"- **num_workers:** Specifies a static number of workers for the databricks run cluster\n",
|
||||
"- **min_workers:** Specifies a min number of workers to use for auto-scaling the databricks run cluster\n",
|
||||
|
||||
@@ -252,7 +252,7 @@
|
||||
"# is_directory=None)\n",
|
||||
"\n",
|
||||
"# Naming the intermediate data as processed_data1 and assigning it to the variable processed_data1.\n",
|
||||
"processed_data1 = PipelineData(\"processed_data1\",datastore=def_blob_store)\n",
|
||||
"processed_data1 = PipelineData(\"processed_data1\",datastore=def_blob_store, is_directory=True)\n",
|
||||
"print(\"PipelineData object created\")"
|
||||
]
|
||||
},
|
||||
@@ -347,7 +347,7 @@
|
||||
"source": [
|
||||
"# step5 to use the intermediate data produced by step4\n",
|
||||
"# This step also produces an output processed_data2\n",
|
||||
"processed_data2 = PipelineData(\"processed_data2\", datastore=def_blob_store)\n",
|
||||
"processed_data2 = PipelineData(\"processed_data2\", datastore=def_blob_store, is_directory=True)\n",
|
||||
"source_directory = \"data_dependency_run_extract\"\n",
|
||||
"\n",
|
||||
"extractStep = PythonScriptStep(\n",
|
||||
@@ -394,7 +394,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Now define the compare step which takes two inputs and produces an output\n",
|
||||
"processed_data3 = PipelineData(\"processed_data3\", datastore=def_blob_store)\n",
|
||||
"processed_data3 = PipelineData(\"processed_data3\", datastore=def_blob_store, is_directory=True)\n",
|
||||
"source_directory = \"data_dependency_run_compare\"\n",
|
||||
"\n",
|
||||
"compareStep = PythonScriptStep(\n",
|
||||
|
||||
@@ -235,7 +235,8 @@
|
||||
" path_on_datastore=\"titanic/Titanic.csv\")\n",
|
||||
"\n",
|
||||
"output_data = PipelineData(name=\"processed_data\",\n",
|
||||
" datastore=Datastore.get(ws, \"workspaceblobstore\"))"
|
||||
" datastore=Datastore.get(ws, \"workspaceblobstore\"),\n",
|
||||
" is_directory=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -306,7 +307,8 @@
|
||||
"from azureml.pipeline.core import PipelineParameter\n",
|
||||
"\n",
|
||||
"output_from_notebook = PipelineData(name=\"notebook_processed_data\",\n",
|
||||
" datastore=Datastore.get(ws, \"workspaceblobstore\"))\n",
|
||||
" datastore=Datastore.get(ws, \"workspaceblobstore\"),\n",
|
||||
" is_directory=True)\n",
|
||||
"\n",
|
||||
"my_pipeline_param = PipelineParameter(name=\"pipeline_param\", default_value=\"my_param\")\n",
|
||||
"\n",
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# DisableDockerDetector "Disabled to unblock PRs until the owner can fix the file. Not used in any prod deployments - only as a documentation for the customers"
|
||||
FROM rocker/tidyverse:4.0.0-ubuntu18.04
|
||||
FROM rocker/tidyverse:4.0.0-ubuntu20.04
|
||||
|
||||
# Install python
|
||||
RUN apt-get update -qq && \
|
||||
|
||||
@@ -363,7 +363,7 @@
|
||||
"}).replace(\",\", \";\")\n",
|
||||
"\n",
|
||||
"# Define output after cleansing step\n",
|
||||
"cleansed_green_data = PipelineData(\"cleansed_green_data\", datastore=default_store).as_dataset()\n",
|
||||
"cleansed_green_data = PipelineData(\"cleansed_green_data\", datastore=default_store, is_directory=True).as_dataset()\n",
|
||||
"\n",
|
||||
"print('Cleanse script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
|
||||
"\n",
|
||||
@@ -414,7 +414,7 @@
|
||||
"}).replace(\",\", \";\")\n",
|
||||
"\n",
|
||||
"# Define output after cleansing step\n",
|
||||
"cleansed_yellow_data = PipelineData(\"cleansed_yellow_data\", datastore=default_store).as_dataset()\n",
|
||||
"cleansed_yellow_data = PipelineData(\"cleansed_yellow_data\", datastore=default_store, is_directory=True).as_dataset()\n",
|
||||
"\n",
|
||||
"print('Cleanse script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
|
||||
"\n",
|
||||
@@ -452,7 +452,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define output after merging step\n",
|
||||
"merged_data = PipelineData(\"merged_data\", datastore=default_store).as_dataset()\n",
|
||||
"merged_data = PipelineData(\"merged_data\", datastore=default_store, is_directory=True).as_dataset()\n",
|
||||
"\n",
|
||||
"print('Merge script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
|
||||
"\n",
|
||||
@@ -489,7 +489,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define output after merging step\n",
|
||||
"filtered_data = PipelineData(\"filtered_data\", datastore=default_store).as_dataset()\n",
|
||||
"filtered_data = PipelineData(\"filtered_data\", datastore=default_store, is_directory=True).as_dataset()\n",
|
||||
"\n",
|
||||
"print('Filter script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
|
||||
"\n",
|
||||
@@ -525,7 +525,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define output after normalize step\n",
|
||||
"normalized_data = PipelineData(\"normalized_data\", datastore=default_store).as_dataset()\n",
|
||||
"normalized_data = PipelineData(\"normalized_data\", datastore=default_store, is_directory=True).as_dataset()\n",
|
||||
"\n",
|
||||
"print('Normalize script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
|
||||
"\n",
|
||||
@@ -566,7 +566,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define output after transform step\n",
|
||||
"transformed_data = PipelineData(\"transformed_data\", datastore=default_store).as_dataset()\n",
|
||||
"transformed_data = PipelineData(\"transformed_data\", datastore=default_store, is_directory=True).as_dataset()\n",
|
||||
"\n",
|
||||
"print('Transform script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n",
|
||||
"\n",
|
||||
@@ -604,8 +604,8 @@
|
||||
"train_model_folder = './scripts/trainmodel'\n",
|
||||
"\n",
|
||||
"# train and test splits output\n",
|
||||
"output_split_train = PipelineData(\"output_split_train\", datastore=default_store).as_dataset()\n",
|
||||
"output_split_test = PipelineData(\"output_split_test\", datastore=default_store).as_dataset()\n",
|
||||
"output_split_train = PipelineData(\"output_split_train\", datastore=default_store, is_directory=True).as_dataset()\n",
|
||||
"output_split_test = PipelineData(\"output_split_test\", datastore=default_store, is_directory=True).as_dataset()\n",
|
||||
"\n",
|
||||
"print('Data spilt script is in {}.'.format(os.path.realpath(train_model_folder)))\n",
|
||||
"\n",
|
||||
|
||||
@@ -61,8 +61,6 @@ def main():
|
||||
run.log('Epochs', np.int(epochs))
|
||||
|
||||
train_iter = iterators.SerialIterator(train, batchsize)
|
||||
test_iter = iterators.SerialIterator(test, batchsize,
|
||||
repeat=False, shuffle=False)
|
||||
|
||||
model = MyNetwork()
|
||||
|
||||
@@ -106,6 +104,8 @@ def main():
|
||||
|
||||
test_losses = []
|
||||
test_accuracies = []
|
||||
test_iter = iterators.SerialIterator(test, batchsize,
|
||||
repeat=False, shuffle=False)
|
||||
while True:
|
||||
test_batch = test_iter.next()
|
||||
image_test, target_test = concat_examples(test_batch, gpu_id)
|
||||
@@ -123,10 +123,6 @@ def main():
|
||||
test_accuracies.append(accuracy.array)
|
||||
|
||||
if test_iter.is_new_epoch:
|
||||
test_iter.epoch = 0
|
||||
test_iter.current_position = 0
|
||||
test_iter.is_new_epoch = False
|
||||
test_iter._pushed_position = None
|
||||
break
|
||||
|
||||
val_accuracy = np.mean(test_accuracies)
|
||||
|
||||
@@ -253,14 +253,13 @@
|
||||
"channels:\n",
|
||||
"- conda-forge\n",
|
||||
"dependencies:\n",
|
||||
"- python=3.6.2\n",
|
||||
"- pip=21.3.1\n",
|
||||
"- python=3.8.12\n",
|
||||
"- pip=22.3.1\n",
|
||||
"- pip:\n",
|
||||
" - azureml-defaults\n",
|
||||
" - azureml-opendatasets\n",
|
||||
" - chainer==5.1.0\n",
|
||||
" - cupy-cuda100==5.1.0\n",
|
||||
" - mpi4py==3.0.0\n",
|
||||
" - chainer\n",
|
||||
" - cupy-cuda111\n",
|
||||
" - pytest"
|
||||
]
|
||||
},
|
||||
@@ -273,10 +272,10 @@
|
||||
"from azureml.core import Environment\n",
|
||||
"from azureml.core.runconfig import DockerConfiguration\n",
|
||||
"\n",
|
||||
"chainer_env = Environment.from_conda_specification(name = 'chainer-5.1.0-gpu', file_path = './conda_dependencies.yml')\n",
|
||||
"chainer_env = Environment.from_conda_specification(name = 'chainer-gpu', file_path = './conda_dependencies.yml')\n",
|
||||
"\n",
|
||||
"# Specify a GPU base image\n",
|
||||
"chainer_env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.0-cudnn7-ubuntu18.04'\n",
|
||||
"chainer_env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu20.04'\n",
|
||||
"\n",
|
||||
"docker_config = DockerConfiguration(use_docker=True)"
|
||||
]
|
||||
@@ -553,8 +552,8 @@
|
||||
"\n",
|
||||
"cd = CondaDependencies.create()\n",
|
||||
"cd.add_conda_package('numpy')\n",
|
||||
"cd.add_pip_package('chainer==5.1.0')\n",
|
||||
"cd.add_pip_package(\"azureml-defaults==1.43.0\")\n",
|
||||
"cd.add_pip_package('chainer')\n",
|
||||
"cd.add_pip_package(\"azureml-defaults\")\n",
|
||||
"cd.add_pip_package(\"azureml-opendatasets\")\n",
|
||||
"cd.save_to_file(base_directory='./', conda_file_path='myenv.yml')\n",
|
||||
"\n",
|
||||
|
||||
@@ -166,7 +166,7 @@ def download_data():
|
||||
from zipfile import ZipFile
|
||||
# download data
|
||||
data_file = './fowl_data.zip'
|
||||
download_url = 'https://azureopendatastorage.blob.core.windows.net/testpublic/temp/fowl_data.zip'
|
||||
download_url = 'https://azuremlexamples.blob.core.windows.net/datasets/fowl_data.zip'
|
||||
urllib.request.urlretrieve(download_url, filename=data_file)
|
||||
|
||||
# extract files
|
||||
|
||||
@@ -176,7 +176,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Download training data\n",
|
||||
"The dataset we will use (located on a public blob [here](https://azureopendatastorage.blob.core.windows.net/testpublic/temp/fowl_data.zip) as a zip file) consists of about 120 training images each for turkeys and chickens, with 100 validation images for each class. The images are a subset of the [Open Images v5 Dataset](https://storage.googleapis.com/openimages/web/index.html). We will download and extract the dataset as part of our training script `pytorch_train.py`"
|
||||
"The dataset we will use (located on a public blob [here](https://azuremlexamples.blob.core.windows.net/datasets/fowl_data.zip) as a zip file) consists of about 120 training images each for turkeys and chickens, with 100 validation images for each class. The images are a subset of the [Open Images v5 Dataset](https://storage.googleapis.com/openimages/web/index.html). We will download and extract the dataset as part of our training script `pytorch_train.py`"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -260,15 +260,14 @@
|
||||
"\n",
|
||||
"channels:\n",
|
||||
"- conda-forge\n",
|
||||
"- pytorch\n",
|
||||
"dependencies:\n",
|
||||
"- python=3.6.2\n",
|
||||
"- python=3.8.12\n",
|
||||
"- pip=21.3.1\n",
|
||||
"- pytorch::pytorch==1.8.1\n",
|
||||
"- pytorch::torchvision==0.9.1\n",
|
||||
"- pip:\n",
|
||||
" - azureml-defaults==1.43.0\n",
|
||||
" - torch==1.6.0\n",
|
||||
" - torchvision==0.7.0\n",
|
||||
" - future==0.17.1\n",
|
||||
" - pillow"
|
||||
" - azureml-defaults"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -160,7 +160,7 @@
|
||||
"source": [
|
||||
"from azureml.core import Dataset\n",
|
||||
"\n",
|
||||
"web_paths = ['https://azureopendatastorage.blob.core.windows.net/testpublic/text8.zip']\n",
|
||||
"web_paths = ['https://azuremlexamples.blob.core.windows.net/datasets/text8.zip']\n",
|
||||
"dataset = Dataset.File.from_files(path=web_paths)"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -1,354 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Distributed TensorFlow with parameter server\n",
|
||||
"In this tutorial, you will train a TensorFlow model on the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset using native [distributed TensorFlow](https://www.tensorflow.org/guide/distributed_training)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prerequisites\n",
|
||||
"* Understand the [architecture and terms](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture) introduced by Azure Machine Learning (AML)\n",
|
||||
"* If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, go through the [configuration notebook](../../../../configuration.ipynb) to:\n",
|
||||
" * install the AML SDK\n",
|
||||
" * create a workspace and its configuration file (`config.json`)\n",
|
||||
"* Review the [tutorial](../train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb) on single-node TensorFlow training using the SDK"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check core SDK version number\n",
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"SDK version:\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Diagnostics\n",
|
||||
"Opt-in diagnostics for better experience, quality, and security of future releases."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"Diagnostics"
|
||||
]
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.telemetry import set_diagnostics_collection\n",
|
||||
"\n",
|
||||
"set_diagnostics_collection(send_diagnostics=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialize workspace\n",
|
||||
"Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()\n",
|
||||
"print('Workspace name: ' + ws.name, \n",
|
||||
" 'Azure region: ' + ws.location, \n",
|
||||
" 'Subscription id: ' + ws.subscription_id, \n",
|
||||
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create or Attach existing AmlCompute\n",
|
||||
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training your model. In this tutorial, you create `AmlCompute` as your training compute resource.\n",
|
||||
"\n",
|
||||
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
|
||||
"\n",
|
||||
"**Creation of AmlCompute takes approximately 5 minutes.** If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
|
||||
"\n",
|
||||
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# choose a name for your cluster\n",
|
||||
"cluster_name = \"gpu-cluster\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
|
||||
" print('Found existing compute target.')\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" print('Creating a new compute target...')\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n",
|
||||
" max_nodes=4)\n",
|
||||
"\n",
|
||||
" # create the cluster\n",
|
||||
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
" compute_target.wait_for_completion(show_output=True)\n",
|
||||
"\n",
|
||||
"# use get_status() to get a detailed status for the current cluster. \n",
|
||||
"print(compute_target.get_status().serialize())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train model on the remote compute\n",
|
||||
"Now that we have the cluster ready to go, let's run our distributed training job."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create a project directory\n",
|
||||
"Create a directory that will contain all the necessary code from your local machine that you will need access to on the remote resource. This includes the training script, and any additional files your training script depends on."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"project_folder = './tf-distr-ps'\n",
|
||||
"os.makedirs(project_folder, exist_ok=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copy the training script `tf_mnist_replica.py` into this project directory."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import shutil\n",
|
||||
"\n",
|
||||
"shutil.copy('tf_mnist_replica.py', project_folder)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create an experiment\n",
|
||||
"Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment) to track all the runs in your workspace for this distributed TensorFlow tutorial. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Experiment\n",
|
||||
"\n",
|
||||
"experiment_name = 'tf-distr-ps'\n",
|
||||
"experiment = Experiment(ws, name=experiment_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create an environment\n",
|
||||
"\n",
|
||||
"In this tutorial, we will use one of Azure ML's curated TensorFlow environments for training. [Curated environments](https://docs.microsoft.com/azure/machine-learning/how-to-use-environments#use-a-curated-environment) are available in your workspace by default. Specifically, we will use the TensorFlow 1.13 GPU curated environment."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Environment\n",
|
||||
"\n",
|
||||
"tf_env = Environment.get(ws, name='AzureML-TensorFlow-1.13-GPU')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configure the training job\n",
|
||||
"\n",
|
||||
"Create a ScriptRunConfig object to specify the configuration details of your training job, including your training script, environment to use, and the compute target to run on.\n",
|
||||
"\n",
|
||||
"In order to execute a distributed TensorFlow run with the parameter server strategy, you must create a `TensorflowConfiguration` object and pass it to the `distributed_job_config` parameter of the ScriptRunConfig constructor. The below code configures a distributed TensorFlow run with `2` workers and `1` parameter server."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import ScriptRunConfig\n",
|
||||
"from azureml.core.runconfig import TensorflowConfiguration\n",
|
||||
"\n",
|
||||
"src = ScriptRunConfig(source_directory=project_folder,\n",
|
||||
" script='tf_mnist_replica.py',\n",
|
||||
" arguments=['--num_gpus', 1, '--train_steps', 500],\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" environment=tf_env,\n",
|
||||
" distributed_job_config=TensorflowConfiguration(worker_count=2, parameter_server_count=1))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Submit job\n",
|
||||
"Run your experiment by submitting your ScriptRunConfig object. Note that this call is asynchronous."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"run = experiment.submit(src)\n",
|
||||
"print(run)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Monitor your run\n",
|
||||
"You can monitor the progress of the run with a Jupyter widget. Like the run submission, the widget is asynchronous and provides live updates every 10-15 seconds until the job completes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"\n",
|
||||
"RunDetails(run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Alternatively, you can block until the script has completed training before running more code."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"run.wait_for_completion(show_output=True) # this provides a verbose log"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "minxia"
|
||||
}
|
||||
],
|
||||
"category": "training",
|
||||
"compute": [
|
||||
"AML Compute"
|
||||
],
|
||||
"datasets": [
|
||||
"MNIST"
|
||||
],
|
||||
"deployment": [
|
||||
"None"
|
||||
],
|
||||
"exclude_from_index": false,
|
||||
"framework": [
|
||||
"TensorFlow"
|
||||
],
|
||||
"friendly_name": "Distributed TensorFlow with parameter server",
|
||||
"index_order": 1,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.8 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python38-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.9"
|
||||
},
|
||||
"tags": [
|
||||
"None"
|
||||
],
|
||||
"task": "Use the TensorFlow estimator to train a model using distributed training"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,5 +0,0 @@
|
||||
name: distributed-tensorflow-with-parameter-server
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
@@ -1,271 +0,0 @@
|
||||
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
||||
# Licensed under the Apache License, Version 2.0
|
||||
# Script adapted from:
|
||||
# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dist_test/python/mnist_replica.py
|
||||
# ==============================================================================
|
||||
"""Distributed MNIST training and validation, with model replicas.
|
||||
A simple softmax model with one hidden layer is defined. The parameters
|
||||
(weights and biases) are located on one parameter server (ps), while the ops
|
||||
are executed on two worker nodes by default. The TF sessions also run on the
|
||||
worker node.
|
||||
Multiple invocations of this script can be done in parallel, with different
|
||||
values for --task_index. There should be exactly one invocation with
|
||||
--task_index, which will create a master session that carries out variable
|
||||
initialization. The other, non-master, sessions will wait for the master
|
||||
session to finish the initialization before proceeding to the training stage.
|
||||
The coordination between the multiple worker invocations occurs due to
|
||||
the definition of the parameters on the same ps devices. The parameter updates
|
||||
from one worker is visible to all other workers. As such, the workers can
|
||||
perform forward computation and gradient calculation in parallel, which
|
||||
should lead to increased training speed for the simple model.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import math
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import json
|
||||
|
||||
import tensorflow as tf
|
||||
from tensorflow.examples.tutorials.mnist import input_data
|
||||
from azureml.core.run import Run
|
||||
|
||||
flags = tf.app.flags
|
||||
flags.DEFINE_string("data_dir", "/tmp/mnist-data",
|
||||
"Directory for storing mnist data")
|
||||
flags.DEFINE_boolean("download_only", False,
|
||||
"Only perform downloading of data; Do not proceed to "
|
||||
"session preparation, model definition or training")
|
||||
flags.DEFINE_integer("num_gpus", 0, "Total number of gpus for each machine."
|
||||
"If you don't use GPU, please set it to '0'")
|
||||
flags.DEFINE_integer("replicas_to_aggregate", None,
|
||||
"Number of replicas to aggregate before parameter update "
|
||||
"is applied (For sync_replicas mode only; default: "
|
||||
"num_workers)")
|
||||
flags.DEFINE_integer("hidden_units", 100,
|
||||
"Number of units in the hidden layer of the NN")
|
||||
flags.DEFINE_integer("train_steps", 200,
|
||||
"Number of (global) training steps to perform")
|
||||
flags.DEFINE_integer("batch_size", 100, "Training batch size")
|
||||
flags.DEFINE_float("learning_rate", 0.01, "Learning rate")
|
||||
flags.DEFINE_boolean(
|
||||
"sync_replicas", False,
|
||||
"Use the sync_replicas (synchronized replicas) mode, "
|
||||
"wherein the parameter updates from workers are aggregated "
|
||||
"before applied to avoid stale gradients")
|
||||
flags.DEFINE_boolean(
|
||||
"existing_servers", False, "Whether servers already exists. If True, "
|
||||
"will use the worker hosts via their GRPC URLs (one client process "
|
||||
"per worker host). Otherwise, will create an in-process TensorFlow "
|
||||
"server.")
|
||||
|
||||
FLAGS = flags.FLAGS
|
||||
|
||||
IMAGE_PIXELS = 28
|
||||
|
||||
|
||||
def main(unused_argv):
|
||||
data_root = os.path.join("outputs", "MNIST")
|
||||
mnist = None
|
||||
tf_config = os.environ.get("TF_CONFIG")
|
||||
if not tf_config or tf_config == "":
|
||||
raise ValueError("TF_CONFIG not found.")
|
||||
tf_config_json = json.loads(tf_config)
|
||||
cluster = tf_config_json.get('cluster')
|
||||
job_name = tf_config_json.get('task', {}).get('type')
|
||||
task_index = tf_config_json.get('task', {}).get('index')
|
||||
job_name = "worker" if job_name == "master" else job_name
|
||||
sentinel_path = os.path.join(data_root, "complete.txt")
|
||||
if job_name == "worker" and task_index == 0:
|
||||
mnist = input_data.read_data_sets(data_root, one_hot=True)
|
||||
with open(sentinel_path, 'w+') as f:
|
||||
f.write("download complete")
|
||||
else:
|
||||
while not os.path.exists(sentinel_path):
|
||||
time.sleep(0.01)
|
||||
mnist = input_data.read_data_sets(data_root, one_hot=True)
|
||||
|
||||
if FLAGS.download_only:
|
||||
sys.exit(0)
|
||||
|
||||
print("job name = %s" % job_name)
|
||||
print("task index = %d" % task_index)
|
||||
print("number of GPUs = %d" % FLAGS.num_gpus)
|
||||
|
||||
# Construct the cluster and start the server
|
||||
cluster_spec = tf.train.ClusterSpec(cluster)
|
||||
|
||||
# Get the number of workers.
|
||||
num_workers = len(cluster_spec.task_indices("worker"))
|
||||
|
||||
if not FLAGS.existing_servers:
|
||||
# Not using existing servers. Create an in-process server.
|
||||
server = tf.train.Server(
|
||||
cluster_spec, job_name=job_name, task_index=task_index)
|
||||
if job_name == "ps":
|
||||
server.join()
|
||||
|
||||
is_chief = (task_index == 0)
|
||||
if FLAGS.num_gpus > 0:
|
||||
# Avoid gpu allocation conflict: now allocate task_num -> #gpu
|
||||
# for each worker in the corresponding machine
|
||||
gpu = (task_index % FLAGS.num_gpus)
|
||||
worker_device = "/job:worker/task:%d/gpu:%d" % (task_index, gpu)
|
||||
elif FLAGS.num_gpus == 0:
|
||||
# Just allocate the CPU to worker server
|
||||
cpu = 0
|
||||
worker_device = "/job:worker/task:%d/cpu:%d" % (task_index, cpu)
|
||||
# The device setter will automatically place Variables ops on separate
|
||||
# parameter servers (ps). The non-Variable ops will be placed on the workers.
|
||||
# The ps use CPU and workers use corresponding GPU
|
||||
with tf.device(
|
||||
tf.train.replica_device_setter(
|
||||
worker_device=worker_device,
|
||||
ps_device="/job:ps/cpu:0",
|
||||
cluster=cluster)):
|
||||
global_step = tf.Variable(0, name="global_step", trainable=False)
|
||||
|
||||
# Variables of the hidden layer
|
||||
hid_w = tf.Variable(
|
||||
tf.truncated_normal(
|
||||
[IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
|
||||
stddev=1.0 / IMAGE_PIXELS),
|
||||
name="hid_w")
|
||||
hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b")
|
||||
|
||||
# Variables of the softmax layer
|
||||
sm_w = tf.Variable(
|
||||
tf.truncated_normal(
|
||||
[FLAGS.hidden_units, 10],
|
||||
stddev=1.0 / math.sqrt(FLAGS.hidden_units)),
|
||||
name="sm_w")
|
||||
sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
|
||||
|
||||
# Ops: located on the worker specified with task_index
|
||||
x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])
|
||||
y_ = tf.placeholder(tf.float32, [None, 10])
|
||||
|
||||
hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
|
||||
hid = tf.nn.relu(hid_lin)
|
||||
|
||||
y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
|
||||
cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
|
||||
|
||||
opt = tf.train.AdamOptimizer(FLAGS.learning_rate)
|
||||
|
||||
if FLAGS.sync_replicas:
|
||||
if FLAGS.replicas_to_aggregate is None:
|
||||
replicas_to_aggregate = num_workers
|
||||
else:
|
||||
replicas_to_aggregate = FLAGS.replicas_to_aggregate
|
||||
|
||||
opt = tf.train.SyncReplicasOptimizer(
|
||||
opt,
|
||||
replicas_to_aggregate=replicas_to_aggregate,
|
||||
total_num_replicas=num_workers,
|
||||
name="mnist_sync_replicas")
|
||||
|
||||
train_step = opt.minimize(cross_entropy, global_step=global_step)
|
||||
|
||||
if FLAGS.sync_replicas:
|
||||
local_init_op = opt.local_step_init_op
|
||||
if is_chief:
|
||||
local_init_op = opt.chief_init_op
|
||||
|
||||
ready_for_local_init_op = opt.ready_for_local_init_op
|
||||
|
||||
# Initial token and chief queue runners required by the sync_replicas mode
|
||||
chief_queue_runner = opt.get_chief_queue_runner()
|
||||
sync_init_op = opt.get_init_tokens_op()
|
||||
|
||||
init_op = tf.global_variables_initializer()
|
||||
train_dir = tempfile.mkdtemp()
|
||||
|
||||
if FLAGS.sync_replicas:
|
||||
sv = tf.train.Supervisor(
|
||||
is_chief=is_chief,
|
||||
logdir=train_dir,
|
||||
init_op=init_op,
|
||||
local_init_op=local_init_op,
|
||||
ready_for_local_init_op=ready_for_local_init_op,
|
||||
recovery_wait_secs=1,
|
||||
global_step=global_step)
|
||||
else:
|
||||
sv = tf.train.Supervisor(
|
||||
is_chief=is_chief,
|
||||
logdir=train_dir,
|
||||
init_op=init_op,
|
||||
recovery_wait_secs=1,
|
||||
global_step=global_step)
|
||||
|
||||
sess_config = tf.ConfigProto(
|
||||
allow_soft_placement=True,
|
||||
log_device_placement=False,
|
||||
device_filters=["/job:ps",
|
||||
"/job:worker/task:%d" % task_index])
|
||||
|
||||
# The chief worker (task_index==0) session will prepare the session,
|
||||
# while the remaining workers will wait for the preparation to complete.
|
||||
if is_chief:
|
||||
print("Worker %d: Initializing session..." % task_index)
|
||||
else:
|
||||
print("Worker %d: Waiting for session to be initialized..." %
|
||||
task_index)
|
||||
|
||||
if FLAGS.existing_servers:
|
||||
server_grpc_url = "grpc://" + task_index
|
||||
print("Using existing server at: %s" % server_grpc_url)
|
||||
|
||||
sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config)
|
||||
else:
|
||||
sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
|
||||
|
||||
print("Worker %d: Session initialization complete." % task_index)
|
||||
|
||||
if FLAGS.sync_replicas and is_chief:
|
||||
# Chief worker will start the chief queue runner and call the init op.
|
||||
sess.run(sync_init_op)
|
||||
sv.start_queue_runners(sess, [chief_queue_runner])
|
||||
|
||||
# Perform training
|
||||
time_begin = time.time()
|
||||
print("Training begins @ %f" % time_begin)
|
||||
|
||||
local_step = 0
|
||||
while True:
|
||||
# Training feed
|
||||
batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size)
|
||||
train_feed = {x: batch_xs, y_: batch_ys}
|
||||
|
||||
_, step = sess.run([train_step, global_step], feed_dict=train_feed)
|
||||
local_step += 1
|
||||
|
||||
now = time.time()
|
||||
print("%f: Worker %d: training step %d done (global step: %d)" %
|
||||
(now, task_index, local_step, step))
|
||||
|
||||
if step >= FLAGS.train_steps:
|
||||
break
|
||||
|
||||
time_end = time.time()
|
||||
print("Training ends @ %f" % time_end)
|
||||
training_time = time_end - time_begin
|
||||
print("Training elapsed time: %f s" % training_time)
|
||||
|
||||
# Validation feed
|
||||
val_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
|
||||
val_xent = sess.run(cross_entropy, feed_dict=val_feed)
|
||||
print("After %d training step(s), validation cross entropy = %g" %
|
||||
(FLAGS.train_steps, val_xent))
|
||||
if job_name == "worker" and task_index == 0:
|
||||
run = Run.get_context()
|
||||
run.log("CrossEntropy", val_xent)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tf.app.run()
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.0.3-cudnn8-ubuntu18.04
|
||||
FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu20.04
|
||||
|
||||
USER root
|
||||
RUN conda install -c anaconda python=3.7
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04
|
||||
FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
|
||||
|
||||
USER root
|
||||
RUN conda install -c anaconda python=3.7
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20200423.v1
|
||||
FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python-opengl \
|
||||
@@ -16,7 +16,7 @@ RUN pip install ray-on-aml==0.2.1 & \
|
||||
azureml-contrib-reinforcementlearning \
|
||||
gputil \
|
||||
scipy \
|
||||
pyglet \
|
||||
pyglet==1.5.27 \
|
||||
cloudpickle==1.3.0 \
|
||||
tensorboardX \
|
||||
tensorflow==1.14.0 \
|
||||
|
||||
@@ -8,7 +8,7 @@ dependencies:
|
||||
- matplotlib
|
||||
- azureml-dataset-runtime
|
||||
- ipywidgets
|
||||
- raiwidgets~=0.22.0
|
||||
- raiwidgets~=0.23.0
|
||||
- liac-arff
|
||||
- packaging>=20.9
|
||||
- itsdangerous==2.0.1
|
||||
|
||||
@@ -101,7 +101,7 @@
|
||||
"\n",
|
||||
"# Check core SDK version number\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using SDK version 1.46.0, you are currently running version\", azureml.core.VERSION)"
|
||||
"print(\"This notebook was created using SDK version 1.48.0, you are currently running version\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -139,7 +139,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('./train.py', 'r') as f:\n",
|
||||
"with open('./scripts/train.py', 'r') as f:\n",
|
||||
" print(f.read())"
|
||||
]
|
||||
},
|
||||
@@ -149,7 +149,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('./mylib.py', 'r') as f:\n",
|
||||
"with open('./scripts/mylib.py', 'r') as f:\n",
|
||||
" print(f.read())"
|
||||
]
|
||||
},
|
||||
@@ -203,7 +203,7 @@
|
||||
"source": [
|
||||
"from azureml.core import ScriptRunConfig\n",
|
||||
"\n",
|
||||
"src = ScriptRunConfig(source_directory='./', script='train.py', environment=user_managed_env)"
|
||||
"src = ScriptRunConfig(source_directory='./scripts', script='train.py', environment=user_managed_env)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -176,7 +176,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_urls = ['https://data4mldemo6150520719.blob.core.windows.net/demo/mnist-fashion']\n",
|
||||
"data_urls = ['https://azuremldownloads.blob.core.windows.net/demo/mnist-fashion']\n",
|
||||
"fashion_ds = Dataset.File.from_files(data_urls)\n",
|
||||
"\n",
|
||||
"# list the files referenced by fashion_ds\n",
|
||||
|
||||
1
index.md
1
index.md
@@ -67,7 +67,6 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an
|
||||
| [Training with hyperparameter tuning using PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb) | Train an image classification model using transfer learning with the PyTorch estimator | ImageNet | AML Compute | Azure Container Instance | PyTorch | None |
|
||||
| [Training and hyperparameter tuning with Scikit-learn](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/scikit-learn/train-hyperparameter-tune-deploy-with-sklearn/train-hyperparameter-tune-deploy-with-sklearn.ipynb) | Train a support vector machine (SVM) to perform classification | Iris | AML Compute | None | Scikit-learn | None |
|
||||
| [Distributed training using TensorFlow with Horovod](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/distributed-tensorflow-with-horovod/distributed-tensorflow-with-horovod.ipynb) | Use the TensorFlow estimator to train a word2vec model | None | AML Compute | None | TensorFlow | None |
|
||||
| [Distributed TensorFlow with parameter server](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/distributed-tensorflow-with-parameter-server/distributed-tensorflow-with-parameter-server.ipynb) | Use the TensorFlow estimator to train a model using distributed training | MNIST | AML Compute | None | TensorFlow | None |
|
||||
| [Hyperparameter tuning and warm start using the TensorFlow estimator](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.ipynb) | Train a deep neural network | MNIST | AML Compute | Azure Container Instance | TensorFlow | None |
|
||||
| [Training and hyperparameter tuning using the TensorFlow estimator](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb) | Train a deep neural network | MNIST | AML Compute | Azure Container Instance | TensorFlow | None |
|
||||
| [Resuming a model](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/train-tensorflow-resume-training/train-tensorflow-resume-training.ipynb) | Resume a model in TensorFlow from a previously submitted run | MNIST | AML Compute | None | TensorFlow | None |
|
||||
|
||||
@@ -102,7 +102,7 @@
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.46.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.48.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -151,7 +151,7 @@
|
||||
"# use a curated environment that has already been built for you\n",
|
||||
"\n",
|
||||
"env = Environment.get(workspace=ws, \n",
|
||||
" name=\"AzureML-sklearn-0.24-ubuntu18.04-py37-cpu\")"
|
||||
" name=\"AzureML-sklearn-1.0-ubuntu20.04-py38-cpu\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user