mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-20 01:27:06 -05:00
Compare commits
8 Commits
azureml-sd
...
azureml-sd
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b0aa91acce | ||
|
|
5928ba83bb | ||
|
|
ffa3a43979 | ||
|
|
7ce79a43f1 | ||
|
|
edcc50ab0c | ||
|
|
4a391522d0 | ||
|
|
1903f78285 | ||
|
|
a4dfcc4693 |
@@ -103,7 +103,7 @@
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.40.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.41.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -13,18 +13,19 @@ dependencies:
|
||||
- pytorch::pytorch=1.4.0
|
||||
- conda-forge::fbprophet==0.7.1
|
||||
- cudatoolkit=10.1.243
|
||||
- tqdm==4.63.1
|
||||
- scipy==1.5.2
|
||||
- notebook
|
||||
- pywin32==225
|
||||
- pywin32==227
|
||||
- PySocks==1.7.1
|
||||
- Pygments==2.11.2
|
||||
- conda-forge::pyqt==5.12.3
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.40.0
|
||||
- azureml-widgets~=1.41.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- spacy==2.2.4
|
||||
- pystan==2.19.1.1
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.40.0/validated_win32_requirements.txt [--no-deps]
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.41.0/validated_win32_requirements.txt [--no-deps]
|
||||
- arch==4.14
|
||||
|
||||
@@ -24,10 +24,10 @@ dependencies:
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.40.0
|
||||
- azureml-widgets~=1.41.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- spacy==2.2.4
|
||||
- pystan==2.19.1.1
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.40.0/validated_linux_requirements.txt [--no-deps]
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.41.0/validated_linux_requirements.txt [--no-deps]
|
||||
- arch==4.14
|
||||
|
||||
@@ -25,10 +25,10 @@ dependencies:
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azureml-widgets~=1.40.0
|
||||
- azureml-widgets~=1.41.0
|
||||
- pytorch-transformers==1.0.0
|
||||
- spacy==2.2.4
|
||||
- pystan==2.19.1.1
|
||||
- https://aka.ms/automl-resources/packages/en_core_web_sm-2.1.0.tar.gz
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.40.0/validated_darwin_requirements.txt [--no-deps]
|
||||
- -r https://automlsdkdataresources.blob.core.windows.net/validated-requirements/1.41.0/validated_darwin_requirements.txt [--no-deps]
|
||||
- arch==4.14
|
||||
|
||||
@@ -134,6 +134,7 @@
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Experiment Name\"] = experiment.name\n",
|
||||
"output[\"SDK Version\"] = azureml.core.VERSION\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
|
||||
@@ -90,6 +90,7 @@
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Experiment Name\"] = experiment.name\n",
|
||||
"output[\"SDK Version\"] = azureml.core.VERSION\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
|
||||
@@ -101,6 +101,7 @@
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Experiment Name\"] = experiment.name\n",
|
||||
"output[\"SDK Version\"] = azureml.core.VERSION\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
|
||||
@@ -102,6 +102,7 @@
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Run History Name\"] = experiment_name\n",
|
||||
"output[\"SDK Version\"] = azureml.core.VERSION\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
|
||||
@@ -8,9 +8,12 @@ dependencies:
|
||||
- urllib3==1.26.7
|
||||
- PyJWT < 2.0.0
|
||||
- numpy==1.18.5
|
||||
- pywin32==227
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azure-core==1.21.1
|
||||
- azure-identity==1.7.0
|
||||
- azureml-defaults
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
|
||||
@@ -14,6 +14,8 @@ dependencies:
|
||||
|
||||
- pip:
|
||||
# Required packages for AzureML execution, history, and data preparation.
|
||||
- azure-core==1.21.1
|
||||
- azure-identity==1.7.0
|
||||
- azureml-defaults
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
|
||||
@@ -92,7 +92,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.40.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.41.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -75,7 +75,6 @@
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"from azureml.core.dataset import Dataset\n",
|
||||
"from azureml.data.dataset_factory import TabularDatasetFactory\n",
|
||||
"from azureml.train.automl import AutoMLConfig"
|
||||
]
|
||||
},
|
||||
@@ -92,7 +91,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.40.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.41.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -197,10 +196,10 @@
|
||||
"source": [
|
||||
"ds = ws.get_default_datastore()\n",
|
||||
"\n",
|
||||
"train_data = TabularDatasetFactory.register_pandas_dataframe(\n",
|
||||
"train_data = Dataset.Tabular.register_pandas_dataframe(\n",
|
||||
" train_data.to_pandas_dataframe(), target=(ds, \"machineTrainData\"), name=\"train_data\")\n",
|
||||
"\n",
|
||||
"test_data = TabularDatasetFactory.register_pandas_dataframe(\n",
|
||||
"test_data = Dataset.Tabular.register_pandas_dataframe(\n",
|
||||
" test_data.to_pandas_dataframe(), target=(ds, \"machineTestData\"), name=\"test_data\")"
|
||||
]
|
||||
},
|
||||
@@ -328,7 +327,8 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Show hyperparameters\n",
|
||||
"Show the model pipeline used for the best run with its hyperparameters."
|
||||
"Show the model pipeline used for the best run with its hyperparameters.\n",
|
||||
"For ensemble pipelines it shows the iterations and algorithms that are ensembled."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -337,8 +337,19 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"run_properties = json.loads(best_run.get_details()['properties']['pipeline_script'])\n",
|
||||
"print(json.dumps(run_properties, indent = 1)) "
|
||||
"run_properties = best_run.get_details()['properties']\n",
|
||||
"pipeline_script = json.loads(run_properties['pipeline_script'])\n",
|
||||
"print(json.dumps(pipeline_script, indent = 1)) \n",
|
||||
"\n",
|
||||
"if 'ensembled_iterations' in run_properties:\n",
|
||||
" print(\"\")\n",
|
||||
" print(\"Ensembled Iterations\")\n",
|
||||
" print(run_properties['ensembled_iterations'])\n",
|
||||
" \n",
|
||||
"if 'ensembled_algorithms' in run_properties:\n",
|
||||
" print(\"\")\n",
|
||||
" print(\"Ensembled Algorithms\")\n",
|
||||
" print(run_properties['ensembled_algorithms'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -5,6 +5,7 @@ import json
|
||||
import os
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from matplotlib import pyplot as plt
|
||||
@@ -121,7 +122,7 @@ def calculate_scores_and_build_plots(
|
||||
input_dir: str, output_dir: str, automl_settings: Dict[str, Any]
|
||||
):
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
grains = automl_settings.get(constants.TimeSeries.GRAIN_COLUMN_NAMES)
|
||||
grains = automl_settings.get(constants.TimeSeries.TIME_SERIES_ID_COLUMN_NAMES)
|
||||
time_column_name = automl_settings.get(constants.TimeSeries.TIME_COLUMN_NAME)
|
||||
if grains is None:
|
||||
grains = []
|
||||
@@ -146,6 +147,9 @@ def calculate_scores_and_build_plots(
|
||||
_draw_one_plot(one_forecast, time_column_name, grains, pdf)
|
||||
pdf.close()
|
||||
forecast_df.to_csv(os.path.join(output_dir, FORECASTS_FILE), index=False)
|
||||
# Remove np.NaN and np.inf from the prediction and actuals data.
|
||||
forecast_df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||
forecast_df.dropna(subset=[ACTUALS, PREDICTIONS], inplace=True)
|
||||
metrics = compute_all_metrics(forecast_df, grains + [BACKTEST_ITER])
|
||||
metrics.to_csv(os.path.join(output_dir, SCORES_FILE), index=False)
|
||||
|
||||
|
||||
@@ -86,6 +86,7 @@
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Default datastore name\"] = dstore.name\n",
|
||||
"output[\"SDK Version\"] = azureml.core.VERSION\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
@@ -322,10 +323,10 @@
|
||||
"| **iterations** | Number of models to train. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **experiment_timeout_hours** | Maximum amount of time in hours that the experiment can take before it terminates. This is optional but provides customers with greater control on exit criteria. |\n",
|
||||
"| **label_column_name** | The name of the label column. |\n",
|
||||
"| **max_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
|
||||
"| **forecast_horizon** | The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly). Periods are inferred from your data. |\n",
|
||||
"| **n_cross_validations** | Number of cross validation splits. Rolling Origin Validation is used to split time-series in a temporally consistent way. |\n",
|
||||
"| **time_column_name** | The name of your time column. |\n",
|
||||
"| **grain_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
|
||||
"| **partition_column_names** | The names of columns used to group your models. For timeseries, the groups must not split up individual time-series. That is, each group must contain one or more whole time-series. |"
|
||||
]
|
||||
@@ -354,8 +355,8 @@
|
||||
" \"label_column_name\": TARGET_COLNAME,\n",
|
||||
" \"n_cross_validations\": 3,\n",
|
||||
" \"time_column_name\": TIME_COLNAME,\n",
|
||||
" \"max_horizon\": 6,\n",
|
||||
" \"grain_column_names\": partition_column_names,\n",
|
||||
" \"forecast_horizon\": 6,\n",
|
||||
" \"time_series_id_column_names\": partition_column_names,\n",
|
||||
" \"track_child_runs\": False,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
|
||||
@@ -5,6 +5,7 @@ import json
|
||||
import os
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from matplotlib import pyplot as plt
|
||||
@@ -146,6 +147,9 @@ def calculate_scores_and_build_plots(
|
||||
_draw_one_plot(one_forecast, time_column_name, grains, pdf)
|
||||
pdf.close()
|
||||
forecast_df.to_csv(os.path.join(output_dir, FORECASTS_FILE), index=False)
|
||||
# Remove np.NaN and np.inf from the prediction and actuals data.
|
||||
forecast_df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||
forecast_df.dropna(subset=[ACTUALS, PREDICTIONS], inplace=True)
|
||||
metrics = compute_all_metrics(forecast_df, grains + [BACKTEST_ITER])
|
||||
metrics.to_csv(os.path.join(output_dir, SCORES_FILE), index=False)
|
||||
|
||||
|
||||
@@ -100,6 +100,7 @@
|
||||
"output[\"SKU\"] = ws.sku\n",
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"SDK Version\"] = azureml.core.VERSION\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
|
||||
@@ -119,6 +119,7 @@
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Run History Name\"] = experiment_name\n",
|
||||
"output[\"SDK Version\"] = azureml.core.VERSION\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
|
||||
@@ -132,6 +132,7 @@
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Run History Name\"] = experiment_name\n",
|
||||
"output[\"SDK Version\"] = azureml.core.VERSION\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
|
||||
@@ -121,6 +121,7 @@
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Run History Name\"] = experiment_name\n",
|
||||
"output[\"SDK Version\"] = azureml.core.VERSION\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
|
||||
@@ -57,7 +57,7 @@
|
||||
"Notebook synopsis:\n",
|
||||
"\n",
|
||||
"1. Creating an Experiment in an existing Workspace\n",
|
||||
"2. Configuration and remote run of AutoML for a time-series model exploring Regression learners, Arima, Prophet and DNNs\n",
|
||||
"2. Configuration and remote run of AutoML for a time-series model exploring DNNs\n",
|
||||
"4. Evaluating the fitted model using a rolling test "
|
||||
]
|
||||
},
|
||||
@@ -92,8 +92,7 @@
|
||||
"# Squash warning messages for cleaner output in the notebook\n",
|
||||
"warnings.showwarning = lambda *args, **kwargs: None\n",
|
||||
"\n",
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
"from azureml.core import Workspace, Experiment, Dataset\n",
|
||||
"from azureml.train.automl import AutoMLConfig\n",
|
||||
"from matplotlib import pyplot as plt\n",
|
||||
"from sklearn.metrics import mean_absolute_error, mean_squared_error\n",
|
||||
@@ -148,6 +147,7 @@
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Run History Name\"] = experiment_name\n",
|
||||
"output[\"SDK Version\"] = azureml.core.VERSION\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
@@ -298,40 +298,21 @@
|
||||
"from helper import split_full_for_forecasting\n",
|
||||
"\n",
|
||||
"train, valid = split_full_for_forecasting(df, time_column_name)\n",
|
||||
"train.to_csv(\"train.csv\")\n",
|
||||
"valid.to_csv(\"valid.csv\")\n",
|
||||
"test_df.to_csv(\"test.csv\")\n",
|
||||
"\n",
|
||||
"# Reset index to create a Tabualr Dataset.\n",
|
||||
"train.reset_index(inplace=True)\n",
|
||||
"valid.reset_index(inplace=True)\n",
|
||||
"test_df.reset_index(inplace=True)\n",
|
||||
"\n",
|
||||
"datastore = ws.get_default_datastore()\n",
|
||||
"datastore.upload_files(\n",
|
||||
" files=[\"./train.csv\"],\n",
|
||||
" target_path=\"github-dataset/tabular/\",\n",
|
||||
" overwrite=True,\n",
|
||||
" show_progress=True,\n",
|
||||
"train_dataset = Dataset.Tabular.register_pandas_dataframe(\n",
|
||||
" train, target=(datastore, \"dataset/\"), name=\"Github_DAU_train\"\n",
|
||||
")\n",
|
||||
"datastore.upload_files(\n",
|
||||
" files=[\"./valid.csv\"],\n",
|
||||
" target_path=\"github-dataset/tabular/\",\n",
|
||||
" overwrite=True,\n",
|
||||
" show_progress=True,\n",
|
||||
"valid_dataset = Dataset.Tabular.register_pandas_dataframe(\n",
|
||||
" valid, target=(datastore, \"dataset/\"), name=\"Github_DAU_valid\"\n",
|
||||
")\n",
|
||||
"datastore.upload_files(\n",
|
||||
" files=[\"./test.csv\"],\n",
|
||||
" target_path=\"github-dataset/tabular/\",\n",
|
||||
" overwrite=True,\n",
|
||||
" show_progress=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"from azureml.core import Dataset\n",
|
||||
"\n",
|
||||
"train_dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, \"github-dataset/tabular/train.csv\")]\n",
|
||||
")\n",
|
||||
"valid_dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, \"github-dataset/tabular/valid.csv\")]\n",
|
||||
")\n",
|
||||
"test_dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, \"github-dataset/tabular/test.csv\")]\n",
|
||||
"test_dataset = Dataset.Tabular.register_pandas_dataframe(\n",
|
||||
" test_df, target=(datastore, \"dataset/\"), name=\"Github_DAU_test\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -397,7 +378,7 @@
|
||||
" freq=\"D\", # Set the forecast frequency to be daily\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# We will disable the enable_early_stopping flag to ensure the DNN model is recommended for demonstration purpose.\n",
|
||||
"# To only allow the TCNForecaster we set the allowed_models parameter to reflect this.\n",
|
||||
"automl_config = AutoMLConfig(\n",
|
||||
" task=\"forecasting\",\n",
|
||||
" primary_metric=\"normalized_root_mean_squared_error\",\n",
|
||||
@@ -410,7 +391,7 @@
|
||||
" max_concurrent_iterations=4,\n",
|
||||
" max_cores_per_iteration=-1,\n",
|
||||
" enable_dnn=True,\n",
|
||||
" enable_early_stopping=False,\n",
|
||||
" allowed_models=[\"TCNForecaster\"],\n",
|
||||
" forecasting_parameters=forecasting_parameters,\n",
|
||||
")"
|
||||
]
|
||||
@@ -503,7 +484,9 @@
|
||||
"if not forecast_model in summary_df[\"run_id\"]:\n",
|
||||
" forecast_model = \"ForecastTCN\"\n",
|
||||
"\n",
|
||||
"best_dnn_run_id = summary_df[\"run_id\"][forecast_model]\n",
|
||||
"best_dnn_run_id = summary_df[summary_df[\"Score\"] == summary_df[\"Score\"].min()][\n",
|
||||
" \"run_id\"\n",
|
||||
"][forecast_model]\n",
|
||||
"best_dnn_run = Run(experiment, best_dnn_run_id)"
|
||||
]
|
||||
},
|
||||
@@ -564,11 +547,6 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Dataset\n",
|
||||
"\n",
|
||||
"test_dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, \"github-dataset/tabular/test.csv\")]\n",
|
||||
")\n",
|
||||
"# preview the first 3 rows of the dataset\n",
|
||||
"test_dataset.take(5).to_pandas_dataframe()"
|
||||
]
|
||||
|
||||
@@ -79,9 +79,7 @@ def get_result_df(remote_run):
|
||||
if "goal" in run.properties:
|
||||
goal_minimize = run.properties["goal"].split("_")[-1] == "min"
|
||||
|
||||
summary_df = summary_df.T.sort_values(
|
||||
"Score", ascending=goal_minimize
|
||||
).drop_duplicates(["run_algorithm"])
|
||||
summary_df = summary_df.T.sort_values("Score", ascending=goal_minimize)
|
||||
summary_df = summary_df.set_index("run_algorithm")
|
||||
return summary_df
|
||||
|
||||
|
||||
@@ -78,6 +78,7 @@
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Default datastore name\"] = dstore.name\n",
|
||||
"output[\"SDK Version\"] = azureml.core.VERSION\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
|
||||
@@ -78,6 +78,7 @@
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Default datastore name\"] = dstore.name\n",
|
||||
"output[\"SDK Version\"] = azureml.core.VERSION\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
@@ -324,7 +325,7 @@
|
||||
"| **enable_early_stopping** | Flag to enable early termination if the score is not improving in the short term. |\n",
|
||||
"| **time_column_name** | The name of your time column. |\n",
|
||||
"| **enable_engineered_explanations** | Engineered feature explanations will be downloaded if enable_engineered_explanations flag is set to True. By default it is set to False to save storage space. |\n",
|
||||
"| **time_series_id_column_name** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"| **time_series_id_column_names** | The column names used to uniquely identify timeseries in data that has multiple rows with the same timestamp. |\n",
|
||||
"| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n",
|
||||
"| **pipeline_fetch_max_batch_size** | Determines how many pipelines (training algorithms) to fetch at a time for training, this helps reduce throttling when training at large scale. |\n",
|
||||
"| **partition_column_names** | The names of columns used to group your models. For timeseries, the groups must not split up individual time-series. That is, each group must contain one or more whole time-series. |"
|
||||
@@ -355,8 +356,8 @@
|
||||
" \"n_cross_validations\": 3,\n",
|
||||
" \"time_column_name\": \"WeekStarting\",\n",
|
||||
" \"drop_column_names\": \"Revenue\",\n",
|
||||
" \"max_horizon\": 6,\n",
|
||||
" \"grain_column_names\": partition_column_names,\n",
|
||||
" \"forecast_horizon\": 6,\n",
|
||||
" \"time_series_id_column_names\": partition_column_names,\n",
|
||||
" \"track_child_runs\": False,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
|
||||
@@ -112,6 +112,7 @@
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Run History Name\"] = experiment_name\n",
|
||||
"output[\"SDK Version\"] = azureml.core.VERSION\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
|
||||
@@ -93,6 +93,7 @@
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Experiment Name\"] = experiment.name\n",
|
||||
"output[\"SDK Version\"] = azureml.core.VERSION\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
|
||||
@@ -93,6 +93,7 @@
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Experiment Name\"] = experiment.name\n",
|
||||
"output[\"SDK Version\"] = azureml.core.VERSION\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
|
||||
@@ -89,6 +89,7 @@
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Run History Name\"] = experiment_name\n",
|
||||
"output[\"SDK Version\"] = azureml.core.VERSION\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
|
||||
@@ -82,7 +82,7 @@
|
||||
"source": [
|
||||
"## Create trained model\n",
|
||||
"\n",
|
||||
"For this example, we will train a small model on scikit-learn's [diabetes dataset](https://scikit-learn.org/stable/datasets/index.html#diabetes-dataset). "
|
||||
"For this example, we will train a small model on scikit-learn's [diabetes dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html). "
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -279,7 +279,9 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"environment = Environment('my-sklearn-environment')\n",
|
||||
"environment.python.conda_dependencies = CondaDependencies.create(pip_packages=[\n",
|
||||
"environment.python.conda_dependencies = CondaDependencies.create(conda_packages=[\n",
|
||||
" 'pip==20.2.4'],\n",
|
||||
" pip_packages=[\n",
|
||||
" 'azureml-defaults',\n",
|
||||
" 'inference-schema[numpy-support]',\n",
|
||||
" 'joblib',\n",
|
||||
@@ -478,7 +480,9 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"environment = Environment('my-sklearn-environment')\n",
|
||||
"environment.python.conda_dependencies = CondaDependencies.create(pip_packages=[\n",
|
||||
"environment.python.conda_dependencies = CondaDependencies.create(conda_packages=[\n",
|
||||
" 'pip==20.2.4'],\n",
|
||||
" pip_packages=[\n",
|
||||
" 'azureml-defaults',\n",
|
||||
" 'inference-schema[numpy-support]',\n",
|
||||
" 'joblib',\n",
|
||||
|
||||
@@ -105,7 +105,9 @@
|
||||
"from azureml.core.conda_dependencies import CondaDependencies\n",
|
||||
"\n",
|
||||
"environment=Environment('my-sklearn-environment')\n",
|
||||
"environment.python.conda_dependencies = CondaDependencies.create(pip_packages=[\n",
|
||||
"environment.python.conda_dependencies = CondaDependencies.create(conda_packages=[\n",
|
||||
" 'pip==20.2.4'],\n",
|
||||
" pip_packages=[\n",
|
||||
" 'azureml-defaults',\n",
|
||||
" 'inference-schema[numpy-support]',\n",
|
||||
" 'numpy',\n",
|
||||
|
||||
@@ -106,7 +106,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.40.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.41.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -358,6 +358,7 @@
|
||||
"# cause errors. Please take extra care when specifying your dependencies in a production environment.\n",
|
||||
"myenv = CondaDependencies.create(\n",
|
||||
" python_version=python_version,\n",
|
||||
" conda_packages=['pip==20.2.4'],\n",
|
||||
" pip_packages=['pyyaml', sklearn_dep, pandas_dep] + azureml_pip_packages)\n",
|
||||
"\n",
|
||||
"with open(\"myenv.yml\",\"w\") as f:\n",
|
||||
|
||||
@@ -1,5 +1,11 @@
|
||||
FROM mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.0.3-cudnn8-ubuntu18.04:20211111.v1
|
||||
|
||||
# CUDA repository key rotation: https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212771
|
||||
RUN apt-key del 7fa2af80
|
||||
ENV distro ubuntu1804
|
||||
ENV arch x86_64
|
||||
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/$distro/$arch/3bf863cc.pub
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python-opengl \
|
||||
rsync \
|
||||
@@ -57,6 +63,10 @@ RUN pip install --no-cache-dir \
|
||||
lz4 \
|
||||
psutil \
|
||||
setproctitle
|
||||
|
||||
# This is required for ray 0.8.7
|
||||
RUN pip install -U aiohttp==3.7.4
|
||||
|
||||
# This is needed for mpi to locate libpython
|
||||
ENV LD_LIBRARY_PATH $AZUREML_CONDA_ENVIRONMENT_PATH/lib:$LD_LIBRARY_PATH
|
||||
|
||||
|
||||
@@ -97,7 +97,7 @@
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"# Check core SDK version number\n",
|
||||
"print(\"Azure Machine Learning SDK Version: \", azureml.core.VERSION)"
|
||||
"print(\"Azure Machine Learning SDK version: \", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -242,11 +242,7 @@
|
||||
" register(workspace=ws)\n",
|
||||
"ray_cpu_build_details = ray_cpu_env.build(workspace=ws)\n",
|
||||
"\n",
|
||||
"import time\n",
|
||||
"while ray_cpu_build_details.status not in ['Succeeded', 'Failed']:\n",
|
||||
" print(f'Awaiting completion of ray CPU environment build. Current status is: {ray_cpu_build_details.status}')\n",
|
||||
" time.sleep(30)\n",
|
||||
"print(f'status={ray_cpu_build_details.status}')"
|
||||
"ray_cpu_build_details.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -279,11 +275,7 @@
|
||||
" register(workspace=ws)\n",
|
||||
"ray_gpu_build_details = ray_gpu_env.build(workspace=ws)\n",
|
||||
"\n",
|
||||
"import time\n",
|
||||
"while ray_gpu_build_details.status not in ['Succeeded', 'Failed']:\n",
|
||||
" print(f'Awaiting completion of ray GPU environment build. Current status is: {ray_gpu_build_details.status}')\n",
|
||||
" time.sleep(30)\n",
|
||||
"print(f'status={ray_gpu_build_details.status}')"
|
||||
"ray_gpu_build_details.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -255,11 +255,7 @@
|
||||
" register(workspace=ws)\n",
|
||||
"ray_env_build_details = ray_environment.build(workspace=ws)\n",
|
||||
"\n",
|
||||
"# import time\n",
|
||||
"while ray_env_build_details.status not in ['Succeeded', 'Failed']:\n",
|
||||
" print(f'Awaiting completion of environment build. Current status is: {ray_env_build_details.status}')\n",
|
||||
" time.sleep(30)\n",
|
||||
"print(f'status={ray_env_build_details.status}')"
|
||||
"ray_env_build_details.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -223,11 +223,7 @@
|
||||
" register(workspace=ws)\n",
|
||||
"ray_env_build_details = ray_environment.build(workspace=ws)\n",
|
||||
"\n",
|
||||
"import time\n",
|
||||
"while ray_env_build_details.status not in ['Succeeded', 'Failed']:\n",
|
||||
" print(f'Awaiting completion of environment build. Current status is: {ray_env_build_details.status}')\n",
|
||||
" time.sleep(30)\n",
|
||||
"print(f'status={ray_env_build_details.status}')"
|
||||
"ray_env_build_details.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -8,10 +8,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
rm -rf /usr/share/man/*
|
||||
|
||||
RUN conda install -y conda=4.7.12 python=3.7 && conda clean -ay && \
|
||||
pip install ray-on-aml==0.1.6 & \
|
||||
pip install --upgrade ray==0.8.3 \
|
||||
ray[rllib,dashboard,tune]==0.8.3 & \
|
||||
RUN conda install -y conda=4.12.0 python=3.7 && conda clean -ay
|
||||
RUN pip install ray-on-aml==0.1.6 & \
|
||||
pip install --no-cache-dir \
|
||||
azureml-defaults \
|
||||
azureml-dataset-runtime[fuse,pandas] \
|
||||
@@ -28,7 +26,9 @@ RUN conda install -y conda=4.7.12 python=3.7 && conda clean -ay && \
|
||||
psutil \
|
||||
setproctitle \
|
||||
pygame \
|
||||
gym[atari]==0.17.3 && \
|
||||
gym[classic_control]==0.19.0 && \
|
||||
conda install -y -c conda-forge x264='1!152.20180717' ffmpeg=4.0.2 && \
|
||||
conda install -c anaconda opencv
|
||||
|
||||
RUN pip install --upgrade ray==0.8.3 \
|
||||
ray[rllib,dashboard,tune]==0.8.3
|
||||
@@ -246,7 +246,9 @@
|
||||
"ray_environment = Environment. \\\n",
|
||||
" from_dockerfile(name=ray_environment_name, dockerfile=ray_environment_dockerfile_path). \\\n",
|
||||
" register(workspace=ws)\n",
|
||||
"ray_gpu_build_details = ray_environment.build(workspace=ws)"
|
||||
"ray_cpu_build_details = ray_environment.build(workspace=ws)\n",
|
||||
"\n",
|
||||
"ray_cpu_build_details.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -95,7 +95,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.40.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.41.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
Binary file not shown.
@@ -30,7 +30,7 @@ _categorical_columns = [
|
||||
|
||||
|
||||
def fetch_census_dataset():
|
||||
"""Fetch the Adult Census Dataset.
|
||||
"""Fetch the Adult Census Dataset
|
||||
|
||||
This uses a particular URL for the Adult Census dataset. The code
|
||||
is a simplified version of fetch_openml() in sklearn.
|
||||
@@ -39,45 +39,25 @@ def fetch_census_dataset():
|
||||
https://openml.org/data/v1/download/1595261.gz
|
||||
(as of 2021-03-31)
|
||||
"""
|
||||
|
||||
dataset_path = "1595261.gz"
|
||||
|
||||
try:
|
||||
from urllib import urlretrieve
|
||||
except ImportError:
|
||||
from urllib.request import urlretrieve
|
||||
file_stream = gzip.GzipFile(filename=dataset_path, mode='rb')
|
||||
|
||||
filename = "1595261.gz"
|
||||
data_url = "https://rainotebookscdn.blob.core.windows.net/datasets/"
|
||||
|
||||
remaining_attempts = 5
|
||||
sleep_duration = 10
|
||||
while remaining_attempts > 0:
|
||||
try:
|
||||
urlretrieve(data_url + filename, filename)
|
||||
|
||||
http_stream = gzip.GzipFile(filename=filename, mode='rb')
|
||||
|
||||
with closing(http_stream):
|
||||
with closing(file_stream):
|
||||
def _stream_generator(response):
|
||||
for line in response:
|
||||
yield line.decode('utf-8')
|
||||
|
||||
stream = _stream_generator(http_stream)
|
||||
stream = _stream_generator(file_stream)
|
||||
data = arff.load(stream)
|
||||
except Exception as exc: # noqa: B902
|
||||
remaining_attempts -= 1
|
||||
print("Error downloading dataset from {} ({} attempt(s) remaining)"
|
||||
.format(data_url, remaining_attempts))
|
||||
print(exc)
|
||||
sleep(sleep_duration)
|
||||
sleep_duration *= 2
|
||||
continue
|
||||
else:
|
||||
# dataset successfully downloaded
|
||||
break
|
||||
else:
|
||||
raise Exception("Could not retrieve dataset from {}.".format(data_url))
|
||||
except Exception as exc:
|
||||
raise Exception("Could not load dataset from {} with exception {}".format(dataset_path, exc))
|
||||
|
||||
attributes = OrderedDict(data['attributes'])
|
||||
arff_columns = list(attributes)
|
||||
|
||||
raw_df = pd.DataFrame(data=data['data'], columns=arff_columns)
|
||||
|
||||
target_column_name = 'class'
|
||||
|
||||
@@ -100,7 +100,7 @@
|
||||
"\n",
|
||||
"# Check core SDK version number\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using SDK version 1.40.0, you are currently running version\", azureml.core.VERSION)"
|
||||
"print(\"This notebook was created using SDK version 1.41.0, you are currently running version\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -363,6 +363,43 @@
|
||||
"run.log_image(name='Hyperbolic Tangent', plot=plt)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Logging for when more Metric Names are required\n",
|
||||
"\n",
|
||||
"Limits on logging are internally enforced to ensure a smooth experience, however these can sometimes be limiting, particularly in terms of the limit on metric names.\n",
|
||||
"\n",
|
||||
"The \"Logging Vectors\" or \"Logging Tables\" examples previously can be expanded upon to use up to 15 columns to increase this limit, with the information still being presented in Run Details as a chart, and being directly comparable in experiment reports.\n",
|
||||
"\n",
|
||||
"**Note:** see [Azure Machine Learning Limits Documentation](https://aka.ms/azure-machine-learning-limits) for more information on service limits.\n",
|
||||
"**Note:** tables logged into the run are expected to be relatively small. Logging very large tables into Azure ML can result in reduced performance. If you need to store large amounts of data associated with the run, you can write the data to file that will be uploaded."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import random\n",
|
||||
"metricNames = [ \"Accuracy\", \"Precision\", \"Recall\" ]\n",
|
||||
"columnNames = [ \"expected\", \"actual\", \"calculated\", \"inferred\", \"determined\", \"predicted\", \"forecast\", \"speculated\", \"assumed\", \"required\", \"intended\", \"deduced\", \"theorized\", \"hoped\", \"hypothesized\" ]\n",
|
||||
"\n",
|
||||
"for step in range(1000):\n",
|
||||
" for metricName in metricNames:\n",
|
||||
"\n",
|
||||
" metricKeyValueDictionary={}\n",
|
||||
" for column in columnNames:\n",
|
||||
" metricKeyValueDictionary[column] = random.randrange(0, step + 1)\n",
|
||||
"\n",
|
||||
" run.log_row(\n",
|
||||
" metricName,\n",
|
||||
" \"Example row for metric \" + metricName,\n",
|
||||
" **metricKeyValueDictionary)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -498,7 +535,6 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"os.makedirs('files', exist_ok=True)\n",
|
||||
"\n",
|
||||
"for f in run.get_file_names():\n",
|
||||
|
||||
@@ -102,7 +102,7 @@
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.40.0 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version 1.41.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user