Compare commits
1 Commits
master
...
azureml-sd
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
71e061b193 |
@@ -1,21 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -99,16 +83,6 @@
|
||||
"This sample notebook may use features that are not available in previous versions of the Azure ML SDK."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.38.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -144,18 +118,18 @@
|
||||
"ws = Workspace.from_config()\n",
|
||||
"\n",
|
||||
"# choose a name for experiment\n",
|
||||
"experiment_name = 'automl-classification-bmarketing-all'\n",
|
||||
"experiment_name = \"automl-classification-bmarketing-all\"\n",
|
||||
"\n",
|
||||
"experiment = Experiment(ws, experiment_name)\n",
|
||||
"\n",
|
||||
"output = {}\n",
|
||||
"output['Subscription ID'] = ws.subscription_id\n",
|
||||
"output['Workspace'] = ws.name\n",
|
||||
"output['Resource Group'] = ws.resource_group\n",
|
||||
"output['Location'] = ws.location\n",
|
||||
"output['Experiment Name'] = experiment.name\n",
|
||||
"pd.set_option('display.max_colwidth', -1)\n",
|
||||
"outputDf = pd.DataFrame(data = output, index = [''])\n",
|
||||
"output[\"Subscription ID\"] = ws.subscription_id\n",
|
||||
"output[\"Workspace\"] = ws.name\n",
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Experiment Name\"] = experiment.name\n",
|
||||
"pd.set_option(\"display.max_colwidth\", -1)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
]
|
||||
},
|
||||
@@ -176,7 +150,9 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
|
||||
@@ -188,12 +164,12 @@
|
||||
"# Verify that cluster does not exist already\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)\n",
|
||||
" print('Found existing cluster, use it.')\n",
|
||||
" print(\"Found existing cluster, use it.\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',\n",
|
||||
" max_nodes=6)\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=\"STANDARD_DS12_V2\", max_nodes=6\n",
|
||||
" )\n",
|
||||
" compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
"compute_target.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
@@ -226,7 +202,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = pd.read_csv(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv\")\n",
|
||||
"data = pd.read_csv(\n",
|
||||
" \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv\"\n",
|
||||
")\n",
|
||||
"data.head()"
|
||||
]
|
||||
},
|
||||
@@ -241,7 +219,12 @@
|
||||
"\n",
|
||||
"missing_rate = 0.75\n",
|
||||
"n_missing_samples = int(np.floor(data.shape[0] * missing_rate))\n",
|
||||
"missing_samples = np.hstack((np.zeros(data.shape[0] - n_missing_samples, dtype=np.bool), np.ones(n_missing_samples, dtype=np.bool)))\n",
|
||||
"missing_samples = np.hstack(\n",
|
||||
" (\n",
|
||||
" np.zeros(data.shape[0] - n_missing_samples, dtype=np.bool),\n",
|
||||
" np.ones(n_missing_samples, dtype=np.bool),\n",
|
||||
" )\n",
|
||||
")\n",
|
||||
"rng = np.random.RandomState(0)\n",
|
||||
"rng.shuffle(missing_samples)\n",
|
||||
"missing_features = rng.randint(0, data.shape[1], n_missing_samples)\n",
|
||||
@@ -254,19 +237,21 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if not os.path.isdir('data'):\n",
|
||||
" os.mkdir('data')\n",
|
||||
" \n",
|
||||
"if not os.path.isdir(\"data\"):\n",
|
||||
" os.mkdir(\"data\")\n",
|
||||
"# Save the train data to a csv to be uploaded to the datastore\n",
|
||||
"pd.DataFrame(data).to_csv(\"data/train_data.csv\", index=False)\n",
|
||||
"\n",
|
||||
"ds = ws.get_default_datastore()\n",
|
||||
"ds.upload(src_dir='./data', target_path='bankmarketing', overwrite=True, show_progress=True)\n",
|
||||
"\n",
|
||||
"ds.upload(\n",
|
||||
" src_dir=\"./data\", target_path=\"bankmarketing\", overwrite=True, show_progress=True\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Upload the training data as a tabular dataset for access during training on remote compute\n",
|
||||
"train_data = Dataset.Tabular.from_delimited_files(path=ds.path('bankmarketing/train_data.csv'))\n",
|
||||
"train_data = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=ds.path(\"bankmarketing/train_data.csv\")\n",
|
||||
")\n",
|
||||
"label = \"y\""
|
||||
]
|
||||
},
|
||||
@@ -343,27 +328,30 @@
|
||||
" \"max_concurrent_iterations\": 4,\n",
|
||||
" \"max_cores_per_iteration\": -1,\n",
|
||||
" # \"n_cross_validations\": 2,\n",
|
||||
" \"primary_metric\": 'AUC_weighted',\n",
|
||||
" \"featurization\": 'auto',\n",
|
||||
" \"primary_metric\": \"AUC_weighted\",\n",
|
||||
" \"featurization\": \"auto\",\n",
|
||||
" \"verbosity\": logging.INFO,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(task = 'classification',\n",
|
||||
" debug_log = 'automl_errors.log',\n",
|
||||
"automl_config = AutoMLConfig(\n",
|
||||
" task=\"classification\",\n",
|
||||
" debug_log=\"automl_errors.log\",\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" experiment_exit_score=0.9984,\n",
|
||||
" blocked_models = ['KNN','LinearSVM'],\n",
|
||||
" blocked_models=[\"KNN\", \"LinearSVM\"],\n",
|
||||
" enable_onnx_compatible_models=True,\n",
|
||||
" training_data=train_data,\n",
|
||||
" label_column_name=label,\n",
|
||||
" validation_data=validation_dataset,\n",
|
||||
" **automl_settings\n",
|
||||
" **automl_settings,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while. Validation errors and current status will be shown when setting `show_output=True` and the execution will be synchronous."
|
||||
]
|
||||
@@ -379,7 +367,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"Run the following cell to access previous runs. Uncomment the cell below and update the run_id."
|
||||
]
|
||||
@@ -431,7 +421,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download the featurization summary JSON file locally\n",
|
||||
"best_run.download_file(\"outputs/featurization_summary.json\", \"featurization_summary.json\")\n",
|
||||
"best_run.download_file(\n",
|
||||
" \"outputs/featurization_summary.json\", \"featurization_summary.json\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Render the JSON as a pandas DataFrame\n",
|
||||
"with open(\"featurization_summary.json\", \"r\") as f:\n",
|
||||
@@ -454,6 +446,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"\n",
|
||||
"RunDetails(remote_run).show()"
|
||||
]
|
||||
},
|
||||
@@ -473,9 +466,12 @@
|
||||
"source": [
|
||||
"# Wait for the best model explanation run to complete\n",
|
||||
"from azureml.core.run import Run\n",
|
||||
"\n",
|
||||
"model_explainability_run_id = remote_run.id + \"_\" + \"ModelExplain\"\n",
|
||||
"print(model_explainability_run_id)\n",
|
||||
"model_explainability_run = Run(experiment=experiment, run_id=model_explainability_run_id)\n",
|
||||
"model_explainability_run = Run(\n",
|
||||
" experiment=experiment, run_id=model_explainability_run_id\n",
|
||||
")\n",
|
||||
"model_explainability_run.wait_for_completion()\n",
|
||||
"\n",
|
||||
"# Get the best run object\n",
|
||||
@@ -556,6 +552,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.runtime.onnx_convert import OnnxConverter\n",
|
||||
"\n",
|
||||
"onnx_fl_path = \"./best_model.onnx\"\n",
|
||||
"OnnxConverter.save_onnx_model(onnx_mdl, onnx_fl_path)"
|
||||
]
|
||||
@@ -580,13 +577,17 @@
|
||||
"\n",
|
||||
"from azureml.automl.runtime.onnx_convert import OnnxInferenceHelper\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_onnx_res(run):\n",
|
||||
" res_path = 'onnx_resource.json'\n",
|
||||
" run.download_file(name=constants.MODEL_RESOURCE_PATH_ONNX, output_file_path=res_path)\n",
|
||||
" res_path = \"onnx_resource.json\"\n",
|
||||
" run.download_file(\n",
|
||||
" name=constants.MODEL_RESOURCE_PATH_ONNX, output_file_path=res_path\n",
|
||||
" )\n",
|
||||
" with open(res_path) as f:\n",
|
||||
" result = json.load(f)\n",
|
||||
" return result\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"if sys.version_info < OnnxConvertConstants.OnnxIncompatiblePythonVersion:\n",
|
||||
" test_df = test_dataset.to_pandas_dataframe()\n",
|
||||
" mdl_bytes = onnx_mdl.SerializeToString()\n",
|
||||
@@ -598,7 +599,7 @@
|
||||
" print(pred_onnx)\n",
|
||||
" print(pred_prob_onnx)\n",
|
||||
"else:\n",
|
||||
" print('Please use Python version 3.6 or 3.7 to run the inference helper.')"
|
||||
" print(\"Please use Python version 3.6 or 3.7 to run the inference helper.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -609,7 +610,7 @@
|
||||
"\n",
|
||||
"### Retrieve the Best Model\n",
|
||||
"\n",
|
||||
"Below we select the best pipeline from our iterations. The `get_best_child` method returns the Run object for the best model based on the default primary metric. There are additional flags that can be passed to the method if we want to retrieve the best Run based on any of the other supported metrics, or if we are just interested in the best run among the ONNX compatible runs. As always, you can execute `remote_run.get_best_child??` in a new cell to view the source or docs for the function."
|
||||
"Below we select the best pipeline from our iterations. The `get_best_child` method returns the Run object for the best model based on the default primary metric. There are additional flags that can be passed to the method if we want to retrieve the best Run based on any of the other supported metrics, or if we are just interested in the best run among the ONNX compatible runs. As always, you can execute `??remote_run.get_best_child` in a new cell to view the source or docs for the function."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -618,7 +619,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"remote_run.get_best_child??"
|
||||
"??remote_run.get_best_child"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -647,11 +648,11 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_name = best_run.properties['model_name']\n",
|
||||
"model_name = best_run.properties[\"model_name\"]\n",
|
||||
"\n",
|
||||
"script_file_name = 'inference/score.py'\n",
|
||||
"script_file_name = \"inference/score.py\"\n",
|
||||
"\n",
|
||||
"best_run.download_file('outputs/scoring_file_v_1_0_0.py', 'inference/score.py')"
|
||||
"best_run.download_file(\"outputs/scoring_file_v_1_0_0.py\", \"inference/score.py\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -668,11 +669,15 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"description = 'AutoML Model trained on bank marketing data to predict if a client will subscribe to a term deposit'\n",
|
||||
"description = \"AutoML Model trained on bank marketing data to predict if a client will subscribe to a term deposit\"\n",
|
||||
"tags = None\n",
|
||||
"model = remote_run.register_model(model_name = model_name, description = description, tags = tags)\n",
|
||||
"model = remote_run.register_model(\n",
|
||||
" model_name=model_name, description=description, tags=tags\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(remote_run.model_id) # This will be written to the script file later in the notebook."
|
||||
"print(\n",
|
||||
" remote_run.model_id\n",
|
||||
") # This will be written to the script file later in the notebook."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -690,16 +695,20 @@
|
||||
"source": [
|
||||
"from azureml.core.model import InferenceConfig\n",
|
||||
"from azureml.core.webservice import AciWebservice\n",
|
||||
"from azureml.core.webservice import Webservice\n",
|
||||
"from azureml.core.model import Model\n",
|
||||
"from azureml.core.environment import Environment\n",
|
||||
"\n",
|
||||
"inference_config = InferenceConfig(environment = best_run.get_environment(), entry_script=script_file_name)\n",
|
||||
"inference_config = InferenceConfig(entry_script=script_file_name)\n",
|
||||
"\n",
|
||||
"aciconfig = AciWebservice.deploy_configuration(cpu_cores = 2, \n",
|
||||
"aciconfig = AciWebservice.deploy_configuration(\n",
|
||||
" cpu_cores=2,\n",
|
||||
" memory_gb=2,\n",
|
||||
" tags = {'area': \"bmData\", 'type': \"automl_classification\"}, \n",
|
||||
" description = 'sample service for Automl Classification')\n",
|
||||
" tags={\"area\": \"bmData\", \"type\": \"automl_classification\"},\n",
|
||||
" description=\"sample service for Automl Classification\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"aci_service_name = 'automl-sample-bankmarketing-all'\n",
|
||||
"aci_service_name = model_name.lower()\n",
|
||||
"print(aci_service_name)\n",
|
||||
"aci_service = Model.deploy(ws, aci_service_name, [model], inference_config, aciconfig)\n",
|
||||
"aci_service.wait_for_deployment(True)\n",
|
||||
@@ -751,8 +760,8 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_test = test_dataset.drop_columns(columns=['y'])\n",
|
||||
"y_test = test_dataset.keep_columns(columns=['y'], validate=True)\n",
|
||||
"X_test = test_dataset.drop_columns(columns=[\"y\"])\n",
|
||||
"y_test = test_dataset.keep_columns(columns=[\"y\"], validate=True)\n",
|
||||
"test_dataset.take(5).to_pandas_dataframe()"
|
||||
]
|
||||
},
|
||||
@@ -774,13 +783,13 @@
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"X_test_json = X_test.to_json(orient='records')\n",
|
||||
"data = \"{\\\"data\\\": \" + X_test_json +\"}\"\n",
|
||||
"headers = {'Content-Type': 'application/json'}\n",
|
||||
"X_test_json = X_test.to_json(orient=\"records\")\n",
|
||||
"data = '{\"data\": ' + X_test_json + \"}\"\n",
|
||||
"headers = {\"Content-Type\": \"application/json\"}\n",
|
||||
"\n",
|
||||
"resp = requests.post(aci_service.scoring_uri, data, headers=headers)\n",
|
||||
"\n",
|
||||
"y_pred = json.loads(json.loads(resp.text))['result']"
|
||||
"y_pred = json.loads(json.loads(resp.text))[\"result\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -814,19 +823,25 @@
|
||||
"import itertools\n",
|
||||
"\n",
|
||||
"cf = confusion_matrix(actual, y_pred)\n",
|
||||
"plt.imshow(cf,cmap=plt.cm.Blues,interpolation='nearest')\n",
|
||||
"plt.imshow(cf, cmap=plt.cm.Blues, interpolation=\"nearest\")\n",
|
||||
"plt.colorbar()\n",
|
||||
"plt.title('Confusion Matrix')\n",
|
||||
"plt.xlabel('Predicted')\n",
|
||||
"plt.ylabel('Actual')\n",
|
||||
"class_labels = ['no','yes']\n",
|
||||
"plt.title(\"Confusion Matrix\")\n",
|
||||
"plt.xlabel(\"Predicted\")\n",
|
||||
"plt.ylabel(\"Actual\")\n",
|
||||
"class_labels = [\"no\", \"yes\"]\n",
|
||||
"tick_marks = np.arange(len(class_labels))\n",
|
||||
"plt.xticks(tick_marks, class_labels)\n",
|
||||
"plt.yticks([-0.5,0,1,1.5],['','no','yes',''])\n",
|
||||
"plt.yticks([-0.5, 0, 1, 1.5], [\"\", \"no\", \"yes\", \"\"])\n",
|
||||
"# plotting text value inside cells\n",
|
||||
"thresh = cf.max() / 2.\n",
|
||||
"thresh = cf.max() / 2.0\n",
|
||||
"for i, j in itertools.product(range(cf.shape[0]), range(cf.shape[1])):\n",
|
||||
" plt.text(j,i,format(cf[i,j],'d'),horizontalalignment='center',color='white' if cf[i,j] >thresh else 'black')\n",
|
||||
" plt.text(\n",
|
||||
" j,\n",
|
||||
" i,\n",
|
||||
" format(cf[i, j], \"d\"),\n",
|
||||
" horizontalalignment=\"center\",\n",
|
||||
" color=\"white\" if cf[i, j] > thresh else \"black\",\n",
|
||||
" )\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
@@ -891,9 +906,9 @@
|
||||
"friendly_name": "Automated ML run with basic edition features.",
|
||||
"index_order": 5,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -1,21 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -87,16 +71,6 @@
|
||||
"This sample notebook may use features that are not available in previous versions of the Azure ML SDK."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.38.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -106,18 +80,18 @@
|
||||
"ws = Workspace.from_config()\n",
|
||||
"\n",
|
||||
"# choose a name for experiment\n",
|
||||
"experiment_name = 'automl-classification-ccard-remote'\n",
|
||||
"experiment_name = \"automl-classification-ccard-remote\"\n",
|
||||
"\n",
|
||||
"experiment = Experiment(ws, experiment_name)\n",
|
||||
"\n",
|
||||
"output = {}\n",
|
||||
"output['Subscription ID'] = ws.subscription_id\n",
|
||||
"output['Workspace'] = ws.name\n",
|
||||
"output['Resource Group'] = ws.resource_group\n",
|
||||
"output['Location'] = ws.location\n",
|
||||
"output['Experiment Name'] = experiment.name\n",
|
||||
"pd.set_option('display.max_colwidth', -1)\n",
|
||||
"outputDf = pd.DataFrame(data = output, index = [''])\n",
|
||||
"output[\"Subscription ID\"] = ws.subscription_id\n",
|
||||
"output[\"Workspace\"] = ws.name\n",
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Experiment Name\"] = experiment.name\n",
|
||||
"pd.set_option(\"display.max_colwidth\", -1)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
]
|
||||
},
|
||||
@@ -150,12 +124,12 @@
|
||||
"# Verify that cluster does not exist already\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)\n",
|
||||
" print('Found existing cluster, use it.')\n",
|
||||
" print(\"Found existing cluster, use it.\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',\n",
|
||||
" max_nodes=6)\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=\"STANDARD_DS12_V2\", max_nodes=6\n",
|
||||
" )\n",
|
||||
" compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
"compute_target.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
@@ -184,7 +158,7 @@
|
||||
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\"\n",
|
||||
"dataset = Dataset.Tabular.from_delimited_files(data)\n",
|
||||
"training_data, validation_data = dataset.random_split(percentage=0.8, seed=223)\n",
|
||||
"label_column_name = 'Class'"
|
||||
"label_column_name = \"Class\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -215,19 +189,20 @@
|
||||
"source": [
|
||||
"automl_settings = {\n",
|
||||
" \"n_cross_validations\": 3,\n",
|
||||
" \"primary_metric\": 'AUC_weighted',\n",
|
||||
" \"primary_metric\": \"average_precision_score_weighted\",\n",
|
||||
" \"enable_early_stopping\": True,\n",
|
||||
" \"max_concurrent_iterations\": 2, # This is a limit for testing purpose, please increase it as per cluster size\n",
|
||||
" \"experiment_timeout_hours\": 0.25, # This is a time limit for testing purposes, remove it for real use cases, this will drastically limit ablity to find the best model possible\n",
|
||||
" \"verbosity\": logging.INFO,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(task = 'classification',\n",
|
||||
" debug_log = 'automl_errors.log',\n",
|
||||
"automl_config = AutoMLConfig(\n",
|
||||
" task=\"classification\",\n",
|
||||
" debug_log=\"automl_errors.log\",\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=training_data,\n",
|
||||
" label_column_name=label_column_name,\n",
|
||||
" **automl_settings\n",
|
||||
" **automl_settings,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -287,6 +262,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"\n",
|
||||
"RunDetails(remote_run).show()"
|
||||
]
|
||||
},
|
||||
@@ -353,8 +329,12 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# convert the test data to dataframe\n",
|
||||
"X_test_df = validation_data.drop_columns(columns=[label_column_name]).to_pandas_dataframe()\n",
|
||||
"y_test_df = validation_data.keep_columns(columns=[label_column_name], validate=True).to_pandas_dataframe()"
|
||||
"X_test_df = validation_data.drop_columns(\n",
|
||||
" columns=[label_column_name]\n",
|
||||
").to_pandas_dataframe()\n",
|
||||
"y_test_df = validation_data.keep_columns(\n",
|
||||
" columns=[label_column_name], validate=True\n",
|
||||
").to_pandas_dataframe()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -389,19 +369,25 @@
|
||||
"import itertools\n",
|
||||
"\n",
|
||||
"cf = confusion_matrix(y_test_df.values, y_pred)\n",
|
||||
"plt.imshow(cf,cmap=plt.cm.Blues,interpolation='nearest')\n",
|
||||
"plt.imshow(cf, cmap=plt.cm.Blues, interpolation=\"nearest\")\n",
|
||||
"plt.colorbar()\n",
|
||||
"plt.title('Confusion Matrix')\n",
|
||||
"plt.xlabel('Predicted')\n",
|
||||
"plt.ylabel('Actual')\n",
|
||||
"class_labels = ['False','True']\n",
|
||||
"plt.title(\"Confusion Matrix\")\n",
|
||||
"plt.xlabel(\"Predicted\")\n",
|
||||
"plt.ylabel(\"Actual\")\n",
|
||||
"class_labels = [\"False\", \"True\"]\n",
|
||||
"tick_marks = np.arange(len(class_labels))\n",
|
||||
"plt.xticks(tick_marks, class_labels)\n",
|
||||
"plt.yticks([-0.5,0,1,1.5],['','False','True',''])\n",
|
||||
"plt.yticks([-0.5, 0, 1, 1.5], [\"\", \"False\", \"True\", \"\"])\n",
|
||||
"# plotting text value inside cells\n",
|
||||
"thresh = cf.max() / 2.\n",
|
||||
"thresh = cf.max() / 2.0\n",
|
||||
"for i, j in itertools.product(range(cf.shape[0]), range(cf.shape[1])):\n",
|
||||
" plt.text(j,i,format(cf[i,j],'d'),horizontalalignment='center',color='white' if cf[i,j] >thresh else 'black')\n",
|
||||
" plt.text(\n",
|
||||
" j,\n",
|
||||
" i,\n",
|
||||
" format(cf[i, j], \"d\"),\n",
|
||||
" horizontalalignment=\"center\",\n",
|
||||
" color=\"white\" if cf[i, j] > thresh else \"black\",\n",
|
||||
" )\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
@@ -418,7 +404,7 @@
|
||||
"source": [
|
||||
"This Credit Card fraud Detection dataset is made available under the Open Database License: http://opendatacommons.org/licenses/odbl/1.0/. Any rights in individual contents of the database are licensed under the Database Contents License: http://opendatacommons.org/licenses/dbcl/1.0/ and is available at: https://www.kaggle.com/mlg-ulb/creditcardfraud\n",
|
||||
"\n",
|
||||
"The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00c3\u00a9 Libre de Bruxelles) on big data mining and fraud detection.\n",
|
||||
"The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Université Libre de Bruxelles) on big data mining and fraud detection.\n",
|
||||
"More details on current and past projects on related topics are available on https://www.researchgate.net/project/Fraud-detection-5 and the page of the DefeatFraud project\n",
|
||||
"\n",
|
||||
"Please cite the following works:\n",
|
||||
@@ -431,13 +417,13 @@
|
||||
"\n",
|
||||
"Dal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)\n",
|
||||
"\n",
|
||||
"Carcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-A\u00c3\u00abl; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier\n",
|
||||
"Carcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-Aël; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier\n",
|
||||
"\n",
|
||||
"Carcillo, Fabrizio; Le Borgne, Yann-A\u00c3\u00abl; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing\n",
|
||||
"Carcillo, Fabrizio; Le Borgne, Yann-Aël; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing\n",
|
||||
"\n",
|
||||
"Bertrand Lebichot, Yann-A\u00c3\u00abl Le Borgne, Liyun He, Frederic Obl\u00c3\u00a9, Gianluca Bontempi Deep-Learning Domain Adaptation Techniques for Credit Cards Fraud Detection, INNSBDDL 2019: Recent Advances in Big Data and Deep Learning, pp 78-88, 2019\n",
|
||||
"Bertrand Lebichot, Yann-Aël Le Borgne, Liyun He, Frederic Oblé, Gianluca Bontempi Deep-Learning Domain Adaptation Techniques for Credit Cards Fraud Detection, INNSBDDL 2019: Recent Advances in Big Data and Deep Learning, pp 78-88, 2019\n",
|
||||
"\n",
|
||||
"Fabrizio Carcillo, Yann-A\u00c3\u00abl Le Borgne, Olivier Caelen, Frederic Obl\u00c3\u00a9, Gianluca Bontempi Combining Unsupervised and Supervised Learning in Credit Card Fraud Detection Information Sciences, 2019"
|
||||
"Fabrizio Carcillo, Yann-Aël Le Borgne, Olivier Caelen, Frederic Oblé, Gianluca Bontempi Combining Unsupervised and Supervised Learning in Credit Card Fraud Detection Information Sciences, 2019"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -465,9 +451,9 @@
|
||||
"friendly_name": "Classification of credit card fraudulent transactions using Automated ML",
|
||||
"index_order": 5,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -1,21 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -91,16 +75,6 @@
|
||||
"This sample notebook may use features that are not available in previous versions of the Azure ML SDK."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.38.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -117,18 +91,18 @@
|
||||
"ws = Workspace.from_config()\n",
|
||||
"\n",
|
||||
"# Choose an experiment name.\n",
|
||||
"experiment_name = 'automl-classification-text-dnn'\n",
|
||||
"experiment_name = \"automl-classification-text-dnn\"\n",
|
||||
"\n",
|
||||
"experiment = Experiment(ws, experiment_name)\n",
|
||||
"\n",
|
||||
"output = {}\n",
|
||||
"output['Subscription ID'] = ws.subscription_id\n",
|
||||
"output['Workspace Name'] = ws.name\n",
|
||||
"output['Resource Group'] = ws.resource_group\n",
|
||||
"output['Location'] = ws.location\n",
|
||||
"output['Experiment Name'] = experiment.name\n",
|
||||
"pd.set_option('display.max_colwidth', -1)\n",
|
||||
"outputDf = pd.DataFrame(data = output, index = [''])\n",
|
||||
"output[\"Subscription ID\"] = ws.subscription_id\n",
|
||||
"output[\"Workspace Name\"] = ws.name\n",
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Experiment Name\"] = experiment.name\n",
|
||||
"pd.set_option(\"display.max_colwidth\", -1)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
]
|
||||
},
|
||||
@@ -161,13 +135,15 @@
|
||||
"# Verify that cluster does not exist already\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)\n",
|
||||
" print('Found existing cluster, use it.')\n",
|
||||
" print(\"Found existing cluster, use it.\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_NC6\", # CPU for BiLSTM, such as \"STANDARD_DS12_V2\" \n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=\"STANDARD_NC6\", # CPU for BiLSTM, such as \"STANDARD_D2_V2\"\n",
|
||||
" # To use BERT (this is recommended for best performance), select a GPU such as \"STANDARD_NC6\"\n",
|
||||
" # or similar GPU option\n",
|
||||
" # available in your workspace\n",
|
||||
" max_nodes = num_nodes)\n",
|
||||
" max_nodes=num_nodes,\n",
|
||||
" )\n",
|
||||
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
"compute_target.wait_for_completion(show_output=True)"
|
||||
@@ -189,38 +165,52 @@
|
||||
"source": [
|
||||
"data_dir = \"text-dnn-data\" # Local directory to store data\n",
|
||||
"blobstore_datadir = data_dir # Blob store directory to store data in\n",
|
||||
"target_column_name = 'y'\n",
|
||||
"feature_column_name = 'X'\n",
|
||||
"target_column_name = \"y\"\n",
|
||||
"feature_column_name = \"X\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_20newsgroups_data():\n",
|
||||
" '''Fetches 20 Newsgroups data from scikit-learn\n",
|
||||
" \"\"\"Fetches 20 Newsgroups data from scikit-learn\n",
|
||||
" Returns them in form of pandas dataframes\n",
|
||||
" '''\n",
|
||||
" remove = ('headers', 'footers', 'quotes')\n",
|
||||
" \"\"\"\n",
|
||||
" remove = (\"headers\", \"footers\", \"quotes\")\n",
|
||||
" categories = [\n",
|
||||
" 'rec.sport.baseball',\n",
|
||||
" 'rec.sport.hockey',\n",
|
||||
" 'comp.graphics',\n",
|
||||
" 'sci.space',\n",
|
||||
" \"rec.sport.baseball\",\n",
|
||||
" \"rec.sport.hockey\",\n",
|
||||
" \"comp.graphics\",\n",
|
||||
" \"sci.space\",\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" data = fetch_20newsgroups(subset = 'train', categories = categories,\n",
|
||||
" shuffle = True, random_state = 42,\n",
|
||||
" remove = remove)\n",
|
||||
" data = pd.DataFrame({feature_column_name: data.data, target_column_name: data.target})\n",
|
||||
" data = fetch_20newsgroups(\n",
|
||||
" subset=\"train\",\n",
|
||||
" categories=categories,\n",
|
||||
" shuffle=True,\n",
|
||||
" random_state=42,\n",
|
||||
" remove=remove,\n",
|
||||
" )\n",
|
||||
" data = pd.DataFrame(\n",
|
||||
" {feature_column_name: data.data, target_column_name: data.target}\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" data_train = data[:200]\n",
|
||||
" data_test = data[200:300]\n",
|
||||
"\n",
|
||||
" data_train = remove_blanks_20news(data_train, feature_column_name, target_column_name)\n",
|
||||
" data_train = remove_blanks_20news(\n",
|
||||
" data_train, feature_column_name, target_column_name\n",
|
||||
" )\n",
|
||||
" data_test = remove_blanks_20news(data_test, feature_column_name, target_column_name)\n",
|
||||
"\n",
|
||||
" return data_train, data_test\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def remove_blanks_20news(data, feature_column_name, target_column_name):\n",
|
||||
"\n",
|
||||
" data[feature_column_name] = data[feature_column_name].replace(r'\\n', ' ', regex=True).apply(lambda x: x.strip())\n",
|
||||
" data = data[data[feature_column_name] != '']\n",
|
||||
" data[feature_column_name] = (\n",
|
||||
" data[feature_column_name]\n",
|
||||
" .replace(r\"\\n\", \" \", regex=True)\n",
|
||||
" .apply(lambda x: x.strip())\n",
|
||||
" )\n",
|
||||
" data = data[data[feature_column_name] != \"\"]\n",
|
||||
"\n",
|
||||
" return data"
|
||||
]
|
||||
@@ -243,15 +233,14 @@
|
||||
"if not os.path.isdir(data_dir):\n",
|
||||
" os.mkdir(data_dir)\n",
|
||||
"\n",
|
||||
"train_data_fname = data_dir + '/train_data.csv'\n",
|
||||
"test_data_fname = data_dir + '/test_data.csv'\n",
|
||||
"train_data_fname = data_dir + \"/train_data.csv\"\n",
|
||||
"test_data_fname = data_dir + \"/test_data.csv\"\n",
|
||||
"\n",
|
||||
"data_train.to_csv(train_data_fname, index=False)\n",
|
||||
"data_test.to_csv(test_data_fname, index=False)\n",
|
||||
"\n",
|
||||
"datastore = ws.get_default_datastore()\n",
|
||||
"datastore.upload(src_dir=data_dir, target_path=blobstore_datadir,\n",
|
||||
" overwrite=True)"
|
||||
"datastore.upload(src_dir=data_dir, target_path=blobstore_datadir, overwrite=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -260,7 +249,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, blobstore_datadir + '/train_data.csv')])"
|
||||
"train_dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, blobstore_datadir + \"/train_data.csv\")]\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -285,7 +276,7 @@
|
||||
"source": [
|
||||
"automl_settings = {\n",
|
||||
" \"experiment_timeout_minutes\": 30,\n",
|
||||
" \"primary_metric\": 'AUC_weighted',\n",
|
||||
" \"primary_metric\": \"accuracy\",\n",
|
||||
" \"max_concurrent_iterations\": num_nodes,\n",
|
||||
" \"max_cores_per_iteration\": -1,\n",
|
||||
" \"enable_dnn\": True,\n",
|
||||
@@ -296,13 +287,14 @@
|
||||
" \"enable_stack_ensemble\": False,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(task = 'classification',\n",
|
||||
" debug_log = 'automl_errors.log',\n",
|
||||
"automl_config = AutoMLConfig(\n",
|
||||
" task=\"classification\",\n",
|
||||
" debug_log=\"automl_errors.log\",\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=train_dataset,\n",
|
||||
" label_column_name=target_column_name,\n",
|
||||
" blocked_models = ['LightGBM', 'XGBoostClassifier'],\n",
|
||||
" **automl_settings\n",
|
||||
" blocked_models=[\"LightGBM\", \"XGBoostClassifier\"],\n",
|
||||
" **automl_settings,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -342,8 +334,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For local inferencing, you can load the model locally via. the method `remote_run.get_output()`. For more information on the arguments expected by this method, you can run `remote_run.get_output??`.\n",
|
||||
"Note that when the model contains BERT, this step will require pytorch and pytorch-transformers installed in your local environment. The exact versions of these packages can be found in the **automl_env.yml** file located in the local copy of your MachineLearningNotebooks folder here:\n",
|
||||
"MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/automl_env.yml\n"
|
||||
"Note that when the model contains BERT, this step will require pytorch and pytorch-transformers installed in your local environment. The exact versions of these packages can be found in the **automl_env.yml** file located in the local copy of your azureml-examples folder here: \"azureml-examples/python-sdk/tutorials/automl-with-azureml\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -370,14 +361,16 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download the featurization summary JSON file locally\n",
|
||||
"best_run.download_file(\"outputs/featurization_summary.json\", \"featurization_summary.json\")\n",
|
||||
"best_run.download_file(\n",
|
||||
" \"outputs/featurization_summary.json\", \"featurization_summary.json\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Render the JSON as a pandas DataFrame\n",
|
||||
"with open(\"featurization_summary.json\", \"r\") as f:\n",
|
||||
" records = json.load(f)\n",
|
||||
"\n",
|
||||
"featurization_summary = pd.DataFrame.from_records(records)\n",
|
||||
"featurization_summary['Transformations'].tolist()"
|
||||
"featurization_summary[\"Transformations\"].tolist()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -402,7 +395,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summary_df = get_result_df(automl_run)\n",
|
||||
"best_dnn_run_id = summary_df['run_id'].iloc[0]\n",
|
||||
"best_dnn_run_id = summary_df[\"run_id\"].iloc[0]\n",
|
||||
"best_dnn_run = Run(experiment, best_dnn_run_id)"
|
||||
]
|
||||
},
|
||||
@@ -412,11 +405,11 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_dir = 'Model' # Local folder where the model will be stored temporarily\n",
|
||||
"model_dir = \"Model\" # Local folder where the model will be stored temporarily\n",
|
||||
"if not os.path.isdir(model_dir):\n",
|
||||
" os.mkdir(model_dir)\n",
|
||||
"\n",
|
||||
"best_dnn_run.download_file('outputs/model.pkl', model_dir + '/model.pkl')"
|
||||
"best_dnn_run.download_file(\"outputs/model.pkl\", model_dir + \"/model.pkl\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -433,11 +426,10 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Register the model\n",
|
||||
"model_name = 'textDNN-20News'\n",
|
||||
"model = Model.register(model_path = model_dir + '/model.pkl',\n",
|
||||
" model_name = model_name,\n",
|
||||
" tags=None,\n",
|
||||
" workspace=ws)"
|
||||
"model_name = \"textDNN-20News\"\n",
|
||||
"model = Model.register(\n",
|
||||
" model_path=model_dir + \"/model.pkl\", model_name=model_name, tags=None, workspace=ws\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -462,7 +454,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, blobstore_datadir + '/test_data.csv')])\n",
|
||||
"test_dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, blobstore_datadir + \"/test_data.csv\")]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# preview the first 3 rows of the dataset\n",
|
||||
"test_dataset.take(3).to_pandas_dataframe()"
|
||||
@@ -483,9 +477,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"script_folder = os.path.join(os.getcwd(), 'inference')\n",
|
||||
"script_folder = os.path.join(os.getcwd(), \"inference\")\n",
|
||||
"os.makedirs(script_folder, exist_ok=True)\n",
|
||||
"shutil.copy('infer.py', script_folder)"
|
||||
"shutil.copy(\"infer.py\", script_folder)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -494,8 +488,15 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_run = run_inference(test_experiment, compute_target, script_folder, best_dnn_run,\n",
|
||||
" test_dataset, target_column_name, model_name)"
|
||||
"test_run = run_inference(\n",
|
||||
" test_experiment,\n",
|
||||
" compute_target,\n",
|
||||
" script_folder,\n",
|
||||
" best_dnn_run,\n",
|
||||
" test_dataset,\n",
|
||||
" target_column_name,\n",
|
||||
" model_name,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -564,9 +565,9 @@
|
||||
"friendly_name": "DNN Text Featurization",
|
||||
"index_order": 2,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -4,52 +4,65 @@ from azureml.train.estimator import Estimator
|
||||
from azureml.core.run import Run
|
||||
|
||||
|
||||
def run_inference(test_experiment, compute_target, script_folder, train_run,
|
||||
test_dataset, target_column_name, model_name):
|
||||
def run_inference(
|
||||
test_experiment,
|
||||
compute_target,
|
||||
script_folder,
|
||||
train_run,
|
||||
test_dataset,
|
||||
target_column_name,
|
||||
model_name,
|
||||
):
|
||||
|
||||
inference_env = train_run.get_environment()
|
||||
|
||||
est = Estimator(source_directory=script_folder,
|
||||
entry_script='infer.py',
|
||||
est = Estimator(
|
||||
source_directory=script_folder,
|
||||
entry_script="infer.py",
|
||||
script_params={
|
||||
'--target_column_name': target_column_name,
|
||||
'--model_name': model_name
|
||||
"--target_column_name": target_column_name,
|
||||
"--model_name": model_name,
|
||||
},
|
||||
inputs=[
|
||||
test_dataset.as_named_input('test_data')
|
||||
],
|
||||
inputs=[test_dataset.as_named_input("test_data")],
|
||||
compute_target=compute_target,
|
||||
environment_definition=inference_env)
|
||||
environment_definition=inference_env,
|
||||
)
|
||||
|
||||
run = test_experiment.submit(
|
||||
est, tags={
|
||||
'training_run_id': train_run.id,
|
||||
'run_algorithm': train_run.properties['run_algorithm'],
|
||||
'valid_score': train_run.properties['score'],
|
||||
'primary_metric': train_run.properties['primary_metric']
|
||||
})
|
||||
est,
|
||||
tags={
|
||||
"training_run_id": train_run.id,
|
||||
"run_algorithm": train_run.properties["run_algorithm"],
|
||||
"valid_score": train_run.properties["score"],
|
||||
"primary_metric": train_run.properties["primary_metric"],
|
||||
},
|
||||
)
|
||||
|
||||
run.log("run_algorithm", run.tags['run_algorithm'])
|
||||
run.log("run_algorithm", run.tags["run_algorithm"])
|
||||
return run
|
||||
|
||||
|
||||
def get_result_df(remote_run):
|
||||
|
||||
children = list(remote_run.get_children(recursive=True))
|
||||
summary_df = pd.DataFrame(index=['run_id', 'run_algorithm',
|
||||
'primary_metric', 'Score'])
|
||||
summary_df = pd.DataFrame(
|
||||
index=["run_id", "run_algorithm", "primary_metric", "Score"]
|
||||
)
|
||||
goal_minimize = False
|
||||
for run in children:
|
||||
if('run_algorithm' in run.properties and 'score' in run.properties):
|
||||
summary_df[run.id] = [run.id, run.properties['run_algorithm'],
|
||||
run.properties['primary_metric'],
|
||||
float(run.properties['score'])]
|
||||
if('goal' in run.properties):
|
||||
goal_minimize = run.properties['goal'].split('_')[-1] == 'min'
|
||||
if "run_algorithm" in run.properties and "score" in run.properties:
|
||||
summary_df[run.id] = [
|
||||
run.id,
|
||||
run.properties["run_algorithm"],
|
||||
run.properties["primary_metric"],
|
||||
float(run.properties["score"]),
|
||||
]
|
||||
if "goal" in run.properties:
|
||||
goal_minimize = run.properties["goal"].split("_")[-1] == "min"
|
||||
|
||||
summary_df = summary_df.T.sort_values(
|
||||
'Score',
|
||||
ascending=goal_minimize).drop_duplicates(['run_algorithm'])
|
||||
summary_df = summary_df.set_index('run_algorithm')
|
||||
"Score", ascending=goal_minimize
|
||||
).drop_duplicates(["run_algorithm"])
|
||||
summary_df = summary_df.set_index("run_algorithm")
|
||||
|
||||
return summary_df
|
||||
|
||||
@@ -12,19 +12,22 @@ from azureml.core.model import Model
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--target_column_name', type=str, dest='target_column_name',
|
||||
help='Target Column Name')
|
||||
"--target_column_name",
|
||||
type=str,
|
||||
dest="target_column_name",
|
||||
help="Target Column Name",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--model_name', type=str, dest='model_name',
|
||||
help='Name of registered model')
|
||||
"--model_name", type=str, dest="model_name", help="Name of registered model"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
target_column_name = args.target_column_name
|
||||
model_name = args.model_name
|
||||
|
||||
print('args passed are: ')
|
||||
print('Target column name: ', target_column_name)
|
||||
print('Name of registered model: ', model_name)
|
||||
print("args passed are: ")
|
||||
print("Target column name: ", target_column_name)
|
||||
print("Name of registered model: ", model_name)
|
||||
|
||||
model_path = Model.get_model_path(model_name)
|
||||
# deserialize the model file back into a sklearn model
|
||||
@@ -32,13 +35,16 @@ model = joblib.load(model_path)
|
||||
|
||||
run = Run.get_context()
|
||||
# get input dataset by name
|
||||
test_dataset = run.input_datasets['test_data']
|
||||
test_dataset = run.input_datasets["test_data"]
|
||||
|
||||
X_test_df = test_dataset.drop_columns(columns=[target_column_name]) \
|
||||
.to_pandas_dataframe()
|
||||
y_test_df = test_dataset.with_timestamp_columns(None) \
|
||||
.keep_columns(columns=[target_column_name]) \
|
||||
X_test_df = test_dataset.drop_columns(
|
||||
columns=[target_column_name]
|
||||
).to_pandas_dataframe()
|
||||
y_test_df = (
|
||||
test_dataset.with_timestamp_columns(None)
|
||||
.keep_columns(columns=[target_column_name])
|
||||
.to_pandas_dataframe()
|
||||
)
|
||||
|
||||
predicted = model.predict_proba(X_test_df)
|
||||
|
||||
@@ -47,11 +53,13 @@ if isinstance(predicted, pd.DataFrame):
|
||||
|
||||
# Use the AutoML scoring module
|
||||
train_labels = model.classes_
|
||||
class_labels = np.unique(np.concatenate((y_test_df.values, np.reshape(train_labels, (-1, 1)))))
|
||||
class_labels = np.unique(
|
||||
np.concatenate((y_test_df.values, np.reshape(train_labels, (-1, 1))))
|
||||
)
|
||||
classification_metrics = list(constants.CLASSIFICATION_SCALAR_SET)
|
||||
scores = scoring.score_classification(y_test_df.values, predicted,
|
||||
classification_metrics,
|
||||
class_labels, train_labels)
|
||||
scores = scoring.score_classification(
|
||||
y_test_df.values, predicted, classification_metrics, class_labels, train_labels
|
||||
)
|
||||
|
||||
print("scores:")
|
||||
print(scores)
|
||||
|
||||
@@ -1,20 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved. \n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -75,16 +60,6 @@
|
||||
"This sample notebook may use features that are not available in previous versions of the Azure ML SDK."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.38.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -118,17 +93,17 @@
|
||||
"dstor = ws.get_default_datastore()\n",
|
||||
"\n",
|
||||
"# Choose a name for the run history container in the workspace.\n",
|
||||
"experiment_name = 'retrain-noaaweather'\n",
|
||||
"experiment_name = \"retrain-noaaweather\"\n",
|
||||
"experiment = Experiment(ws, experiment_name)\n",
|
||||
"\n",
|
||||
"output = {}\n",
|
||||
"output['Subscription ID'] = ws.subscription_id\n",
|
||||
"output['Workspace'] = ws.name\n",
|
||||
"output['Resource Group'] = ws.resource_group\n",
|
||||
"output['Location'] = ws.location\n",
|
||||
"output['Run History Name'] = experiment_name\n",
|
||||
"pd.set_option('display.max_colwidth', -1)\n",
|
||||
"outputDf = pd.DataFrame(data = output, index = [''])\n",
|
||||
"output[\"Subscription ID\"] = ws.subscription_id\n",
|
||||
"output[\"Workspace\"] = ws.name\n",
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Run History Name\"] = experiment_name\n",
|
||||
"pd.set_option(\"display.max_colwidth\", -1)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
]
|
||||
},
|
||||
@@ -164,12 +139,12 @@
|
||||
"# Verify that cluster does not exist already\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)\n",
|
||||
" print('Found existing cluster, use it.')\n",
|
||||
" print(\"Found existing cluster, use it.\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',\n",
|
||||
" max_nodes=4)\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=\"STANDARD_DS12_V2\", max_nodes=4\n",
|
||||
" )\n",
|
||||
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
"compute_target.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
@@ -196,12 +171,19 @@
|
||||
"\n",
|
||||
"conda_run_config.environment.docker.enabled = True\n",
|
||||
"\n",
|
||||
"cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]', 'applicationinsights', 'azureml-opendatasets', 'azureml-defaults'], \n",
|
||||
" conda_packages=['numpy==1.16.2'], \n",
|
||||
" pin_sdk_version=False)\n",
|
||||
"cd = CondaDependencies.create(\n",
|
||||
" pip_packages=[\n",
|
||||
" \"azureml-sdk[automl]\",\n",
|
||||
" \"applicationinsights\",\n",
|
||||
" \"azureml-opendatasets\",\n",
|
||||
" \"azureml-defaults\",\n",
|
||||
" ],\n",
|
||||
" conda_packages=[\"numpy==1.16.2\"],\n",
|
||||
" pin_sdk_version=False,\n",
|
||||
")\n",
|
||||
"conda_run_config.environment.python.conda_dependencies = cd\n",
|
||||
"\n",
|
||||
"print('run config is ready')"
|
||||
"print(\"run config is ready\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -242,12 +224,14 @@
|
||||
"from azureml.pipeline.steps import PythonScriptStep\n",
|
||||
"\n",
|
||||
"ds_name = PipelineParameter(name=\"ds_name\", default_value=dataset)\n",
|
||||
"upload_data_step = PythonScriptStep(script_name=\"upload_weather_data.py\", \n",
|
||||
"upload_data_step = PythonScriptStep(\n",
|
||||
" script_name=\"upload_weather_data.py\",\n",
|
||||
" allow_reuse=False,\n",
|
||||
" name=\"upload_weather_data\",\n",
|
||||
" arguments=[\"--ds_name\", ds_name],\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" runconfig=conda_run_config)"
|
||||
" runconfig=conda_run_config,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -264,10 +248,11 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_pipeline = Pipeline(\n",
|
||||
" description=\"pipeline_with_uploaddata\",\n",
|
||||
" workspace=ws, \n",
|
||||
" steps=[upload_data_step])\n",
|
||||
"data_pipeline_run = experiment.submit(data_pipeline, pipeline_parameters={\"ds_name\":dataset})"
|
||||
" description=\"pipeline_with_uploaddata\", workspace=ws, steps=[upload_data_step]\n",
|
||||
")\n",
|
||||
"data_pipeline_run = experiment.submit(\n",
|
||||
" data_pipeline, pipeline_parameters={\"ds_name\": dataset}\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -307,13 +292,14 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_prep_step = PythonScriptStep(script_name=\"check_data.py\", \n",
|
||||
"data_prep_step = PythonScriptStep(\n",
|
||||
" script_name=\"check_data.py\",\n",
|
||||
" allow_reuse=False,\n",
|
||||
" name=\"check_data\",\n",
|
||||
" arguments=[\"--ds_name\", ds_name,\n",
|
||||
" \"--model_name\", model_name],\n",
|
||||
" arguments=[\"--ds_name\", ds_name, \"--model_name\", model_name],\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" runconfig=conda_run_config)"
|
||||
" runconfig=conda_run_config,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -323,6 +309,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Dataset\n",
|
||||
"\n",
|
||||
"train_ds = Dataset.get_by_name(ws, dataset)\n",
|
||||
"train_ds = train_ds.drop_columns([\"partition_date\"])"
|
||||
]
|
||||
@@ -348,20 +335,21 @@
|
||||
" \"iteration_timeout_minutes\": 10,\n",
|
||||
" \"experiment_timeout_hours\": 0.25,\n",
|
||||
" \"n_cross_validations\": 3,\n",
|
||||
" \"primary_metric\": 'normalized_root_mean_squared_error',\n",
|
||||
" \"primary_metric\": \"r2_score\",\n",
|
||||
" \"max_concurrent_iterations\": 3,\n",
|
||||
" \"max_cores_per_iteration\": -1,\n",
|
||||
" \"verbosity\": logging.INFO,\n",
|
||||
" \"enable_early_stopping\": True\n",
|
||||
" \"enable_early_stopping\": True,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(task = 'regression',\n",
|
||||
" debug_log = 'automl_errors.log',\n",
|
||||
"automl_config = AutoMLConfig(\n",
|
||||
" task=\"regression\",\n",
|
||||
" debug_log=\"automl_errors.log\",\n",
|
||||
" path=\".\",\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=train_ds,\n",
|
||||
" label_column_name=target_column_name,\n",
|
||||
" **automl_settings\n",
|
||||
" **automl_settings,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -373,17 +361,21 @@
|
||||
"source": [
|
||||
"from azureml.pipeline.core import PipelineData, TrainingOutput\n",
|
||||
"\n",
|
||||
"metrics_output_name = 'metrics_output'\n",
|
||||
"best_model_output_name = 'best_model_output'\n",
|
||||
"metrics_output_name = \"metrics_output\"\n",
|
||||
"best_model_output_name = \"best_model_output\"\n",
|
||||
"\n",
|
||||
"metrics_data = PipelineData(name='metrics_data',\n",
|
||||
"metrics_data = PipelineData(\n",
|
||||
" name=\"metrics_data\",\n",
|
||||
" datastore=dstor,\n",
|
||||
" pipeline_output_name=metrics_output_name,\n",
|
||||
" training_output=TrainingOutput(type='Metrics'))\n",
|
||||
"model_data = PipelineData(name='model_data',\n",
|
||||
" training_output=TrainingOutput(type=\"Metrics\"),\n",
|
||||
")\n",
|
||||
"model_data = PipelineData(\n",
|
||||
" name=\"model_data\",\n",
|
||||
" datastore=dstor,\n",
|
||||
" pipeline_output_name=best_model_output_name,\n",
|
||||
" training_output=TrainingOutput(type='Model'))"
|
||||
" training_output=TrainingOutput(type=\"Model\"),\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -393,10 +385,11 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_step = AutoMLStep(\n",
|
||||
" name='automl_module',\n",
|
||||
" name=\"automl_module\",\n",
|
||||
" automl_config=automl_config,\n",
|
||||
" outputs=[metrics_data, model_data],\n",
|
||||
" allow_reuse=False)"
|
||||
" allow_reuse=False,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -413,13 +406,22 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"register_model_step = PythonScriptStep(script_name=\"register_model.py\",\n",
|
||||
"register_model_step = PythonScriptStep(\n",
|
||||
" script_name=\"register_model.py\",\n",
|
||||
" name=\"register_model\",\n",
|
||||
" allow_reuse=False,\n",
|
||||
" arguments=[\"--model_name\", model_name, \"--model_path\", model_data, \"--ds_name\", ds_name],\n",
|
||||
" arguments=[\n",
|
||||
" \"--model_name\",\n",
|
||||
" model_name,\n",
|
||||
" \"--model_path\",\n",
|
||||
" model_data,\n",
|
||||
" \"--ds_name\",\n",
|
||||
" ds_name,\n",
|
||||
" ],\n",
|
||||
" inputs=[model_data],\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" runconfig=conda_run_config)"
|
||||
" runconfig=conda_run_config,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -438,7 +440,8 @@
|
||||
"training_pipeline = Pipeline(\n",
|
||||
" description=\"training_pipeline\",\n",
|
||||
" workspace=ws,\n",
|
||||
" steps=[data_prep_step, automl_step, register_model_step])"
|
||||
" steps=[data_prep_step, automl_step, register_model_step],\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -447,8 +450,10 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"training_pipeline_run = experiment.submit(training_pipeline, pipeline_parameters={\n",
|
||||
" \"ds_name\": dataset, \"model_name\": \"noaaweatherds\"})"
|
||||
"training_pipeline_run = experiment.submit(\n",
|
||||
" training_pipeline,\n",
|
||||
" pipeline_parameters={\"ds_name\": dataset, \"model_name\": \"noaaweatherds\"},\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -477,8 +482,8 @@
|
||||
"pipeline_name = \"Retraining-Pipeline-NOAAWeather\"\n",
|
||||
"\n",
|
||||
"published_pipeline = training_pipeline.publish(\n",
|
||||
" name=pipeline_name, \n",
|
||||
" description=\"Pipeline that retrains AutoML model\")\n",
|
||||
" name=pipeline_name, description=\"Pipeline that retrains AutoML model\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"published_pipeline"
|
||||
]
|
||||
@@ -490,13 +495,17 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.pipeline.core import Schedule\n",
|
||||
"schedule = Schedule.create(workspace=ws, name=\"RetrainingSchedule\",\n",
|
||||
"\n",
|
||||
"schedule = Schedule.create(\n",
|
||||
" workspace=ws,\n",
|
||||
" name=\"RetrainingSchedule\",\n",
|
||||
" pipeline_parameters={\"ds_name\": dataset, \"model_name\": \"noaaweatherds\"},\n",
|
||||
" pipeline_id=published_pipeline.id,\n",
|
||||
" experiment_name=experiment_name,\n",
|
||||
" datastore=dstor,\n",
|
||||
" wait_for_provisioning=True,\n",
|
||||
" polling_interval=1440)"
|
||||
" polling_interval=1440,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -520,8 +529,8 @@
|
||||
"pipeline_name = \"DataIngestion-Pipeline-NOAAWeather\"\n",
|
||||
"\n",
|
||||
"published_pipeline = training_pipeline.publish(\n",
|
||||
" name=pipeline_name, \n",
|
||||
" description=\"Pipeline that updates NOAAWeather Dataset\")\n",
|
||||
" name=pipeline_name, description=\"Pipeline that updates NOAAWeather Dataset\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"published_pipeline"
|
||||
]
|
||||
@@ -533,13 +542,17 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.pipeline.core import Schedule\n",
|
||||
"schedule = Schedule.create(workspace=ws, name=\"RetrainingSchedule-DataIngestion\",\n",
|
||||
"\n",
|
||||
"schedule = Schedule.create(\n",
|
||||
" workspace=ws,\n",
|
||||
" name=\"RetrainingSchedule-DataIngestion\",\n",
|
||||
" pipeline_parameters={\"ds_name\": dataset},\n",
|
||||
" pipeline_id=published_pipeline.id,\n",
|
||||
" experiment_name=experiment_name,\n",
|
||||
" datastore=dstor,\n",
|
||||
" wait_for_provisioning=True,\n",
|
||||
" polling_interval=1440)"
|
||||
" polling_interval=1440,\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -550,9 +563,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -31,7 +31,7 @@ try:
|
||||
model = Model(ws, args.model_name)
|
||||
last_train_time = model.created_time
|
||||
print("Model was last trained on {0}.".format(last_train_time))
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
print("Could not get last model train time.")
|
||||
last_train_time = datetime.min.replace(tzinfo=pytz.UTC)
|
||||
|
||||
|
||||
@@ -25,9 +25,11 @@ datasets = [(Dataset.Scenario.TRAINING, train_ds)]
|
||||
|
||||
# Register model with training dataset
|
||||
|
||||
model = Model.register(workspace=ws,
|
||||
model = Model.register(
|
||||
workspace=ws,
|
||||
model_path=args.model_path,
|
||||
model_name=args.model_name,
|
||||
datasets=datasets)
|
||||
datasets=datasets,
|
||||
)
|
||||
|
||||
print("Registered version {0} of model {1}".format(model.version, model.name))
|
||||
|
||||
@@ -16,26 +16,82 @@ if type(run) == _OfflineRun:
|
||||
else:
|
||||
ws = run.experiment.workspace
|
||||
|
||||
usaf_list = ['725724', '722149', '723090', '722159', '723910', '720279',
|
||||
'725513', '725254', '726430', '720381', '723074', '726682',
|
||||
'725486', '727883', '723177', '722075', '723086', '724053',
|
||||
'725070', '722073', '726060', '725224', '725260', '724520',
|
||||
'720305', '724020', '726510', '725126', '722523', '703333',
|
||||
'722249', '722728', '725483', '722972', '724975', '742079',
|
||||
'727468', '722193', '725624', '722030', '726380', '720309',
|
||||
'722071', '720326', '725415', '724504', '725665', '725424',
|
||||
'725066']
|
||||
usaf_list = [
|
||||
"725724",
|
||||
"722149",
|
||||
"723090",
|
||||
"722159",
|
||||
"723910",
|
||||
"720279",
|
||||
"725513",
|
||||
"725254",
|
||||
"726430",
|
||||
"720381",
|
||||
"723074",
|
||||
"726682",
|
||||
"725486",
|
||||
"727883",
|
||||
"723177",
|
||||
"722075",
|
||||
"723086",
|
||||
"724053",
|
||||
"725070",
|
||||
"722073",
|
||||
"726060",
|
||||
"725224",
|
||||
"725260",
|
||||
"724520",
|
||||
"720305",
|
||||
"724020",
|
||||
"726510",
|
||||
"725126",
|
||||
"722523",
|
||||
"703333",
|
||||
"722249",
|
||||
"722728",
|
||||
"725483",
|
||||
"722972",
|
||||
"724975",
|
||||
"742079",
|
||||
"727468",
|
||||
"722193",
|
||||
"725624",
|
||||
"722030",
|
||||
"726380",
|
||||
"720309",
|
||||
"722071",
|
||||
"720326",
|
||||
"725415",
|
||||
"724504",
|
||||
"725665",
|
||||
"725424",
|
||||
"725066",
|
||||
]
|
||||
|
||||
|
||||
def get_noaa_data(start_time, end_time):
|
||||
columns = ['usaf', 'wban', 'datetime', 'latitude', 'longitude', 'elevation',
|
||||
'windAngle', 'windSpeed', 'temperature', 'stationName', 'p_k']
|
||||
columns = [
|
||||
"usaf",
|
||||
"wban",
|
||||
"datetime",
|
||||
"latitude",
|
||||
"longitude",
|
||||
"elevation",
|
||||
"windAngle",
|
||||
"windSpeed",
|
||||
"temperature",
|
||||
"stationName",
|
||||
"p_k",
|
||||
]
|
||||
isd = NoaaIsdWeather(start_time, end_time, cols=columns)
|
||||
noaa_df = isd.to_pandas_dataframe()
|
||||
df_filtered = noaa_df[noaa_df["usaf"].isin(usaf_list)]
|
||||
df_filtered.reset_index(drop=True)
|
||||
print("Received {0} rows of training data between {1} and {2}".format(
|
||||
df_filtered.shape[0], start_time, end_time))
|
||||
print(
|
||||
"Received {0} rows of training data between {1} and {2}".format(
|
||||
df_filtered.shape[0], start_time, end_time
|
||||
)
|
||||
)
|
||||
return df_filtered
|
||||
|
||||
|
||||
@@ -54,11 +110,12 @@ end_time = datetime.utcnow()
|
||||
try:
|
||||
ds = Dataset.get_by_name(ws, args.ds_name)
|
||||
end_time_last_slice = ds.data_changed_time.replace(tzinfo=None)
|
||||
print("Dataset {0} last updated on {1}".format(args.ds_name,
|
||||
end_time_last_slice))
|
||||
print("Dataset {0} last updated on {1}".format(args.ds_name, end_time_last_slice))
|
||||
except Exception:
|
||||
print(traceback.format_exc())
|
||||
print("Dataset with name {0} not found, registering new dataset.".format(args.ds_name))
|
||||
print(
|
||||
"Dataset with name {0} not found, registering new dataset.".format(args.ds_name)
|
||||
)
|
||||
register_dataset = True
|
||||
end_time = datetime(2021, 5, 1, 0, 0)
|
||||
end_time_last_slice = end_time - relativedelta(weeks=2)
|
||||
@@ -66,26 +123,35 @@ except Exception:
|
||||
train_df = get_noaa_data(end_time_last_slice, end_time)
|
||||
|
||||
if train_df.size > 0:
|
||||
print("Received {0} rows of new data after {1}.".format(
|
||||
train_df.shape[0], end_time_last_slice))
|
||||
folder_name = "{}/{:04d}/{:02d}/{:02d}/{:02d}/{:02d}/{:02d}".format(args.ds_name, end_time.year,
|
||||
end_time.month, end_time.day,
|
||||
end_time.hour, end_time.minute,
|
||||
end_time.second)
|
||||
print(
|
||||
"Received {0} rows of new data after {1}.".format(
|
||||
train_df.shape[0], end_time_last_slice
|
||||
)
|
||||
)
|
||||
folder_name = "{}/{:04d}/{:02d}/{:02d}/{:02d}/{:02d}/{:02d}".format(
|
||||
args.ds_name,
|
||||
end_time.year,
|
||||
end_time.month,
|
||||
end_time.day,
|
||||
end_time.hour,
|
||||
end_time.minute,
|
||||
end_time.second,
|
||||
)
|
||||
file_path = "{0}/data.csv".format(folder_name)
|
||||
|
||||
# Add a new partition to the registered dataset
|
||||
os.makedirs(folder_name, exist_ok=True)
|
||||
train_df.to_csv(file_path, index=False)
|
||||
|
||||
dstor.upload_files(files=[file_path],
|
||||
target_path=folder_name,
|
||||
overwrite=True,
|
||||
show_progress=True)
|
||||
dstor.upload_files(
|
||||
files=[file_path], target_path=folder_name, overwrite=True, show_progress=True
|
||||
)
|
||||
else:
|
||||
print("No new data since {0}.".format(end_time_last_slice))
|
||||
|
||||
if register_dataset:
|
||||
ds = Dataset.Tabular.from_delimited_files(dstor.path("{}/**/*.csv".format(
|
||||
args.ds_name)), partition_format='/{partition_date:yyyy/MM/dd/HH/mm/ss}/data.csv')
|
||||
ds = Dataset.Tabular.from_delimited_files(
|
||||
dstor.path("{}/**/*.csv".format(args.ds_name)),
|
||||
partition_format="/{partition_date:yyyy/MM/dd/HH/mm/ss}/data.csv",
|
||||
)
|
||||
ds.register(ws, name=args.ds_name)
|
||||
|
||||
@@ -703,9 +703,9 @@
|
||||
"automated-machine-learning"
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-contrib-automl-pipeline-steps
|
||||
@@ -697,9 +697,9 @@
|
||||
"Azure ML AutoML"
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -73,14 +73,14 @@
|
||||
"import pandas as pd\n",
|
||||
"from azureml.automl.core.featurization import FeaturizationConfig\n",
|
||||
"from azureml.core import Dataset, Experiment, Workspace\n",
|
||||
"from azureml.train.automl import AutoMLConfig\n"
|
||||
"from azureml.train.automl import AutoMLConfig"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This sample notebook may use features that are not available in previous versions of the Azure ML SDK."
|
||||
"This notebook is compatible with Azure ML SDK version 1.35.0 or later."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -89,7 +89,6 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.38.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -429,7 +428,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download the JSON file locally\n",
|
||||
"best_run.download_file(\"outputs/engineered_feature_names.json\", \"engineered_feature_names.json\")\n",
|
||||
"best_run.download_file(\n",
|
||||
" \"outputs/engineered_feature_names.json\", \"engineered_feature_names.json\"\n",
|
||||
")\n",
|
||||
"with open(\"engineered_feature_names.json\", \"r\") as f:\n",
|
||||
" records = json.load(f)\n",
|
||||
"\n",
|
||||
@@ -458,7 +459,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download the featurization summary JSON file locally\n",
|
||||
"best_run.download_file(\"outputs/featurization_summary.json\", \"featurization_summary.json\")\n",
|
||||
"best_run.download_file(\n",
|
||||
" \"outputs/featurization_summary.json\", \"featurization_summary.json\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Render the JSON as a pandas DataFrame\n",
|
||||
"with open(\"featurization_summary.json\", \"r\") as f:\n",
|
||||
@@ -466,7 +469,15 @@
|
||||
"fs = pd.DataFrame.from_records(records)\n",
|
||||
"\n",
|
||||
"# View a summary of the featurization\n",
|
||||
"fs[[\"RawFeatureName\", \"TypeDetected\", \"Dropped\", \"EngineeredFeatureCount\", \"Transformations\"]]"
|
||||
"fs[\n",
|
||||
" [\n",
|
||||
" \"RawFeatureName\",\n",
|
||||
" \"TypeDetected\",\n",
|
||||
" \"Dropped\",\n",
|
||||
" \"EngineeredFeatureCount\",\n",
|
||||
" \"Transformations\",\n",
|
||||
" ]\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -683,9 +694,9 @@
|
||||
"friendly_name": "Forecasting BikeShare Demand",
|
||||
"index_order": 1,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -91,7 +91,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This sample notebook may use features that are not available in previous versions of the Azure ML SDK."
|
||||
"This notebook is compatible with Azure ML SDK version 1.35.0 or later."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -100,7 +100,6 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.38.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -399,7 +398,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Retrieve the Best Run details\n",
|
||||
"## Retrieve the Best Run details\n",
|
||||
"Below we retrieve the best Run object from among all the runs in the experiment."
|
||||
]
|
||||
},
|
||||
@@ -428,7 +427,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download the JSON file locally\n",
|
||||
"best_run.download_file(\"outputs/engineered_feature_names.json\", \"engineered_feature_names.json\")\n",
|
||||
"best_run.download_file(\n",
|
||||
" \"outputs/engineered_feature_names.json\", \"engineered_feature_names.json\"\n",
|
||||
")\n",
|
||||
"with open(\"engineered_feature_names.json\", \"r\") as f:\n",
|
||||
" records = json.load(f)\n",
|
||||
"\n",
|
||||
@@ -456,7 +457,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download the featurization summary JSON file locally\n",
|
||||
"best_run.download_file(\"outputs/featurization_summary.json\", \"featurization_summary.json\")\n",
|
||||
"best_run.download_file(\n",
|
||||
" \"outputs/featurization_summary.json\", \"featurization_summary.json\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Render the JSON as a pandas DataFrame\n",
|
||||
"with open(\"featurization_summary.json\", \"r\") as f:\n",
|
||||
@@ -464,7 +467,15 @@
|
||||
"fs = pd.DataFrame.from_records(records)\n",
|
||||
"\n",
|
||||
"# View a summary of the featurization\n",
|
||||
"fs[[\"RawFeatureName\", \"TypeDetected\", \"Dropped\", \"EngineeredFeatureCount\", \"Transformations\"]]"
|
||||
"fs[\n",
|
||||
" [\n",
|
||||
" \"RawFeatureName\",\n",
|
||||
" \"TypeDetected\",\n",
|
||||
" \"Dropped\",\n",
|
||||
" \"EngineeredFeatureCount\",\n",
|
||||
" \"Transformations\",\n",
|
||||
" ]\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -491,7 +502,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Retreiving forecasts from the model\n",
|
||||
"### Retrieving forecasts from the model\n",
|
||||
"We have created a function called `run_forecast` that submits the test data to the best model determined during the training run and retrieves forecasts. This function uses a helper script `forecasting_script` which is uploaded and expecuted on the remote compute."
|
||||
]
|
||||
},
|
||||
@@ -752,9 +763,9 @@
|
||||
"automated-machine-learning"
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -85,7 +85,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This sample notebook may use features that are not available in previous versions of the Azure ML SDK."
|
||||
"This notebook is compatible with Azure ML SDK version 1.35.0 or later."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -94,7 +94,6 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.38.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -867,9 +866,9 @@
|
||||
"friendly_name": "Forecasting away from training data",
|
||||
"index_order": 3,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -0,0 +1,725 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
"# Automated Machine Learning\n",
|
||||
"**Github DAU Forecasting**\n",
|
||||
"\n",
|
||||
"## Contents\n",
|
||||
"1. [Introduction](#Introduction)\n",
|
||||
"1. [Setup](#Setup)\n",
|
||||
"1. [Data](#Data)\n",
|
||||
"1. [Train](#Train)\n",
|
||||
"1. [Evaluate](#Evaluate)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
"## Introduction\n",
|
||||
"This notebook demonstrates demand forecasting for Github Daily Active Users Dataset using AutoML.\n",
|
||||
"\n",
|
||||
"AutoML highlights here include using Deep Learning forecasts, Arima, Prophet, Remote Execution and Remote Inferencing, and working with the `forecast` function. Please also look at the additional forecasting notebooks, which document lagging, rolling windows, forecast quantiles, other ways to use the forecast function, and forecaster deployment.\n",
|
||||
"\n",
|
||||
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
|
||||
"\n",
|
||||
"Notebook synopsis:\n",
|
||||
"\n",
|
||||
"1. Creating an Experiment in an existing Workspace\n",
|
||||
"2. Configuration and remote run of AutoML for a time-series model exploring Regression learners, Arima, Prophet and DNNs\n",
|
||||
"4. Evaluating the fitted model using a rolling test "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
"## Setup\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import azureml.core\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import logging\n",
|
||||
"import warnings\n",
|
||||
"\n",
|
||||
"from pandas.tseries.frequencies import to_offset\n",
|
||||
"\n",
|
||||
"# Squash warning messages for cleaner output in the notebook\n",
|
||||
"warnings.showwarning = lambda *args, **kwargs: None\n",
|
||||
"\n",
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
"from azureml.train.automl import AutoMLConfig\n",
|
||||
"from matplotlib import pyplot as plt\n",
|
||||
"from sklearn.metrics import mean_absolute_error, mean_squared_error\n",
|
||||
"from azureml.train.estimator import Estimator"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This notebook is compatible with Azure ML SDK version 1.35.0 or later."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
"As part of the setup you have already created a <b>Workspace</b>. To run AutoML, you also need to create an <b>Experiment</b>. An Experiment corresponds to a prediction problem you are trying to solve, while a Run corresponds to a specific approach to the problem."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ws = Workspace.from_config()\n",
|
||||
"\n",
|
||||
"# choose a name for the run history container in the workspace\n",
|
||||
"experiment_name = \"github-remote-cpu\"\n",
|
||||
"\n",
|
||||
"experiment = Experiment(ws, experiment_name)\n",
|
||||
"\n",
|
||||
"output = {}\n",
|
||||
"output[\"Subscription ID\"] = ws.subscription_id\n",
|
||||
"output[\"Workspace\"] = ws.name\n",
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Run History Name\"] = experiment_name\n",
|
||||
"pd.set_option(\"display.max_colwidth\", -1)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
"### Using AmlCompute\n",
|
||||
"You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for your AutoML run. In this tutorial, you use `AmlCompute` as your training compute resource.\n",
|
||||
"\n",
|
||||
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# Choose a name for your CPU cluster\n",
|
||||
"cpu_cluster_name = \"github-cluster\"\n",
|
||||
"\n",
|
||||
"# Verify that cluster does not exist already\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)\n",
|
||||
" print(\"Found existing cluster, use it.\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=\"STANDARD_DS12_V2\", max_nodes=4\n",
|
||||
" )\n",
|
||||
" compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
"compute_target.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
"## Data\n",
|
||||
"Read Github DAU data from file, and preview data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
"Let's set up what we know about the dataset. \n",
|
||||
"\n",
|
||||
"**Target column** is what we want to forecast.\n",
|
||||
"\n",
|
||||
"**Time column** is the time axis along which to predict.\n",
|
||||
"\n",
|
||||
"**Time series identifier columns** are identified by values of the columns listed `time_series_id_column_names`, for example \"store\" and \"item\" if your data has multiple time series of sales, one series for each combination of store and item sold.\n",
|
||||
"\n",
|
||||
"**Forecast frequency (freq)** This optional parameter represents the period with which the forecast is desired, for example, daily, weekly, yearly, etc. Use this parameter for the correction of time series containing irregular data points or for padding of short time series. The frequency needs to be a pandas offset alias. Please refer to [pandas documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects) for more information.\n",
|
||||
"\n",
|
||||
"This dataset has only one time series. Please see the [orange juice notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales) for an example of a multi-time series dataset."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from pandas import DataFrame\n",
|
||||
"from pandas import Grouper\n",
|
||||
"from pandas import concat\n",
|
||||
"from pandas.plotting import register_matplotlib_converters\n",
|
||||
"\n",
|
||||
"register_matplotlib_converters()\n",
|
||||
"plt.figure(figsize=(20, 10))\n",
|
||||
"plt.tight_layout()\n",
|
||||
"\n",
|
||||
"plt.subplot(2, 1, 1)\n",
|
||||
"plt.title(\"Github Daily Active User By Year\")\n",
|
||||
"df = pd.read_csv(\"github_dau_2011-2018_train.csv\", parse_dates=True, index_col=\"date\")\n",
|
||||
"test_df = pd.read_csv(\n",
|
||||
" \"github_dau_2011-2018_test.csv\", parse_dates=True, index_col=\"date\"\n",
|
||||
")\n",
|
||||
"plt.plot(df)\n",
|
||||
"\n",
|
||||
"plt.subplot(2, 1, 2)\n",
|
||||
"plt.title(\"Github Daily Active User By Month\")\n",
|
||||
"groups = df.groupby(df.index.month)\n",
|
||||
"months = concat([DataFrame(x[1].values) for x in groups], axis=1)\n",
|
||||
"months = DataFrame(months)\n",
|
||||
"months.columns = range(1, 49)\n",
|
||||
"months.boxplot()\n",
|
||||
"\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"target_column_name = \"count\"\n",
|
||||
"time_column_name = \"date\"\n",
|
||||
"time_series_id_column_names = []\n",
|
||||
"freq = \"D\" # Daily data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Split Training data into Train and Validation set and Upload to Datastores"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from helper import split_fraction_by_grain\n",
|
||||
"from helper import split_full_for_forecasting\n",
|
||||
"\n",
|
||||
"train, valid = split_full_for_forecasting(df, time_column_name)\n",
|
||||
"train.to_csv(\"train.csv\")\n",
|
||||
"valid.to_csv(\"valid.csv\")\n",
|
||||
"test_df.to_csv(\"test.csv\")\n",
|
||||
"\n",
|
||||
"datastore = ws.get_default_datastore()\n",
|
||||
"datastore.upload_files(\n",
|
||||
" files=[\"./train.csv\"],\n",
|
||||
" target_path=\"github-dataset/tabular/\",\n",
|
||||
" overwrite=True,\n",
|
||||
" show_progress=True,\n",
|
||||
")\n",
|
||||
"datastore.upload_files(\n",
|
||||
" files=[\"./valid.csv\"],\n",
|
||||
" target_path=\"github-dataset/tabular/\",\n",
|
||||
" overwrite=True,\n",
|
||||
" show_progress=True,\n",
|
||||
")\n",
|
||||
"datastore.upload_files(\n",
|
||||
" files=[\"./test.csv\"],\n",
|
||||
" target_path=\"github-dataset/tabular/\",\n",
|
||||
" overwrite=True,\n",
|
||||
" show_progress=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"from azureml.core import Dataset\n",
|
||||
"\n",
|
||||
"train_dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, \"github-dataset/tabular/train.csv\")]\n",
|
||||
")\n",
|
||||
"valid_dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, \"github-dataset/tabular/valid.csv\")]\n",
|
||||
")\n",
|
||||
"test_dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, \"github-dataset/tabular/test.csv\")]\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
"### Setting forecaster maximum horizon \n",
|
||||
"\n",
|
||||
"The forecast horizon is the number of periods into the future that the model should predict. Here, we set the horizon to 12 periods (i.e. 12 months). Notice that this is much shorter than the number of months in the test set; we will need to use a rolling test to evaluate the performance on the whole test set. For more discussion of forecast horizons and guiding principles for setting them, please see the [energy demand notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand). "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"forecast_horizon = 12"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
"## Train\n",
|
||||
"\n",
|
||||
"Instantiate a AutoMLConfig object. This defines the settings and data used to run the experiment.\n",
|
||||
"\n",
|
||||
"|Property|Description|\n",
|
||||
"|-|-|\n",
|
||||
"|**task**|forecasting|\n",
|
||||
"|**primary_metric**|This is the metric that you want to optimize.<br> Forecasting supports the following primary metrics <br><i>spearman_correlation</i><br><i>normalized_root_mean_squared_error</i><br><i>r2_score</i><br><i>normalized_mean_absolute_error</i>\n",
|
||||
"|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
|
||||
"|**training_data**|Input dataset, containing both features and label column.|\n",
|
||||
"|**label_column_name**|The name of the label column.|\n",
|
||||
"|**enable_dnn**|Enable Forecasting DNNs|\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.core.forecasting_parameters import ForecastingParameters\n",
|
||||
"\n",
|
||||
"forecasting_parameters = ForecastingParameters(\n",
|
||||
" time_column_name=time_column_name,\n",
|
||||
" forecast_horizon=forecast_horizon,\n",
|
||||
" freq=\"D\", # Set the forecast frequency to be daily\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# We will disable the enable_early_stopping flag to ensure the DNN model is recommended for demonstration purpose.\n",
|
||||
"automl_config = AutoMLConfig(\n",
|
||||
" task=\"forecasting\",\n",
|
||||
" primary_metric=\"normalized_root_mean_squared_error\",\n",
|
||||
" experiment_timeout_hours=1,\n",
|
||||
" training_data=train_dataset,\n",
|
||||
" label_column_name=target_column_name,\n",
|
||||
" validation_data=valid_dataset,\n",
|
||||
" verbosity=logging.INFO,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" max_concurrent_iterations=4,\n",
|
||||
" max_cores_per_iteration=-1,\n",
|
||||
" enable_dnn=True,\n",
|
||||
" enable_early_stopping=False,\n",
|
||||
" forecasting_parameters=forecasting_parameters,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
"We will now run the experiment, starting with 10 iterations of model search. The experiment can be continued for more iterations if more accurate results are required. Validation errors and current status will be shown when setting `show_output=True` and the execution will be synchronous."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"remote_run = experiment.submit(automl_config, show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# If you need to retrieve a run that already started, use the following code\n",
|
||||
"# from azureml.train.automl.run import AutoMLRun\n",
|
||||
"# remote_run = AutoMLRun(experiment = experiment, run_id = '<replace with your run id>')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
"Displaying the run objects gives you links to the visual tools in the Azure Portal. Go try them!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
"### Retrieve the Best Model for Each Algorithm\n",
|
||||
"Below we select the best pipeline from our iterations. The get_output method on automl_classifier returns the best run and the fitted model for the last fit invocation. There are overloads on get_output that allow you to retrieve the best run and fitted model for any logged metric or a particular iteration."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from helper import get_result_df\n",
|
||||
"\n",
|
||||
"summary_df = get_result_df(remote_run)\n",
|
||||
"summary_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.run import Run\n",
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"\n",
|
||||
"forecast_model = \"TCNForecaster\"\n",
|
||||
"if not forecast_model in summary_df[\"run_id\"]:\n",
|
||||
" forecast_model = \"ForecastTCN\"\n",
|
||||
"\n",
|
||||
"best_dnn_run_id = summary_df[\"run_id\"][forecast_model]\n",
|
||||
"best_dnn_run = Run(experiment, best_dnn_run_id)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"best_dnn_run.parent\n",
|
||||
"RunDetails(best_dnn_run.parent).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"best_dnn_run\n",
|
||||
"RunDetails(best_dnn_run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
"## Evaluate on Test Data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"source": [
|
||||
"We now use the best fitted model from the AutoML Run to make forecasts for the test set. \n",
|
||||
"\n",
|
||||
"We always score on the original dataset whose schema matches the training set schema."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Dataset\n",
|
||||
"\n",
|
||||
"test_dataset = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, \"github-dataset/tabular/test.csv\")]\n",
|
||||
")\n",
|
||||
"# preview the first 3 rows of the dataset\n",
|
||||
"test_dataset.take(5).to_pandas_dataframe()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"compute_target = ws.compute_targets[\"github-cluster\"]\n",
|
||||
"test_experiment = Experiment(ws, experiment_name + \"_test\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import shutil\n",
|
||||
"\n",
|
||||
"script_folder = os.path.join(os.getcwd(), \"inference\")\n",
|
||||
"os.makedirs(script_folder, exist_ok=True)\n",
|
||||
"shutil.copy(\"infer.py\", script_folder)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from helper import run_inference\n",
|
||||
"\n",
|
||||
"test_run = run_inference(\n",
|
||||
" test_experiment,\n",
|
||||
" compute_target,\n",
|
||||
" script_folder,\n",
|
||||
" best_dnn_run,\n",
|
||||
" test_dataset,\n",
|
||||
" valid_dataset,\n",
|
||||
" forecast_horizon,\n",
|
||||
" target_column_name,\n",
|
||||
" time_column_name,\n",
|
||||
" freq,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"RunDetails(test_run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from helper import run_multiple_inferences\n",
|
||||
"\n",
|
||||
"summary_df = run_multiple_inferences(\n",
|
||||
" summary_df,\n",
|
||||
" experiment,\n",
|
||||
" test_experiment,\n",
|
||||
" compute_target,\n",
|
||||
" script_folder,\n",
|
||||
" test_dataset,\n",
|
||||
" valid_dataset,\n",
|
||||
" forecast_horizon,\n",
|
||||
" target_column_name,\n",
|
||||
" time_column_name,\n",
|
||||
" freq,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for run_name, run_summary in summary_df.iterrows():\n",
|
||||
" print(run_name)\n",
|
||||
" print(run_summary)\n",
|
||||
" run_id = run_summary.run_id\n",
|
||||
" test_run_id = run_summary.test_run_id\n",
|
||||
" test_run = Run(test_experiment, test_run_id)\n",
|
||||
" test_run.wait_for_completion()\n",
|
||||
" test_score = test_run.get_metrics()[run_summary.primary_metric]\n",
|
||||
" summary_df.loc[summary_df.run_id == run_id, \"Test Score\"] = test_score\n",
|
||||
" print(\"Test Score: \", test_score)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"hideCode": false,
|
||||
"hidePrompt": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summary_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "jialiu"
|
||||
}
|
||||
],
|
||||
"hide_code_all_hidden": false,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -0,0 +1,455 @@
|
||||
date,count,day_of_week,month_of_year,holiday
|
||||
2017-06-04,104663,6.0,5.0,0.0
|
||||
2017-06-05,155824,0.0,5.0,0.0
|
||||
2017-06-06,164908,1.0,5.0,0.0
|
||||
2017-06-07,170309,2.0,5.0,0.0
|
||||
2017-06-08,164256,3.0,5.0,0.0
|
||||
2017-06-09,153406,4.0,5.0,0.0
|
||||
2017-06-10,97024,5.0,5.0,0.0
|
||||
2017-06-11,103442,6.0,5.0,0.0
|
||||
2017-06-12,160768,0.0,5.0,0.0
|
||||
2017-06-13,166288,1.0,5.0,0.0
|
||||
2017-06-14,163819,2.0,5.0,0.0
|
||||
2017-06-15,157593,3.0,5.0,0.0
|
||||
2017-06-16,149259,4.0,5.0,0.0
|
||||
2017-06-17,95579,5.0,5.0,0.0
|
||||
2017-06-18,98723,6.0,5.0,0.0
|
||||
2017-06-19,159076,0.0,5.0,0.0
|
||||
2017-06-20,163340,1.0,5.0,0.0
|
||||
2017-06-21,163344,2.0,5.0,0.0
|
||||
2017-06-22,159528,3.0,5.0,0.0
|
||||
2017-06-23,146563,4.0,5.0,0.0
|
||||
2017-06-24,92631,5.0,5.0,0.0
|
||||
2017-06-25,96549,6.0,5.0,0.0
|
||||
2017-06-26,153249,0.0,5.0,0.0
|
||||
2017-06-27,160357,1.0,5.0,0.0
|
||||
2017-06-28,159941,2.0,5.0,0.0
|
||||
2017-06-29,156781,3.0,5.0,0.0
|
||||
2017-06-30,144709,4.0,5.0,0.0
|
||||
2017-07-01,89101,5.0,6.0,0.0
|
||||
2017-07-02,93046,6.0,6.0,0.0
|
||||
2017-07-03,144113,0.0,6.0,0.0
|
||||
2017-07-04,143061,1.0,6.0,1.0
|
||||
2017-07-05,154603,2.0,6.0,0.0
|
||||
2017-07-06,157200,3.0,6.0,0.0
|
||||
2017-07-07,147213,4.0,6.0,0.0
|
||||
2017-07-08,92348,5.0,6.0,0.0
|
||||
2017-07-09,97018,6.0,6.0,0.0
|
||||
2017-07-10,157192,0.0,6.0,0.0
|
||||
2017-07-11,161819,1.0,6.0,0.0
|
||||
2017-07-12,161998,2.0,6.0,0.0
|
||||
2017-07-13,160280,3.0,6.0,0.0
|
||||
2017-07-14,146818,4.0,6.0,0.0
|
||||
2017-07-15,93041,5.0,6.0,0.0
|
||||
2017-07-16,97505,6.0,6.0,0.0
|
||||
2017-07-17,156167,0.0,6.0,0.0
|
||||
2017-07-18,162855,1.0,6.0,0.0
|
||||
2017-07-19,162519,2.0,6.0,0.0
|
||||
2017-07-20,159941,3.0,6.0,0.0
|
||||
2017-07-21,148460,4.0,6.0,0.0
|
||||
2017-07-22,93431,5.0,6.0,0.0
|
||||
2017-07-23,98553,6.0,6.0,0.0
|
||||
2017-07-24,156202,0.0,6.0,0.0
|
||||
2017-07-25,162503,1.0,6.0,0.0
|
||||
2017-07-26,158479,2.0,6.0,0.0
|
||||
2017-07-27,158192,3.0,6.0,0.0
|
||||
2017-07-28,147108,4.0,6.0,0.0
|
||||
2017-07-29,93799,5.0,6.0,0.0
|
||||
2017-07-30,97920,6.0,6.0,0.0
|
||||
2017-07-31,152197,0.0,6.0,0.0
|
||||
2017-08-01,158477,1.0,7.0,0.0
|
||||
2017-08-02,159089,2.0,7.0,0.0
|
||||
2017-08-03,157182,3.0,7.0,0.0
|
||||
2017-08-04,146345,4.0,7.0,0.0
|
||||
2017-08-05,92534,5.0,7.0,0.0
|
||||
2017-08-06,97128,6.0,7.0,0.0
|
||||
2017-08-07,151359,0.0,7.0,0.0
|
||||
2017-08-08,159895,1.0,7.0,0.0
|
||||
2017-08-09,158329,2.0,7.0,0.0
|
||||
2017-08-10,155468,3.0,7.0,0.0
|
||||
2017-08-11,144914,4.0,7.0,0.0
|
||||
2017-08-12,92258,5.0,7.0,0.0
|
||||
2017-08-13,95933,6.0,7.0,0.0
|
||||
2017-08-14,147706,0.0,7.0,0.0
|
||||
2017-08-15,151115,1.0,7.0,0.0
|
||||
2017-08-16,157640,2.0,7.0,0.0
|
||||
2017-08-17,156600,3.0,7.0,0.0
|
||||
2017-08-18,146980,4.0,7.0,0.0
|
||||
2017-08-19,94592,5.0,7.0,0.0
|
||||
2017-08-20,99320,6.0,7.0,0.0
|
||||
2017-08-21,145727,0.0,7.0,0.0
|
||||
2017-08-22,160260,1.0,7.0,0.0
|
||||
2017-08-23,160440,2.0,7.0,0.0
|
||||
2017-08-24,157830,3.0,7.0,0.0
|
||||
2017-08-25,145822,4.0,7.0,0.0
|
||||
2017-08-26,94706,5.0,7.0,0.0
|
||||
2017-08-27,99047,6.0,7.0,0.0
|
||||
2017-08-28,152112,0.0,7.0,0.0
|
||||
2017-08-29,162440,1.0,7.0,0.0
|
||||
2017-08-30,162902,2.0,7.0,0.0
|
||||
2017-08-31,159498,3.0,7.0,0.0
|
||||
2017-09-01,145689,4.0,8.0,0.0
|
||||
2017-09-02,93589,5.0,8.0,0.0
|
||||
2017-09-03,100058,6.0,8.0,0.0
|
||||
2017-09-04,140865,0.0,8.0,1.0
|
||||
2017-09-05,165715,1.0,8.0,0.0
|
||||
2017-09-06,167463,2.0,8.0,0.0
|
||||
2017-09-07,164811,3.0,8.0,0.0
|
||||
2017-09-08,156157,4.0,8.0,0.0
|
||||
2017-09-09,101358,5.0,8.0,0.0
|
||||
2017-09-10,107915,6.0,8.0,0.0
|
||||
2017-09-11,167845,0.0,8.0,0.0
|
||||
2017-09-12,172756,1.0,8.0,0.0
|
||||
2017-09-13,172851,2.0,8.0,0.0
|
||||
2017-09-14,171675,3.0,8.0,0.0
|
||||
2017-09-15,159266,4.0,8.0,0.0
|
||||
2017-09-16,103547,5.0,8.0,0.0
|
||||
2017-09-17,110964,6.0,8.0,0.0
|
||||
2017-09-18,170976,0.0,8.0,0.0
|
||||
2017-09-19,177864,1.0,8.0,0.0
|
||||
2017-09-20,173567,2.0,8.0,0.0
|
||||
2017-09-21,172017,3.0,8.0,0.0
|
||||
2017-09-22,161357,4.0,8.0,0.0
|
||||
2017-09-23,104681,5.0,8.0,0.0
|
||||
2017-09-24,111711,6.0,8.0,0.0
|
||||
2017-09-25,173517,0.0,8.0,0.0
|
||||
2017-09-26,180049,1.0,8.0,0.0
|
||||
2017-09-27,178307,2.0,8.0,0.0
|
||||
2017-09-28,174157,3.0,8.0,0.0
|
||||
2017-09-29,161707,4.0,8.0,0.0
|
||||
2017-09-30,110536,5.0,8.0,0.0
|
||||
2017-10-01,106505,6.0,9.0,0.0
|
||||
2017-10-02,157565,0.0,9.0,0.0
|
||||
2017-10-03,164764,1.0,9.0,0.0
|
||||
2017-10-04,163383,2.0,9.0,0.0
|
||||
2017-10-05,162847,3.0,9.0,0.0
|
||||
2017-10-06,153575,4.0,9.0,0.0
|
||||
2017-10-07,107472,5.0,9.0,0.0
|
||||
2017-10-08,116127,6.0,9.0,0.0
|
||||
2017-10-09,174457,0.0,9.0,1.0
|
||||
2017-10-10,185217,1.0,9.0,0.0
|
||||
2017-10-11,185120,2.0,9.0,0.0
|
||||
2017-10-12,180844,3.0,9.0,0.0
|
||||
2017-10-13,170178,4.0,9.0,0.0
|
||||
2017-10-14,112754,5.0,9.0,0.0
|
||||
2017-10-15,121251,6.0,9.0,0.0
|
||||
2017-10-16,183906,0.0,9.0,0.0
|
||||
2017-10-17,188945,1.0,9.0,0.0
|
||||
2017-10-18,187297,2.0,9.0,0.0
|
||||
2017-10-19,183867,3.0,9.0,0.0
|
||||
2017-10-20,173021,4.0,9.0,0.0
|
||||
2017-10-21,115851,5.0,9.0,0.0
|
||||
2017-10-22,126088,6.0,9.0,0.0
|
||||
2017-10-23,189452,0.0,9.0,0.0
|
||||
2017-10-24,194412,1.0,9.0,0.0
|
||||
2017-10-25,192293,2.0,9.0,0.0
|
||||
2017-10-26,190163,3.0,9.0,0.0
|
||||
2017-10-27,177053,4.0,9.0,0.0
|
||||
2017-10-28,114934,5.0,9.0,0.0
|
||||
2017-10-29,125289,6.0,9.0,0.0
|
||||
2017-10-30,189245,0.0,9.0,0.0
|
||||
2017-10-31,191480,1.0,9.0,0.0
|
||||
2017-11-01,182281,2.0,10.0,0.0
|
||||
2017-11-02,186351,3.0,10.0,0.0
|
||||
2017-11-03,175422,4.0,10.0,0.0
|
||||
2017-11-04,118160,5.0,10.0,0.0
|
||||
2017-11-05,127602,6.0,10.0,0.0
|
||||
2017-11-06,191067,0.0,10.0,0.0
|
||||
2017-11-07,197083,1.0,10.0,0.0
|
||||
2017-11-08,194333,2.0,10.0,0.0
|
||||
2017-11-09,193914,3.0,10.0,0.0
|
||||
2017-11-10,179933,4.0,10.0,1.0
|
||||
2017-11-11,121346,5.0,10.0,0.0
|
||||
2017-11-12,131900,6.0,10.0,0.0
|
||||
2017-11-13,196969,0.0,10.0,0.0
|
||||
2017-11-14,201949,1.0,10.0,0.0
|
||||
2017-11-15,198424,2.0,10.0,0.0
|
||||
2017-11-16,196902,3.0,10.0,0.0
|
||||
2017-11-17,183893,4.0,10.0,0.0
|
||||
2017-11-18,122767,5.0,10.0,0.0
|
||||
2017-11-19,130890,6.0,10.0,0.0
|
||||
2017-11-20,194515,0.0,10.0,0.0
|
||||
2017-11-21,198601,1.0,10.0,0.0
|
||||
2017-11-22,191041,2.0,10.0,0.0
|
||||
2017-11-23,170321,3.0,10.0,1.0
|
||||
2017-11-24,155623,4.0,10.0,0.0
|
||||
2017-11-25,115759,5.0,10.0,0.0
|
||||
2017-11-26,128771,6.0,10.0,0.0
|
||||
2017-11-27,199419,0.0,10.0,0.0
|
||||
2017-11-28,207253,1.0,10.0,0.0
|
||||
2017-11-29,205406,2.0,10.0,0.0
|
||||
2017-11-30,200674,3.0,10.0,0.0
|
||||
2017-12-01,187017,4.0,11.0,0.0
|
||||
2017-12-02,129735,5.0,11.0,0.0
|
||||
2017-12-03,139120,6.0,11.0,0.0
|
||||
2017-12-04,205505,0.0,11.0,0.0
|
||||
2017-12-05,208218,1.0,11.0,0.0
|
||||
2017-12-06,202480,2.0,11.0,0.0
|
||||
2017-12-07,197822,3.0,11.0,0.0
|
||||
2017-12-08,180686,4.0,11.0,0.0
|
||||
2017-12-09,123667,5.0,11.0,0.0
|
||||
2017-12-10,130987,6.0,11.0,0.0
|
||||
2017-12-11,193901,0.0,11.0,0.0
|
||||
2017-12-12,194997,1.0,11.0,0.0
|
||||
2017-12-13,192063,2.0,11.0,0.0
|
||||
2017-12-14,186496,3.0,11.0,0.0
|
||||
2017-12-15,170812,4.0,11.0,0.0
|
||||
2017-12-16,110474,5.0,11.0,0.0
|
||||
2017-12-17,118165,6.0,11.0,0.0
|
||||
2017-12-18,176843,0.0,11.0,0.0
|
||||
2017-12-19,179550,1.0,11.0,0.0
|
||||
2017-12-20,173506,2.0,11.0,0.0
|
||||
2017-12-21,165910,3.0,11.0,0.0
|
||||
2017-12-22,145886,4.0,11.0,0.0
|
||||
2017-12-23,95246,5.0,11.0,0.0
|
||||
2017-12-24,88781,6.0,11.0,0.0
|
||||
2017-12-25,98189,0.0,11.0,1.0
|
||||
2017-12-26,121383,1.0,11.0,0.0
|
||||
2017-12-27,135300,2.0,11.0,0.0
|
||||
2017-12-28,136827,3.0,11.0,0.0
|
||||
2017-12-29,127700,4.0,11.0,0.0
|
||||
2017-12-30,93014,5.0,11.0,0.0
|
||||
2017-12-31,82878,6.0,11.0,0.0
|
||||
2018-01-01,86419,0.0,0.0,1.0
|
||||
2018-01-02,147428,1.0,0.0,0.0
|
||||
2018-01-03,162193,2.0,0.0,0.0
|
||||
2018-01-04,163784,3.0,0.0,0.0
|
||||
2018-01-05,158606,4.0,0.0,0.0
|
||||
2018-01-06,113467,5.0,0.0,0.0
|
||||
2018-01-07,118313,6.0,0.0,0.0
|
||||
2018-01-08,175623,0.0,0.0,0.0
|
||||
2018-01-09,183880,1.0,0.0,0.0
|
||||
2018-01-10,183945,2.0,0.0,0.0
|
||||
2018-01-11,181769,3.0,0.0,0.0
|
||||
2018-01-12,170552,4.0,0.0,0.0
|
||||
2018-01-13,115707,5.0,0.0,0.0
|
||||
2018-01-14,121191,6.0,0.0,0.0
|
||||
2018-01-15,176127,0.0,0.0,1.0
|
||||
2018-01-16,188032,1.0,0.0,0.0
|
||||
2018-01-17,189871,2.0,0.0,0.0
|
||||
2018-01-18,189348,3.0,0.0,0.0
|
||||
2018-01-19,177456,4.0,0.0,0.0
|
||||
2018-01-20,123321,5.0,0.0,0.0
|
||||
2018-01-21,128306,6.0,0.0,0.0
|
||||
2018-01-22,186132,0.0,0.0,0.0
|
||||
2018-01-23,197618,1.0,0.0,0.0
|
||||
2018-01-24,196402,2.0,0.0,0.0
|
||||
2018-01-25,192722,3.0,0.0,0.0
|
||||
2018-01-26,179415,4.0,0.0,0.0
|
||||
2018-01-27,125769,5.0,0.0,0.0
|
||||
2018-01-28,133306,6.0,0.0,0.0
|
||||
2018-01-29,194151,0.0,0.0,0.0
|
||||
2018-01-30,198680,1.0,0.0,0.0
|
||||
2018-01-31,198652,2.0,0.0,0.0
|
||||
2018-02-01,195472,3.0,1.0,0.0
|
||||
2018-02-02,183173,4.0,1.0,0.0
|
||||
2018-02-03,124276,5.0,1.0,0.0
|
||||
2018-02-04,129054,6.0,1.0,0.0
|
||||
2018-02-05,190024,0.0,1.0,0.0
|
||||
2018-02-06,198658,1.0,1.0,0.0
|
||||
2018-02-07,198272,2.0,1.0,0.0
|
||||
2018-02-08,195339,3.0,1.0,0.0
|
||||
2018-02-09,183086,4.0,1.0,0.0
|
||||
2018-02-10,122536,5.0,1.0,0.0
|
||||
2018-02-11,133033,6.0,1.0,0.0
|
||||
2018-02-12,185386,0.0,1.0,0.0
|
||||
2018-02-13,184789,1.0,1.0,0.0
|
||||
2018-02-14,176089,2.0,1.0,0.0
|
||||
2018-02-15,171317,3.0,1.0,0.0
|
||||
2018-02-16,162693,4.0,1.0,0.0
|
||||
2018-02-17,116342,5.0,1.0,0.0
|
||||
2018-02-18,122466,6.0,1.0,0.0
|
||||
2018-02-19,172364,0.0,1.0,1.0
|
||||
2018-02-20,185896,1.0,1.0,0.0
|
||||
2018-02-21,188166,2.0,1.0,0.0
|
||||
2018-02-22,189427,3.0,1.0,0.0
|
||||
2018-02-23,178732,4.0,1.0,0.0
|
||||
2018-02-24,132664,5.0,1.0,0.0
|
||||
2018-02-25,134008,6.0,1.0,0.0
|
||||
2018-02-26,200075,0.0,1.0,0.0
|
||||
2018-02-27,207996,1.0,1.0,0.0
|
||||
2018-02-28,204416,2.0,1.0,0.0
|
||||
2018-03-01,201320,3.0,2.0,0.0
|
||||
2018-03-02,188205,4.0,2.0,0.0
|
||||
2018-03-03,131162,5.0,2.0,0.0
|
||||
2018-03-04,138320,6.0,2.0,0.0
|
||||
2018-03-05,207326,0.0,2.0,0.0
|
||||
2018-03-06,212462,1.0,2.0,0.0
|
||||
2018-03-07,209357,2.0,2.0,0.0
|
||||
2018-03-08,194876,3.0,2.0,0.0
|
||||
2018-03-09,193761,4.0,2.0,0.0
|
||||
2018-03-10,133449,5.0,2.0,0.0
|
||||
2018-03-11,142258,6.0,2.0,0.0
|
||||
2018-03-12,208753,0.0,2.0,0.0
|
||||
2018-03-13,210602,1.0,2.0,0.0
|
||||
2018-03-14,214236,2.0,2.0,0.0
|
||||
2018-03-15,210761,3.0,2.0,0.0
|
||||
2018-03-16,196619,4.0,2.0,0.0
|
||||
2018-03-17,133056,5.0,2.0,0.0
|
||||
2018-03-18,141335,6.0,2.0,0.0
|
||||
2018-03-19,211580,0.0,2.0,0.0
|
||||
2018-03-20,219051,1.0,2.0,0.0
|
||||
2018-03-21,215435,2.0,2.0,0.0
|
||||
2018-03-22,211961,3.0,2.0,0.0
|
||||
2018-03-23,196009,4.0,2.0,0.0
|
||||
2018-03-24,132390,5.0,2.0,0.0
|
||||
2018-03-25,140021,6.0,2.0,0.0
|
||||
2018-03-26,205273,0.0,2.0,0.0
|
||||
2018-03-27,212686,1.0,2.0,0.0
|
||||
2018-03-28,210683,2.0,2.0,0.0
|
||||
2018-03-29,189044,3.0,2.0,0.0
|
||||
2018-03-30,170256,4.0,2.0,0.0
|
||||
2018-03-31,125999,5.0,2.0,0.0
|
||||
2018-04-01,126749,6.0,3.0,0.0
|
||||
2018-04-02,186546,0.0,3.0,0.0
|
||||
2018-04-03,207905,1.0,3.0,0.0
|
||||
2018-04-04,201528,2.0,3.0,0.0
|
||||
2018-04-05,188580,3.0,3.0,0.0
|
||||
2018-04-06,173714,4.0,3.0,0.0
|
||||
2018-04-07,125723,5.0,3.0,0.0
|
||||
2018-04-08,142545,6.0,3.0,0.0
|
||||
2018-04-09,204767,0.0,3.0,0.0
|
||||
2018-04-10,212048,1.0,3.0,0.0
|
||||
2018-04-11,210517,2.0,3.0,0.0
|
||||
2018-04-12,206924,3.0,3.0,0.0
|
||||
2018-04-13,191679,4.0,3.0,0.0
|
||||
2018-04-14,126394,5.0,3.0,0.0
|
||||
2018-04-15,137279,6.0,3.0,0.0
|
||||
2018-04-16,208085,0.0,3.0,0.0
|
||||
2018-04-17,213273,1.0,3.0,0.0
|
||||
2018-04-18,211580,2.0,3.0,0.0
|
||||
2018-04-19,206037,3.0,3.0,0.0
|
||||
2018-04-20,191211,4.0,3.0,0.0
|
||||
2018-04-21,125564,5.0,3.0,0.0
|
||||
2018-04-22,136469,6.0,3.0,0.0
|
||||
2018-04-23,206288,0.0,3.0,0.0
|
||||
2018-04-24,212115,1.0,3.0,0.0
|
||||
2018-04-25,207948,2.0,3.0,0.0
|
||||
2018-04-26,205759,3.0,3.0,0.0
|
||||
2018-04-27,181330,4.0,3.0,0.0
|
||||
2018-04-28,130046,5.0,3.0,0.0
|
||||
2018-04-29,120802,6.0,3.0,0.0
|
||||
2018-04-30,170390,0.0,3.0,0.0
|
||||
2018-05-01,169054,1.0,4.0,0.0
|
||||
2018-05-02,197891,2.0,4.0,0.0
|
||||
2018-05-03,199820,3.0,4.0,0.0
|
||||
2018-05-04,186783,4.0,4.0,0.0
|
||||
2018-05-05,124420,5.0,4.0,0.0
|
||||
2018-05-06,130666,6.0,4.0,0.0
|
||||
2018-05-07,196014,0.0,4.0,0.0
|
||||
2018-05-08,203058,1.0,4.0,0.0
|
||||
2018-05-09,198582,2.0,4.0,0.0
|
||||
2018-05-10,191321,3.0,4.0,0.0
|
||||
2018-05-11,183639,4.0,4.0,0.0
|
||||
2018-05-12,122023,5.0,4.0,0.0
|
||||
2018-05-13,128775,6.0,4.0,0.0
|
||||
2018-05-14,199104,0.0,4.0,0.0
|
||||
2018-05-15,200658,1.0,4.0,0.0
|
||||
2018-05-16,201541,2.0,4.0,0.0
|
||||
2018-05-17,196886,3.0,4.0,0.0
|
||||
2018-05-18,188597,4.0,4.0,0.0
|
||||
2018-05-19,121392,5.0,4.0,0.0
|
||||
2018-05-20,126981,6.0,4.0,0.0
|
||||
2018-05-21,189291,0.0,4.0,0.0
|
||||
2018-05-22,203038,1.0,4.0,0.0
|
||||
2018-05-23,205330,2.0,4.0,0.0
|
||||
2018-05-24,199208,3.0,4.0,0.0
|
||||
2018-05-25,187768,4.0,4.0,0.0
|
||||
2018-05-26,117635,5.0,4.0,0.0
|
||||
2018-05-27,124352,6.0,4.0,0.0
|
||||
2018-05-28,180398,0.0,4.0,1.0
|
||||
2018-05-29,194170,1.0,4.0,0.0
|
||||
2018-05-30,200281,2.0,4.0,0.0
|
||||
2018-05-31,197244,3.0,4.0,0.0
|
||||
2018-06-01,184037,4.0,5.0,0.0
|
||||
2018-06-02,121135,5.0,5.0,0.0
|
||||
2018-06-03,129389,6.0,5.0,0.0
|
||||
2018-06-04,200331,0.0,5.0,0.0
|
||||
2018-06-05,207735,1.0,5.0,0.0
|
||||
2018-06-06,203354,2.0,5.0,0.0
|
||||
2018-06-07,200520,3.0,5.0,0.0
|
||||
2018-06-08,182038,4.0,5.0,0.0
|
||||
2018-06-09,120164,5.0,5.0,0.0
|
||||
2018-06-10,125256,6.0,5.0,0.0
|
||||
2018-06-11,194786,0.0,5.0,0.0
|
||||
2018-06-12,200815,1.0,5.0,0.0
|
||||
2018-06-13,197740,2.0,5.0,0.0
|
||||
2018-06-14,192294,3.0,5.0,0.0
|
||||
2018-06-15,173587,4.0,5.0,0.0
|
||||
2018-06-16,105955,5.0,5.0,0.0
|
||||
2018-06-17,110780,6.0,5.0,0.0
|
||||
2018-06-18,174582,0.0,5.0,0.0
|
||||
2018-06-19,193310,1.0,5.0,0.0
|
||||
2018-06-20,193062,2.0,5.0,0.0
|
||||
2018-06-21,187986,3.0,5.0,0.0
|
||||
2018-06-22,173606,4.0,5.0,0.0
|
||||
2018-06-23,111795,5.0,5.0,0.0
|
||||
2018-06-24,116134,6.0,5.0,0.0
|
||||
2018-06-25,185919,0.0,5.0,0.0
|
||||
2018-06-26,193142,1.0,5.0,0.0
|
||||
2018-06-27,188114,2.0,5.0,0.0
|
||||
2018-06-28,183737,3.0,5.0,0.0
|
||||
2018-06-29,171496,4.0,5.0,0.0
|
||||
2018-06-30,107210,5.0,5.0,0.0
|
||||
2018-07-01,111053,6.0,6.0,0.0
|
||||
2018-07-02,176198,0.0,6.0,0.0
|
||||
2018-07-03,184040,1.0,6.0,0.0
|
||||
2018-07-04,169783,2.0,6.0,1.0
|
||||
2018-07-05,177996,3.0,6.0,0.0
|
||||
2018-07-06,167378,4.0,6.0,0.0
|
||||
2018-07-07,106401,5.0,6.0,0.0
|
||||
2018-07-08,112327,6.0,6.0,0.0
|
||||
2018-07-09,182835,0.0,6.0,0.0
|
||||
2018-07-10,187694,1.0,6.0,0.0
|
||||
2018-07-11,185762,2.0,6.0,0.0
|
||||
2018-07-12,184099,3.0,6.0,0.0
|
||||
2018-07-13,170860,4.0,6.0,0.0
|
||||
2018-07-14,106799,5.0,6.0,0.0
|
||||
2018-07-15,108475,6.0,6.0,0.0
|
||||
2018-07-16,175704,0.0,6.0,0.0
|
||||
2018-07-17,183596,1.0,6.0,0.0
|
||||
2018-07-18,179897,2.0,6.0,0.0
|
||||
2018-07-19,183373,3.0,6.0,0.0
|
||||
2018-07-20,169626,4.0,6.0,0.0
|
||||
2018-07-21,106785,5.0,6.0,0.0
|
||||
2018-07-22,112387,6.0,6.0,0.0
|
||||
2018-07-23,180572,0.0,6.0,0.0
|
||||
2018-07-24,186943,1.0,6.0,0.0
|
||||
2018-07-25,185744,2.0,6.0,0.0
|
||||
2018-07-26,183117,3.0,6.0,0.0
|
||||
2018-07-27,168526,4.0,6.0,0.0
|
||||
2018-07-28,105936,5.0,6.0,0.0
|
||||
2018-07-29,111708,6.0,6.0,0.0
|
||||
2018-07-30,179950,0.0,6.0,0.0
|
||||
2018-07-31,185930,1.0,6.0,0.0
|
||||
2018-08-01,183366,2.0,7.0,0.0
|
||||
2018-08-02,182412,3.0,7.0,0.0
|
||||
2018-08-03,173429,4.0,7.0,0.0
|
||||
2018-08-04,106108,5.0,7.0,0.0
|
||||
2018-08-05,110059,6.0,7.0,0.0
|
||||
2018-08-06,178355,0.0,7.0,0.0
|
||||
2018-08-07,185518,1.0,7.0,0.0
|
||||
2018-08-08,183204,2.0,7.0,0.0
|
||||
2018-08-09,181276,3.0,7.0,0.0
|
||||
2018-08-10,168297,4.0,7.0,0.0
|
||||
2018-08-11,106488,5.0,7.0,0.0
|
||||
2018-08-12,111786,6.0,7.0,0.0
|
||||
2018-08-13,178620,0.0,7.0,0.0
|
||||
2018-08-14,181922,1.0,7.0,0.0
|
||||
2018-08-15,172198,2.0,7.0,0.0
|
||||
2018-08-16,177367,3.0,7.0,0.0
|
||||
2018-08-17,166550,4.0,7.0,0.0
|
||||
2018-08-18,107011,5.0,7.0,0.0
|
||||
2018-08-19,112299,6.0,7.0,0.0
|
||||
2018-08-20,176718,0.0,7.0,0.0
|
||||
2018-08-21,182562,1.0,7.0,0.0
|
||||
2018-08-22,181484,2.0,7.0,0.0
|
||||
2018-08-23,180317,3.0,7.0,0.0
|
||||
2018-08-24,170197,4.0,7.0,0.0
|
||||
2018-08-25,109383,5.0,7.0,0.0
|
||||
2018-08-26,113373,6.0,7.0,0.0
|
||||
2018-08-27,180142,0.0,7.0,0.0
|
||||
2018-08-28,191628,1.0,7.0,0.0
|
||||
2018-08-29,191149,2.0,7.0,0.0
|
||||
2018-08-30,187503,3.0,7.0,0.0
|
||||
2018-08-31,172280,4.0,7.0,0.0
|
||||
|
@@ -0,0 +1,183 @@
|
||||
import pandas as pd
|
||||
from azureml.core import Environment
|
||||
from azureml.core.conda_dependencies import CondaDependencies
|
||||
from azureml.train.estimator import Estimator
|
||||
from azureml.core.run import Run
|
||||
from azureml.automl.core.shared import constants
|
||||
|
||||
|
||||
def split_fraction_by_grain(df, fraction, time_column_name, grain_column_names=None):
|
||||
if not grain_column_names:
|
||||
df["tmp_grain_column"] = "grain"
|
||||
grain_column_names = ["tmp_grain_column"]
|
||||
|
||||
"""Group df by grain and split on last n rows for each group."""
|
||||
df_grouped = df.sort_values(time_column_name).groupby(
|
||||
grain_column_names, group_keys=False
|
||||
)
|
||||
|
||||
df_head = df_grouped.apply(
|
||||
lambda dfg: dfg.iloc[: -int(len(dfg) * fraction)] if fraction > 0 else dfg
|
||||
)
|
||||
|
||||
df_tail = df_grouped.apply(
|
||||
lambda dfg: dfg.iloc[-int(len(dfg) * fraction) :] if fraction > 0 else dfg[:0]
|
||||
)
|
||||
|
||||
if "tmp_grain_column" in grain_column_names:
|
||||
for df2 in (df, df_head, df_tail):
|
||||
df2.drop("tmp_grain_column", axis=1, inplace=True)
|
||||
|
||||
grain_column_names.remove("tmp_grain_column")
|
||||
|
||||
return df_head, df_tail
|
||||
|
||||
|
||||
def split_full_for_forecasting(
|
||||
df, time_column_name, grain_column_names=None, test_split=0.2
|
||||
):
|
||||
index_name = df.index.name
|
||||
|
||||
# Assumes that there isn't already a column called tmpindex
|
||||
|
||||
df["tmpindex"] = df.index
|
||||
|
||||
train_df, test_df = split_fraction_by_grain(
|
||||
df, test_split, time_column_name, grain_column_names
|
||||
)
|
||||
|
||||
train_df = train_df.set_index("tmpindex")
|
||||
train_df.index.name = index_name
|
||||
|
||||
test_df = test_df.set_index("tmpindex")
|
||||
test_df.index.name = index_name
|
||||
|
||||
df.drop("tmpindex", axis=1, inplace=True)
|
||||
|
||||
return train_df, test_df
|
||||
|
||||
|
||||
def get_result_df(remote_run):
|
||||
children = list(remote_run.get_children(recursive=True))
|
||||
summary_df = pd.DataFrame(
|
||||
index=["run_id", "run_algorithm", "primary_metric", "Score"]
|
||||
)
|
||||
goal_minimize = False
|
||||
for run in children:
|
||||
if (
|
||||
run.get_status().lower() == constants.RunState.COMPLETE_RUN
|
||||
and "run_algorithm" in run.properties
|
||||
and "score" in run.properties
|
||||
):
|
||||
# We only count in the completed child runs.
|
||||
summary_df[run.id] = [
|
||||
run.id,
|
||||
run.properties["run_algorithm"],
|
||||
run.properties["primary_metric"],
|
||||
float(run.properties["score"]),
|
||||
]
|
||||
if "goal" in run.properties:
|
||||
goal_minimize = run.properties["goal"].split("_")[-1] == "min"
|
||||
|
||||
summary_df = summary_df.T.sort_values(
|
||||
"Score", ascending=goal_minimize
|
||||
).drop_duplicates(["run_algorithm"])
|
||||
summary_df = summary_df.set_index("run_algorithm")
|
||||
return summary_df
|
||||
|
||||
|
||||
def run_inference(
|
||||
test_experiment,
|
||||
compute_target,
|
||||
script_folder,
|
||||
train_run,
|
||||
test_dataset,
|
||||
lookback_dataset,
|
||||
max_horizon,
|
||||
target_column_name,
|
||||
time_column_name,
|
||||
freq,
|
||||
):
|
||||
model_base_name = "model.pkl"
|
||||
if "model_data_location" in train_run.properties:
|
||||
model_location = train_run.properties["model_data_location"]
|
||||
_, model_base_name = model_location.rsplit("/", 1)
|
||||
train_run.download_file(
|
||||
"outputs/{}".format(model_base_name), "inference/{}".format(model_base_name)
|
||||
)
|
||||
train_run.download_file("outputs/conda_env_v_1_0_0.yml", "inference/condafile.yml")
|
||||
|
||||
inference_env = Environment("myenv")
|
||||
inference_env.docker.enabled = True
|
||||
inference_env.python.conda_dependencies = CondaDependencies(
|
||||
conda_dependencies_file_path="inference/condafile.yml"
|
||||
)
|
||||
|
||||
est = Estimator(
|
||||
source_directory=script_folder,
|
||||
entry_script="infer.py",
|
||||
script_params={
|
||||
"--max_horizon": max_horizon,
|
||||
"--target_column_name": target_column_name,
|
||||
"--time_column_name": time_column_name,
|
||||
"--frequency": freq,
|
||||
"--model_path": model_base_name,
|
||||
},
|
||||
inputs=[
|
||||
test_dataset.as_named_input("test_data"),
|
||||
lookback_dataset.as_named_input("lookback_data"),
|
||||
],
|
||||
compute_target=compute_target,
|
||||
environment_definition=inference_env,
|
||||
)
|
||||
|
||||
run = test_experiment.submit(
|
||||
est,
|
||||
tags={
|
||||
"training_run_id": train_run.id,
|
||||
"run_algorithm": train_run.properties["run_algorithm"],
|
||||
"valid_score": train_run.properties["score"],
|
||||
"primary_metric": train_run.properties["primary_metric"],
|
||||
},
|
||||
)
|
||||
|
||||
run.log("run_algorithm", run.tags["run_algorithm"])
|
||||
return run
|
||||
|
||||
|
||||
def run_multiple_inferences(
|
||||
summary_df,
|
||||
train_experiment,
|
||||
test_experiment,
|
||||
compute_target,
|
||||
script_folder,
|
||||
test_dataset,
|
||||
lookback_dataset,
|
||||
max_horizon,
|
||||
target_column_name,
|
||||
time_column_name,
|
||||
freq,
|
||||
):
|
||||
for run_name, run_summary in summary_df.iterrows():
|
||||
print(run_name)
|
||||
print(run_summary)
|
||||
run_id = run_summary.run_id
|
||||
train_run = Run(train_experiment, run_id)
|
||||
|
||||
test_run = run_inference(
|
||||
test_experiment,
|
||||
compute_target,
|
||||
script_folder,
|
||||
train_run,
|
||||
test_dataset,
|
||||
lookback_dataset,
|
||||
max_horizon,
|
||||
target_column_name,
|
||||
time_column_name,
|
||||
freq,
|
||||
)
|
||||
|
||||
print(test_run)
|
||||
summary_df.loc[summary_df.run_id == run_id, "test_run_id"] = test_run.id
|
||||
|
||||
return summary_df
|
||||
@@ -0,0 +1,386 @@
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from pandas.tseries.frequencies import to_offset
|
||||
from sklearn.externals import joblib
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error
|
||||
|
||||
from azureml.automl.runtime.shared.score import scoring, constants
|
||||
from azureml.core import Run
|
||||
|
||||
try:
|
||||
import torch
|
||||
|
||||
_torch_present = True
|
||||
except ImportError:
|
||||
_torch_present = False
|
||||
|
||||
|
||||
def align_outputs(
|
||||
y_predicted,
|
||||
X_trans,
|
||||
X_test,
|
||||
y_test,
|
||||
predicted_column_name="predicted",
|
||||
horizon_colname="horizon_origin",
|
||||
):
|
||||
"""
|
||||
Demonstrates how to get the output aligned to the inputs
|
||||
using pandas indexes. Helps understand what happened if
|
||||
the output's shape differs from the input shape, or if
|
||||
the data got re-sorted by time and grain during forecasting.
|
||||
|
||||
Typical causes of misalignment are:
|
||||
* we predicted some periods that were missing in actuals -> drop from eval
|
||||
* model was asked to predict past max_horizon -> increase max horizon
|
||||
* data at start of X_test was needed for lags -> provide previous periods
|
||||
"""
|
||||
if horizon_colname in X_trans:
|
||||
df_fcst = pd.DataFrame(
|
||||
{
|
||||
predicted_column_name: y_predicted,
|
||||
horizon_colname: X_trans[horizon_colname],
|
||||
}
|
||||
)
|
||||
else:
|
||||
df_fcst = pd.DataFrame({predicted_column_name: y_predicted})
|
||||
|
||||
# y and X outputs are aligned by forecast() function contract
|
||||
df_fcst.index = X_trans.index
|
||||
|
||||
# align original X_test to y_test
|
||||
X_test_full = X_test.copy()
|
||||
X_test_full[target_column_name] = y_test
|
||||
|
||||
# X_test_full's index does not include origin, so reset for merge
|
||||
df_fcst.reset_index(inplace=True)
|
||||
X_test_full = X_test_full.reset_index().drop(columns="index")
|
||||
together = df_fcst.merge(X_test_full, how="right")
|
||||
|
||||
# drop rows where prediction or actuals are nan
|
||||
# happens because of missing actuals
|
||||
# or at edges of time due to lags/rolling windows
|
||||
clean = together[
|
||||
together[[target_column_name, predicted_column_name]].notnull().all(axis=1)
|
||||
]
|
||||
return clean
|
||||
|
||||
|
||||
def do_rolling_forecast_with_lookback(
|
||||
fitted_model, X_test, y_test, max_horizon, X_lookback, y_lookback, freq="D"
|
||||
):
|
||||
"""
|
||||
Produce forecasts on a rolling origin over the given test set.
|
||||
|
||||
Each iteration makes a forecast for the next 'max_horizon' periods
|
||||
with respect to the current origin, then advances the origin by the
|
||||
horizon time duration. The prediction context for each forecast is set so
|
||||
that the forecaster uses the actual target values prior to the current
|
||||
origin time for constructing lag features.
|
||||
|
||||
This function returns a concatenated DataFrame of rolling forecasts.
|
||||
"""
|
||||
print("Using lookback of size: ", y_lookback.size)
|
||||
df_list = []
|
||||
origin_time = X_test[time_column_name].min()
|
||||
X = X_lookback.append(X_test)
|
||||
y = np.concatenate((y_lookback, y_test), axis=0)
|
||||
while origin_time <= X_test[time_column_name].max():
|
||||
# Set the horizon time - end date of the forecast
|
||||
horizon_time = origin_time + max_horizon * to_offset(freq)
|
||||
|
||||
# Extract test data from an expanding window up-to the horizon
|
||||
expand_wind = X[time_column_name] < horizon_time
|
||||
X_test_expand = X[expand_wind]
|
||||
y_query_expand = np.zeros(len(X_test_expand)).astype(np.float)
|
||||
y_query_expand.fill(np.NaN)
|
||||
|
||||
if origin_time != X[time_column_name].min():
|
||||
# Set the context by including actuals up-to the origin time
|
||||
test_context_expand_wind = X[time_column_name] < origin_time
|
||||
context_expand_wind = X_test_expand[time_column_name] < origin_time
|
||||
y_query_expand[context_expand_wind] = y[test_context_expand_wind]
|
||||
|
||||
# Print some debug info
|
||||
print(
|
||||
"Horizon_time:",
|
||||
horizon_time,
|
||||
" origin_time: ",
|
||||
origin_time,
|
||||
" max_horizon: ",
|
||||
max_horizon,
|
||||
" freq: ",
|
||||
freq,
|
||||
)
|
||||
print("expand_wind: ", expand_wind)
|
||||
print("y_query_expand")
|
||||
print(y_query_expand)
|
||||
print("X_test")
|
||||
print(X)
|
||||
print("X_test_expand")
|
||||
print(X_test_expand)
|
||||
print("Type of X_test_expand: ", type(X_test_expand))
|
||||
print("Type of y_query_expand: ", type(y_query_expand))
|
||||
|
||||
print("y_query_expand")
|
||||
print(y_query_expand)
|
||||
|
||||
# Make a forecast out to the maximum horizon
|
||||
# y_fcst, X_trans = y_query_expand, X_test_expand
|
||||
y_fcst, X_trans = fitted_model.forecast(X_test_expand, y_query_expand)
|
||||
|
||||
print("y_fcst")
|
||||
print(y_fcst)
|
||||
|
||||
# Align forecast with test set for dates within
|
||||
# the current rolling window
|
||||
trans_tindex = X_trans.index.get_level_values(time_column_name)
|
||||
trans_roll_wind = (trans_tindex >= origin_time) & (trans_tindex < horizon_time)
|
||||
test_roll_wind = expand_wind & (X[time_column_name] >= origin_time)
|
||||
df_list.append(
|
||||
align_outputs(
|
||||
y_fcst[trans_roll_wind],
|
||||
X_trans[trans_roll_wind],
|
||||
X[test_roll_wind],
|
||||
y[test_roll_wind],
|
||||
)
|
||||
)
|
||||
|
||||
# Advance the origin time
|
||||
origin_time = horizon_time
|
||||
|
||||
return pd.concat(df_list, ignore_index=True)
|
||||
|
||||
|
||||
def do_rolling_forecast(fitted_model, X_test, y_test, max_horizon, freq="D"):
|
||||
"""
|
||||
Produce forecasts on a rolling origin over the given test set.
|
||||
|
||||
Each iteration makes a forecast for the next 'max_horizon' periods
|
||||
with respect to the current origin, then advances the origin by the
|
||||
horizon time duration. The prediction context for each forecast is set so
|
||||
that the forecaster uses the actual target values prior to the current
|
||||
origin time for constructing lag features.
|
||||
|
||||
This function returns a concatenated DataFrame of rolling forecasts.
|
||||
"""
|
||||
df_list = []
|
||||
origin_time = X_test[time_column_name].min()
|
||||
while origin_time <= X_test[time_column_name].max():
|
||||
# Set the horizon time - end date of the forecast
|
||||
horizon_time = origin_time + max_horizon * to_offset(freq)
|
||||
|
||||
# Extract test data from an expanding window up-to the horizon
|
||||
expand_wind = X_test[time_column_name] < horizon_time
|
||||
X_test_expand = X_test[expand_wind]
|
||||
y_query_expand = np.zeros(len(X_test_expand)).astype(np.float)
|
||||
y_query_expand.fill(np.NaN)
|
||||
|
||||
if origin_time != X_test[time_column_name].min():
|
||||
# Set the context by including actuals up-to the origin time
|
||||
test_context_expand_wind = X_test[time_column_name] < origin_time
|
||||
context_expand_wind = X_test_expand[time_column_name] < origin_time
|
||||
y_query_expand[context_expand_wind] = y_test[test_context_expand_wind]
|
||||
|
||||
# Print some debug info
|
||||
print(
|
||||
"Horizon_time:",
|
||||
horizon_time,
|
||||
" origin_time: ",
|
||||
origin_time,
|
||||
" max_horizon: ",
|
||||
max_horizon,
|
||||
" freq: ",
|
||||
freq,
|
||||
)
|
||||
print("expand_wind: ", expand_wind)
|
||||
print("y_query_expand")
|
||||
print(y_query_expand)
|
||||
print("X_test")
|
||||
print(X_test)
|
||||
print("X_test_expand")
|
||||
print(X_test_expand)
|
||||
print("Type of X_test_expand: ", type(X_test_expand))
|
||||
print("Type of y_query_expand: ", type(y_query_expand))
|
||||
print("y_query_expand")
|
||||
print(y_query_expand)
|
||||
|
||||
# Make a forecast out to the maximum horizon
|
||||
y_fcst, X_trans = fitted_model.forecast(X_test_expand, y_query_expand)
|
||||
|
||||
print("y_fcst")
|
||||
print(y_fcst)
|
||||
|
||||
# Align forecast with test set for dates within the
|
||||
# current rolling window
|
||||
trans_tindex = X_trans.index.get_level_values(time_column_name)
|
||||
trans_roll_wind = (trans_tindex >= origin_time) & (trans_tindex < horizon_time)
|
||||
test_roll_wind = expand_wind & (X_test[time_column_name] >= origin_time)
|
||||
df_list.append(
|
||||
align_outputs(
|
||||
y_fcst[trans_roll_wind],
|
||||
X_trans[trans_roll_wind],
|
||||
X_test[test_roll_wind],
|
||||
y_test[test_roll_wind],
|
||||
)
|
||||
)
|
||||
|
||||
# Advance the origin time
|
||||
origin_time = horizon_time
|
||||
|
||||
return pd.concat(df_list, ignore_index=True)
|
||||
|
||||
|
||||
def APE(actual, pred):
|
||||
"""
|
||||
Calculate absolute percentage error.
|
||||
Returns a vector of APE values with same length as actual/pred.
|
||||
"""
|
||||
return 100 * np.abs((actual - pred) / actual)
|
||||
|
||||
|
||||
def MAPE(actual, pred):
|
||||
"""
|
||||
Calculate mean absolute percentage error.
|
||||
Remove NA and values where actual is close to zero
|
||||
"""
|
||||
not_na = ~(np.isnan(actual) | np.isnan(pred))
|
||||
not_zero = ~np.isclose(actual, 0.0)
|
||||
actual_safe = actual[not_na & not_zero]
|
||||
pred_safe = pred[not_na & not_zero]
|
||||
return np.mean(APE(actual_safe, pred_safe))
|
||||
|
||||
|
||||
def map_location_cuda(storage, loc):
|
||||
return storage.cuda()
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--max_horizon",
|
||||
type=int,
|
||||
dest="max_horizon",
|
||||
default=10,
|
||||
help="Max Horizon for forecasting",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--target_column_name",
|
||||
type=str,
|
||||
dest="target_column_name",
|
||||
help="Target Column Name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--time_column_name", type=str, dest="time_column_name", help="Time Column Name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--frequency", type=str, dest="freq", help="Frequency of prediction"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_path",
|
||||
type=str,
|
||||
dest="model_path",
|
||||
default="model.pkl",
|
||||
help="Filename of model to be loaded",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
max_horizon = args.max_horizon
|
||||
target_column_name = args.target_column_name
|
||||
time_column_name = args.time_column_name
|
||||
freq = args.freq
|
||||
model_path = args.model_path
|
||||
|
||||
print("args passed are: ")
|
||||
print(max_horizon)
|
||||
print(target_column_name)
|
||||
print(time_column_name)
|
||||
print(freq)
|
||||
print(model_path)
|
||||
|
||||
run = Run.get_context()
|
||||
# get input dataset by name
|
||||
test_dataset = run.input_datasets["test_data"]
|
||||
lookback_dataset = run.input_datasets["lookback_data"]
|
||||
|
||||
grain_column_names = []
|
||||
|
||||
df = test_dataset.to_pandas_dataframe()
|
||||
|
||||
print("Read df")
|
||||
print(df)
|
||||
|
||||
X_test_df = test_dataset.drop_columns(columns=[target_column_name])
|
||||
y_test_df = test_dataset.with_timestamp_columns(None).keep_columns(
|
||||
columns=[target_column_name]
|
||||
)
|
||||
|
||||
X_lookback_df = lookback_dataset.drop_columns(columns=[target_column_name])
|
||||
y_lookback_df = lookback_dataset.with_timestamp_columns(None).keep_columns(
|
||||
columns=[target_column_name]
|
||||
)
|
||||
|
||||
_, ext = os.path.splitext(model_path)
|
||||
if ext == ".pt":
|
||||
# Load the fc-tcn torch model.
|
||||
assert _torch_present
|
||||
if torch.cuda.is_available():
|
||||
map_location = map_location_cuda
|
||||
else:
|
||||
map_location = "cpu"
|
||||
with open(model_path, "rb") as fh:
|
||||
fitted_model = torch.load(fh, map_location=map_location)
|
||||
else:
|
||||
# Load the sklearn pipeline.
|
||||
fitted_model = joblib.load(model_path)
|
||||
|
||||
if hasattr(fitted_model, "get_lookback"):
|
||||
lookback = fitted_model.get_lookback()
|
||||
df_all = do_rolling_forecast_with_lookback(
|
||||
fitted_model,
|
||||
X_test_df.to_pandas_dataframe(),
|
||||
y_test_df.to_pandas_dataframe().values.T[0],
|
||||
max_horizon,
|
||||
X_lookback_df.to_pandas_dataframe()[-lookback:],
|
||||
y_lookback_df.to_pandas_dataframe().values.T[0][-lookback:],
|
||||
freq,
|
||||
)
|
||||
else:
|
||||
df_all = do_rolling_forecast(
|
||||
fitted_model,
|
||||
X_test_df.to_pandas_dataframe(),
|
||||
y_test_df.to_pandas_dataframe().values.T[0],
|
||||
max_horizon,
|
||||
freq,
|
||||
)
|
||||
|
||||
print(df_all)
|
||||
|
||||
print("target values:::")
|
||||
print(df_all[target_column_name])
|
||||
print("predicted values:::")
|
||||
print(df_all["predicted"])
|
||||
|
||||
# Use the AutoML scoring module
|
||||
regression_metrics = list(constants.REGRESSION_SCALAR_SET)
|
||||
y_test = np.array(df_all[target_column_name])
|
||||
y_pred = np.array(df_all["predicted"])
|
||||
scores = scoring.score_regression(y_test, y_pred, regression_metrics)
|
||||
|
||||
print("scores:")
|
||||
print(scores)
|
||||
|
||||
for key, value in scores.items():
|
||||
run.log(key, value)
|
||||
|
||||
print("Simple forecasting model")
|
||||
rmse = np.sqrt(mean_squared_error(df_all[target_column_name], df_all["predicted"]))
|
||||
print("[Test Data] \nRoot Mean squared error: %.2f" % rmse)
|
||||
mae = mean_absolute_error(df_all[target_column_name], df_all["predicted"])
|
||||
print("mean_absolute_error score: %.2f" % mae)
|
||||
print("MAPE: %.2f" % MAPE(df_all[target_column_name], df_all["predicted"]))
|
||||
|
||||
run.log("rmse", rmse)
|
||||
run.log("mae", mae)
|
||||
@@ -0,0 +1,94 @@
|
||||
---
|
||||
page_type: sample
|
||||
languages:
|
||||
- python
|
||||
products:
|
||||
- azure-machine-learning
|
||||
description: Tutorial showing how to solve a complex machine learning time series forecasting problems at scale by using Azure Automated ML and Hierarchical time series accelerator.
|
||||
---
|
||||
|
||||
## Microsoft Solution Accelerator: Hierachical Time Series Forecasting
|
||||
|
||||
In most applications, customers have a need to understand their forecasts at a macro and micro level of the business. Whether that be predicting sales of products at different geographic locations, or understanding the expected workforce demand for different organizations at a company, the ability to train a machine learning model to intelligently forecast on hierarchy data is essential.
|
||||
|
||||
This business pattern is common across a wide variety of industries and applicable to many real world use cases. Below are some examples of where the hierarchical time series pattern is useful.
|
||||
|
||||
| Industry | Scenario |
|
||||
|----------------|--------------------------------------------|
|
||||
| *Restaurant Chain* | Building demand forecasting models across thousands of restaurants and several countries. |
|
||||
| *Retail Organization* | Building workforce optimization models for thousands of stores. |
|
||||
| *Retail Organization*| Price optimization models for hundreds of thousands of products available. |
|
||||
|
||||
|
||||
### Technical Summary
|
||||
|
||||
A hierarchical time series is a structure in which each of the unique series are arranged into a hierarchy based on dimensions such as geography, or product type. The table below shows an example of data whose unique attributes form a hierarchy. Our hierarchy is defined by the `product type` such as headphones or tablets, the `product category` which splits product types into accessories and devices, and the `region` the products are sold in. The table below demonstrates the first input of each unique series in the hierarchy.
|
||||
|
||||

|
||||
|
||||
To further visualize this, the leaf levels of the hierarchy contain all the time series with unique combinations of attribute values. Each higher level in the hierarchy will consider one less dimension for defining the time series and will aggregate each set of `child nodes` from the lower level into a `parent node`.
|
||||
|
||||

|
||||
|
||||
> **Note:** If no unique root level exists in the data, Automated Machine Learning will create a node `automl_top_level` for users to train or forecasts totals.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
To use this solution accelerator, all you need is access to an [Azure subscription](https://azure.microsoft.com/free/) and an [Azure Machine Learning Workspace](https://docs.microsoft.com/azure/machine-learning/how-to-manage-workspace) that you'll create below.
|
||||
|
||||
A basic understanding of Azure Machine Learning and hierarchical time series concepts will be helpful for understanding the solution. The following resources can help introduce you to these concepts:
|
||||
|
||||
1. [Azure Machine Learning Overview](https://azure.microsoft.com/services/machine-learning/)
|
||||
2. [Azure Machine Learning Tutorials](https://docs.microsoft.com/azure/machine-learning/tutorial-1st-experiment-sdk-setup)
|
||||
3. [Azure Machine Learning Sample Notebooks on Github](https://github.com/Azure/azureml-examples/)
|
||||
4. [Forecasting: Principles and Practice, Hierarchical time series](https://otexts.com/fpp2/hts.html)
|
||||
|
||||
## Getting started
|
||||
|
||||
### 1. Set up the Compute Instance
|
||||
Please create a [Compute Instance](https://docs.microsoft.com/en-us/azure/machine-learning/concept-compute-instance#create) and clone the git repo to your workspace.
|
||||
|
||||
### 2. Run the Notebook
|
||||
|
||||
Once your environment is set up, go to JupyterLab and run the notebook auto-ml-hierarchical-timeseries.ipynb on Compute Instance you created. It would run through the steps outlined sequentially. By the end, you'll know how to train, score, and make predictions using the hierarchical time series model pattern on Azure Machine Learning.
|
||||
|
||||
| Notebook | Description |
|
||||
|----------------|--------------------------------------------|
|
||||
| `auto-ml-forecasting-hierarchical-timeseries.ipynb`|Creates a pipeline to train machine learning models for the defined hierarchy and forecast at the desired hierarchy level using Automated ML. |
|
||||
|
||||
|
||||

|
||||
|
||||
## Key Concepts
|
||||
|
||||
### Automated Machine Learning
|
||||
|
||||
[Automated Machine Learning](https://docs.microsoft.com/azure/machine-learning/concept-automated-ml) also referred to as automated ML or AutoML, is the process of automating the time consuming, iterative tasks of machine learning model development. It allows data scientists, analysts, and developers to build ML models with high scale, efficiency, and productivity all while sustaining model quality.
|
||||
|
||||
### Pipelines
|
||||
|
||||
[Pipelines](https://docs.microsoft.com/azure/machine-learning/concept-ml-pipelines) allow you to create workflows in your machine learning projects. These workflows have a number of benefits including speed, simplicity, repeatability, and modularity.
|
||||
|
||||
### ParallelRunStep
|
||||
|
||||
[ParallelRunStep](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps.parallel_run_step.parallelrunstep?view=azure-ml-py) enables the parallel training of models and is commonly used for batch inferencing. This [document](https://docs.microsoft.com/azure/machine-learning/how-to-use-parallel-run-step) walks through some of the key concepts around ParallelRunStep.
|
||||
|
||||
### Other Concepts
|
||||
|
||||
In additional to ParallelRunStep, Pipelines and Automated Machine Learning, you'll also be working with the following concepts including [workspace](https://docs.microsoft.com/azure/machine-learning/concept-workspace), [datasets](https://docs.microsoft.com/azure/machine-learning/concept-data#datasets), [compute targets](https://docs.microsoft.com/azure/machine-learning/concept-compute-target#train), [python script steps](https://docs.microsoft.com/python/api/azureml-pipeline-steps/azureml.pipeline.steps.python_script_step.pythonscriptstep?view=azure-ml-py), and [Azure Open Datasets](https://azure.microsoft.com/services/open-datasets/).
|
||||
|
||||
## Contributing
|
||||
|
||||
This project welcomes contributions and suggestions. To learn more visit the [contributing](CONTRIBUTING.md) section.
|
||||
|
||||
Most contributions require you to agree to a Contributor License Agreement (CLA)
|
||||
declaring that you have the right to, and actually do, grant us
|
||||
the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
|
||||
|
||||
When you submit a pull request, a CLA bot will automatically determine whether you need to provide
|
||||
a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
|
||||
provided by the bot. You will only need to do this once across all repos using our CLA.
|
||||
|
||||
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
|
||||
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
|
||||
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
|
||||
@@ -381,7 +381,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Submit the pipeline to run\n",
|
||||
"Next we submit our pipeline to run. The whole training pipeline takes about 1h 11m using a Standard_D12_V2 VM with our current ParallelRunConfig setting."
|
||||
"Next we submit our pipeline to run. The whole training pipeline takes about 1h using a Standard_D16_V3 VM with our current ParallelRunConfig setting."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -399,7 +399,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"training_run.wait_for_completion(show_output=False)"
|
||||
"training_run.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -562,7 +562,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"inference_run = experiment.submit(inference_pipeline)\n",
|
||||
"inference_run.wait_for_completion(show_output=False)"
|
||||
"inference_run.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -571,7 +571,7 @@
|
||||
"source": [
|
||||
"## Retrieve results\n",
|
||||
"\n",
|
||||
"Forecast results can be retrieved through the following code. The prediction results summary and the actual predictions are downloaded the \"forecast_results\" folder"
|
||||
"Forecast results can be retrieved through the following code. The prediction results summary and the actual predictions are downloaded in forecast_results folder"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -617,9 +617,9 @@
|
||||
"automated-machine-learning"
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
|
After Width: | Height: | Size: 65 KiB |
|
After Width: | Height: | Size: 17 KiB |
|
After Width: | Height: | Size: 1.7 KiB |
|
After Width: | Height: | Size: 165 KiB |
|
After Width: | Height: | Size: 1.5 KiB |
|
After Width: | Height: | Size: 1.8 KiB |
|
After Width: | Height: | Size: 31 KiB |
@@ -0,0 +1,3 @@
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-contrib-automl-pipeline-steps
|
||||
@@ -0,0 +1,122 @@
|
||||
---
|
||||
page_type: sample
|
||||
languages:
|
||||
- python
|
||||
products:
|
||||
- azure-machine-learning
|
||||
description: Tutorial showing how to solve a complex machine learning time series forecasting problems at scale by using Azure Automated ML and Many Models solution accelerator.
|
||||
---
|
||||
|
||||

|
||||
# Many Models Solution Accelerator
|
||||
|
||||
<!--
|
||||
Guidelines on README format: https://review.docs.microsoft.com/help/onboard/admin/samples/concepts/readme-template?branch=master
|
||||
|
||||
Guidance on onboarding samples to docs.microsoft.com/samples: https://review.docs.microsoft.com/help/onboard/admin/samples/process/onboarding?branch=master
|
||||
|
||||
Taxonomies for products and languages: https://review.docs.microsoft.com/new-hope/information-architecture/metadata/taxonomies?branch=master
|
||||
-->
|
||||
|
||||
In the real world, many problems can be too complex to be solved by a single machine learning model. Whether that be predicting sales for each individual store, building a predictive maintanence model for hundreds of oil wells, or tailoring an experience to individual users, building a model for each instance can lead to improved results on many machine learning problems.
|
||||
|
||||
This Pattern is very common across a wide variety of industries and applicable to many real world use cases. Below are some examples we have seen where this pattern is being used.
|
||||
|
||||
- Energy and utility companies building predictive maintenance models for thousands of oil wells, hundreds of wind turbines or hundreds of smart meters
|
||||
|
||||
- Retail organizations building workforce optimization models for thousands of stores, campaign promotion propensity models, Price optimization models for hundreds of thousands of products they sell
|
||||
|
||||
- Restaurant chains building demand forecasting models across thousands of restaurants
|
||||
|
||||
- Banks and financial institutes building models for cash replenishment for ATM Machine and for several ATMs or building personalized models for individuals
|
||||
|
||||
- Enterprises building revenue forecasting models at each division level
|
||||
|
||||
- Document management companies building text analytics and legal document search models per each state
|
||||
|
||||
Azure Machine Learning (AML) makes it easy to train, operate, and manage hundreds or even thousands of models. This repo will walk you through the end to end process of creating a many models solution from training to scoring to monitoring.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
To use this solution accelerator, all you need is access to an [Azure subscription](https://azure.microsoft.com/free/) and an [Azure Machine Learning Workspace](https://docs.microsoft.com/azure/machine-learning/how-to-manage-workspace) that you'll create below.
|
||||
|
||||
While it's not required, a basic understanding of Azure Machine Learning will be helpful for understanding the solution. The following resources can help introduce you to AML:
|
||||
|
||||
1. [Azure Machine Learning Overview](https://azure.microsoft.com/services/machine-learning/)
|
||||
2. [Azure Machine Learning Tutorials](https://docs.microsoft.com/azure/machine-learning/tutorial-1st-experiment-sdk-setup)
|
||||
3. [Azure Machine Learning Sample Notebooks on Github](https://github.com/Azure/azureml-examples)
|
||||
|
||||
## Getting started
|
||||
|
||||
### 1. Deploy Resources
|
||||
|
||||
Start by deploying the resources to Azure. The button below will deploy Azure Machine Learning and its related resources:
|
||||
|
||||
<a href="https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fmicrosoft%2Fsolution-accelerator-many-models%2Fmaster%2Fazuredeploy.json" target="_blank">
|
||||
<img src="http://azuredeploy.net/deploybutton.png"/>
|
||||
</a>
|
||||
|
||||
### 2. Configure Development Environment
|
||||
|
||||
Next you'll need to configure your [development environment](https://docs.microsoft.com/azure/machine-learning/how-to-configure-environment) for Azure Machine Learning. We recommend using a [Compute Instance](https://docs.microsoft.com/azure/machine-learning/how-to-configure-environment#compute-instance) as it's the fastest way to get up and running.
|
||||
|
||||
### 3. Run Notebooks
|
||||
|
||||
Once your development environment is set up, run through the Jupyter Notebooks sequentially following the steps outlined. By the end, you'll know how to train, score, and make predictions using the many models pattern on Azure Machine Learning.
|
||||
|
||||

|
||||
|
||||
|
||||
## Contents
|
||||
|
||||
In this repo, you'll train and score a forecasting model for each orange juice brand and for each store at a (simulated) grocery chain. By the end, you'll have forecasted sales by using up to 11,973 models to predict sales for the next few weeks.
|
||||
|
||||
The data used in this sample is simulated based on the [Dominick's Orange Juice Dataset](http://www.cs.unitn.it/~taufer/QMMA/L10-OJ-Data.html#(1)), sales data from a Chicago area grocery store.
|
||||
|
||||
<img src="images/Flow_map.png" width="1000">
|
||||
|
||||
### Using Automated ML to train the models:
|
||||
|
||||
The [`auto-ml-forecasting-many-models.ipynb`](./auto-ml-forecasting-many-models.ipynb) noteboook is a guided solution accelerator that demonstrates steps from data preparation, to model training, and forecasting on train models as well as operationalizing the solution.
|
||||
|
||||
## How-to-videos
|
||||
|
||||
Watch these how-to-videos for a step by step walk-through of the many model solution accelerator to learn how to setup your models using Automated ML.
|
||||
|
||||
### Automated ML
|
||||
|
||||
[](https://channel9.msdn.com/Shows/Docs-AI/Building-Large-Scale-Machine-Learning-Forecasting-Models-using-Azure-Machine-Learnings-Automated-ML)
|
||||
|
||||
## Key concepts
|
||||
|
||||
### ParallelRunStep
|
||||
|
||||
[ParallelRunStep](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps.parallel_run_step.parallelrunstep?view=azure-ml-py) enables the parallel training of models and is commonly used for batch inferencing. This [document](https://docs.microsoft.com/azure/machine-learning/how-to-use-parallel-run-step) walks through some of the key concepts around ParallelRunStep.
|
||||
|
||||
### Pipelines
|
||||
|
||||
[Pipelines](https://docs.microsoft.com/azure/machine-learning/concept-ml-pipelines) allow you to create workflows in your machine learning projects. These workflows have a number of benefits including speed, simplicity, repeatability, and modularity.
|
||||
|
||||
### Automated Machine Learning
|
||||
|
||||
[Automated Machine Learning](https://docs.microsoft.com/azure/machine-learning/concept-automated-ml) also referred to as automated ML or AutoML, is the process of automating the time consuming, iterative tasks of machine learning model development. It allows data scientists, analysts, and developers to build ML models with high scale, efficiency, and productivity all while sustaining model quality.
|
||||
|
||||
### Other Concepts
|
||||
|
||||
In additional to ParallelRunStep, Pipelines and Automated Machine Learning, you'll also be working with the following concepts including [workspace](https://docs.microsoft.com/azure/machine-learning/concept-workspace), [datasets](https://docs.microsoft.com/azure/machine-learning/concept-data#datasets), [compute targets](https://docs.microsoft.com/azure/machine-learning/concept-compute-target#train), [python script steps](https://docs.microsoft.com/python/api/azureml-pipeline-steps/azureml.pipeline.steps.python_script_step.pythonscriptstep?view=azure-ml-py), and [Azure Open Datasets](https://azure.microsoft.com/services/open-datasets/).
|
||||
|
||||
## Contributing
|
||||
|
||||
This project welcomes contributions and suggestions. To learn more visit the [contributing](../../../CONTRIBUTING.md) section.
|
||||
|
||||
Most contributions require you to agree to a Contributor License Agreement (CLA)
|
||||
declaring that you have the right to, and actually do, grant us
|
||||
the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
|
||||
|
||||
When you submit a pull request, a CLA bot will automatically determine whether you need to provide
|
||||
a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
|
||||
provided by the bot. You will only need to do this once across all repos using our CLA.
|
||||
|
||||
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
|
||||
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
|
||||
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
|
||||
@@ -30,7 +30,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For this notebook we are using a synthetic dataset portraying sales data to predict the quantity of a vartiety of product SKUs across several states, stores, and product categories.\n",
|
||||
"For this notebook we are using a synthetic dataset portraying sales data to predict the the quantity of a vartiety of product skus across several states, stores, and product categories.\n",
|
||||
"\n",
|
||||
"**NOTE: There are limits on how many runs we can do in parallel per workspace, and we currently recommend to set the parallelism to maximum of 320 runs per experiment per workspace. If users want to have more parallelism and increase this limit they might encounter Too Many Requests errors (HTTP 429).**"
|
||||
]
|
||||
@@ -308,7 +308,7 @@
|
||||
"source": [
|
||||
"### Set up training parameters\n",
|
||||
"\n",
|
||||
"This dictionary defines the AutoML and many models settings. For this forecasting task we need to define several settings including the name of the time column, the maximum forecast horizon, and the partition column name definition.\n",
|
||||
"This dictionary defines the AutoML and many models settings. For this forecasting task we need to define several settings inncluding the name of the time column, the maximum forecast horizon, and the partition column name definition.\n",
|
||||
"\n",
|
||||
"| Property | Description|\n",
|
||||
"| :--------------- | :------------------- |\n",
|
||||
@@ -554,12 +554,12 @@
|
||||
"| :--------------- | :------------------- |\n",
|
||||
"| **experiment** | The experiment used for inference run. |\n",
|
||||
"| **inference_data** | The data to use for inferencing. It should be the same schema as used for training.\n",
|
||||
"| **compute_target** | The compute target that runs the inference pipeline.|\n",
|
||||
"| **compute_target** The compute target that runs the inference pipeline.|\n",
|
||||
"| **node_count** | The number of compute nodes to be used for running the user script. We recommend to start with the number of cores per node (varies by compute sku). |\n",
|
||||
"| **process_count_per_node** | The number of processes per node.\n",
|
||||
"| **train_run_id** | \\[Optional\\] The run id of the hierarchy training, by default it is the latest successful training many model run in the experiment. |\n",
|
||||
"| **train_experiment_name** | \\[Optional\\] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline. |\n",
|
||||
"| **process_count_per_node** | \\[Optional\\] The number of processes per node, by default it's 4. |"
|
||||
"| **process_count_per_node** The number of processes per node.\n",
|
||||
"| **train_run_id** | \\[Optional] The run id of the hierarchy training, by default it is the latest successful training many model run in the experiment. |\n",
|
||||
"| **train_experiment_name** | \\[Optional] The train experiment that contains the train pipeline. This one is only needed when the train pipeline is not in the same experiement as the inference pipeline. |\n",
|
||||
"| **process_count_per_node** | \\[Optional] The number of processes per node, by default it's 4. |"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -724,9 +724,9 @@
|
||||
"automated-machine-learning"
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
|
After Width: | Height: | Size: 32 KiB |
|
After Width: | Height: | Size: 306 KiB |
|
After Width: | Height: | Size: 2.6 MiB |
|
After Width: | Height: | Size: 106 KiB |
|
After Width: | Height: | Size: 158 KiB |
|
After Width: | Height: | Size: 80 KiB |
|
After Width: | Height: | Size: 68 KiB |
|
After Width: | Height: | Size: 631 KiB |
@@ -0,0 +1,3 @@
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-contrib-automl-pipeline-steps
|
||||
@@ -66,14 +66,14 @@
|
||||
"from azureml.automl.core.featurization import FeaturizationConfig\n",
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"from azureml.train.automl import AutoMLConfig\n"
|
||||
"from azureml.train.automl import AutoMLConfig"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This sample notebook may use features that are not available in previous versions of the Azure ML SDK."
|
||||
"This notebook is compatible with Azure ML SDK version 1.35.0 or later."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -82,7 +82,6 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.38.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -367,7 +366,7 @@
|
||||
"|-|-|\n",
|
||||
"|**time_column_name**|The name of your time column.|\n",
|
||||
"|**forecast_horizon**|The forecast horizon is how many periods forward you would like to forecast. This integer horizon is in units of the timeseries frequency (e.g. daily, weekly).|\n",
|
||||
"|**time_series_id_column_names**|This optional parameter represents the column names used to uniquely identify the time series in data that has multiple rows with the same timestamp. If the time series identifiers are not defined or incorrectly defined, time series identifiers will be created automatically if they exist.|\n",
|
||||
"|**time_series_id_column_names**|The column names used to uniquely identify the time series in data that has multiple rows with the same timestamp. If the time series identifiers are not defined, the data set is assumed to be one time series.|\n",
|
||||
"|**freq**|Forecast frequency. This optional parameter represents the period with which the forecast is desired, for example, daily, weekly, yearly, etc. Use this parameter for the correction of time series containing irregular data points or for padding of short time series. The frequency needs to be a pandas offset alias. Please refer to [pandas documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects) for more information."
|
||||
]
|
||||
},
|
||||
@@ -379,7 +378,7 @@
|
||||
"\n",
|
||||
"The [AutoMLConfig](https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py) object defines the settings and data for an AutoML training job. Here, we set necessary inputs like the task type, the number of AutoML iterations to try, the training data, and cross-validation parameters.\n",
|
||||
"\n",
|
||||
"For forecasting tasks, there are some additional parameters that can be set in the `ForecastingParameters` class: the name of the column holding the date/time, the timeseries id column names, and the maximum forecast horizon. A time column is required for forecasting, while the time_series_id is optional. If time_series_id columns are not given or incorrectly given, AutoML automatically creates time_series_id columns if they exist. We also pass a list of columns to drop prior to modeling. The _logQuantity_ column is completely correlated with the target quantity, so it must be removed to prevent a target leak.\n",
|
||||
"For forecasting tasks, there are some additional parameters that can be set in the `ForecastingParameters` class: the name of the column holding the date/time, the timeseries id column names, and the maximum forecast horizon. A time column is required for forecasting, while the time_series_id is optional. If time_series_id columns are not given, AutoML assumes that the whole dataset is a single time-series. We also pass a list of columns to drop prior to modeling. The _logQuantity_ column is completely correlated with the target quantity, so it must be removed to prevent a target leak.\n",
|
||||
"\n",
|
||||
"The forecast horizon is given in units of the time-series frequency; for instance, the OJ series frequency is weekly, so a horizon of 20 means that a trained model will estimate sales up to 20 weeks beyond the latest date in the training data for each series. In this example, we set the forecast horizon to the number of samples per series in the test set (n_test_periods). Generally, the value of this parameter will be dictated by business needs. For example, a demand planning application that estimates the next month of sales should set the horizon according to suitable planning time-scales. Please see the [energy_demand notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand) for more discussion of forecast horizon.\n",
|
||||
"\n",
|
||||
@@ -422,6 +421,7 @@
|
||||
"forecasting_parameters = ForecastingParameters(\n",
|
||||
" time_column_name=time_column_name,\n",
|
||||
" forecast_horizon=n_test_periods,\n",
|
||||
" time_series_id_column_names=time_series_id_column_names,\n",
|
||||
" freq=\"W-THU\", # Set the forecast frequency to be weekly (start on each Thursday)\n",
|
||||
")\n",
|
||||
"\n",
|
||||
@@ -503,7 +503,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download the featurization summary JSON file locally\n",
|
||||
"best_run.download_file(\"outputs/featurization_summary.json\", \"featurization_summary.json\")\n",
|
||||
"best_run.download_file(\n",
|
||||
" \"outputs/featurization_summary.json\", \"featurization_summary.json\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Render the JSON as a pandas DataFrame\n",
|
||||
"with open(\"featurization_summary.json\", \"r\") as f:\n",
|
||||
@@ -511,7 +513,15 @@
|
||||
"fs = pd.DataFrame.from_records(records)\n",
|
||||
"\n",
|
||||
"# View a summary of the featurization\n",
|
||||
"fs[[\"RawFeatureName\", \"TypeDetected\", \"Dropped\", \"EngineeredFeatureCount\", \"Transformations\"]]"
|
||||
"fs[\n",
|
||||
" [\n",
|
||||
" \"RawFeatureName\",\n",
|
||||
" \"TypeDetected\",\n",
|
||||
" \"Dropped\",\n",
|
||||
" \"EngineeredFeatureCount\",\n",
|
||||
" \"Transformations\",\n",
|
||||
" ]\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -538,7 +548,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Retreiving forecasts from the model\n",
|
||||
"### Retrieving forecasts from the model\n",
|
||||
"We have created a function called `run_forecast` that submits the test data to the best model determined during the training run and retrieves forecasts. This function uses a helper script `forecasting_script` which is uploaded and expecuted on the remote compute."
|
||||
]
|
||||
},
|
||||
@@ -808,9 +818,9 @@
|
||||
"friendly_name": "Forecasting orange juice sales with deployment",
|
||||
"index_order": 1,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -262,7 +262,7 @@
|
||||
" <li> test_name is the name of the test.\n",
|
||||
" <ul> \n",
|
||||
" <li> ADF: Augmented Dickey-Fuller test </li>\n",
|
||||
" <li> KPSS: Kwiatkowski-Phillips\u00e2\u20ac\u201cSchmidt\u00e2\u20ac\u201cShin test </li>\n",
|
||||
" <li> KPSS: Kwiatkowski-Phillips–Schmidt–Shin test </li>\n",
|
||||
" <li> PP: Phillips-Perron test\n",
|
||||
" <li> ADF GLS: Augmented Dickey-Fuller using generalized least squares method </li>\n",
|
||||
" <li> AZ: Andrews-Zivot test </li>\n",
|
||||
@@ -472,9 +472,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -387,8 +387,8 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Retrieve the best model\n",
|
||||
"Below we select the best model from all the training iterations using get_output method."
|
||||
"### Retrieve the Best Run details\n",
|
||||
"Below we retrieve the best Run object from among all the runs in the experiment."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -397,8 +397,8 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"best_run, fitted_model = remote_run.get_output()\n",
|
||||
"fitted_model.steps"
|
||||
"best_run = remote_run.get_best_child()\n",
|
||||
"best_run"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -571,9 +571,9 @@
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
---
|
||||
page_type: sample
|
||||
languages:
|
||||
- python
|
||||
products:
|
||||
- azure-machine-learning
|
||||
description: Notebook showing how to use Azure Machine Learning pipelines to do Batch Predictions with an Image Classification model trained using AutoML.
|
||||
---
|
||||
|
||||
# Batch Scoring with an Image Classification Model
|
||||
- Dataset: Toy dataset with images of products found in a fridge
|
||||
- **[Jupyter Notebook](auto-ml-image-classification-multiclass-batch-scoring.ipynb)**
|
||||
- register an Image Classification Multi-Class model already trained using AutoML
|
||||
- create an Inference Dataset
|
||||
- provision compute targets and create a Batch Scoring script
|
||||
- use ParallelRunStep to do batch scoring
|
||||
- build, run, and publish a pipeline
|
||||
- enable a REST endpoint for the pipeline
|
||||
@@ -0,0 +1,950 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License.\n",
|
||||
"\n",
|
||||
"# Batch Predictions for an Image Classification model trained using AutoML\n",
|
||||
"In this notebook, we go over how you can use [Azure Machine Learning pipelines](https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-pipeline-batch-scoring-classification) to run a batch scoring image classification job.\n",
|
||||
"\n",
|
||||
"**Please note:** For this notebook you can use an existing image classification model trained using AutoML for Images or use the simple model training we included below for convenience. For detailed instructions on how to train an image classification model with AutoML, please refer to the official [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models) and to the [image classification multiclass notebook](https://github.com/Azure/azureml-examples/blob/main/python-sdk/tutorials/automl-with-azureml/image-classification-multiclass/auto-ml-image-classification-multiclass.ipynb)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Important:** This feature is currently in public preview. This preview version is provided without a service-level agreement. Certain features might not be supported or might have constrained capabilities. For more information, see [Supplemental Terms of Use for Microsoft Azure Previews](https://azure.microsoft.com/en-us/support/legal/preview-supplemental-terms/)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Environment Setup\n",
|
||||
"Please follow the [\"Setup a new conda environment\"](https://github.com/Azure/azureml-examples/tree/main/python-sdk/tutorials/automl-with-azureml#3-setup-a-new-conda-environment) instructions to get started."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.35.0 of the Azure ML SDK.\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK.\")\n",
|
||||
"assert (\n",
|
||||
" azureml.core.VERSION >= \"1.35\"\n",
|
||||
"), \"Please upgrade the Azure ML SDK by running '!pip install --upgrade azureml-sdk' then restart the kernel.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## You will perform the following tasks:\n",
|
||||
"\n",
|
||||
"* Register a Model already trained using AutoML for Image Classification.\n",
|
||||
"* Create an Inference Dataset.\n",
|
||||
"* Provision compute targets and create a Batch Scoring script.\n",
|
||||
"* Use ParallelRunStep to do batch scoring.\n",
|
||||
"* Build, run, and publish a pipeline.\n",
|
||||
"* Enable a REST endpoint for the pipeline."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Workspace setup\n",
|
||||
"\n",
|
||||
"An [Azure ML Workspace](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#workspace) is an Azure resource that organizes and coordinates the actions of many other Azure resources to assist in executing and sharing machine learning workflows. In particular, an Azure ML Workspace coordinates storage, databases, and compute resources providing added functionality for machine learning experimentation, deployment, inference, and the monitoring of deployed models.\n",
|
||||
"\n",
|
||||
"Create an Azure ML Workspace within your Azure subscription or load an existing workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Workspace default datastore is used to store inference input images and outputs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def_data_store = ws.get_default_datastore()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Compute target setup\n",
|
||||
"You will need to provide a [Compute Target](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#computes) that will be used for your AutoML model training. AutoML models for image tasks require [GPU SKUs](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) such as the ones from the NC, NCv2, NCv3, ND, NDv2 and NCasT4 series. We recommend using the NCsv3-series (with v100 GPUs) for faster training. Using a compute target with a multi-GPU VM SKU will leverage the multiple GPUs to speed up training. Additionally, setting up a compute target with multiple nodes will allow for faster model training by leveraging parallelism, when tuning hyperparameters for your model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import AmlCompute, ComputeTarget\n",
|
||||
"\n",
|
||||
"cluster_name = \"gpu-cluster-nc6\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" compute_target = ws.compute_targets[cluster_name]\n",
|
||||
" print(\"Found existing compute target.\")\n",
|
||||
"except KeyError:\n",
|
||||
" print(\"Creating a new compute target...\")\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=\"Standard_NC6\",\n",
|
||||
" idle_seconds_before_scaledown=600,\n",
|
||||
" min_nodes=0,\n",
|
||||
" max_nodes=4,\n",
|
||||
" )\n",
|
||||
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
|
||||
"# Can poll for a minimum number of nodes and for a specific timeout.\n",
|
||||
"# If no min_node_count is provided, it will use the scale settings for the cluster.\n",
|
||||
"compute_target.wait_for_completion(\n",
|
||||
" show_output=True, min_node_count=None, timeout_in_minutes=20\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train an Image Classification model\n",
|
||||
"\n",
|
||||
"In this section we will do a quick model train to use for the batch scoring. For a datailed example on how to train an image classification model, please refer to the official [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models) or to the [image classification multiclass notebook](https://github.com/Azure/azureml-examples/blob/main/python-sdk/tutorials/automl-with-azureml/image-classification-multiclass/auto-ml-image-classification-multiclass.ipynb). If you already have a model trained in the same workspace, you can skip to section [\"Create data objects\"](#Create-data-objects)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Experiment Setup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Experiment\n",
|
||||
"\n",
|
||||
"experiment_name = \"automl-image-batchscoring\"\n",
|
||||
"experiment = Experiment(ws, name=experiment_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Download dataset with input Training Data\n",
|
||||
"\n",
|
||||
"All images in this notebook are hosted in [this repository](https://github.com/microsoft/computervision-recipes) and are made available under the [MIT license](https://github.com/microsoft/computervision-recipes/blob/master/LICENSE)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import urllib\n",
|
||||
"from zipfile import ZipFile\n",
|
||||
"\n",
|
||||
"# download data\n",
|
||||
"download_url = \"https://cvbp-secondary.z19.web.core.windows.net/datasets/image_classification/fridgeObjects.zip\"\n",
|
||||
"data_file = \"./fridgeObjects.zip\"\n",
|
||||
"urllib.request.urlretrieve(download_url, filename=data_file)\n",
|
||||
"\n",
|
||||
"# extract files\n",
|
||||
"with ZipFile(data_file, \"r\") as zip:\n",
|
||||
" print(\"extracting files...\")\n",
|
||||
" zip.extractall()\n",
|
||||
" print(\"done\")\n",
|
||||
"# delete zip file\n",
|
||||
"os.remove(data_file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Convert the downloaded data to JSONL"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"src = \"./fridgeObjects/\"\n",
|
||||
"train_validation_ratio = 5\n",
|
||||
"\n",
|
||||
"# Retrieving default datastore that got automatically created when we setup a workspace\n",
|
||||
"workspaceblobstore = ws.get_default_datastore().name\n",
|
||||
"\n",
|
||||
"# Path to the training and validation files\n",
|
||||
"train_annotations_file = os.path.join(src, \"train_annotations.jsonl\")\n",
|
||||
"validation_annotations_file = os.path.join(src, \"validation_annotations.jsonl\")\n",
|
||||
"\n",
|
||||
"# sample json line dictionary\n",
|
||||
"json_line_sample = {\n",
|
||||
" \"image_url\": \"AmlDatastore://\"\n",
|
||||
" + workspaceblobstore\n",
|
||||
" + \"/\"\n",
|
||||
" + os.path.basename(os.path.dirname(src)),\n",
|
||||
" \"label\": \"\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"index = 0\n",
|
||||
"# Scan each sub directary and generate jsonl line\n",
|
||||
"with open(train_annotations_file, \"w\") as train_f:\n",
|
||||
" with open(validation_annotations_file, \"w\") as validation_f:\n",
|
||||
" for className in os.listdir(src):\n",
|
||||
" subDir = src + className\n",
|
||||
" if not os.path.isdir(subDir):\n",
|
||||
" continue\n",
|
||||
" # Scan each sub directary\n",
|
||||
" print(\"Parsing \" + subDir)\n",
|
||||
" for image in os.listdir(subDir):\n",
|
||||
" json_line = dict(json_line_sample)\n",
|
||||
" json_line[\"image_url\"] += f\"/{className}/{image}\"\n",
|
||||
" json_line[\"label\"] = className\n",
|
||||
"\n",
|
||||
" if index % train_validation_ratio == 0:\n",
|
||||
" # validation annotation\n",
|
||||
" validation_f.write(json.dumps(json_line) + \"\\n\")\n",
|
||||
" else:\n",
|
||||
" # train annotation\n",
|
||||
" train_f.write(json.dumps(json_line) + \"\\n\")\n",
|
||||
" index += 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Upload the JSONL file and images to Datastore"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Retrieving default datastore that got automatically created when we setup a workspace\n",
|
||||
"ds = ws.get_default_datastore()\n",
|
||||
"ds.upload(src_dir=\"./fridgeObjects\", target_path=\"fridgeObjects\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Create and register datasets in workspace"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Dataset\n",
|
||||
"from azureml.data import DataType\n",
|
||||
"\n",
|
||||
"# get existing training dataset\n",
|
||||
"training_dataset_name = \"fridgeObjectsTrainingDataset\"\n",
|
||||
"if training_dataset_name in ws.datasets:\n",
|
||||
" training_dataset = ws.datasets.get(training_dataset_name)\n",
|
||||
" print(\"Found the training dataset\", training_dataset_name)\n",
|
||||
"else:\n",
|
||||
" # create training dataset\n",
|
||||
" training_dataset = Dataset.Tabular.from_json_lines_files(\n",
|
||||
" path=ds.path(\"fridgeObjects/train_annotations.jsonl\"),\n",
|
||||
" set_column_types={\"image_url\": DataType.to_stream(ds.workspace)},\n",
|
||||
" )\n",
|
||||
" training_dataset = training_dataset.register(\n",
|
||||
" workspace=ws, name=training_dataset_name\n",
|
||||
" )\n",
|
||||
"# get existing validation dataset\n",
|
||||
"validation_dataset_name = \"fridgeObjectsValidationDataset\"\n",
|
||||
"if validation_dataset_name in ws.datasets:\n",
|
||||
" validation_dataset = ws.datasets.get(validation_dataset_name)\n",
|
||||
" print(\"Found the validation dataset\", validation_dataset_name)\n",
|
||||
"else:\n",
|
||||
" # create validation dataset\n",
|
||||
" validation_dataset = Dataset.Tabular.from_json_lines_files(\n",
|
||||
" path=ds.path(\"fridgeObjects/validation_annotations.jsonl\"),\n",
|
||||
" set_column_types={\"image_url\": DataType.to_stream(ds.workspace)},\n",
|
||||
" )\n",
|
||||
" validation_dataset = validation_dataset.register(\n",
|
||||
" workspace=ws, name=validation_dataset_name\n",
|
||||
" )\n",
|
||||
"print(\"Training dataset name: \" + training_dataset.name)\n",
|
||||
"print(\"Validation dataset name: \" + validation_dataset.name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Submit training 1 training run with default hyperparameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.core.shared.constants import ImageTask\n",
|
||||
"from azureml.train.automl import AutoMLImageConfig\n",
|
||||
"from azureml.train.hyperdrive import GridParameterSampling, choice\n",
|
||||
"\n",
|
||||
"image_config_vit = AutoMLImageConfig(\n",
|
||||
" task=ImageTask.IMAGE_CLASSIFICATION,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=training_dataset,\n",
|
||||
" validation_data=validation_dataset,\n",
|
||||
" hyperparameter_sampling=GridParameterSampling({\"model_name\": choice(\"vitb16r224\")}),\n",
|
||||
" iterations=1,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run = experiment.submit(image_config_vit)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run.wait_for_completion(wait_post_processing=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create data objects\n",
|
||||
"\n",
|
||||
"When building pipelines, `Dataset` objects are used for reading data from workspace datastores, and `PipelineData` objects are used for transferring intermediate data between pipeline steps.\n",
|
||||
"\n",
|
||||
"This batch scoring example only uses one pipeline step, but in use-cases with multiple steps, the typical flow will include:\n",
|
||||
"\n",
|
||||
"1. Using `Dataset` objects as inputs to fetch raw data, performing some transformations, then output a `PipelineData` object. \n",
|
||||
"1. Use the previous step's `PipelineData` **output object** as an **input object**, repeated for subsequent steps.\n",
|
||||
"\n",
|
||||
"For this scenario you create `Dataset` objects corresponding to the datastore directories for the input images. You also create a `PipelineData` object for the batch scoring output data. An object reference in the `outputs` array becomes available as an **input** for a subsequent pipeline step, for scenarios where there is more than one step. In this case we are just going to build a single step pipeline.\n",
|
||||
"\n",
|
||||
"It is assumed that an image classification training run was already performed in this workspace and the files are already in the datastore. If this is not the case, please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models) to know how to train an image classification model with AutoML.\n",
|
||||
"\n",
|
||||
"All images in this notebook are hosted in [this repository](https://github.com/microsoft/computervision-recipes) and are made available under the [MIT license](https://github.com/microsoft/computervision-recipes/blob/master/LICENSE)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.dataset import Dataset\n",
|
||||
"from azureml.pipeline.core import PipelineData\n",
|
||||
"\n",
|
||||
"input_images = Dataset.File.from_files((def_data_store, \"fridgeObjects/**/*.jpg\"))\n",
|
||||
"\n",
|
||||
"output_dir = PipelineData(name=\"scores\", datastore=def_data_store)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Next, we need to register the input datasets for batch scoring with the workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"input_images = input_images.register(\n",
|
||||
" workspace=ws, name=\"fridgeObjects_scoring_images\", create_new_version=True\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retrieve the environment and metrics from the training run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
"from azureml.core import Run\n",
|
||||
"\n",
|
||||
"experiment_name = \"automl-image-batchscoring\"\n",
|
||||
"# If your model was not trained with this notebook, replace the id below\n",
|
||||
"# with the run id of the child training run (i.e., the one ending with HD_0)\n",
|
||||
"training_run_id = automl_image_run.id + \"_HD_0\"\n",
|
||||
"exp = Experiment(ws, experiment_name)\n",
|
||||
"training_run = Run(exp, training_run_id)\n",
|
||||
"\n",
|
||||
"# The below will give only the requested metric\n",
|
||||
"metrics = training_run.get_metrics(\"accuracy\")\n",
|
||||
"best_metric = max(metrics[\"accuracy\"])\n",
|
||||
"print(\"best_metric:\", best_metric)\n",
|
||||
"\n",
|
||||
"# Retrieve the training environment\n",
|
||||
"env = training_run.get_environment()\n",
|
||||
"print(env)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Register model with metric and environment tags\n",
|
||||
"\n",
|
||||
"Now you register the model to your workspace, which allows you to easily retrieve it in the pipeline process. In the `register()` static function, the `model_name` parameter is the key you use to locate your model throughout the SDK.\n",
|
||||
"Tag the model with the metrics and the environment used to train the model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.model import Model\n",
|
||||
"\n",
|
||||
"tags = dict()\n",
|
||||
"tags[\"accuracy\"] = best_metric\n",
|
||||
"tags[\"env_name\"] = env.name\n",
|
||||
"tags[\"env_version\"] = env.version\n",
|
||||
"\n",
|
||||
"model_name = \"fridgeObjectsClassifier\"\n",
|
||||
"model = training_run.register_model(\n",
|
||||
" model_name=model_name, model_path=\"train_artifacts\", tags=tags\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# List the models from the workspace\n",
|
||||
"models = Model.list(ws, name=model_name, latest=True)\n",
|
||||
"print(model.name)\n",
|
||||
"print(model.tags)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Write a scoring script"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To do the scoring, you create a batch scoring script `batch_scoring.py`, and write it to the scripts folder in current directory. The script takes a minibatch of input images, applies the classification model, and outputs the predictions to a results file.\n",
|
||||
"\n",
|
||||
"The script `batch_scoring.py` takes the following parameters, which get passed from the `ParallelRunStep` that you create later:\n",
|
||||
"\n",
|
||||
"- `--model_name`: the name of the model being used\n",
|
||||
"\n",
|
||||
"While creating the batch scoring script, refer to the scoring scripts generated under the outputs folder of the Automl training runs. This will help to identify the right model settings to be used in the batch scoring script init method while loading the model.\n",
|
||||
"Note: The batch scoring script we generate in the subsequent step is different from the scoring script generated by the training runs in the below screenshot. We refer to it just to identify the right model settings to be used in the batch scoring script.\n",
|
||||
"\n",
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# View the batch scoring script. Use the model settings as appropriate for your model.\n",
|
||||
"with open(\"./scripts/batch_scoring.py\", \"r\") as f:\n",
|
||||
" print(f.read())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Build and run the pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create the parallel-run configuration to wrap the inference script\n",
|
||||
"Create the pipeline run configuration specifying the script, environment configuration, and parameters. Specify the compute target you already attached to your workspace as the target of execution of the script. This will set the run configuration of the ParallelRunStep we will define next.\n",
|
||||
"\n",
|
||||
"Refer this [site](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/machine-learning-pipelines/parallel-run) for more details on ParallelRunStep of Azure Machine Learning Pipelines."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.pipeline.steps import ParallelRunConfig\n",
|
||||
"\n",
|
||||
"parallel_run_config = ParallelRunConfig(\n",
|
||||
" environment=env,\n",
|
||||
" entry_script=\"batch_scoring.py\",\n",
|
||||
" source_directory=\"scripts\",\n",
|
||||
" output_action=\"append_row\",\n",
|
||||
" append_row_file_name=\"parallel_run_step.txt\",\n",
|
||||
" mini_batch_size=\"20\", # Num files to process in one call\n",
|
||||
" error_threshold=1,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" process_count_per_node=2,\n",
|
||||
" node_count=1,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Create the pipeline step\n",
|
||||
"\n",
|
||||
"A pipeline step is an object that encapsulates everything you need for running a pipeline including:\n",
|
||||
"\n",
|
||||
"* environment and dependency settings\n",
|
||||
"* the compute resource to run the pipeline on\n",
|
||||
"* input and output data, and any custom parameters\n",
|
||||
"* reference to a script to run during the step\n",
|
||||
"\n",
|
||||
"There are multiple classes that inherit from the parent class [`PipelineStep`](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/?view=azure-ml-py) to assist with building a step using certain frameworks and stacks. In this example, you use the [`ParallelRunStep`](https://docs.microsoft.com/en-us/python/api/azureml-contrib-pipeline-steps/azureml.contrib.pipeline.steps.parallelrunstep?view=azure-ml-py) class to define your step logic using a scoring script. `ParallelRunStep` executes the script in a distributed fashion.\n",
|
||||
"\n",
|
||||
"The pipelines infrastructure uses the `ArgumentParser` class to pass parameters into pipeline steps. For example, in the code below the first argument `--model_name` is given the property identifier `model_name`. In the `main()` function, this property is accessed using `Model.get_model_path(args.model_name)`.\n",
|
||||
"\n",
|
||||
"Note: The pipeline in this tutorial only has one step and writes the output to a file, but for multi-step pipelines, you also use `ArgumentParser` to define a directory to write output data for input to subsequent steps. See the [notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/nyc-taxi-data-regression-model-building.ipynb) for an example of passing data between multiple pipeline steps using the `ArgumentParser` design pattern."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.pipeline.steps import ParallelRunStep\n",
|
||||
"from datetime import datetime\n",
|
||||
"\n",
|
||||
"parallel_step_name = \"batchscoring-\" + datetime.now().strftime(\"%Y%m%d%H%M\")\n",
|
||||
"\n",
|
||||
"arguments = [\"--model_name\", model_name]\n",
|
||||
"\n",
|
||||
"# Specify inference batch_size, otherwise uses default value. (This is different from the mini_batch_size above)\n",
|
||||
"# NOTE: Large batch sizes may result in OOM errors.\n",
|
||||
"# arguments = arguments + [\"--batch_size\", \"20\"]\n",
|
||||
"\n",
|
||||
"batch_score_step = ParallelRunStep(\n",
|
||||
" name=parallel_step_name,\n",
|
||||
" inputs=[input_images.as_named_input(\"input_images\")],\n",
|
||||
" output=output_dir,\n",
|
||||
" arguments=arguments,\n",
|
||||
" parallel_run_config=parallel_run_config,\n",
|
||||
" allow_reuse=False,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For a list of all classes for different step types, see the [steps package](https://docs.microsoft.com/python/api/azureml-pipeline-steps/azureml.pipeline.steps?view=azure-ml-py)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Run the pipeline\n",
|
||||
"\n",
|
||||
"Now you run the pipeline. First create a `Pipeline` object with your workspace reference and the pipeline step you created. The `steps` parameter is an array of steps, and in this case, there is only one step for batch scoring. To build pipelines with multiple steps, you place the steps in order in this array.\n",
|
||||
"\n",
|
||||
"Next use the `Experiment.submit()` function to submit the pipeline for execution. You also specify the custom parameter `param_batch_size`. The `wait_for_completion` function will output logs during the pipeline build process, which allows you to see current progress.\n",
|
||||
"\n",
|
||||
"Note: The first pipeline run takes roughly **15 minutes**, as all dependencies must be downloaded, a Docker image is created, and the Python environment is provisioned/created. Running it again takes significantly less time as those resources are reused. However, total run time depends on the workload of your scripts and processes running in each pipeline step."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Experiment\n",
|
||||
"from azureml.pipeline.core import Pipeline\n",
|
||||
"\n",
|
||||
"pipeline = Pipeline(workspace=ws, steps=[batch_score_step])\n",
|
||||
"pipeline_run = Experiment(ws, \"batch_scoring_automl_image\").submit(pipeline)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# This will output information of the pipeline run, including the link to the details page of portal.\n",
|
||||
"pipeline_run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Wait the run for completion and show output log to console\n",
|
||||
"pipeline_run.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Download and review output"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import tempfile\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"batch_run = pipeline_run.find_step_run(batch_score_step.name)[0]\n",
|
||||
"batch_output = batch_run.get_output_data(output_dir.name)\n",
|
||||
"\n",
|
||||
"target_dir = tempfile.mkdtemp()\n",
|
||||
"batch_output.download(local_path=target_dir)\n",
|
||||
"result_file = os.path.join(\n",
|
||||
" target_dir, batch_output.path_on_datastore, parallel_run_config.append_row_file_name\n",
|
||||
")\n",
|
||||
"result_file\n",
|
||||
"\n",
|
||||
"# Print the first five lines of the output\n",
|
||||
"with open(result_file) as f:\n",
|
||||
" for x in range(5):\n",
|
||||
" print(next(f))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Choose a random file for visualization"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import random\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"with open(result_file, \"r\") as f:\n",
|
||||
" contents = f.readlines()\n",
|
||||
"rand_file = contents[random.randrange(len(contents))]\n",
|
||||
"prediction = json.loads(rand_file)\n",
|
||||
"print(prediction[\"filename\"])\n",
|
||||
"print(prediction[\"probs\"])\n",
|
||||
"print(prediction[\"labels\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download the image file from the datastore\n",
|
||||
"path = (\n",
|
||||
" \"fridgeObjects\"\n",
|
||||
" + \"/\"\n",
|
||||
" + prediction[\"filename\"].split(\"/\")[-2]\n",
|
||||
" + \"/\"\n",
|
||||
" + prediction[\"filename\"].split(\"/\")[-1]\n",
|
||||
")\n",
|
||||
"path_on_datastore = def_data_store.path(path)\n",
|
||||
"single_image_ds = Dataset.File.from_files(path=path_on_datastore, validate=False)\n",
|
||||
"image = single_image_ds.download()[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import matplotlib.image as mpimg\n",
|
||||
"from PIL import Image\n",
|
||||
"import numpy as np\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"IMAGE_SIZE = (18, 12)\n",
|
||||
"plt.figure(figsize=IMAGE_SIZE)\n",
|
||||
"img_np = mpimg.imread(image)\n",
|
||||
"img = Image.fromarray(img_np.astype(\"uint8\"), \"RGB\")\n",
|
||||
"x, y = img.size\n",
|
||||
"\n",
|
||||
"fig, ax = plt.subplots(1, figsize=(15, 15))\n",
|
||||
"# Display the image\n",
|
||||
"ax.imshow(img_np)\n",
|
||||
"\n",
|
||||
"label_index = np.argmax(prediction[\"probs\"])\n",
|
||||
"label = prediction[\"labels\"][label_index]\n",
|
||||
"conf_score = prediction[\"probs\"][label_index]\n",
|
||||
"\n",
|
||||
"display_text = \"{} ({})\".format(label, round(conf_score, 3))\n",
|
||||
"print(display_text)\n",
|
||||
"\n",
|
||||
"color = \"red\"\n",
|
||||
"plt.text(30, 30, display_text, color=color, fontsize=30)\n",
|
||||
"\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Publish and run from REST endpoint"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Run the following code to publish the pipeline to your workspace. In your workspace in the portal, you can see metadata for the pipeline including run history and durations. You can also run the pipeline manually from the portal.\n",
|
||||
"\n",
|
||||
"Additionally, publishing the pipeline enables a REST endpoint to rerun the pipeline from any HTTP library on any platform."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"published_pipeline = pipeline_run.publish_pipeline(\n",
|
||||
" name=\"automl-image-batch-scoring\",\n",
|
||||
" description=\"Batch scoring using Automl for Image\",\n",
|
||||
" version=\"1.0\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"published_pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To run the pipeline from the REST endpoint, you first need an OAuth2 Bearer-type authentication header. This example uses interactive authentication for illustration purposes, but for most production scenarios requiring automated or headless authentication, use service principal authentication as [described in this notebook](https://aka.ms/pl-restep-auth).\n",
|
||||
"\n",
|
||||
"Service principal authentication involves creating an **App Registration** in **Azure Active Directory**, generating a client secret, and then granting your service principal **role access** to your machine learning workspace. You then use the [`ServicePrincipalAuthentication`](https://docs.microsoft.com/python/api/azureml-core/azureml.core.authentication.serviceprincipalauthentication?view=azure-ml-py) class to manage your auth flow.\n",
|
||||
"\n",
|
||||
"Both `InteractiveLoginAuthentication` and `ServicePrincipalAuthentication` inherit from `AbstractAuthentication`, and in both cases you use the `get_authentication_header()` function in the same way to fetch the header."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.authentication import InteractiveLoginAuthentication\n",
|
||||
"\n",
|
||||
"interactive_auth = InteractiveLoginAuthentication()\n",
|
||||
"auth_header = interactive_auth.get_authentication_header()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Get the REST url from the `endpoint` property of the published pipeline object. You can also find the REST url in your workspace in the portal. Build an HTTP POST request to the endpoint, specifying your authentication header. Additionally, add a JSON payload object with the experiment name and the batch size parameter. As a reminder, the `process_count_per_node` is passed through to `ParallelRunStep` because you defined it is defined as a `PipelineParameter` object in the step configuration.\n",
|
||||
"\n",
|
||||
"Make the request to trigger the run. Access the `Id` key from the response dictionary to get the value of the run id."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"rest_endpoint = published_pipeline.endpoint\n",
|
||||
"response = requests.post(\n",
|
||||
" rest_endpoint,\n",
|
||||
" headers=auth_header,\n",
|
||||
" json={\n",
|
||||
" \"ExperimentName\": \"batch_scoring\",\n",
|
||||
" \"ParameterAssignments\": {\"process_count_per_node\": 2},\n",
|
||||
" },\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" response.raise_for_status()\n",
|
||||
"except Exception:\n",
|
||||
" raise Exception(\n",
|
||||
" \"Received bad response from the endpoint: {}\\n\"\n",
|
||||
" \"Response Code: {}\\n\"\n",
|
||||
" \"Headers: {}\\n\"\n",
|
||||
" \"Content: {}\".format(\n",
|
||||
" rest_endpoint, response.status_code, response.headers, response.content\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"run_id = response.json().get(\"Id\")\n",
|
||||
"print(\"Submitted pipeline run: \", run_id)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Use the run id to monitor the status of the new run. This will take another 10-15 min to run and will look similar to the previous pipeline run, so if you don't need to see another pipeline run, you can skip watching the full output."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.pipeline.core.run import PipelineRun\n",
|
||||
"\n",
|
||||
"published_pipeline_run = PipelineRun(ws.experiments[\"batch_scoring\"], run_id)\n",
|
||||
"published_pipeline_run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Wait the run for completion and show output log to console\n",
|
||||
"published_pipeline_run.wait_for_completion(show_output=True)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": [
|
||||
"sanpil",
|
||||
"trmccorm",
|
||||
"pansav"
|
||||
]
|
||||
}
|
||||
],
|
||||
"categories": [
|
||||
"tutorials"
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.8"
|
||||
},
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "0f25b6eb4724eea488a4edd67dd290abce7d142c09986fc811384b5aebc0585a"
|
||||
}
|
||||
},
|
||||
"msauthor": "trbye"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
# Copyright (c) Microsoft. All rights reserved.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import json
|
||||
|
||||
from azureml.core.model import Model
|
||||
from azureml.automl.core.shared import logging_utilities
|
||||
|
||||
try:
|
||||
from azureml.automl.dnn.vision.common.logging_utils import get_logger
|
||||
from azureml.automl.dnn.vision.common.model_export_utils import (
|
||||
load_model,
|
||||
run_inference_batch,
|
||||
)
|
||||
from azureml.automl.dnn.vision.classification.inference.score import (
|
||||
_score_with_model,
|
||||
)
|
||||
from azureml.automl.dnn.vision.common.utils import _set_logging_parameters
|
||||
except ImportError:
|
||||
from azureml.contrib.automl.dnn.vision.common.logging_utils import get_logger
|
||||
from azureml.contrib.automl.dnn.vision.common.model_export_utils import (
|
||||
load_model,
|
||||
run_inference_batch,
|
||||
)
|
||||
from azureml.contrib.automl.dnn.vision.classification.inference.score import (
|
||||
_score_with_model,
|
||||
)
|
||||
from azureml.contrib.automl.dnn.vision.common.utils import _set_logging_parameters
|
||||
|
||||
TASK_TYPE = "image-classification"
|
||||
logger = get_logger("azureml.automl.core.scoring_script_images")
|
||||
|
||||
|
||||
def init():
|
||||
global model
|
||||
global batch_size
|
||||
|
||||
# Set up logging
|
||||
_set_logging_parameters(TASK_TYPE, {})
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Retrieve model_name and batch_size from arguments."
|
||||
)
|
||||
parser.add_argument("--model_name", dest="model_name", required=True)
|
||||
parser.add_argument("--batch_size", dest="batch_size", type=int, required=False)
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
batch_size = args.batch_size
|
||||
|
||||
model_path = os.path.join(Model.get_model_path(args.model_name), "model.pt")
|
||||
print(model_path)
|
||||
|
||||
try:
|
||||
logger.info("Loading model from path: {}.".format(model_path))
|
||||
model_settings = {}
|
||||
model = load_model(TASK_TYPE, model_path, **model_settings)
|
||||
logger.info("Loading successful.")
|
||||
except Exception as e:
|
||||
logging_utilities.log_traceback(e, logger)
|
||||
raise
|
||||
|
||||
|
||||
def run(mini_batch):
|
||||
logger.info("Running inference.")
|
||||
result = run_inference_batch(model, mini_batch, _score_with_model, batch_size)
|
||||
logger.info("Finished inferencing.")
|
||||
return result
|
||||
|
After Width: | Height: | Size: 258 KiB |
@@ -0,0 +1,15 @@
|
||||
---
|
||||
page_type: sample
|
||||
languages:
|
||||
- python
|
||||
products:
|
||||
- azure-machine-learning
|
||||
description: Notebook showing how to use AutoML for training an Image Classification Multi-Class model. We will use a small dataset to train the model, demonstrate how you can tune hyperparameters of the model to optimize model performance and deploy the model to use in inference scenarios.
|
||||
---
|
||||
|
||||
# Image Classification Multi-Class using AutoML for Images
|
||||
- Dataset: Toy dataset with images of products found in a fridge
|
||||
- **[Jupyter Notebook](auto-ml-image-classification-multiclass.ipynb)**
|
||||
- train an Image Classification Multi-Class model using AutoML
|
||||
- tune hyperparameters of the model to optimize model performance
|
||||
- deploy the model to use in inference scenarios
|
||||
@@ -0,0 +1,744 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License.\n",
|
||||
"\n",
|
||||
"# Training an Image Classification Multi-Class model using AutoML\n",
|
||||
"In this notebook, we go over how you can use AutoML for training an Image Classification Multi-Class model. We will use a small dataset to train the model, demonstrate how you can tune hyperparameters of the model to optimize model performance and deploy the model to use in inference scenarios. For detailed information please refer to the [documentation of AutoML for Images](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Important:** This feature is currently in public preview. This preview version is provided without a service-level agreement. Certain features might not be supported or might have constrained capabilities. For more information, see [Supplemental Terms of Use for Microsoft Azure Previews](https://azure.microsoft.com/en-us/support/legal/preview-supplemental-terms/)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Environment Setup\n",
|
||||
"Please follow the [\"Setup a new conda environment\"](https://github.com/Azure/azureml-examples/tree/main/python-sdk/tutorials/automl-with-azureml#3-setup-a-new-conda-environment) instructions to get started."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.35.0 of the Azure ML SDK.\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK.\")\n",
|
||||
"assert (\n",
|
||||
" azureml.core.VERSION >= \"1.35\"\n",
|
||||
"), \"Please upgrade the Azure ML SDK by running '!pip install --upgrade azureml-sdk' then restart the kernel.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Workspace setup\n",
|
||||
"In order to train and deploy models in Azure ML, you will first need to set up a workspace.\n",
|
||||
"\n",
|
||||
"An [Azure ML Workspace](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#workspace) is an Azure resource that organizes and coordinates the actions of many other Azure resources to assist in executing and sharing machine learning workflows. In particular, an Azure ML Workspace coordinates storage, databases, and compute resources providing added functionality for machine learning experimentation, deployment, inference, and the monitoring of deployed models.\n",
|
||||
"\n",
|
||||
"Create an Azure ML Workspace within your Azure subscription or load an existing workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Compute target setup\n",
|
||||
"You will need to provide a [Compute Target](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#computes) that will be used for your AutoML model training. AutoML models for image tasks require [GPU SKUs](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) such as the ones from the NC, NCv2, NCv3, ND, NDv2 and NCasT4 series. We recommend using the NCsv3-series (with v100 GPUs) for faster training. Using a compute target with a multi-GPU VM SKU will leverage the multiple GPUs to speed up training. Additionally, setting up a compute target with multiple nodes will allow for faster model training by leveraging parallelism, when tuning hyperparameters for your model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import AmlCompute, ComputeTarget\n",
|
||||
"\n",
|
||||
"cluster_name = \"gpu-cluster-nc6\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" compute_target = ws.compute_targets[cluster_name]\n",
|
||||
" print(\"Found existing compute target.\")\n",
|
||||
"except KeyError:\n",
|
||||
" print(\"Creating a new compute target...\")\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=\"Standard_NC6\",\n",
|
||||
" idle_seconds_before_scaledown=600,\n",
|
||||
" min_nodes=0,\n",
|
||||
" max_nodes=4,\n",
|
||||
" )\n",
|
||||
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
|
||||
"# Can poll for a minimum number of nodes and for a specific timeout.\n",
|
||||
"# If no min_node_count is provided, it will use the scale settings for the cluster.\n",
|
||||
"compute_target.wait_for_completion(\n",
|
||||
" show_output=True, min_node_count=None, timeout_in_minutes=20\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Experiment Setup\n",
|
||||
"Create an [Experiment](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#experiments) in your workspace to track your model training runs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Experiment\n",
|
||||
"\n",
|
||||
"experiment_name = \"automl-image-multiclass\"\n",
|
||||
"experiment = Experiment(ws, name=experiment_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Dataset with input Training Data\n",
|
||||
"\n",
|
||||
"In order to generate models for computer vision, you will need to bring in labeled image data as input for model training in the form of an [AzureML Tabular Dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset). You can either use a dataset that you have exported from a [Data Labeling](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-label-data) project, or create a new Tabular Dataset with your labeled training data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In this notebook, we use a toy dataset called Fridge Objects, which consists of 134 images of 4 classes of beverage container {can, carton, milk bottle, water bottle} photos taken on different backgrounds.\n",
|
||||
"\n",
|
||||
"All images in this notebook are hosted in [this repository](https://github.com/microsoft/computervision-recipes) and are made available under the [MIT license](https://github.com/microsoft/computervision-recipes/blob/master/LICENSE).\n",
|
||||
"\n",
|
||||
"We first download and unzip the data locally."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import urllib\n",
|
||||
"from zipfile import ZipFile\n",
|
||||
"\n",
|
||||
"# download data\n",
|
||||
"download_url = \"https://cvbp-secondary.z19.web.core.windows.net/datasets/image_classification/fridgeObjects.zip\"\n",
|
||||
"data_file = \"./fridgeObjects.zip\"\n",
|
||||
"urllib.request.urlretrieve(download_url, filename=data_file)\n",
|
||||
"\n",
|
||||
"# extract files\n",
|
||||
"with ZipFile(data_file, \"r\") as zip:\n",
|
||||
" print(\"extracting files...\")\n",
|
||||
" zip.extractall()\n",
|
||||
" print(\"done\")\n",
|
||||
"# delete zip file\n",
|
||||
"os.remove(data_file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This is a sample image from this dataset:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from IPython.display import Image\n",
|
||||
"\n",
|
||||
"sample_image = \"./fridgeObjects/milk_bottle/99.jpg\"\n",
|
||||
"Image(filename=sample_image)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Convert the downloaded data to JSONL\n",
|
||||
"In this example, the fridge object dataset is stored in a directory. There are four different folders inside:\n",
|
||||
"\n",
|
||||
"- /water_bottle\n",
|
||||
"- /milk_bottle\n",
|
||||
"- /carton\n",
|
||||
"- /can\n",
|
||||
"\n",
|
||||
"This is the most common data format for multiclass image classification. Each folder title corresponds to the image label for the images contained inside.\n",
|
||||
"\n",
|
||||
"In order to use this data to create an [AzureML Tabular Dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset), we first need to convert it to the required JSONL format. Please refer to the [documentation on how to prepare datasets](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-prepare-datasets-for-automl-images).\n",
|
||||
"\n",
|
||||
"The following script is creating two .jsonl files (one for training and one for validation) in the parent folder of the dataset. The train / validation ratio corresponds to 20% of the data going into the validation file."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"src = \"./fridgeObjects/\"\n",
|
||||
"train_validation_ratio = 5\n",
|
||||
"\n",
|
||||
"# Retrieving default datastore that got automatically created when we setup a workspace\n",
|
||||
"workspaceblobstore = ws.get_default_datastore().name\n",
|
||||
"\n",
|
||||
"# Path to the training and validation files\n",
|
||||
"train_annotations_file = os.path.join(src, \"train_annotations.jsonl\")\n",
|
||||
"validation_annotations_file = os.path.join(src, \"validation_annotations.jsonl\")\n",
|
||||
"\n",
|
||||
"# sample json line dictionary\n",
|
||||
"json_line_sample = {\n",
|
||||
" \"image_url\": \"AmlDatastore://\"\n",
|
||||
" + workspaceblobstore\n",
|
||||
" + \"/\"\n",
|
||||
" + os.path.basename(os.path.dirname(src)),\n",
|
||||
" \"label\": \"\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"index = 0\n",
|
||||
"# Scan each sub directary and generate jsonl line\n",
|
||||
"with open(train_annotations_file, \"w\") as train_f:\n",
|
||||
" with open(validation_annotations_file, \"w\") as validation_f:\n",
|
||||
" for className in os.listdir(src):\n",
|
||||
" subDir = src + className\n",
|
||||
" if not os.path.isdir(subDir):\n",
|
||||
" continue\n",
|
||||
" # Scan each sub directary\n",
|
||||
" print(\"Parsing \" + subDir)\n",
|
||||
" for image in os.listdir(subDir):\n",
|
||||
" json_line = dict(json_line_sample)\n",
|
||||
" json_line[\"image_url\"] += f\"/{className}/{image}\"\n",
|
||||
" json_line[\"label\"] = className\n",
|
||||
"\n",
|
||||
" if index % train_validation_ratio == 0:\n",
|
||||
" # validation annotation\n",
|
||||
" validation_f.write(json.dumps(json_line) + \"\\n\")\n",
|
||||
" else:\n",
|
||||
" # train annotation\n",
|
||||
" train_f.write(json.dumps(json_line) + \"\\n\")\n",
|
||||
" index += 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Upload the JSONL file and images to Datastore\n",
|
||||
"In order to use the data for training in Azure ML, we upload it to our Azure ML Workspace via a [Datastore](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#datasets-and-datastores). The datastore provides a mechanism for you to upload/download data and interact with it from your remote compute targets. It is an abstraction over Azure Storage."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Retrieving default datastore that got automatically created when we setup a workspace\n",
|
||||
"ds = ws.get_default_datastore()\n",
|
||||
"ds.upload(src_dir=\"./fridgeObjects\", target_path=\"fridgeObjects\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Finally, we need to create an [AzureML Tabular Dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset) from the data we uploaded to the Datastore. We create one dataset for training and one for validation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Dataset\n",
|
||||
"from azureml.data import DataType\n",
|
||||
"\n",
|
||||
"# get existing training dataset\n",
|
||||
"training_dataset_name = \"fridgeObjectsTrainingDataset\"\n",
|
||||
"if training_dataset_name in ws.datasets:\n",
|
||||
" training_dataset = ws.datasets.get(training_dataset_name)\n",
|
||||
" print(\"Found the training dataset\", training_dataset_name)\n",
|
||||
"else:\n",
|
||||
" # create training dataset\n",
|
||||
" training_dataset = Dataset.Tabular.from_json_lines_files(\n",
|
||||
" path=ds.path(\"fridgeObjects/train_annotations.jsonl\"),\n",
|
||||
" set_column_types={\"image_url\": DataType.to_stream(ds.workspace)},\n",
|
||||
" )\n",
|
||||
" training_dataset = training_dataset.register(\n",
|
||||
" workspace=ws, name=training_dataset_name\n",
|
||||
" )\n",
|
||||
"# get existing validation dataset\n",
|
||||
"validation_dataset_name = \"fridgeObjectsValidationDataset\"\n",
|
||||
"if validation_dataset_name in ws.datasets:\n",
|
||||
" validation_dataset = ws.datasets.get(validation_dataset_name)\n",
|
||||
" print(\"Found the validation dataset\", validation_dataset_name)\n",
|
||||
"else:\n",
|
||||
" # create validation dataset\n",
|
||||
" validation_dataset = Dataset.Tabular.from_json_lines_files(\n",
|
||||
" path=ds.path(\"fridgeObjects/validation_annotations.jsonl\"),\n",
|
||||
" set_column_types={\"image_url\": DataType.to_stream(ds.workspace)},\n",
|
||||
" )\n",
|
||||
" validation_dataset = validation_dataset.register(\n",
|
||||
" workspace=ws, name=validation_dataset_name\n",
|
||||
" )\n",
|
||||
"print(\"Training dataset name: \" + training_dataset.name)\n",
|
||||
"print(\"Validation dataset name: \" + validation_dataset.name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Validation dataset is optional. If no validation dataset is specified, by default 20% of your training data will be used for validation. You can control the percentage using the `split_ratio` argument - please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models#model-agnostic-hyperparameters) for more details.\n",
|
||||
"\n",
|
||||
"This is what the training dataset looks like:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"training_dataset.to_pandas_dataframe()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Configuring your AutoML run for image tasks\n",
|
||||
"AutoML allows you to easily train models for Image Classification, Object Detection & Instance Segmentation on your image data. You can control the model algorithm to be used, specify hyperparameter values for your model as well as perform a sweep across the hyperparameter space to generate an optimal model. Parameters for configuring your AutoML Image run are specified using the `AutoMLImageConfig` - please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models#configure-your-experiment-settings) for the details on the parameters that can be used and their values."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"When using AutoML for image tasks, you need to specify the model algorithms using the `model_name` parameter. You can either specify a single model or choose to sweep over multiple models. Please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models#configure-model-algorithms-and-hyperparameters) for the list of supported model algorithms.\n",
|
||||
"\n",
|
||||
"### Using default hyperparameter values for the specified algorithm\n",
|
||||
"Before doing a large sweep to search for the optimal models and hyperparameters, we recommend trying the default values for a given model to get a first baseline. Next, you can explore multiple hyperparameters for the same model before sweeping over multiple models and their parameters. This allows an iterative approach, as with multiple models and multiple hyperparameters for each (as we showcase in the next section), the search space grows exponentially, and you need more iterations to find optimal configurations.\n",
|
||||
"\n",
|
||||
"If you wish to use the default hyperparameter values for a given algorithm (say `vitb16r224`), you can specify the config for your AutoML Image runs as follows:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.core.shared.constants import ImageTask\n",
|
||||
"from azureml.train.automl import AutoMLImageConfig\n",
|
||||
"from azureml.train.hyperdrive import GridParameterSampling, choice\n",
|
||||
"\n",
|
||||
"image_config_vit = AutoMLImageConfig(\n",
|
||||
" task=ImageTask.IMAGE_CLASSIFICATION,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=training_dataset,\n",
|
||||
" validation_data=validation_dataset,\n",
|
||||
" hyperparameter_sampling=GridParameterSampling({\"model_name\": choice(\"vitb16r224\")}),\n",
|
||||
" iterations=1,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Submitting an AutoML run for Computer Vision tasks\n",
|
||||
"Once you've created the config settings for your run, you can submit an AutoML run using the config in order to train a vision model using your training dataset."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run = experiment.submit(image_config_vit)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run.wait_for_completion(wait_post_processing=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Hyperparameter sweeping for your AutoML models for computer vision tasks\n",
|
||||
"In this example, we use the AutoMLImageConfig to train an Image Classification model using the following model algorithms: `seresnext`, `resnet50`, `vitb16r224`, and `vits16r224`.\n",
|
||||
"\n",
|
||||
"When using AutoML for Images, you can perform a hyperparameter sweep over a defined parameter space to find the optimal model. In this example, we sweep over the hyperparameters for each algorithm, choosing from a range of values for learning_rate, number_of_epochs, layers_to_freeze, etc., to generate a model with the optimal 'accuracy'. If hyperparameter values are not specified, then default values are used for the specified algorithm.\n",
|
||||
"\n",
|
||||
"We use Random Sampling to pick samples from this parameter space and try a total of 10 iterations with these different samples, running 2 iterations at a time on our compute target, which has been previously set up using 4 nodes. Please note that the more parameters the space has, the more iterations you need to find optimal models.\n",
|
||||
"\n",
|
||||
"We leverage the Bandit early termination policy which will terminate poor performing configs (those that are not within 20% slack of the best performing config), thus significantly saving compute resources.\n",
|
||||
"\n",
|
||||
"For more details on model and hyperparameter sweeping, please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.core.shared.constants import ImageTask\n",
|
||||
"from azureml.train.automl import AutoMLImageConfig\n",
|
||||
"from azureml.train.hyperdrive import BanditPolicy, RandomParameterSampling\n",
|
||||
"from azureml.train.hyperdrive import choice, uniform\n",
|
||||
"\n",
|
||||
"parameter_space = {\n",
|
||||
" \"learning_rate\": uniform(0.001, 0.01),\n",
|
||||
" \"model\": choice(\n",
|
||||
" {\n",
|
||||
" \"model_name\": choice(\"vitb16r224\", \"vits16r224\"),\n",
|
||||
" \"number_of_epochs\": choice(15, 30),\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"model_name\": choice(\"seresnext\", \"resnest50\"),\n",
|
||||
" \"layers_to_freeze\": choice(0, 2),\n",
|
||||
" },\n",
|
||||
" ),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"tuning_settings = {\n",
|
||||
" \"iterations\": 10,\n",
|
||||
" \"max_concurrent_iterations\": 2,\n",
|
||||
" \"hyperparameter_sampling\": RandomParameterSampling(parameter_space),\n",
|
||||
" \"early_termination_policy\": BanditPolicy(\n",
|
||||
" evaluation_interval=2, slack_factor=0.2, delay_evaluation=6\n",
|
||||
" ),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_image_config = AutoMLImageConfig(\n",
|
||||
" task=ImageTask.IMAGE_CLASSIFICATION,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=training_dataset,\n",
|
||||
" validation_data=validation_dataset,\n",
|
||||
" **tuning_settings,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run = experiment.submit(automl_image_config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run.wait_for_completion(wait_post_processing=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"When doing a hyperparameter sweep, it can be useful to visualize the different configurations that were tried using the HyperDrive UI. You can navigate to this UI by going to the 'Child runs' tab in the UI of the main `automl_image_run` from above, which is the HyperDrive parent run. Then you can go into the 'Child runs' tab of this HyperDrive parent run. Alternatively, here below you can see directly the HyperDrive parent run and navigate to its 'Child runs' tab:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Run\n",
|
||||
"\n",
|
||||
"hyperdrive_run = Run(experiment=experiment, run_id=automl_image_run.id + \"_HD\")\n",
|
||||
"hyperdrive_run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Register the optimal vision model from the AutoML run\n",
|
||||
"Once the run completes, we can register the model that was created from the best run (configuration that resulted in the best primary metric)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Register the model from the best run\n",
|
||||
"\n",
|
||||
"best_child_run = automl_image_run.get_best_child()\n",
|
||||
"model_name = best_child_run.properties[\"model_name\"]\n",
|
||||
"model = best_child_run.register_model(\n",
|
||||
" model_name=model_name, model_path=\"outputs/model.pt\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Deploy model as a web service\n",
|
||||
"Once you have your trained model, you can deploy the model on Azure. You can deploy your trained model as a web service on Azure Container Instances ([ACI](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-azure-container-instance)) or Azure Kubernetes Service ([AKS](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-azure-kubernetes-service)). Please note that ACI only supports small models under 1 GB in size. For testing larger models or for the high-scale production stage, we recommend using AKS.\n",
|
||||
"In this tutorial, we will deploy the model as a web service in AKS."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You will need to first create an AKS compute cluster or use an existing AKS cluster. You can use either GPU or CPU VM SKUs for your deployment cluster"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeTarget, AksCompute\n",
|
||||
"from azureml.exceptions import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# Choose a name for your cluster\n",
|
||||
"aks_name = \"aks-cpu-mc\"\n",
|
||||
"# Check to see if the cluster already exists\n",
|
||||
"try:\n",
|
||||
" aks_target = ComputeTarget(workspace=ws, name=aks_name)\n",
|
||||
" print(\"Found existing compute target\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" print(\"Creating a new compute target...\")\n",
|
||||
" # Provision AKS cluster with a CPU machine\n",
|
||||
" prov_config = AksCompute.provisioning_configuration(vm_size=\"STANDARD_D3_V2\")\n",
|
||||
" # Create the cluster\n",
|
||||
" aks_target = ComputeTarget.create(\n",
|
||||
" workspace=ws, name=aks_name, provisioning_configuration=prov_config\n",
|
||||
" )\n",
|
||||
" aks_target.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Next, you will need to define the [inference configuration](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models#update-inference-configuration), that describes how to set up the web-service containing your model. You can use the scoring script and the environment from the training run in your inference config.\n",
|
||||
"\n",
|
||||
"<b>Note:</b> To change the model's settings, open the downloaded scoring script and modify the model_settings variable <i>before</i> deploying the model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.model import InferenceConfig\n",
|
||||
"\n",
|
||||
"best_child_run.download_file(\n",
|
||||
" \"outputs/scoring_file_v_1_0_0.py\", output_file_path=\"score.py\"\n",
|
||||
")\n",
|
||||
"environment = best_child_run.get_environment()\n",
|
||||
"inference_config = InferenceConfig(entry_script=\"score.py\", environment=environment)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can then deploy the model as an AKS web service."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Deploy the model from the best run as an AKS web service\n",
|
||||
"from azureml.core.webservice import AksWebservice\n",
|
||||
"from azureml.core.model import Model\n",
|
||||
"\n",
|
||||
"aks_config = AksWebservice.deploy_configuration(\n",
|
||||
" autoscale_enabled=True, cpu_cores=1, memory_gb=5, enable_app_insights=True\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"aks_service = Model.deploy(\n",
|
||||
" ws,\n",
|
||||
" models=[model],\n",
|
||||
" inference_config=inference_config,\n",
|
||||
" deployment_config=aks_config,\n",
|
||||
" deployment_target=aks_target,\n",
|
||||
" name=\"automl-image-test-cpu-mc\",\n",
|
||||
" overwrite=True,\n",
|
||||
")\n",
|
||||
"aks_service.wait_for_deployment(show_output=True)\n",
|
||||
"print(aks_service.state)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Test the web service\n",
|
||||
"Finally, let's test our deployed web service to predict new images. You can pass in any image. In this case, we'll use a random image from the dataset and pass it to the scoring URI."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"# URL for the web service\n",
|
||||
"scoring_uri = aks_service.scoring_uri\n",
|
||||
"\n",
|
||||
"# If the service is authenticated, set the key or token\n",
|
||||
"key, _ = aks_service.get_keys()\n",
|
||||
"\n",
|
||||
"sample_image = \"./test_image.jpg\"\n",
|
||||
"\n",
|
||||
"# Load image data\n",
|
||||
"data = open(sample_image, \"rb\").read()\n",
|
||||
"\n",
|
||||
"# Set the content type\n",
|
||||
"headers = {\"Content-Type\": \"application/octet-stream\"}\n",
|
||||
"\n",
|
||||
"# If authentication is enabled, set the authorization header\n",
|
||||
"headers[\"Authorization\"] = f\"Bearer {key}\"\n",
|
||||
"\n",
|
||||
"# Make the request and display the response\n",
|
||||
"resp = requests.post(scoring_uri, data, headers=headers)\n",
|
||||
"print(resp.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Visualize predictions\n",
|
||||
"Now that we have scored a test image, we can visualize the prediction for this image"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import matplotlib.image as mpimg\n",
|
||||
"from PIL import Image\n",
|
||||
"import numpy as np\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"IMAGE_SIZE = (18, 12)\n",
|
||||
"plt.figure(figsize=IMAGE_SIZE)\n",
|
||||
"img_np = mpimg.imread(sample_image)\n",
|
||||
"img = Image.fromarray(img_np.astype(\"uint8\"), \"RGB\")\n",
|
||||
"x, y = img.size\n",
|
||||
"\n",
|
||||
"fig, ax = plt.subplots(1, figsize=(15, 15))\n",
|
||||
"# Display the image\n",
|
||||
"ax.imshow(img_np)\n",
|
||||
"\n",
|
||||
"prediction = json.loads(resp.text)\n",
|
||||
"label_index = np.argmax(prediction[\"probs\"])\n",
|
||||
"label = prediction[\"labels\"][label_index]\n",
|
||||
"conf_score = prediction[\"probs\"][label_index]\n",
|
||||
"\n",
|
||||
"display_text = \"{} ({})\".format(label, round(conf_score, 3))\n",
|
||||
"print(display_text)\n",
|
||||
"\n",
|
||||
"color = \"red\"\n",
|
||||
"plt.text(30, 30, display_text, color=color, fontsize=30)\n",
|
||||
"\n",
|
||||
"plt.show()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.10"
|
||||
},
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
After Width: | Height: | Size: 36 KiB |
|
After Width: | Height: | Size: 272 KiB |
@@ -0,0 +1,15 @@
|
||||
---
|
||||
page_type: sample
|
||||
languages:
|
||||
- python
|
||||
products:
|
||||
- azure-machine-learning
|
||||
description: Notebook showing how to use AutoML for training an Image Classification Multi-Label model. We will use a small dataset to train the model, demonstrate how you can tune hyperparameters of the model to optimize model performance and deploy the model to use in inference scenarios.
|
||||
---
|
||||
|
||||
# Image Classification Multi-Label using AutoML for Images
|
||||
- Dataset: Toy dataset with images of products found in a fridge
|
||||
- **[Jupyter Notebook](auto-ml-image-classification-multilabel.ipynb)**
|
||||
- train an Image Classification Multi-Label model using AutoML
|
||||
- tune hyperparameters of the model to optimize model performance
|
||||
- deploy the model to use in inference scenarios
|
||||
@@ -0,0 +1,742 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License.\n",
|
||||
"\n",
|
||||
"# Training an Image Classification Multi-Label model using AutoML\n",
|
||||
"In this notebook, we go over how you can use AutoML for training an Image Classification Multi-Label model. We will use a small dataset to train the model, demonstrate how you can tune hyperparameters of the model to optimize model performance and deploy the model to use in inference scenarios. For detailed information please refer to the [documentation of AutoML for Images](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Important:** This feature is currently in public preview. This preview version is provided without a service-level agreement. Certain features might not be supported or might have constrained capabilities. For more information, see [Supplemental Terms of Use for Microsoft Azure Previews](https://azure.microsoft.com/en-us/support/legal/preview-supplemental-terms/)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Environment Setup\n",
|
||||
"Please follow the [\"Setup a new conda environment\"](https://github.com/Azure/azureml-examples/tree/main/python-sdk/tutorials/automl-with-azureml#3-setup-a-new-conda-environment) instructions to get started."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.35.0 of the Azure ML SDK.\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK.\")\n",
|
||||
"assert (\n",
|
||||
" azureml.core.VERSION >= \"1.35\"\n",
|
||||
"), \"Please upgrade the Azure ML SDK by running '!pip install --upgrade azureml-sdk' then restart the kernel.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Workspace setup\n",
|
||||
"In order to train and deploy models in Azure ML, you will first need to set up a workspace.\n",
|
||||
"\n",
|
||||
"An [Azure ML Workspace](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#workspace) is an Azure resource that organizes and coordinates the actions of many other Azure resources to assist in executing and sharing machine learning workflows. In particular, an Azure ML Workspace coordinates storage, databases, and compute resources providing added functionality for machine learning experimentation, deployment, inference, and the monitoring of deployed models.\n",
|
||||
"\n",
|
||||
"Create an Azure ML Workspace within your Azure subscription or load an existing workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Compute target setup\n",
|
||||
"You will need to provide a [Compute Target](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#computes) that will be used for your AutoML model training. AutoML models for image tasks require [GPU SKUs](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) such as the ones from the NC, NCv2, NCv3, ND, NDv2 and NCasT4 series. We recommend using the NCsv3-series (with v100 GPUs) for faster training. Using a compute target with a multi-GPU VM SKU will leverage the multiple GPUs to speed up training. Additionally, setting up a compute target with multiple nodes will allow for faster model training by leveraging parallelism, when tuning hyperparameters for your model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import AmlCompute, ComputeTarget\n",
|
||||
"\n",
|
||||
"cluster_name = \"gpu-cluster-nc6\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" compute_target = ws.compute_targets[cluster_name]\n",
|
||||
" print(\"Found existing compute target.\")\n",
|
||||
"except KeyError:\n",
|
||||
" print(\"Creating a new compute target...\")\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=\"Standard_NC6\",\n",
|
||||
" idle_seconds_before_scaledown=600,\n",
|
||||
" min_nodes=0,\n",
|
||||
" max_nodes=4,\n",
|
||||
" )\n",
|
||||
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
|
||||
"# Can poll for a minimum number of nodes and for a specific timeout.\n",
|
||||
"# If no min_node_count is provided, it will use the scale settings for the cluster.\n",
|
||||
"compute_target.wait_for_completion(\n",
|
||||
" show_output=True, min_node_count=None, timeout_in_minutes=20\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Experiment Setup\n",
|
||||
"Create an [Experiment](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#experiments) in your workspace to track your model training runs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Experiment\n",
|
||||
"\n",
|
||||
"experiment_name = \"automl-image-classification-multilabel\"\n",
|
||||
"experiment = Experiment(ws, name=experiment_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Dataset with input Training Data\n",
|
||||
"\n",
|
||||
"In order to generate models for computer vision, you will need to bring in labeled image data as input for model training in the form of an [AzureML Tabular Dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset). You can either use a dataset that you have exported from a [Data Labeling](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-label-data) project, or create a new Tabular Dataset with your labeled training data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In this notebook, we use a toy dataset called Fridge Objects, which consists of 128 images of 4 labels of beverage container {can, carton, milk bottle, water bottle} photos taken on different backgrounds. It also includes a labels file in .csv format. This is one of the most common data formats for Image Classification Multi-Label: one csv file that contains the mapping of labels to a folder of images.\n",
|
||||
"\n",
|
||||
"All images in this notebook are hosted in [this repository](https://github.com/microsoft/computervision-recipes) and are made available under the [MIT license](https://github.com/microsoft/computervision-recipes/blob/master/LICENSE).\n",
|
||||
"\n",
|
||||
"We first download and unzip the data locally."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import urllib\n",
|
||||
"from zipfile import ZipFile\n",
|
||||
"\n",
|
||||
"# download data\n",
|
||||
"download_url = \"https://cvbp-secondary.z19.web.core.windows.net/datasets/image_classification/multilabelFridgeObjects.zip\"\n",
|
||||
"data_file = \"./multilabelFridgeObjects.zip\"\n",
|
||||
"urllib.request.urlretrieve(download_url, filename=data_file)\n",
|
||||
"\n",
|
||||
"# extract files\n",
|
||||
"with ZipFile(data_file, \"r\") as zip:\n",
|
||||
" print(\"extracting files...\")\n",
|
||||
" zip.extractall()\n",
|
||||
" print(\"done\")\n",
|
||||
"# delete zip file\n",
|
||||
"os.remove(data_file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This is a sample image from this dataset:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from IPython.display import Image\n",
|
||||
"\n",
|
||||
"sample_image = \"./multilabelFridgeObjects/images/56.jpg\"\n",
|
||||
"Image(filename=sample_image)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Convert the downloaded data to JSONL\n",
|
||||
"In this example, the fridge object dataset is annotated in the CSV file, where each image corresponds to a line. It defines a mapping of the filename to the labels. Since this is a multi-label classification problem, each image can be associated to multiple labels. In order to use this data to create an [AzureML Tabular Dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset), we first need to convert it to the required JSONL format. Please refer to the [documentation on how to prepare datasets](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-prepare-datasets-for-automl-images).\n",
|
||||
"\n",
|
||||
"The following script is creating two .jsonl files (one for training and one for validation) in the parent folder of the dataset. The train / validation ratio corresponds to 20% of the data going into the validation file."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"src = \"./multilabelFridgeObjects\"\n",
|
||||
"train_validation_ratio = 5\n",
|
||||
"\n",
|
||||
"# Retrieving default datastore that got automatically created when we setup a workspace\n",
|
||||
"workspaceblobstore = ws.get_default_datastore().name\n",
|
||||
"\n",
|
||||
"# Path to the labels file.\n",
|
||||
"labelFile = os.path.join(src, \"labels.csv\")\n",
|
||||
"\n",
|
||||
"# Path to the training and validation files\n",
|
||||
"train_annotations_file = os.path.join(src, \"train_annotations.jsonl\")\n",
|
||||
"validation_annotations_file = os.path.join(src, \"validation_annotations.jsonl\")\n",
|
||||
"\n",
|
||||
"# sample json line dictionary\n",
|
||||
"json_line_sample = {\n",
|
||||
" \"image_url\": \"AmlDatastore://\" + workspaceblobstore + \"/multilabelFridgeObjects\",\n",
|
||||
" \"label\": [],\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Read each annotation and convert it to jsonl line\n",
|
||||
"with open(train_annotations_file, \"w\") as train_f:\n",
|
||||
" with open(validation_annotations_file, \"w\") as validation_f:\n",
|
||||
" with open(labelFile, \"r\") as labels:\n",
|
||||
" for i, line in enumerate(labels):\n",
|
||||
" # Skipping the title line and any empty lines.\n",
|
||||
" if i == 0 or len(line.strip()) == 0:\n",
|
||||
" continue\n",
|
||||
" line_split = line.strip().split(\",\")\n",
|
||||
" if len(line_split) != 2:\n",
|
||||
" print(\"Skipping the invalid line: {}\".format(line))\n",
|
||||
" continue\n",
|
||||
" json_line = dict(json_line_sample)\n",
|
||||
" json_line[\"image_url\"] += f\"/images/{line_split[0]}\"\n",
|
||||
" json_line[\"label\"] = line_split[1].strip().split(\" \")\n",
|
||||
"\n",
|
||||
" if i % train_validation_ratio == 0:\n",
|
||||
" # validation annotation\n",
|
||||
" validation_f.write(json.dumps(json_line) + \"\\n\")\n",
|
||||
" else:\n",
|
||||
" # train annotation\n",
|
||||
" train_f.write(json.dumps(json_line) + \"\\n\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Upload the JSONL file and images to Datastore\n",
|
||||
"In order to use the data for training in Azure ML, we upload it to our Azure ML Workspace via a [Datastore](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#datasets-and-datastores). The datastore provides a mechanism for you to upload/download data and interact with it from your remote compute targets. It is an abstraction over Azure Storage."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Retrieving default datastore that got automatically created when we setup a workspace\n",
|
||||
"ds = ws.get_default_datastore()\n",
|
||||
"ds.upload(src_dir=\"./multilabelFridgeObjects\", target_path=\"multilabelFridgeObjects\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Finally, we need to create an [AzureML Tabular Dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset) from the data we uploaded to the Datastore. We create one dataset for training and one for validation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Dataset\n",
|
||||
"from azureml.data import DataType\n",
|
||||
"\n",
|
||||
"# get existing training dataset\n",
|
||||
"training_dataset_name = \"multilabelFridgeObjectsTrainingDataset\"\n",
|
||||
"if training_dataset_name in ws.datasets:\n",
|
||||
" training_dataset = ws.datasets.get(training_dataset_name)\n",
|
||||
" print(\"Found the training dataset\", training_dataset_name)\n",
|
||||
"else:\n",
|
||||
" # create training dataset\n",
|
||||
" training_dataset = Dataset.Tabular.from_json_lines_files(\n",
|
||||
" path=ds.path(\"multilabelFridgeObjects/train_annotations.jsonl\"),\n",
|
||||
" set_column_types={\"image_url\": DataType.to_stream(ds.workspace)},\n",
|
||||
" )\n",
|
||||
" training_dataset = training_dataset.register(\n",
|
||||
" workspace=ws, name=training_dataset_name\n",
|
||||
" )\n",
|
||||
"# get existing validation dataset\n",
|
||||
"validation_dataset_name = \"multilabelFridgeObjectsValidationDataset\"\n",
|
||||
"if validation_dataset_name in ws.datasets:\n",
|
||||
" validation_dataset = ws.datasets.get(validation_dataset_name)\n",
|
||||
" print(\"Found the validation dataset\", validation_dataset_name)\n",
|
||||
"else:\n",
|
||||
" # create validation dataset\n",
|
||||
" validation_dataset = Dataset.Tabular.from_json_lines_files(\n",
|
||||
" path=ds.path(\"multilabelFridgeObjects/validation_annotations.jsonl\"),\n",
|
||||
" set_column_types={\"image_url\": DataType.to_stream(ds.workspace)},\n",
|
||||
" )\n",
|
||||
" validation_dataset = validation_dataset.register(\n",
|
||||
" workspace=ws, name=validation_dataset_name\n",
|
||||
" )\n",
|
||||
"print(\"Training dataset name: \" + training_dataset.name)\n",
|
||||
"print(\"Validation dataset name: \" + validation_dataset.name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Validation dataset is optional. If no validation dataset is specified, by default 20% of your training data will be used for validation. You can control the percentage using the `split_ratio` argument - please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models#model-agnostic-hyperparameters) for more details.\n",
|
||||
"\n",
|
||||
"This is what the training dataset looks like:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"training_dataset.to_pandas_dataframe()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Configuring your AutoML run for image tasks\n",
|
||||
"AutoML allows you to easily train models for Image Classification, Object Detection & Instance Segmentation on your image data. You can control the model algorithm to be used, specify hyperparameter values for your model as well as perform a sweep across the hyperparameter space to generate an optimal model. Parameters for configuring your AutoML Image run are specified using the `AutoMLImageConfig` - please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models#configure-your-experiment-settings) for the details on the parameters that can be used and their values."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"When using AutoML for image tasks, you need to specify the model algorithms using the `model_name` parameter. You can either specify a single model or choose to sweep over multiple models. Please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models#configure-model-algorithms-and-hyperparameters) for the list of supported model algorithms.\n",
|
||||
"\n",
|
||||
"### Using default hyperparameter values for the specified algorithm\n",
|
||||
"Before doing a large sweep to search for the optimal models and hyperparameters, we recommend trying the default values for a given model to get a first baseline. Next, you can explore multiple hyperparameters for the same model before sweeping over multiple models and their parameters. This allows an iterative approach, as with multiple models and multiple hyperparameters for each (as we showcase in the next section), the search space grows exponentially, and you need more iterations to find optimal configurations.\n",
|
||||
"\n",
|
||||
"If you wish to use the default hyperparameter values for a given algorithm (say `vitb16r224`), you can specify the config for your AutoML Image runs as follows:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.core.shared.constants import ImageTask\n",
|
||||
"from azureml.train.automl import AutoMLImageConfig\n",
|
||||
"from azureml.train.hyperdrive import GridParameterSampling, choice\n",
|
||||
"\n",
|
||||
"image_config_vit = AutoMLImageConfig(\n",
|
||||
" task=ImageTask.IMAGE_CLASSIFICATION_MULTILABEL,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=training_dataset,\n",
|
||||
" validation_data=validation_dataset,\n",
|
||||
" hyperparameter_sampling=GridParameterSampling({\"model_name\": choice(\"vitb16r224\")}),\n",
|
||||
" iterations=1,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Submitting an AutoML run for Computer Vision tasks\n",
|
||||
"Once you've created the config settings for your run, you can submit an AutoML run using the config in order to train a vision model using your training dataset."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run = experiment.submit(image_config_vit)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run.wait_for_completion(wait_post_processing=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Hyperparameter sweeping for your AutoML models for computer vision tasks\n",
|
||||
"In this example, we use the AutoMLImageConfig to train an Image Classification model using the `vitb16r224` and `seresnext` model algorithms.\n",
|
||||
"\n",
|
||||
"When using AutoML for Images, you can perform a hyperparameter sweep over a defined parameter space to find the optimal model. In this example, we sweep over the hyperparameters for each algorithm, choosing from a range of values for learning_rate, grad_accumulation_step, valid_resize_size, etc., to generate a model with the optimal 'accuracy'. If hyperparameter values are not specified, then default values are used for the specified algorithm.\n",
|
||||
"\n",
|
||||
"We use Random Sampling to pick samples from this parameter space and try a total of 10 iterations with these different samples, running 2 iterations at a time on our compute target, which has been previously set up using 4 nodes. Please note that the more parameters the space has, the more iterations you need to find optimal models.\n",
|
||||
"\n",
|
||||
"We leverage the Bandit early termination policy which will terminate poor performing configs (those that are not within 20% slack of the best performing config), thus significantly saving compute resources.\n",
|
||||
"\n",
|
||||
"For more details on model and hyperparameter sweeping, please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.core.shared.constants import ImageTask\n",
|
||||
"from azureml.train.automl import AutoMLImageConfig\n",
|
||||
"from azureml.train.hyperdrive import BanditPolicy, RandomParameterSampling\n",
|
||||
"from azureml.train.hyperdrive import choice, uniform\n",
|
||||
"\n",
|
||||
"parameter_space = {\n",
|
||||
" \"learning_rate\": uniform(0.005, 0.05),\n",
|
||||
" \"model\": choice(\n",
|
||||
" {\n",
|
||||
" \"model_name\": choice(\"vitb16r224\"),\n",
|
||||
" \"number_of_epochs\": choice(15, 30),\n",
|
||||
" \"grad_accumulation_step\": choice(1, 2),\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"model_name\": choice(\"seresnext\"),\n",
|
||||
" # model-specific, valid_resize_size should be larger or equal than valid_crop_size\n",
|
||||
" \"valid_resize_size\": choice(288, 320, 352),\n",
|
||||
" \"valid_crop_size\": choice(224, 256), # model-specific\n",
|
||||
" \"train_crop_size\": choice(224, 256), # model-specific\n",
|
||||
" },\n",
|
||||
" ),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"tuning_settings = {\n",
|
||||
" \"iterations\": 10,\n",
|
||||
" \"max_concurrent_iterations\": 2,\n",
|
||||
" \"hyperparameter_sampling\": RandomParameterSampling(parameter_space),\n",
|
||||
" \"early_termination_policy\": BanditPolicy(\n",
|
||||
" evaluation_interval=2, slack_factor=0.2, delay_evaluation=6\n",
|
||||
" ),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_image_config = AutoMLImageConfig(\n",
|
||||
" task=ImageTask.IMAGE_CLASSIFICATION_MULTILABEL,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=training_dataset,\n",
|
||||
" validation_data=validation_dataset,\n",
|
||||
" **tuning_settings,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run = experiment.submit(automl_image_config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run.wait_for_completion(wait_post_processing=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"When doing a hyperparameter sweep, it can be useful to visualize the different configurations that were tried using the HyperDrive UI. You can navigate to this UI by going to the 'Child runs' tab in the UI of the main `automl_image_run` from above, which is the HyperDrive parent run. Then you can go into the 'Child runs' tab of this HyperDrive parent run. Alternatively, here below you can see directly the HyperDrive parent run and navigate to its 'Child runs' tab:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Run\n",
|
||||
"\n",
|
||||
"hyperdrive_run = Run(experiment=experiment, run_id=automl_image_run.id + \"_HD\")\n",
|
||||
"hyperdrive_run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Register the optimal vision model from the AutoML run\n",
|
||||
"Once the run completes, we can register the model that was created from the best run (configuration that resulted in the best primary metric)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Register the model from the best run\n",
|
||||
"\n",
|
||||
"best_child_run = automl_image_run.get_best_child()\n",
|
||||
"model_name = best_child_run.properties[\"model_name\"]\n",
|
||||
"model = best_child_run.register_model(\n",
|
||||
" model_name=model_name, model_path=\"outputs/model.pt\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Deploy model as a web service\n",
|
||||
"Once you have your trained model, you can deploy the model on Azure. You can deploy your trained model as a web service on Azure Container Instances ([ACI](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-azure-container-instance)) or Azure Kubernetes Service ([AKS](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-azure-kubernetes-service)). Please note that ACI only supports small models under 1 GB in size. For testing larger models or for the high-scale production stage, we recommend using AKS.\n",
|
||||
"In this tutorial, we will deploy the model as a web service in AKS."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You will need to first create an AKS compute cluster or use an existing AKS cluster. You can use either GPU or CPU VM SKUs for your deployment cluster"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeTarget, AksCompute\n",
|
||||
"from azureml.exceptions import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# Choose a name for your cluster\n",
|
||||
"aks_name = \"aks-cpu-ml\"\n",
|
||||
"# Check to see if the cluster already exists\n",
|
||||
"try:\n",
|
||||
" aks_target = ComputeTarget(workspace=ws, name=aks_name)\n",
|
||||
" print(\"Found existing compute target\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" print(\"Creating a new compute target...\")\n",
|
||||
" # Provision AKS cluster with a CPU machine\n",
|
||||
" prov_config = AksCompute.provisioning_configuration(vm_size=\"STANDARD_D3_V2\")\n",
|
||||
" # Create the cluster\n",
|
||||
" aks_target = ComputeTarget.create(\n",
|
||||
" workspace=ws, name=aks_name, provisioning_configuration=prov_config\n",
|
||||
" )\n",
|
||||
" aks_target.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Next, you will need to define the [inference configuration](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models#update-inference-configuration), that describes how to set up the web-service containing your model. You can use the scoring script and the environment from the training run in your inference config.\n",
|
||||
"\n",
|
||||
"<b>Note:</b> To change the model's settings, open the downloaded scoring script and modify the model_settings variable <i>before</i> deploying the model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.model import InferenceConfig\n",
|
||||
"\n",
|
||||
"best_child_run.download_file(\n",
|
||||
" \"outputs/scoring_file_v_1_0_0.py\", output_file_path=\"score.py\"\n",
|
||||
")\n",
|
||||
"environment = best_child_run.get_environment()\n",
|
||||
"inference_config = InferenceConfig(entry_script=\"score.py\", environment=environment)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can then deploy the model as an AKS web service."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Deploy the model from the best run as an AKS web service\n",
|
||||
"from azureml.core.webservice import AksWebservice\n",
|
||||
"from azureml.core.model import Model\n",
|
||||
"\n",
|
||||
"aks_config = AksWebservice.deploy_configuration(\n",
|
||||
" autoscale_enabled=True, cpu_cores=1, memory_gb=5, enable_app_insights=True\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"aks_service = Model.deploy(\n",
|
||||
" ws,\n",
|
||||
" models=[model],\n",
|
||||
" inference_config=inference_config,\n",
|
||||
" deployment_config=aks_config,\n",
|
||||
" deployment_target=aks_target,\n",
|
||||
" name=\"automl-image-test-cpu-ml\",\n",
|
||||
" overwrite=True,\n",
|
||||
")\n",
|
||||
"aks_service.wait_for_deployment(show_output=True)\n",
|
||||
"print(aks_service.state)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Test the web service\n",
|
||||
"Finally, let's test our deployed web service to predict new images. You can pass in any image. In this case, we'll use a random image from the dataset and pass it to the scoring URI."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"from IPython.display import Image\n",
|
||||
"\n",
|
||||
"# URL for the web service\n",
|
||||
"scoring_uri = aks_service.scoring_uri\n",
|
||||
"\n",
|
||||
"# If the service is authenticated, set the key or token\n",
|
||||
"key, _ = aks_service.get_keys()\n",
|
||||
"\n",
|
||||
"sample_image = \"./test_image.jpg\"\n",
|
||||
"\n",
|
||||
"# Load image data\n",
|
||||
"data = open(sample_image, \"rb\").read()\n",
|
||||
"\n",
|
||||
"# Set the content type\n",
|
||||
"headers = {\"Content-Type\": \"application/octet-stream\"}\n",
|
||||
"\n",
|
||||
"# If authentication is enabled, set the authorization header\n",
|
||||
"headers[\"Authorization\"] = f\"Bearer {key}\"\n",
|
||||
"\n",
|
||||
"# Make the request and display the response\n",
|
||||
"resp = requests.post(scoring_uri, data, headers=headers)\n",
|
||||
"print(resp.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Visualize predictions\n",
|
||||
"Now that we have scored a test image, we can visualize the predictions for this image"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import matplotlib.image as mpimg\n",
|
||||
"from PIL import Image\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"IMAGE_SIZE = (18, 12)\n",
|
||||
"plt.figure(figsize=IMAGE_SIZE)\n",
|
||||
"img_np = mpimg.imread(sample_image)\n",
|
||||
"img = Image.fromarray(img_np.astype(\"uint8\"), \"RGB\")\n",
|
||||
"x, y = img.size\n",
|
||||
"\n",
|
||||
"fig, ax = plt.subplots(1, figsize=(15, 15))\n",
|
||||
"# Display the image\n",
|
||||
"ax.imshow(img_np)\n",
|
||||
"\n",
|
||||
"prediction = json.loads(resp.text)\n",
|
||||
"score_threshold = 0.5\n",
|
||||
"\n",
|
||||
"label_offset_x = 30\n",
|
||||
"label_offset_y = 30\n",
|
||||
"for index, score in enumerate(prediction[\"probs\"]):\n",
|
||||
" if score > score_threshold:\n",
|
||||
" label = prediction[\"labels\"][index]\n",
|
||||
" display_text = \"{} ({})\".format(label, round(score, 3))\n",
|
||||
" print(display_text)\n",
|
||||
"\n",
|
||||
" color = \"red\"\n",
|
||||
" plt.text(label_offset_x, label_offset_y, display_text, color=color, fontsize=30)\n",
|
||||
" label_offset_y += 30\n",
|
||||
"plt.show()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.10"
|
||||
},
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
After Width: | Height: | Size: 155 KiB |
|
After Width: | Height: | Size: 160 KiB |
@@ -0,0 +1,15 @@
|
||||
---
|
||||
page_type: sample
|
||||
languages:
|
||||
- python
|
||||
products:
|
||||
- azure-machine-learning
|
||||
description: Notebook showing how to use AutoML for training an Instance Segmentation model. We will use a small dataset to train the model, demonstrate how you can tune hyperparameters of the model to optimize model performance and deploy the model to use in inference scenarios.
|
||||
---
|
||||
|
||||
# Instance Segmentation using AutoML for Images
|
||||
- Dataset: Toy dataset with images of products found in a fridge
|
||||
- **[Jupyter Notebook](auto-ml-image-instance-segmentation.ipynb)**
|
||||
- train an Instance Segmentation model using AutoML
|
||||
- tune hyperparameters of the model to optimize model performance
|
||||
- deploy the model to use in inference scenarios
|
||||
@@ -0,0 +1,769 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License.\n",
|
||||
"\n",
|
||||
"# Training an Instance Segmentation model using AutoML\n",
|
||||
"In this notebook, we go over how you can use AutoML for training an Instance Segmentation model. We will use a small dataset to train the model, demonstrate how you can tune hyperparameters of the model to optimize model performance and deploy the model to use in inference scenarios. For detailed information please refer to the [documentation of AutoML for Images](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Important:** This feature is currently in public preview. This preview version is provided without a service-level agreement. Certain features might not be supported or might have constrained capabilities. For more information, see [Supplemental Terms of Use for Microsoft Azure Previews](https://azure.microsoft.com/en-us/support/legal/preview-supplemental-terms/)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Environment Setup\n",
|
||||
"Please follow the [\"Setup a new conda environment\"](https://github.com/Azure/azureml-examples/tree/main/python-sdk/tutorials/automl-with-azureml#3-setup-a-new-conda-environment) instructions to get started."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.35.0 of the Azure ML SDK.\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK.\")\n",
|
||||
"assert (\n",
|
||||
" azureml.core.VERSION >= \"1.35\"\n",
|
||||
"), \"Please upgrade the Azure ML SDK by running '!pip install --upgrade azureml-sdk' then restart the kernel.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Additional environment setup\n",
|
||||
"You will need to install these additional packages below to run this notebook:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install \"scikit-image==0.17.2\" \"simplification==0.5.1\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Workspace setup\n",
|
||||
"In order to train and deploy models in Azure ML, you will first need to set up a workspace.\n",
|
||||
"\n",
|
||||
"An [Azure ML Workspace](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#workspace) is an Azure resource that organizes and coordinates the actions of many other Azure resources to assist in executing and sharing machine learning workflows. In particular, an Azure ML Workspace coordinates storage, databases, and compute resources providing added functionality for machine learning experimentation, deployment, inference, and the monitoring of deployed models.\n",
|
||||
"\n",
|
||||
"Create an Azure ML Workspace within your Azure subscription or load an existing workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Compute target setup\n",
|
||||
"You will need to provide a [Compute Target](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#computes) that will be used for your AutoML model training. AutoML models for image tasks require [GPU SKUs](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) such as the ones from the NC, NCv2, NCv3, ND, NDv2 and NCasT4 series. We recommend using the NCsv3-series (with v100 GPUs) for faster training. Using a compute target with a multi-GPU VM SKU will leverage the multiple GPUs to speed up training. Additionally, setting up a compute target with multiple nodes will allow for faster model training by leveraging parallelism, when tuning hyperparameters for your model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import AmlCompute, ComputeTarget\n",
|
||||
"\n",
|
||||
"cluster_name = \"gpu-cluster-nc6\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" compute_target = ws.compute_targets[cluster_name]\n",
|
||||
" print(\"Found existing compute target.\")\n",
|
||||
"except KeyError:\n",
|
||||
" print(\"Creating a new compute target...\")\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=\"Standard_NC6\",\n",
|
||||
" idle_seconds_before_scaledown=600,\n",
|
||||
" min_nodes=0,\n",
|
||||
" max_nodes=4,\n",
|
||||
" )\n",
|
||||
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
|
||||
"# Can poll for a minimum number of nodes and for a specific timeout.\n",
|
||||
"# If no min_node_count is provided, it will use the scale settings for the cluster.\n",
|
||||
"compute_target.wait_for_completion(\n",
|
||||
" show_output=True, min_node_count=None, timeout_in_minutes=20\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Experiment Setup\n",
|
||||
"Create an [Experiment](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#experiments) in your workspace to track your model training runs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Experiment\n",
|
||||
"\n",
|
||||
"experiment_name = \"automl-image-instance-segmentation\"\n",
|
||||
"experiment = Experiment(ws, name=experiment_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Dataset with input Training Data\n",
|
||||
"\n",
|
||||
"In order to generate models for computer vision, you will need to bring in labeled image data as input for model training in the form of an [AzureML Tabular Dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset). You can either use a dataset that you have exported from a [Data Labeling](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-label-data) project, or create a new Tabular Dataset with your labeled training data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In this notebook, we use a toy dataset called Fridge Objects, which includes 128 images of 4 classes of beverage container {can, carton, milk bottle, water bottle} photos taken on different backgrounds.\n",
|
||||
"\n",
|
||||
"All images in this notebook are hosted in [this repository](https://github.com/microsoft/computervision-recipes) and are made available under the [MIT license](https://github.com/microsoft/computervision-recipes/blob/master/LICENSE).\n",
|
||||
"\n",
|
||||
"We first download and unzip the data locally."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import urllib\n",
|
||||
"from zipfile import ZipFile\n",
|
||||
"\n",
|
||||
"# download data\n",
|
||||
"download_url = \"https://cvbp-secondary.z19.web.core.windows.net/datasets/object_detection/odFridgeObjectsMask.zip\"\n",
|
||||
"data_file = \"./odFridgeObjectsMask.zip\"\n",
|
||||
"urllib.request.urlretrieve(download_url, filename=data_file)\n",
|
||||
"\n",
|
||||
"# extract files\n",
|
||||
"with ZipFile(data_file, \"r\") as zip:\n",
|
||||
" print(\"extracting files...\")\n",
|
||||
" zip.extractall()\n",
|
||||
" print(\"done\")\n",
|
||||
"# delete zip file\n",
|
||||
"os.remove(data_file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This is a sample image from this dataset:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from IPython.display import Image\n",
|
||||
"\n",
|
||||
"Image(filename=\"./odFridgeObjectsMask/images/31.jpg\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Convert the downloaded data to JSONL\n",
|
||||
"In this example, the fridge object dataset is annotated in Pascal VOC format, where each image corresponds to an xml file. Each xml file contains information on where its corresponding image file is located and also contains information about the bounding boxes and the object labels. In order to use this data to create an [AzureML Tabular Dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset), we first need to convert it to the required JSONL format. Please refer to the [documentation on how to prepare datasets](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-prepare-datasets-for-automl-images).\n",
|
||||
"\n",
|
||||
"The following script is creating two .jsonl files (one for training and one for validation) in the parent folder of the dataset. The train / validation ratio corresponds to 20% of the data going into the validation file."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# The jsonl_converter below relies on scikit-image and simplification.\n",
|
||||
"# If you don't have them installed, install them before converting data by runing this cell.\n",
|
||||
"%pip install \"scikit-image==0.17.2\" \"simplification==0.5.1\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from jsonl_converter import convert_mask_in_VOC_to_jsonl\n",
|
||||
"\n",
|
||||
"data_path = \"./odFridgeObjectsMask/\"\n",
|
||||
"convert_mask_in_VOC_to_jsonl(data_path, ws)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Upload the JSONL file and images to Datastore\n",
|
||||
"In order to use the data for training in Azure ML, we upload it to our Azure ML Workspace via a [Datastore](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#datasets-and-datastores). The datastore provides a mechanism for you to upload/download data and interact with it from your remote compute targets. It is an abstraction over Azure Storage."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Retrieving default datastore that got automatically created when we setup a workspace\n",
|
||||
"ds = ws.get_default_datastore()\n",
|
||||
"ds.upload(src_dir=\"./odFridgeObjectsMask\", target_path=\"odFridgeObjectsMask\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Finally, we need to create an [AzureML Tabular Dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset) from the data we uploaded to the Datastore. We create one dataset for training and one for validation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Dataset\n",
|
||||
"from azureml.data import DataType\n",
|
||||
"\n",
|
||||
"# get existing training dataset\n",
|
||||
"training_dataset_name = \"odFridgeObjectsMaskTrainingDataset\"\n",
|
||||
"if training_dataset_name in ws.datasets:\n",
|
||||
" training_dataset = ws.datasets.get(training_dataset_name)\n",
|
||||
" print(\"Found the training dataset\", training_dataset_name)\n",
|
||||
"else:\n",
|
||||
" # create training dataset\n",
|
||||
" training_dataset = Dataset.Tabular.from_json_lines_files(\n",
|
||||
" path=ds.path(\"odFridgeObjectsMask/train_annotations.jsonl\"),\n",
|
||||
" set_column_types={\"image_url\": DataType.to_stream(ds.workspace)},\n",
|
||||
" )\n",
|
||||
" training_dataset = training_dataset.register(\n",
|
||||
" workspace=ws, name=training_dataset_name\n",
|
||||
" )\n",
|
||||
"# get existing validation dataset\n",
|
||||
"validation_dataset_name = \"odFridgeObjectsMaskValidationDataset\"\n",
|
||||
"if validation_dataset_name in ws.datasets:\n",
|
||||
" validation_dataset = ws.datasets.get(validation_dataset_name)\n",
|
||||
" print(\"Found the validation dataset\", validation_dataset_name)\n",
|
||||
"else:\n",
|
||||
" # create validation dataset\n",
|
||||
" validation_dataset = Dataset.Tabular.from_json_lines_files(\n",
|
||||
" path=ds.path(\"odFridgeObjectsMask/validation_annotations.jsonl\"),\n",
|
||||
" set_column_types={\"image_url\": DataType.to_stream(ds.workspace)},\n",
|
||||
" )\n",
|
||||
" validation_dataset = validation_dataset.register(\n",
|
||||
" workspace=ws, name=validation_dataset_name\n",
|
||||
" )\n",
|
||||
"print(\"Training dataset name: \" + training_dataset.name)\n",
|
||||
"print(\"Validation dataset name: \" + validation_dataset.name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Validation dataset is optional. If no validation dataset is specified, by default 20% of your training data will be used for validation. You can control the percentage using the `split_ratio` argument - please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models#model-agnostic-hyperparameters) for more details.\n",
|
||||
"\n",
|
||||
"This is what the training dataset looks like:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"training_dataset.to_pandas_dataframe()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Configuring your AutoML run for image tasks\n",
|
||||
"AutoML allows you to easily train models for Image Classification, Object Detection & Instance Segmentation on your image data. You can control the model algorithm to be used, specify hyperparameter values for your model as well as perform a sweep across the hyperparameter space to generate an optimal model. Parameters for configuring your AutoML Image run are specified using the `AutoMLImageConfig` - please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models#configure-your-experiment-settings) for the details on the parameters that can be used and their values."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"When using AutoML for image tasks, you need to specify the model algorithms using the `model_name` parameter. You can either specify a single model or choose to sweep over multiple models. Please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models#configure-model-algorithms-and-hyperparameters) for the list of supported model algorithms."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Using default hyperparameter values for the specified algorithm\n",
|
||||
"Before doing a large sweep to search for the optimal models and hyperparameters, we recommend trying the default values for a given model to get a first baseline. Next, you can explore multiple hyperparameters for the same model before sweeping over multiple models and their parameters. This allows an iterative approach, as with multiple models and multiple hyperparameters for each (as we showcase in the next section), the search space grows exponentially, and you need more iterations to find optimal configurations.\n",
|
||||
"\n",
|
||||
"If you wish to use the default hyperparameter values for a given algorithm (say `maskrcnn`), you can specify the config for your AutoML Image runs as follows:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.core.shared.constants import ImageTask\n",
|
||||
"from azureml.train.automl import AutoMLImageConfig\n",
|
||||
"from azureml.train.hyperdrive import GridParameterSampling, choice\n",
|
||||
"\n",
|
||||
"image_config_maskrcnn = AutoMLImageConfig(\n",
|
||||
" task=ImageTask.IMAGE_INSTANCE_SEGMENTATION,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=training_dataset,\n",
|
||||
" validation_data=validation_dataset,\n",
|
||||
" hyperparameter_sampling=GridParameterSampling(\n",
|
||||
" {\"model_name\": choice(\"maskrcnn_resnet50_fpn\")}\n",
|
||||
" ),\n",
|
||||
" iterations=1,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Submitting an AutoML run for Computer Vision tasks\n",
|
||||
"Once you've created the config settings for your run, you can submit an AutoML run using the config in order to train a vision model using your training dataset."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run = experiment.submit(image_config_maskrcnn)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run.wait_for_completion(wait_post_processing=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Hyperparameter sweeping for your AutoML models for computer vision tasks\n",
|
||||
"In this example, we use the AutoMLImageConfig to train an Instance Segmentation model using `maskrcnn_resnet50_fpn` which is pretrained on COCO, a large-scale object detection, segmentation, and captioning dataset that contains over 200K labeled images with over 80 label categories.\n",
|
||||
"\n",
|
||||
"When using AutoML for Images, you can perform a hyperparameter sweep over a defined parameter space to find the optimal model. In this example, we sweep over the hyperparameters for each algorithm, choosing from a range of values for learning_rate, optimizer, etc., to generate a model with the optimal 'accuracy'. If hyperparameter values are not specified, then default values are used for the specified algorithm.\n",
|
||||
"\n",
|
||||
"We use Random Sampling to pick samples from this parameter space and try a total of 10 iterations with these different samples, running 2 iterations at a time on our compute target, which has been previously set up using 4 nodes. Please note that the more parameters the space has, the more iterations you need to find optimal models.\n",
|
||||
"\n",
|
||||
"We leverage the Bandit early termination policy which will terminate poor performing configs (those that are not within 20% slack of the best performing config), thus significantly saving compute resources.\n",
|
||||
"\n",
|
||||
"For more details on model and hyperparameter sweeping, please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.core.shared.constants import ImageTask\n",
|
||||
"from azureml.train.automl import AutoMLImageConfig\n",
|
||||
"from azureml.train.hyperdrive import BanditPolicy, RandomParameterSampling\n",
|
||||
"from azureml.train.hyperdrive import choice, uniform\n",
|
||||
"\n",
|
||||
"parameter_space = {\n",
|
||||
" \"model_name\": choice(\"maskrcnn_resnet50_fpn\"),\n",
|
||||
" \"learning_rate\": uniform(0.0001, 0.001),\n",
|
||||
" #'warmup_cosine_lr_warmup_epochs': choice(0, 3),\n",
|
||||
" \"optimizer\": choice(\"sgd\", \"adam\", \"adamw\"),\n",
|
||||
" \"min_size\": choice(600, 800),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"tuning_settings = {\n",
|
||||
" \"iterations\": 10,\n",
|
||||
" \"max_concurrent_iterations\": 2,\n",
|
||||
" \"hyperparameter_sampling\": RandomParameterSampling(parameter_space),\n",
|
||||
" \"early_termination_policy\": BanditPolicy(\n",
|
||||
" evaluation_interval=2, slack_factor=0.2, delay_evaluation=6\n",
|
||||
" ),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_image_config = AutoMLImageConfig(\n",
|
||||
" task=ImageTask.IMAGE_INSTANCE_SEGMENTATION,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=training_dataset,\n",
|
||||
" validation_data=validation_dataset,\n",
|
||||
" **tuning_settings,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run = experiment.submit(automl_image_config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run.wait_for_completion(wait_post_processing=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"When doing a hyperparameter sweep, it can be useful to visualize the different configurations that were tried using the HyperDrive UI. You can navigate to this UI by going to the 'Child runs' tab in the UI of the main `automl_image_run` from above, which is the HyperDrive parent run. Then you can go into the 'Child runs' tab of this HyperDrive parent run. Alternatively, here below you can see directly the HyperDrive parent run and navigate to its 'Child runs' tab:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Run\n",
|
||||
"\n",
|
||||
"hyperdrive_run = Run(experiment=experiment, run_id=automl_image_run.id + \"_HD\")\n",
|
||||
"hyperdrive_run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Register the optimal vision model from the AutoML run\n",
|
||||
"Once the run completes, we can register the model that was created from the best run (configuration that resulted in the best primary metric)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Register the model from the best run\n",
|
||||
"\n",
|
||||
"best_child_run = automl_image_run.get_best_child()\n",
|
||||
"model_name = best_child_run.properties[\"model_name\"]\n",
|
||||
"model = best_child_run.register_model(\n",
|
||||
" model_name=model_name, model_path=\"outputs/model.pt\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Deploy model as a web service\n",
|
||||
"Once you have your trained model, you can deploy the model on Azure. You can deploy your trained model as a web service on Azure Container Instances ([ACI](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-azure-container-instance)) or Azure Kubernetes Service ([AKS](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-azure-kubernetes-service)). Please note that ACI only supports small models under 1 GB in size. For testing larger models or for the high-scale production stage, we recommend using AKS.\n",
|
||||
"In this tutorial, we will deploy the model as a web service in AKS."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You will need to first create an AKS compute cluster or use an existing AKS cluster. You can use either GPU or CPU VM SKUs for your deployment cluster"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeTarget, AksCompute\n",
|
||||
"from azureml.exceptions import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# Choose a name for your cluster\n",
|
||||
"aks_name = \"aks-cpu-is\"\n",
|
||||
"# Check to see if the cluster already exists\n",
|
||||
"try:\n",
|
||||
" aks_target = ComputeTarget(workspace=ws, name=aks_name)\n",
|
||||
" print(\"Found existing compute target\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" print(\"Creating a new compute target...\")\n",
|
||||
" # Provision AKS cluster with a CPU machine\n",
|
||||
" prov_config = AksCompute.provisioning_configuration(vm_size=\"STANDARD_D3_V2\")\n",
|
||||
" # Create the cluster\n",
|
||||
" aks_target = ComputeTarget.create(\n",
|
||||
" workspace=ws, name=aks_name, provisioning_configuration=prov_config\n",
|
||||
" )\n",
|
||||
" aks_target.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Next, you will need to define the [inference configuration](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models#update-inference-configuration), that describes how to set up the web-service containing your model. You can use the scoring script and the environment from the training run in your inference config.\n",
|
||||
"\n",
|
||||
"<b>Note:</b> To change the model's settings, open the downloaded scoring script and modify the model_settings variable <i>before</i> deploying the model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.model import InferenceConfig\n",
|
||||
"\n",
|
||||
"best_child_run.download_file(\n",
|
||||
" \"outputs/scoring_file_v_1_0_0.py\", output_file_path=\"score.py\"\n",
|
||||
")\n",
|
||||
"environment = best_child_run.get_environment()\n",
|
||||
"inference_config = InferenceConfig(entry_script=\"score.py\", environment=environment)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can then deploy the model as an AKS web service."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Deploy the model from the best run as an AKS web service\n",
|
||||
"from azureml.core.webservice import AksWebservice\n",
|
||||
"from azureml.core.model import Model\n",
|
||||
"\n",
|
||||
"aks_config = AksWebservice.deploy_configuration(\n",
|
||||
" autoscale_enabled=True, cpu_cores=1, memory_gb=5, enable_app_insights=True\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"aks_service = Model.deploy(\n",
|
||||
" ws,\n",
|
||||
" models=[model],\n",
|
||||
" inference_config=inference_config,\n",
|
||||
" deployment_config=aks_config,\n",
|
||||
" deployment_target=aks_target,\n",
|
||||
" name=\"automl-image-test-cpu-is\",\n",
|
||||
" overwrite=True,\n",
|
||||
")\n",
|
||||
"aks_service.wait_for_deployment(show_output=True)\n",
|
||||
"print(aks_service.state)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Test the web service\n",
|
||||
"Finally, let's test our deployed web service to predict new images. You can pass in any image. In this case, we'll use a random image from the dataset and pass it to the scoring URI."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"# URL for the web service\n",
|
||||
"scoring_uri = aks_service.scoring_uri\n",
|
||||
"\n",
|
||||
"# If the service is authenticated, set the key or token\n",
|
||||
"key, _ = aks_service.get_keys()\n",
|
||||
"\n",
|
||||
"sample_image = \"./test_image.jpg\"\n",
|
||||
"\n",
|
||||
"# Load image data\n",
|
||||
"data = open(sample_image, \"rb\").read()\n",
|
||||
"\n",
|
||||
"# Set the content type\n",
|
||||
"headers = {\"Content-Type\": \"application/octet-stream\"}\n",
|
||||
"\n",
|
||||
"# If authentication is enabled, set the authorization header\n",
|
||||
"headers[\"Authorization\"] = f\"Bearer {key}\"\n",
|
||||
"\n",
|
||||
"# Make the request and display the response\n",
|
||||
"resp = requests.post(scoring_uri, data, headers=headers)\n",
|
||||
"print(resp.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Visualize predictions\n",
|
||||
"Now that we have scored a test image, we can visualize the predictions for this image"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import matplotlib.image as mpimg\n",
|
||||
"import matplotlib.patches as patches\n",
|
||||
"from matplotlib.lines import Line2D\n",
|
||||
"from PIL import Image\n",
|
||||
"import numpy as np\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"IMAGE_SIZE = (18, 12)\n",
|
||||
"plt.figure(figsize=IMAGE_SIZE)\n",
|
||||
"img_np = mpimg.imread(sample_image)\n",
|
||||
"img = Image.fromarray(img_np.astype(\"uint8\"), \"RGB\")\n",
|
||||
"x, y = img.size\n",
|
||||
"\n",
|
||||
"fig, ax = plt.subplots(1, figsize=(15, 15))\n",
|
||||
"# Display the image\n",
|
||||
"ax.imshow(img_np)\n",
|
||||
"\n",
|
||||
"# draw box and label for each detection\n",
|
||||
"detections = json.loads(resp.text)\n",
|
||||
"for detect in detections[\"boxes\"]:\n",
|
||||
" label = detect[\"label\"]\n",
|
||||
" box = detect[\"box\"]\n",
|
||||
" polygon = detect[\"polygon\"]\n",
|
||||
" conf_score = detect[\"score\"]\n",
|
||||
" if conf_score > 0.6:\n",
|
||||
" ymin, xmin, ymax, xmax = (\n",
|
||||
" box[\"topY\"],\n",
|
||||
" box[\"topX\"],\n",
|
||||
" box[\"bottomY\"],\n",
|
||||
" box[\"bottomX\"],\n",
|
||||
" )\n",
|
||||
" topleft_x, topleft_y = x * xmin, y * ymin\n",
|
||||
" width, height = x * (xmax - xmin), y * (ymax - ymin)\n",
|
||||
" print(\n",
|
||||
" \"{}: [{}, {}, {}, {}], {}\".format(\n",
|
||||
" detect[\"label\"],\n",
|
||||
" round(topleft_x, 3),\n",
|
||||
" round(topleft_y, 3),\n",
|
||||
" round(width, 3),\n",
|
||||
" round(height, 3),\n",
|
||||
" round(conf_score, 3),\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" color = np.random.rand(3) #'red'\n",
|
||||
" rect = patches.Rectangle(\n",
|
||||
" (topleft_x, topleft_y),\n",
|
||||
" width,\n",
|
||||
" height,\n",
|
||||
" linewidth=2,\n",
|
||||
" edgecolor=color,\n",
|
||||
" facecolor=\"none\",\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" ax.add_patch(rect)\n",
|
||||
" plt.text(topleft_x, topleft_y - 10, label, color=color, fontsize=20)\n",
|
||||
"\n",
|
||||
" polygon_np = np.array(polygon[0])\n",
|
||||
" polygon_np = polygon_np.reshape(-1, 2)\n",
|
||||
" polygon_np[:, 0] *= x\n",
|
||||
" polygon_np[:, 1] *= y\n",
|
||||
" poly = patches.Polygon(polygon_np, True, facecolor=color, alpha=0.4)\n",
|
||||
" ax.add_patch(poly)\n",
|
||||
" poly_line = Line2D(\n",
|
||||
" polygon_np[:, 0],\n",
|
||||
" polygon_np[:, 1],\n",
|
||||
" linewidth=2,\n",
|
||||
" marker=\"o\",\n",
|
||||
" markersize=8,\n",
|
||||
" markerfacecolor=color,\n",
|
||||
" )\n",
|
||||
" ax.add_line(poly_line)\n",
|
||||
"plt.show()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.10"
|
||||
},
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
After Width: | Height: | Size: 148 KiB |
@@ -0,0 +1,213 @@
|
||||
import argparse
|
||||
import os
|
||||
import json
|
||||
import numpy as np
|
||||
import PIL.Image as Image
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from simplification.cutil import simplify_coords
|
||||
from skimage import measure
|
||||
|
||||
|
||||
def convert_mask_to_polygon(
|
||||
mask,
|
||||
max_polygon_points=100,
|
||||
score_threshold=0.5,
|
||||
max_refinement_iterations=25,
|
||||
edge_safety_padding=1,
|
||||
):
|
||||
"""Convert a numpy mask to a polygon outline in normalized coordinates.
|
||||
|
||||
:param mask: Pixel mask, where each pixel has an object (float) score in [0, 1], in size ([1, height, width])
|
||||
:type: mask: <class 'numpy.array'>
|
||||
:param max_polygon_points: Maximum number of (x, y) coordinate pairs in polygon
|
||||
:type: max_polygon_points: Int
|
||||
:param score_threshold: Score cutoff for considering a pixel as in object.
|
||||
:type: score_threshold: Float
|
||||
:param max_refinement_iterations: Maximum number of times to refine the polygon
|
||||
trying to reduce the number of pixels to meet max polygon points.
|
||||
:type: max_refinement_iterations: Int
|
||||
:param edge_safety_padding: Number of pixels to pad the mask with
|
||||
:type edge_safety_padding: Int
|
||||
:return: normalized polygon coordinates
|
||||
:rtype: list of list
|
||||
"""
|
||||
# Convert to numpy bitmask
|
||||
mask = mask[0]
|
||||
mask_array = np.array((mask > score_threshold), dtype=np.uint8)
|
||||
image_shape = mask_array.shape
|
||||
|
||||
# Pad the mask to avoid errors at the edge of the mask
|
||||
embedded_mask = np.zeros(
|
||||
(
|
||||
image_shape[0] + 2 * edge_safety_padding,
|
||||
image_shape[1] + 2 * edge_safety_padding,
|
||||
),
|
||||
dtype=np.uint8,
|
||||
)
|
||||
embedded_mask[
|
||||
edge_safety_padding : image_shape[0] + edge_safety_padding,
|
||||
edge_safety_padding : image_shape[1] + edge_safety_padding,
|
||||
] = mask_array
|
||||
|
||||
# Find Image Contours
|
||||
contours = measure.find_contours(embedded_mask, 0.5)
|
||||
simplified_contours = []
|
||||
|
||||
for contour in contours:
|
||||
|
||||
# Iteratively reduce polygon points, if necessary
|
||||
if max_polygon_points is not None:
|
||||
simplify_factor = 0
|
||||
while (
|
||||
len(contour) > max_polygon_points
|
||||
and simplify_factor < max_refinement_iterations
|
||||
):
|
||||
contour = simplify_coords(contour, simplify_factor)
|
||||
simplify_factor += 1
|
||||
|
||||
# Convert to [x, y, x, y, ....] coordinates and correct for padding
|
||||
unwrapped_contour = [0] * (2 * len(contour))
|
||||
unwrapped_contour[::2] = np.ceil(contour[:, 1]) - edge_safety_padding
|
||||
unwrapped_contour[1::2] = np.ceil(contour[:, 0]) - edge_safety_padding
|
||||
|
||||
simplified_contours.append(unwrapped_contour)
|
||||
|
||||
return _normalize_contour(simplified_contours, image_shape)
|
||||
|
||||
|
||||
def _normalize_contour(contours, image_shape):
|
||||
|
||||
height, width = image_shape[0], image_shape[1]
|
||||
|
||||
for contour in contours:
|
||||
contour[::2] = [x * 1.0 / width for x in contour[::2]]
|
||||
contour[1::2] = [y * 1.0 / height for y in contour[1::2]]
|
||||
|
||||
return contours
|
||||
|
||||
|
||||
def binarise_mask(mask_fname):
|
||||
|
||||
mask = Image.open(mask_fname)
|
||||
mask = np.array(mask)
|
||||
# instances are encoded as different colors
|
||||
obj_ids = np.unique(mask)
|
||||
# first id is the background, so remove it
|
||||
obj_ids = obj_ids[1:]
|
||||
|
||||
# split the color-encoded mask into a set of binary masks
|
||||
binary_masks = mask == obj_ids[:, None, None]
|
||||
return binary_masks
|
||||
|
||||
|
||||
def parsing_mask(mask_fname):
|
||||
|
||||
# For this particular dataset, initially each mask was merged (based on binary mask of each object)
|
||||
# in the order of the bounding boxes described in the corresponding PASCAL VOC annotation file.
|
||||
# Therefore, we have to extract each binary mask which is in the order of objects in the annotation file.
|
||||
# https://github.com/microsoft/computervision-recipes/blob/master/utils_cv/detection/dataset.py
|
||||
binary_masks = binarise_mask(mask_fname)
|
||||
polygons = []
|
||||
for bi_mask in binary_masks:
|
||||
|
||||
if len(bi_mask.shape) == 2:
|
||||
bi_mask = bi_mask[np.newaxis, :]
|
||||
polygon = convert_mask_to_polygon(bi_mask)
|
||||
polygons.append(polygon)
|
||||
|
||||
return polygons
|
||||
|
||||
|
||||
def convert_mask_in_VOC_to_jsonl(base_dir, workspace):
|
||||
|
||||
src = base_dir
|
||||
train_validation_ratio = 5
|
||||
|
||||
# Retrieving default datastore that got automatically created when we setup a workspace
|
||||
workspaceblobstore = workspace.get_default_datastore().name
|
||||
|
||||
# Path to the annotations
|
||||
annotations_folder = os.path.join(src, "annotations")
|
||||
mask_folder = os.path.join(src, "segmentation-masks")
|
||||
|
||||
# Path to the training and validation files
|
||||
train_annotations_file = os.path.join(src, "train_annotations.jsonl")
|
||||
validation_annotations_file = os.path.join(src, "validation_annotations.jsonl")
|
||||
|
||||
# sample json line dictionary
|
||||
json_line_sample = {
|
||||
"image_url": "AmlDatastore://"
|
||||
+ workspaceblobstore
|
||||
+ "/"
|
||||
+ os.path.basename(os.path.dirname(src))
|
||||
+ "/"
|
||||
+ "images",
|
||||
"image_details": {"format": None, "width": None, "height": None},
|
||||
"label": [],
|
||||
}
|
||||
|
||||
# Read each annotation and convert it to jsonl line
|
||||
with open(train_annotations_file, "w") as train_f:
|
||||
with open(validation_annotations_file, "w") as validation_f:
|
||||
for i, filename in enumerate(os.listdir(annotations_folder)):
|
||||
if filename.endswith(".xml"):
|
||||
print("Parsing " + os.path.join(src, filename))
|
||||
|
||||
root = ET.parse(
|
||||
os.path.join(annotations_folder, filename)
|
||||
).getroot()
|
||||
|
||||
width = int(root.find("size/width").text)
|
||||
height = int(root.find("size/height").text)
|
||||
# convert mask into polygon
|
||||
mask_fname = os.path.join(mask_folder, filename[:-4] + ".png")
|
||||
polygons = parsing_mask(mask_fname)
|
||||
|
||||
labels = []
|
||||
for index, object in enumerate(root.findall("object")):
|
||||
name = object.find("name").text
|
||||
isCrowd = int(object.find("difficult").text)
|
||||
labels.append(
|
||||
{
|
||||
"label": name,
|
||||
"bbox": "null",
|
||||
"isCrowd": isCrowd,
|
||||
"polygon": polygons[index],
|
||||
}
|
||||
)
|
||||
|
||||
# build the jsonl file
|
||||
image_filename = root.find("filename").text
|
||||
_, file_extension = os.path.splitext(image_filename)
|
||||
json_line = dict(json_line_sample)
|
||||
json_line["image_url"] = (
|
||||
json_line["image_url"] + "/" + image_filename
|
||||
)
|
||||
json_line["image_details"]["format"] = file_extension[1:]
|
||||
json_line["image_details"]["width"] = width
|
||||
json_line["image_details"]["height"] = height
|
||||
json_line["label"] = labels
|
||||
|
||||
if i % train_validation_ratio == 0:
|
||||
# validation annotation
|
||||
validation_f.write(json.dumps(json_line) + "\n")
|
||||
else:
|
||||
# train annotation
|
||||
train_f.write(json.dumps(json_line) + "\n")
|
||||
else:
|
||||
print("Skipping unknown file: {}".format(filename))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(allow_abbrev=False)
|
||||
parser.add_argument(
|
||||
"--data_path",
|
||||
type=str,
|
||||
help="the directory contains images, annotations, and masks",
|
||||
)
|
||||
|
||||
args, remaining_args = parser.parse_known_args()
|
||||
data_path = args.data_path
|
||||
|
||||
convert_mask_in_VOC_to_jsonl(data_path)
|
||||
|
After Width: | Height: | Size: 156 KiB |
@@ -0,0 +1,15 @@
|
||||
---
|
||||
page_type: sample
|
||||
languages:
|
||||
- python
|
||||
products:
|
||||
- azure-machine-learning
|
||||
description: Notebook showing how to use AutoML for training an Object Detection model. We will use a small dataset to train the model, demonstrate how you can tune hyperparameters of the model to optimize model performance and deploy the model to use in inference scenarios.
|
||||
---
|
||||
|
||||
# Object Detection using AutoML for Images
|
||||
- Dataset: Toy dataset with images of products found in a fridge
|
||||
- **[Jupyter Notebook](auto-ml-image-object-detection.ipynb)**
|
||||
- train an Object Detection model using AutoML
|
||||
- tune hyperparameters of the model to optimize model performance
|
||||
- deploy the model to use in inference scenarios
|
||||
@@ -0,0 +1,835 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License.\n",
|
||||
"\n",
|
||||
"# Training an Object Detection model using AutoML\n",
|
||||
"In this notebook, we go over how you can use AutoML for training an Object Detection model. We will use a small dataset to train the model, demonstrate how you can tune hyperparameters of the model to optimize model performance and deploy the model to use in inference scenarios. For detailed information please refer to the [documentation of AutoML for Images](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Important:** This feature is currently in public preview. This preview version is provided without a service-level agreement. Certain features might not be supported or might have constrained capabilities. For more information, see [Supplemental Terms of Use for Microsoft Azure Previews](https://azure.microsoft.com/en-us/support/legal/preview-supplemental-terms/)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Environment Setup\n",
|
||||
"Please follow the [\"Setup a new conda environment\"](https://github.com/Azure/azureml-examples/tree/main/python-sdk/tutorials/automl-with-azureml#3-setup-a-new-conda-environment) instructions to get started."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.35.0 of the Azure ML SDK.\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK.\")\n",
|
||||
"assert (\n",
|
||||
" azureml.core.VERSION >= \"1.35\"\n",
|
||||
"), \"Please upgrade the Azure ML SDK by running '!pip install --upgrade azureml-sdk' then restart the kernel.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Workspace setup\n",
|
||||
"In order to train and deploy models in Azure ML, you will first need to set up a workspace.\n",
|
||||
"\n",
|
||||
"An [Azure ML Workspace](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#workspace) is an Azure resource that organizes and coordinates the actions of many other Azure resources to assist in executing and sharing machine learning workflows. In particular, an Azure ML Workspace coordinates storage, databases, and compute resources providing added functionality for machine learning experimentation, deployment, inference, and the monitoring of deployed models.\n",
|
||||
"\n",
|
||||
"Create an Azure ML Workspace within your Azure subscription or load an existing workspace."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"\n",
|
||||
"ws = Workspace.from_config()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Compute target setup\n",
|
||||
"You will need to provide a [Compute Target](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#computes) that will be used for your AutoML model training. AutoML models for image tasks require [GPU SKUs](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes-gpu) such as the ones from the NC, NCv2, NCv3, ND, NDv2 and NCasT4 series. We recommend using the NCsv3-series (with v100 GPUs) for faster training. Using a compute target with a multi-GPU VM SKU will leverage the multiple GPUs to speed up training. Additionally, setting up a compute target with multiple nodes will allow for faster model training by leveraging parallelism, when tuning hyperparameters for your model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import AmlCompute, ComputeTarget\n",
|
||||
"\n",
|
||||
"cluster_name = \"gpu-cluster-nc6\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" compute_target = ws.compute_targets[cluster_name]\n",
|
||||
" print(\"Found existing compute target.\")\n",
|
||||
"except KeyError:\n",
|
||||
" print(\"Creating a new compute target...\")\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=\"Standard_NC6\",\n",
|
||||
" idle_seconds_before_scaledown=600,\n",
|
||||
" min_nodes=0,\n",
|
||||
" max_nodes=4,\n",
|
||||
" )\n",
|
||||
" compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n",
|
||||
"# Can poll for a minimum number of nodes and for a specific timeout.\n",
|
||||
"# If no min_node_count is provided, it will use the scale settings for the cluster.\n",
|
||||
"compute_target.wait_for_completion(\n",
|
||||
" show_output=True, min_node_count=None, timeout_in_minutes=20\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Experiment Setup\n",
|
||||
"Create an [Experiment](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#experiments) in your workspace to track your model training runs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Experiment\n",
|
||||
"\n",
|
||||
"experiment_name = \"automl-image-object-detection\"\n",
|
||||
"experiment = Experiment(ws, name=experiment_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Dataset with input Training Data\n",
|
||||
"\n",
|
||||
"In order to generate models for computer vision, you will need to bring in labeled image data as input for model training in the form of an [AzureML Tabular Dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset). You can either use a dataset that you have exported from a [Data Labeling](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-label-data) project, or create a new Tabular Dataset with your labeled training data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In this notebook, we use a toy dataset called Fridge Objects, which consists of 128 images of 4 classes of beverage container {can, carton, milk bottle, water bottle} photos taken on different backgrounds.\n",
|
||||
"\n",
|
||||
"All images in this notebook are hosted in [this repository](https://github.com/microsoft/computervision-recipes) and are made available under the [MIT license](https://github.com/microsoft/computervision-recipes/blob/master/LICENSE).\n",
|
||||
"\n",
|
||||
"We first download and unzip the data locally."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import urllib\n",
|
||||
"from zipfile import ZipFile\n",
|
||||
"\n",
|
||||
"# download data\n",
|
||||
"download_url = \"https://cvbp-secondary.z19.web.core.windows.net/datasets/object_detection/odFridgeObjects.zip\"\n",
|
||||
"data_file = \"./odFridgeObjects.zip\"\n",
|
||||
"urllib.request.urlretrieve(download_url, filename=data_file)\n",
|
||||
"\n",
|
||||
"# extract files\n",
|
||||
"with ZipFile(data_file, \"r\") as zip:\n",
|
||||
" print(\"extracting files...\")\n",
|
||||
" zip.extractall()\n",
|
||||
" print(\"done\")\n",
|
||||
"# delete zip file\n",
|
||||
"os.remove(data_file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This is a sample image from this dataset:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from IPython.display import Image\n",
|
||||
"\n",
|
||||
"Image(filename=\"./odFridgeObjects/images/31.jpg\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Convert the downloaded data to JSONL\n",
|
||||
"In this example, the fridge object dataset is annotated in Pascal VOC format, where each image corresponds to an xml file. Each xml file contains information on where its corresponding image file is located and also contains information about the bounding boxes and the object labels. In order to use this data to create an [AzureML Tabular Dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset), we first need to convert it to the required JSONL format. Please refer to the [documentation on how to prepare datasets](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-prepare-datasets-for-automl-images).\n",
|
||||
"\n",
|
||||
"The following script is creating two .jsonl files (one for training and one for validation) in the parent folder of the dataset. The train / validation ratio corresponds to 20% of the data going into the validation file."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import os\n",
|
||||
"import xml.etree.ElementTree as ET\n",
|
||||
"\n",
|
||||
"src = \"./odFridgeObjects/\"\n",
|
||||
"train_validation_ratio = 5\n",
|
||||
"\n",
|
||||
"# Retrieving default datastore that got automatically created when we setup a workspace\n",
|
||||
"workspaceblobstore = ws.get_default_datastore().name\n",
|
||||
"\n",
|
||||
"# Path to the annotations\n",
|
||||
"annotations_folder = os.path.join(src, \"annotations\")\n",
|
||||
"\n",
|
||||
"# Path to the training and validation files\n",
|
||||
"train_annotations_file = os.path.join(src, \"train_annotations.jsonl\")\n",
|
||||
"validation_annotations_file = os.path.join(src, \"validation_annotations.jsonl\")\n",
|
||||
"\n",
|
||||
"# sample json line dictionary\n",
|
||||
"json_line_sample = {\n",
|
||||
" \"image_url\": \"AmlDatastore://\"\n",
|
||||
" + workspaceblobstore\n",
|
||||
" + \"/\"\n",
|
||||
" + os.path.basename(os.path.dirname(src))\n",
|
||||
" + \"/\"\n",
|
||||
" + \"images\",\n",
|
||||
" \"image_details\": {\"format\": None, \"width\": None, \"height\": None},\n",
|
||||
" \"label\": [],\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Read each annotation and convert it to jsonl line\n",
|
||||
"with open(train_annotations_file, \"w\") as train_f:\n",
|
||||
" with open(validation_annotations_file, \"w\") as validation_f:\n",
|
||||
" for i, filename in enumerate(os.listdir(annotations_folder)):\n",
|
||||
" if filename.endswith(\".xml\"):\n",
|
||||
" print(\"Parsing \" + os.path.join(src, filename))\n",
|
||||
"\n",
|
||||
" root = ET.parse(os.path.join(annotations_folder, filename)).getroot()\n",
|
||||
"\n",
|
||||
" width = int(root.find(\"size/width\").text)\n",
|
||||
" height = int(root.find(\"size/height\").text)\n",
|
||||
"\n",
|
||||
" labels = []\n",
|
||||
" for object in root.findall(\"object\"):\n",
|
||||
" name = object.find(\"name\").text\n",
|
||||
" xmin = object.find(\"bndbox/xmin\").text\n",
|
||||
" ymin = object.find(\"bndbox/ymin\").text\n",
|
||||
" xmax = object.find(\"bndbox/xmax\").text\n",
|
||||
" ymax = object.find(\"bndbox/ymax\").text\n",
|
||||
" isCrowd = int(object.find(\"difficult\").text)\n",
|
||||
" labels.append(\n",
|
||||
" {\n",
|
||||
" \"label\": name,\n",
|
||||
" \"topX\": float(xmin) / width,\n",
|
||||
" \"topY\": float(ymin) / height,\n",
|
||||
" \"bottomX\": float(xmax) / width,\n",
|
||||
" \"bottomY\": float(ymax) / height,\n",
|
||||
" \"isCrowd\": isCrowd,\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
" # build the jsonl file\n",
|
||||
" image_filename = root.find(\"filename\").text\n",
|
||||
" _, file_extension = os.path.splitext(image_filename)\n",
|
||||
" json_line = dict(json_line_sample)\n",
|
||||
" json_line[\"image_url\"] = json_line[\"image_url\"] + \"/\" + image_filename\n",
|
||||
" json_line[\"image_details\"][\"format\"] = file_extension[1:]\n",
|
||||
" json_line[\"image_details\"][\"width\"] = width\n",
|
||||
" json_line[\"image_details\"][\"height\"] = height\n",
|
||||
" json_line[\"label\"] = labels\n",
|
||||
"\n",
|
||||
" if i % train_validation_ratio == 0:\n",
|
||||
" # validation annotation\n",
|
||||
" validation_f.write(json.dumps(json_line) + \"\\n\")\n",
|
||||
" else:\n",
|
||||
" # train annotation\n",
|
||||
" train_f.write(json.dumps(json_line) + \"\\n\")\n",
|
||||
" else:\n",
|
||||
" print(\"Skipping unknown file: {}\".format(filename))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Convert annotation file from COCO to JSONL\n",
|
||||
"If you want to try with a dataset in COCO format, the scripts below shows how to convert it to `jsonl` format. The file \"odFridgeObjects_coco.json\" consists of annotation information for the `odFridgeObjects` dataset."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Generate jsonl file from coco file\n",
|
||||
"!python coco2jsonl.py \\\n",
|
||||
"--input_coco_file_path \"./odFridgeObjects_coco.json\" \\\n",
|
||||
"--output_dir \"./odFridgeObjects\" --output_file_name \"odFridgeObjects_from_coco.jsonl\" \\\n",
|
||||
"--task_type \"ObjectDetection\" \\\n",
|
||||
"--base_url \"AmlDatastore://workspaceblobstore/odFridgeObjects/images/\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Visualize bounding boxes\n",
|
||||
"Please refer to the \"Visualize data\" section in the following [tutorial](https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-auto-train-image-models#visualize-data) to see how to easily visualize your ground truth bounding boxes before starting to train."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Upload the JSONL file and images to Datastore\n",
|
||||
"In order to use the data for training in Azure ML, we upload it to our Azure ML Workspace via a [Datastore](https://docs.microsoft.com/en-us/azure/machine-learning/concept-azure-machine-learning-architecture#datasets-and-datastores). The datastore provides a mechanism for you to upload/download data and interact with it from your remote compute targets. It is an abstraction over Azure Storage."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Retrieving default datastore that got automatically created when we setup a workspace\n",
|
||||
"ds = ws.get_default_datastore()\n",
|
||||
"ds.upload(src_dir=\"./odFridgeObjects\", target_path=\"odFridgeObjects\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Finally, we need to create an [AzureML Tabular Dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabulardataset) from the data we uploaded to the Datastore. We create one dataset for training and one for validation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Dataset\n",
|
||||
"from azureml.data import DataType\n",
|
||||
"\n",
|
||||
"# get existing training dataset\n",
|
||||
"training_dataset_name = \"odFridgeObjectsTrainingDataset\"\n",
|
||||
"if training_dataset_name in ws.datasets:\n",
|
||||
" training_dataset = ws.datasets.get(training_dataset_name)\n",
|
||||
" print(\"Found the training dataset\", training_dataset_name)\n",
|
||||
"else:\n",
|
||||
" # create training dataset\n",
|
||||
" training_dataset = Dataset.Tabular.from_json_lines_files(\n",
|
||||
" path=ds.path(\"odFridgeObjects/train_annotations.jsonl\"),\n",
|
||||
" set_column_types={\"image_url\": DataType.to_stream(ds.workspace)},\n",
|
||||
" )\n",
|
||||
" training_dataset = training_dataset.register(\n",
|
||||
" workspace=ws, name=training_dataset_name\n",
|
||||
" )\n",
|
||||
"# get existing validation dataset\n",
|
||||
"validation_dataset_name = \"odFridgeObjectsValidationDataset\"\n",
|
||||
"if validation_dataset_name in ws.datasets:\n",
|
||||
" validation_dataset = ws.datasets.get(validation_dataset_name)\n",
|
||||
" print(\"Found the validation dataset\", validation_dataset_name)\n",
|
||||
"else:\n",
|
||||
" # create validation dataset\n",
|
||||
" validation_dataset = Dataset.Tabular.from_json_lines_files(\n",
|
||||
" path=ds.path(\"odFridgeObjects/validation_annotations.jsonl\"),\n",
|
||||
" set_column_types={\"image_url\": DataType.to_stream(ds.workspace)},\n",
|
||||
" )\n",
|
||||
" validation_dataset = validation_dataset.register(\n",
|
||||
" workspace=ws, name=validation_dataset_name\n",
|
||||
" )\n",
|
||||
"print(\"Training dataset name: \" + training_dataset.name)\n",
|
||||
"print(\"Validation dataset name: \" + validation_dataset.name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Validation dataset is optional. If no validation dataset is specified, by default 20% of your training data will be used for validation. You can control the percentage using the `split_ratio` argument - please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models#model-agnostic-hyperparameters) for more details.\n",
|
||||
"\n",
|
||||
"This is what the training dataset looks like:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"training_dataset.to_pandas_dataframe()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Configuring your AutoML run for image tasks\n",
|
||||
"AutoML allows you to easily train models for Image Classification, Object Detection & Instance Segmentation on your image data. You can control the model algorithm to be used, specify hyperparameter values for your model as well as perform a sweep across the hyperparameter space to generate an optimal model. Parameters for configuring your AutoML Image run are specified using the `AutoMLImageConfig` - please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models#configure-your-experiment-settings) for the details on the parameters that can be used and their values."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"When using AutoML for image tasks, you need to specify the model algorithms using the `model_name` parameter. You can either specify a single model or choose to sweep over multiple models. Please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models#configure-model-algorithms-and-hyperparameters) for the list of supported model algorithms."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Using default hyperparameter values for the specified algorithm\n",
|
||||
"Before doing a large sweep to search for the optimal models and hyperparameters, we recommend trying the default values for a given model to get a first baseline. Next, you can explore multiple hyperparameters for the same model before sweeping over multiple models and their parameters. This allows an iterative approach, as with multiple models and multiple hyperparameters for each (as we showcase in the next section), the search space grows exponentially, and you need more iterations to find optimal configurations.\n",
|
||||
"\n",
|
||||
"If you wish to use the default hyperparameter values for a given algorithm (say `yolov5`), you can specify the config for your AutoML Image runs as follows:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.core.shared.constants import ImageTask\n",
|
||||
"from azureml.train.automl import AutoMLImageConfig\n",
|
||||
"from azureml.train.hyperdrive import GridParameterSampling, choice\n",
|
||||
"\n",
|
||||
"image_config_yolov5 = AutoMLImageConfig(\n",
|
||||
" task=ImageTask.IMAGE_OBJECT_DETECTION,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=training_dataset,\n",
|
||||
" validation_data=validation_dataset,\n",
|
||||
" hyperparameter_sampling=GridParameterSampling({\"model_name\": choice(\"yolov5\")}),\n",
|
||||
" iterations=1,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Submitting an AutoML run for Computer Vision tasks\n",
|
||||
"Once you've created the config settings for your run, you can submit an AutoML run using the config in order to train a vision model using your training dataset."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run = experiment.submit(image_config_yolov5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run.wait_for_completion(wait_post_processing=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Hyperparameter sweeping for your AutoML models for computer vision tasks\n",
|
||||
"\n",
|
||||
"In this example, we use the AutoMLImageConfig to train an Object Detection model using `yolov5` and `fasterrcnn_resnet50_fpn`, both of which are pretrained on COCO, a large-scale object detection, segmentation, and captioning dataset that contains over 200K labeled images with over 80 label categories.\n",
|
||||
"\n",
|
||||
"When using AutoML for Images, you can perform a hyperparameter sweep over a defined parameter space to find the optimal model. In this example, we sweep over the hyperparameters for each algorithm, choosing from a range of values for `learning_rate`, `optimizer`, `lr_scheduler`, etc., to generate a model with the optimal primary metric. If hyperparameter values are not specified, then default values are used for the specified algorithm.\n",
|
||||
"\n",
|
||||
"We use Random Sampling to pick samples from this parameter space and try a total of 10 iterations with these different samples, running 2 iterations at a time on our compute target, which has been previously set up using 4 nodes. Please note that the more parameters the space has, the more iterations you need to find optimal models.\n",
|
||||
"\n",
|
||||
"We leverage the Bandit early termination policy which will terminate poor performing configs (those that are not within 20% slack of the best performing config), thus significantly saving compute resources.\n",
|
||||
"\n",
|
||||
"For more details on model and hyperparameter sweeping, please refer to the [documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.core.shared.constants import ImageTask\n",
|
||||
"from azureml.train.automl import AutoMLImageConfig\n",
|
||||
"from azureml.train.hyperdrive import BanditPolicy, RandomParameterSampling\n",
|
||||
"from azureml.train.hyperdrive import choice, uniform\n",
|
||||
"\n",
|
||||
"parameter_space = {\n",
|
||||
" \"model\": choice(\n",
|
||||
" {\n",
|
||||
" \"model_name\": choice(\"yolov5\"),\n",
|
||||
" \"learning_rate\": uniform(0.0001, 0.01),\n",
|
||||
" \"model_size\": choice(\"small\", \"medium\"), # model-specific\n",
|
||||
" #'img_size': choice(640, 704, 768), # model-specific; might need GPU with large memory\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"model_name\": choice(\"fasterrcnn_resnet50_fpn\"),\n",
|
||||
" \"learning_rate\": uniform(0.0001, 0.001),\n",
|
||||
" \"optimizer\": choice(\"sgd\", \"adam\", \"adamw\"),\n",
|
||||
" \"min_size\": choice(600, 800), # model-specific\n",
|
||||
" #'warmup_cosine_lr_warmup_epochs': choice(0, 3),\n",
|
||||
" },\n",
|
||||
" ),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"tuning_settings = {\n",
|
||||
" \"iterations\": 10,\n",
|
||||
" \"max_concurrent_iterations\": 2,\n",
|
||||
" \"hyperparameter_sampling\": RandomParameterSampling(parameter_space),\n",
|
||||
" \"early_termination_policy\": BanditPolicy(\n",
|
||||
" evaluation_interval=2, slack_factor=0.2, delay_evaluation=6\n",
|
||||
" ),\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_image_config = AutoMLImageConfig(\n",
|
||||
" task=ImageTask.IMAGE_OBJECT_DETECTION,\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=training_dataset,\n",
|
||||
" validation_data=validation_dataset,\n",
|
||||
" **tuning_settings,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run = experiment.submit(automl_image_config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_image_run.wait_for_completion(wait_post_processing=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"When doing a hyperparameter sweep, it can be useful to visualize the different configurations that were tried using the HyperDrive UI. You can navigate to this UI by going to the 'Child runs' tab in the UI of the main `automl_image_run` from above, which is the HyperDrive parent run. Then you can go into the 'Child runs' tab of this HyperDrive parent run. Alternatively, here below you can see directly the HyperDrive parent run and navigate to its 'Child runs' tab:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core import Run\n",
|
||||
"\n",
|
||||
"hyperdrive_run = Run(experiment=experiment, run_id=automl_image_run.id + \"_HD\")\n",
|
||||
"hyperdrive_run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Register the optimal vision model from the AutoML run\n",
|
||||
"Once the run completes, we can register the model that was created from the best run (configuration that resulted in the best primary metric)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Register the model from the best run\n",
|
||||
"\n",
|
||||
"best_child_run = automl_image_run.get_best_child()\n",
|
||||
"model_name = best_child_run.properties[\"model_name\"]\n",
|
||||
"model = best_child_run.register_model(\n",
|
||||
" model_name=model_name, model_path=\"outputs/model.pt\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Deploy model as a web service\n",
|
||||
"Once you have your trained model, you can deploy the model on Azure. You can deploy your trained model as a web service on Azure Container Instances ([ACI](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-azure-container-instance)) or Azure Kubernetes Service ([AKS](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-azure-kubernetes-service)). Please note that ACI only supports small models under 1 GB in size. For testing larger models or for the high-scale production stage, we recommend using AKS.\n",
|
||||
"In this tutorial, we will deploy the model as a web service in AKS."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You will need to first create an AKS compute cluster or use an existing AKS cluster. You can use either GPU or CPU VM SKUs for your deployment cluster"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeTarget, AksCompute\n",
|
||||
"from azureml.exceptions import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# Choose a name for your cluster\n",
|
||||
"aks_name = \"aks-cpu-od\"\n",
|
||||
"# Check to see if the cluster already exists\n",
|
||||
"try:\n",
|
||||
" aks_target = ComputeTarget(workspace=ws, name=aks_name)\n",
|
||||
" print(\"Found existing compute target\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" print(\"Creating a new compute target...\")\n",
|
||||
" # Provision AKS cluster with a CPU machine\n",
|
||||
" prov_config = AksCompute.provisioning_configuration(vm_size=\"STANDARD_D3_V2\")\n",
|
||||
" # Create the cluster\n",
|
||||
" aks_target = ComputeTarget.create(\n",
|
||||
" workspace=ws, name=aks_name, provisioning_configuration=prov_config\n",
|
||||
" )\n",
|
||||
" aks_target.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Next, you will need to define the [inference configuration](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-auto-train-image-models#update-inference-configuration), that describes how to set up the web-service containing your model. You can use the scoring script and the environment from the training run in your inference config.\n",
|
||||
"\n",
|
||||
"<b>Note:</b> To change the model's settings, open the downloaded scoring script and modify the model_settings variable <i>before</i> deploying the model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.model import InferenceConfig\n",
|
||||
"\n",
|
||||
"best_child_run.download_file(\n",
|
||||
" \"outputs/scoring_file_v_1_0_0.py\", output_file_path=\"score.py\"\n",
|
||||
")\n",
|
||||
"environment = best_child_run.get_environment()\n",
|
||||
"inference_config = InferenceConfig(entry_script=\"score.py\", environment=environment)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can then deploy the model as an AKS web service."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Deploy the model from the best run as an AKS web service\n",
|
||||
"from azureml.core.webservice import AksWebservice\n",
|
||||
"from azureml.core.model import Model\n",
|
||||
"\n",
|
||||
"aks_config = AksWebservice.deploy_configuration(\n",
|
||||
" autoscale_enabled=True, cpu_cores=1, memory_gb=5, enable_app_insights=True\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"aks_service = Model.deploy(\n",
|
||||
" ws,\n",
|
||||
" models=[model],\n",
|
||||
" inference_config=inference_config,\n",
|
||||
" deployment_config=aks_config,\n",
|
||||
" deployment_target=aks_target,\n",
|
||||
" name=\"automl-image-test-cpu-od\",\n",
|
||||
" overwrite=True,\n",
|
||||
")\n",
|
||||
"aks_service.wait_for_deployment(show_output=True)\n",
|
||||
"print(aks_service.state)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Test the web service\n",
|
||||
"Finally, let's test our deployed web service to predict new images. You can pass in any image. In this case, we'll use a random image from the dataset and pass it to the scoring URI."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"# URL for the web service\n",
|
||||
"scoring_uri = aks_service.scoring_uri\n",
|
||||
"\n",
|
||||
"# If the service is authenticated, set the key or token\n",
|
||||
"key, _ = aks_service.get_keys()\n",
|
||||
"\n",
|
||||
"sample_image = \"./test_image.jpg\"\n",
|
||||
"\n",
|
||||
"# Load image data\n",
|
||||
"data = open(sample_image, \"rb\").read()\n",
|
||||
"\n",
|
||||
"# Set the content type\n",
|
||||
"headers = {\"Content-Type\": \"application/octet-stream\"}\n",
|
||||
"\n",
|
||||
"# If authentication is enabled, set the authorization header\n",
|
||||
"headers[\"Authorization\"] = f\"Bearer {key}\"\n",
|
||||
"\n",
|
||||
"# Make the request and display the response\n",
|
||||
"resp = requests.post(scoring_uri, data, headers=headers)\n",
|
||||
"print(resp.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Visualize detections\n",
|
||||
"Now that we have scored a test image, we can visualize the bounding boxes for this image"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import matplotlib.image as mpimg\n",
|
||||
"import matplotlib.patches as patches\n",
|
||||
"from PIL import Image\n",
|
||||
"import numpy as np\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"IMAGE_SIZE = (18, 12)\n",
|
||||
"plt.figure(figsize=IMAGE_SIZE)\n",
|
||||
"img_np = mpimg.imread(sample_image)\n",
|
||||
"img = Image.fromarray(img_np.astype(\"uint8\"), \"RGB\")\n",
|
||||
"x, y = img.size\n",
|
||||
"\n",
|
||||
"fig, ax = plt.subplots(1, figsize=(15, 15))\n",
|
||||
"# Display the image\n",
|
||||
"ax.imshow(img_np)\n",
|
||||
"\n",
|
||||
"# draw box and label for each detection\n",
|
||||
"detections = json.loads(resp.text)\n",
|
||||
"for detect in detections[\"boxes\"]:\n",
|
||||
" label = detect[\"label\"]\n",
|
||||
" box = detect[\"box\"]\n",
|
||||
" conf_score = detect[\"score\"]\n",
|
||||
" if conf_score > 0.6:\n",
|
||||
" ymin, xmin, ymax, xmax = (\n",
|
||||
" box[\"topY\"],\n",
|
||||
" box[\"topX\"],\n",
|
||||
" box[\"bottomY\"],\n",
|
||||
" box[\"bottomX\"],\n",
|
||||
" )\n",
|
||||
" topleft_x, topleft_y = x * xmin, y * ymin\n",
|
||||
" width, height = x * (xmax - xmin), y * (ymax - ymin)\n",
|
||||
" print(\n",
|
||||
" \"{}: [{}, {}, {}, {}], {}\".format(\n",
|
||||
" detect[\"label\"],\n",
|
||||
" round(topleft_x, 3),\n",
|
||||
" round(topleft_y, 3),\n",
|
||||
" round(width, 3),\n",
|
||||
" round(height, 3),\n",
|
||||
" round(conf_score, 3),\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" color = np.random.rand(3) #'red'\n",
|
||||
" rect = patches.Rectangle(\n",
|
||||
" (topleft_x, topleft_y),\n",
|
||||
" width,\n",
|
||||
" height,\n",
|
||||
" linewidth=3,\n",
|
||||
" edgecolor=color,\n",
|
||||
" facecolor=\"none\",\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" ax.add_patch(rect)\n",
|
||||
" plt.text(topleft_x, topleft_y - 10, label, color=color, fontsize=20)\n",
|
||||
"plt.show()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.10"
|
||||
},
|
||||
"nteract": {
|
||||
"version": "nteract-front-end@1.0.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -0,0 +1,127 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
# Define Converters
|
||||
|
||||
|
||||
class CocoToJSONLinesConverter:
|
||||
def convert(self):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class BoundingBoxConverter(CocoToJSONLinesConverter):
|
||||
def __init__(self, coco_data):
|
||||
self.json_lines_data = []
|
||||
self.categories = {}
|
||||
self.coco_data = coco_data
|
||||
self.image_id_to_data_index = {}
|
||||
for i in range(0, len(coco_data["images"])):
|
||||
self.json_lines_data.append({})
|
||||
self.json_lines_data[i]["image_url"] = ""
|
||||
self.json_lines_data[i]["image_details"] = {}
|
||||
self.json_lines_data[i]["label"] = []
|
||||
for i in range(0, len(coco_data["categories"])):
|
||||
self.categories[coco_data["categories"][i]["id"]] = coco_data["categories"][
|
||||
i
|
||||
]["name"]
|
||||
|
||||
def _populate_image_url(self, index, coco_image):
|
||||
self.json_lines_data[index]["image_url"] = coco_image["file_name"]
|
||||
self.image_id_to_data_index[coco_image["id"]] = index
|
||||
|
||||
def _populate_image_details(self, index, coco_image):
|
||||
file_name = coco_image["file_name"]
|
||||
self.json_lines_data[index]["image_details"]["format"] = file_name[
|
||||
file_name.rfind(".") + 1 :
|
||||
]
|
||||
self.json_lines_data[index]["image_details"]["width"] = coco_image["width"]
|
||||
self.json_lines_data[index]["image_details"]["height"] = coco_image["height"]
|
||||
|
||||
def _populate_bbox_in_label(self, label, annotation, image_details):
|
||||
# if bbox comes as normalized, skip normalization.
|
||||
if max(annotation["bbox"]) < 1.5:
|
||||
width = 1
|
||||
height = 1
|
||||
else:
|
||||
width = image_details["width"]
|
||||
height = image_details["height"]
|
||||
label["topX"] = annotation["bbox"][0] / width
|
||||
label["topY"] = annotation["bbox"][1] / height
|
||||
label["bottomX"] = (annotation["bbox"][0] + annotation["bbox"][2]) / width
|
||||
label["bottomY"] = (annotation["bbox"][1] + annotation["bbox"][3]) / height
|
||||
|
||||
def _populate_label(self, annotation):
|
||||
index = self.image_id_to_data_index[annotation["image_id"]]
|
||||
image_details = self.json_lines_data[index]["image_details"]
|
||||
label = {"label": self.categories[annotation["category_id"]]}
|
||||
self._populate_bbox_in_label(label, annotation, image_details)
|
||||
self._populate_isCrowd(label, annotation)
|
||||
self.json_lines_data[index]["label"].append(label)
|
||||
|
||||
def _populate_isCrowd(self, label, annotation):
|
||||
if "iscrowd" in annotation.keys():
|
||||
label["isCrowd"] = annotation["iscrowd"]
|
||||
|
||||
def convert(self):
|
||||
for i in range(0, len(self.coco_data["images"])):
|
||||
self._populate_image_url(i, self.coco_data["images"][i])
|
||||
self._populate_image_details(i, self.coco_data["images"][i])
|
||||
for i in range(0, len(self.coco_data["annotations"])):
|
||||
self._populate_label(self.coco_data["annotations"][i])
|
||||
return self.json_lines_data
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Parse arguments that are passed into the script
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--input_coco_file_path", type=str, required=True)
|
||||
parser.add_argument("--output_dir", type=str, required=True)
|
||||
parser.add_argument("--output_file_name", type=str, required=True)
|
||||
parser.add_argument(
|
||||
"--task_type",
|
||||
type=str,
|
||||
required=True,
|
||||
choices=["ObjectDetection"],
|
||||
default="ObjectDetection",
|
||||
)
|
||||
parser.add_argument("--base_url", type=str, default=None)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
input_coco_file_path = args.input_coco_file_path
|
||||
output_dir = args.output_dir
|
||||
output_file_path = output_dir + "/" + args.output_file_name
|
||||
task_type = args.task_type
|
||||
base_url = args.base_url
|
||||
|
||||
def read_coco_file(coco_file):
|
||||
with open(coco_file) as f_in:
|
||||
return json.load(f_in)
|
||||
|
||||
def write_json_lines(converter, filename, base_url=None):
|
||||
json_lines_data = converter.convert()
|
||||
with open(filename, "w") as outfile:
|
||||
for json_line in json_lines_data:
|
||||
if base_url is not None:
|
||||
image_url = json_line["image_url"]
|
||||
json_line["image_url"] = (
|
||||
base_url + image_url[image_url.rfind("/") + 1 :]
|
||||
)
|
||||
json.dump(json_line, outfile, separators=(",", ":"))
|
||||
outfile.write("\n")
|
||||
print(f"Conversion completed. Converted {len(json_lines_data)} lines.")
|
||||
|
||||
coco_data = read_coco_file(input_coco_file_path)
|
||||
|
||||
print("Converting for {}".format(task_type))
|
||||
|
||||
# Defined in azureml.contrib.dataset.labeled_dataset.LabeledDatasetTask.OBJECT_DETECTION.value
|
||||
if task_type == "ObjectDetection":
|
||||
converter = BoundingBoxConverter(coco_data)
|
||||
write_json_lines(converter, output_file_path, base_url)
|
||||
|
||||
else:
|
||||
print("ERROR: Invalid Task Type")
|
||||
pass
|
||||
|
After Width: | Height: | Size: 150 KiB |
|
After Width: | Height: | Size: 156 KiB |
@@ -0,0 +1,327 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
import time
|
||||
import torchvision
|
||||
from PIL import Image
|
||||
from typing import Any, Dict, List
|
||||
|
||||
|
||||
def letterbox(
|
||||
img,
|
||||
new_shape=(640, 640),
|
||||
color=(114, 114, 114),
|
||||
auto=True,
|
||||
scaleFill=False,
|
||||
scaleup=True,
|
||||
):
|
||||
"""Resize image to a 32-pixel-multiple rectangle
|
||||
https://github.com/ultralytics/yolov3/issues/232
|
||||
|
||||
:param img: an image
|
||||
:type img: <class 'numpy.ndarray'>
|
||||
:param new_shape: target shape in [height, width]
|
||||
:type new_shape: <class 'int'>
|
||||
:param color: color for pad area
|
||||
:type color: <class 'tuple'>
|
||||
:param auto: minimum rectangle
|
||||
:type auto: bool
|
||||
:param scaleFill: stretch the image without pad
|
||||
:type scaleFill: bool
|
||||
:param scaleup: scale up
|
||||
:type scaleup: bool
|
||||
:return: letterbox image, scale ratio, padded area in (width, height) in each side
|
||||
:rtype: <class 'numpy.ndarray'>, <class 'tuple'>, <class 'tuple'>
|
||||
"""
|
||||
shape = img.shape[:2] # current shape [height, width]
|
||||
if isinstance(new_shape, int):
|
||||
new_shape = (new_shape, new_shape)
|
||||
|
||||
# Scale ratio (new / old)
|
||||
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
|
||||
if not scaleup: # only scale down, do not scale up (for better test mAP)
|
||||
r = min(r, 1.0)
|
||||
|
||||
# Compute padding
|
||||
ratio = r, r # width, height ratios
|
||||
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
|
||||
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
|
||||
if auto: # minimum rectangle
|
||||
dw, dh = np.mod(dw, 64), np.mod(dh, 64) # wh padding
|
||||
elif scaleFill: # stretch
|
||||
dw, dh = 0.0, 0.0
|
||||
new_unpad = (new_shape[1], new_shape[0])
|
||||
ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios
|
||||
|
||||
dw /= 2 # divide padding into 2 sides
|
||||
dh /= 2
|
||||
|
||||
if shape[::-1] != new_unpad: # resize
|
||||
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
|
||||
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
|
||||
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
|
||||
img = cv2.copyMakeBorder(
|
||||
img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color
|
||||
) # add border
|
||||
return img, ratio, (dw, dh)
|
||||
|
||||
|
||||
def clip_coords(boxes, img_shape):
|
||||
"""Clip bounding xyxy bounding boxes to image shape (height, width)
|
||||
|
||||
:param boxes: bbox
|
||||
:type boxes: <class 'torch.Tensor'>
|
||||
:return: img_shape: image shape
|
||||
:rtype: img_shape: <class 'tuple'>: (height, width)
|
||||
"""
|
||||
boxes[:, 0].clamp_(0, img_shape[1]) # x1
|
||||
boxes[:, 1].clamp_(0, img_shape[0]) # y1
|
||||
boxes[:, 2].clamp_(0, img_shape[1]) # x2
|
||||
boxes[:, 3].clamp_(0, img_shape[0]) # y2
|
||||
|
||||
|
||||
def unpad_bbox(boxes, img_shape, pad):
|
||||
"""Correct bbox coordinates by removing the padded area from letterbox image
|
||||
|
||||
:param boxes: bbox absolute coordinates from prediction
|
||||
:type boxes: <class 'torch.Tensor'>
|
||||
:param img_shape: image shape
|
||||
:type img_shape: <class 'tuple'>: (height, width)
|
||||
:param pad: pad used in letterbox image for inference
|
||||
:type pad: <class 'tuple'>: (width, height)
|
||||
:return: (unpadded) image height and width
|
||||
:rtype: <class 'tuple'>: (height, width)
|
||||
"""
|
||||
dw, dh = pad
|
||||
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
|
||||
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
|
||||
img_width = img_shape[1] - (left + right)
|
||||
img_height = img_shape[0] - (top + bottom)
|
||||
|
||||
if boxes is not None:
|
||||
boxes[:, 0] -= left
|
||||
boxes[:, 1] -= top
|
||||
boxes[:, 2] -= left
|
||||
boxes[:, 3] -= top
|
||||
clip_coords(boxes, (img_height, img_width))
|
||||
|
||||
return img_height, img_width
|
||||
|
||||
|
||||
def _convert_to_rcnn_output(output, height, width, pad):
|
||||
# output: nx6 (x1, y1, x2, y2, conf, cls)
|
||||
rcnn_label: Dict[str, List[Any]] = {"boxes": [], "labels": [], "scores": []}
|
||||
|
||||
# Adjust bbox to effective image bounds
|
||||
img_height, img_width = unpad_bbox(
|
||||
output[:, :4] if output is not None else None, (height, width), pad
|
||||
)
|
||||
|
||||
if output is not None:
|
||||
rcnn_label["boxes"] = output[:, :4]
|
||||
rcnn_label["labels"] = output[:, 5:6].long()
|
||||
rcnn_label["scores"] = output[:, 4:5]
|
||||
|
||||
return rcnn_label, (img_height, img_width)
|
||||
|
||||
|
||||
def xywh2xyxy(x):
|
||||
"""Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
|
||||
|
||||
:param x: bbox coordinates in [x center, y center, w, h]
|
||||
:type x: <class 'numpy.ndarray'> or torch.Tensor
|
||||
:return: new bbox coordinates in [x1, y1, x2, y2]
|
||||
:rtype: <class 'numpy.ndarray'> or torch.Tensor
|
||||
"""
|
||||
y = torch.zeros_like(x) if isinstance(x, torch.Tensor) else np.zeros_like(x)
|
||||
y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x
|
||||
y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y
|
||||
y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x
|
||||
y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y
|
||||
return y
|
||||
|
||||
|
||||
def box_iou(box1, box2):
|
||||
"""Return intersection-over-union (Jaccard index) of boxes.
|
||||
Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
|
||||
https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
|
||||
|
||||
:param box1: bbox in (Tensor[N, 4]), N for multiple bboxes and 4 for the box coordinates
|
||||
:type box1: <class 'torch.Tensor'>
|
||||
:param box2: bbox in (Tensor[M, 4]), M is for multiple bboxes
|
||||
:type box2: <class 'torch.Tensor'>
|
||||
:return: iou of box1 to box2 in (Tensor[N, M]), the NxM matrix containing the pairwise
|
||||
IoU values for every element in boxes1 and boxes2
|
||||
:rtype: <class 'torch.Tensor'>
|
||||
"""
|
||||
|
||||
def box_area(box):
|
||||
# box = 4xn
|
||||
return (box[2] - box[0]) * (box[3] - box[1])
|
||||
|
||||
area1 = box_area(box1.t())
|
||||
area2 = box_area(box2.t())
|
||||
|
||||
# inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
|
||||
inter = (
|
||||
(
|
||||
torch.min(box1[:, None, 2:], box2[:, 2:])
|
||||
- torch.max(box1[:, None, :2], box2[:, :2])
|
||||
)
|
||||
.clamp(0)
|
||||
.prod(2)
|
||||
)
|
||||
return inter / (
|
||||
area1[:, None] + area2 - inter
|
||||
) # iou = inter / (area1 + area2 - inter)
|
||||
|
||||
|
||||
def non_max_suppression(
|
||||
prediction,
|
||||
conf_thres=0.1,
|
||||
iou_thres=0.6,
|
||||
multi_label=False,
|
||||
merge=False,
|
||||
classes=None,
|
||||
agnostic=False,
|
||||
):
|
||||
"""Performs per-class Non-Maximum Suppression (NMS) on inference results
|
||||
|
||||
:param prediction: predictions
|
||||
:type prediction: <class 'torch.Tensor'>
|
||||
:param conf_thres: confidence threshold
|
||||
:type conf_thres: float
|
||||
:param iou_thres: IoU threshold
|
||||
:type iou_thres: float
|
||||
:param multi_label: enable to have multiple labels in each box?
|
||||
:type multi_label: bool
|
||||
:param merge: Merge NMS (boxes merged using weighted mean)
|
||||
:type merge: bool
|
||||
:param classes: specific target class
|
||||
:type classes:
|
||||
:param agnostic: enable class agnostic NMS?
|
||||
:type agnostic: bool
|
||||
:return: detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
|
||||
:rtype: <class 'list'>
|
||||
"""
|
||||
if prediction.dtype is torch.float16:
|
||||
prediction = prediction.float() # to FP32
|
||||
|
||||
nc = prediction[0].shape[1] - 5 # number of classes
|
||||
xc = prediction[..., 4] > conf_thres # candidates
|
||||
|
||||
# min_wh = 2
|
||||
max_wh = 4096 # (pixels) maximum box width and height
|
||||
max_det = 300 # maximum number of detections per image
|
||||
time_limit = 10.0 # seconds to quit after
|
||||
redundant = True # require redundant detections
|
||||
if multi_label and nc < 2:
|
||||
multi_label = False # multiple labels per box (adds 0.5ms/img)
|
||||
|
||||
t = time.time()
|
||||
output = [None] * prediction.shape[0]
|
||||
for xi, x in enumerate(prediction): # image index, image inference
|
||||
# Apply constraints
|
||||
# x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height
|
||||
x = x[xc[xi]] # confidence
|
||||
|
||||
# If none remain process next image
|
||||
if not x.shape[0]:
|
||||
continue
|
||||
|
||||
# Compute conf
|
||||
x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
|
||||
|
||||
# Box (center x, center y, width, height) to (x1, y1, x2, y2)
|
||||
box = xywh2xyxy(x[:, :4])
|
||||
|
||||
# Detections matrix nx6 (xyxy, conf, cls)
|
||||
if multi_label:
|
||||
i, j = (x[:, 5:] > conf_thres).nonzero().t()
|
||||
x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
|
||||
else: # best class only
|
||||
conf, j = x[:, 5:].max(1, keepdim=True)
|
||||
x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
|
||||
|
||||
# Filter by class
|
||||
if classes:
|
||||
x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
|
||||
|
||||
# Apply finite constraint
|
||||
# if not torch.isfinite(x).all():
|
||||
# x = x[torch.isfinite(x).all(1)]
|
||||
|
||||
# If none remain process next image
|
||||
n = x.shape[0] # number of boxes
|
||||
if not n:
|
||||
continue
|
||||
|
||||
# Sort by confidence
|
||||
# x = x[x[:, 4].argsort(descending=True)]
|
||||
|
||||
# Batched NMS
|
||||
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
|
||||
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
|
||||
i = torchvision.ops.boxes.nms(boxes, scores, iou_thres)
|
||||
if i.shape[0] > max_det: # limit detections
|
||||
i = i[:max_det]
|
||||
if merge and (1 < n < 3e3):
|
||||
try: # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
|
||||
iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
|
||||
weights = iou * scores[None] # box weights
|
||||
x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(
|
||||
1, keepdim=True
|
||||
) # merged boxes
|
||||
if redundant:
|
||||
i = i[iou.sum(1) > 1] # require redundancy
|
||||
except Exception: # possible CUDA error https://github.com/ultralytics/yolov3/issues/1139
|
||||
print(
|
||||
"[WARNING: possible CUDA error ({} {} {} {})]".format(
|
||||
x, i, x.shape, i.shape
|
||||
)
|
||||
)
|
||||
pass
|
||||
|
||||
output[xi] = x[i]
|
||||
if (time.time() - t) > time_limit:
|
||||
break # time limit exceeded
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def _read_image(ignore_data_errors: bool, image_url: str, use_cv2: bool = False):
|
||||
try:
|
||||
if use_cv2:
|
||||
# cv2 can return None in some error cases
|
||||
img = cv2.imread(image_url) # BGR
|
||||
if img is None:
|
||||
print("cv2.imread returned None")
|
||||
return img
|
||||
else:
|
||||
image = Image.open(image_url).convert("RGB")
|
||||
return image
|
||||
except Exception as ex:
|
||||
if ignore_data_errors:
|
||||
msg = "Exception occurred when trying to read the image. This file will be ignored."
|
||||
print(msg)
|
||||
else:
|
||||
print(str(ex), has_pii=True)
|
||||
return None
|
||||
|
||||
|
||||
def preprocess(image_url, img_size=640):
|
||||
img0 = _read_image(
|
||||
ignore_data_errors=False, image_url=image_url, use_cv2=True
|
||||
) # cv2.imread(image_url) # BGR
|
||||
if img0 is None:
|
||||
return image_url, None, None
|
||||
|
||||
img, ratio, pad = letterbox(img0, new_shape=img_size, auto=False, scaleup=False)
|
||||
|
||||
# Convert
|
||||
img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x640x640
|
||||
img = np.ascontiguousarray(img)
|
||||
np_image = torch.from_numpy(img)
|
||||
np_image = np.expand_dims(np_image, axis=0)
|
||||
np_image = np_image.astype(np.float32) / 255.0
|
||||
return np_image, pad
|
||||
@@ -1,21 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -90,16 +74,6 @@
|
||||
"This sample notebook may use features that are not available in previous versions of the Azure ML SDK."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.38.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -109,18 +83,18 @@
|
||||
"ws = Workspace.from_config()\n",
|
||||
"\n",
|
||||
"# choose a name for experiment\n",
|
||||
"experiment_name = 'automl-classification-ccard-local'\n",
|
||||
"experiment_name = \"automl-classification-ccard-local\"\n",
|
||||
"\n",
|
||||
"experiment = Experiment(ws, experiment_name)\n",
|
||||
"\n",
|
||||
"output = {}\n",
|
||||
"output['Subscription ID'] = ws.subscription_id\n",
|
||||
"output['Workspace'] = ws.name\n",
|
||||
"output['Resource Group'] = ws.resource_group\n",
|
||||
"output['Location'] = ws.location\n",
|
||||
"output['Experiment Name'] = experiment.name\n",
|
||||
"pd.set_option('display.max_colwidth', -1)\n",
|
||||
"outputDf = pd.DataFrame(data = output, index = [''])\n",
|
||||
"output[\"Subscription ID\"] = ws.subscription_id\n",
|
||||
"output[\"Workspace\"] = ws.name\n",
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Experiment Name\"] = experiment.name\n",
|
||||
"pd.set_option(\"display.max_colwidth\", -1)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
]
|
||||
},
|
||||
@@ -142,7 +116,7 @@
|
||||
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\"\n",
|
||||
"dataset = Dataset.Tabular.from_delimited_files(data)\n",
|
||||
"training_data, validation_data = dataset.random_split(percentage=0.8, seed=223)\n",
|
||||
"label_column_name = 'Class'"
|
||||
"label_column_name = \"Class\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -173,17 +147,18 @@
|
||||
"source": [
|
||||
"automl_settings = {\n",
|
||||
" \"n_cross_validations\": 3,\n",
|
||||
" \"primary_metric\": 'AUC_weighted',\n",
|
||||
" \"primary_metric\": \"average_precision_score_weighted\",\n",
|
||||
" \"experiment_timeout_hours\": 0.25, # This is a time limit for testing purposes, remove it for real use cases, this will drastically limit ability to find the best model possible\n",
|
||||
" \"verbosity\": logging.INFO,\n",
|
||||
" \"enable_stack_ensemble\": False\n",
|
||||
" \"enable_stack_ensemble\": False,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(task = 'classification',\n",
|
||||
" debug_log = 'automl_errors.log',\n",
|
||||
"automl_config = AutoMLConfig(\n",
|
||||
" task=\"classification\",\n",
|
||||
" debug_log=\"automl_errors.log\",\n",
|
||||
" training_data=training_data,\n",
|
||||
" label_column_name=label_column_name,\n",
|
||||
" **automl_settings\n",
|
||||
" **automl_settings,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -240,6 +215,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"\n",
|
||||
"RunDetails(local_run).show()"
|
||||
]
|
||||
},
|
||||
@@ -288,8 +264,12 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# convert the test data to dataframe\n",
|
||||
"X_test_df = validation_data.drop_columns(columns=[label_column_name]).to_pandas_dataframe()\n",
|
||||
"y_test_df = validation_data.keep_columns(columns=[label_column_name], validate=True).to_pandas_dataframe()"
|
||||
"X_test_df = validation_data.drop_columns(\n",
|
||||
" columns=[label_column_name]\n",
|
||||
").to_pandas_dataframe()\n",
|
||||
"y_test_df = validation_data.keep_columns(\n",
|
||||
" columns=[label_column_name], validate=True\n",
|
||||
").to_pandas_dataframe()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -324,19 +304,25 @@
|
||||
"import itertools\n",
|
||||
"\n",
|
||||
"cf = confusion_matrix(y_test_df.values, y_pred)\n",
|
||||
"plt.imshow(cf,cmap=plt.cm.Blues,interpolation='nearest')\n",
|
||||
"plt.imshow(cf, cmap=plt.cm.Blues, interpolation=\"nearest\")\n",
|
||||
"plt.colorbar()\n",
|
||||
"plt.title('Confusion Matrix')\n",
|
||||
"plt.xlabel('Predicted')\n",
|
||||
"plt.ylabel('Actual')\n",
|
||||
"class_labels = ['False','True']\n",
|
||||
"plt.title(\"Confusion Matrix\")\n",
|
||||
"plt.xlabel(\"Predicted\")\n",
|
||||
"plt.ylabel(\"Actual\")\n",
|
||||
"class_labels = [\"False\", \"True\"]\n",
|
||||
"tick_marks = np.arange(len(class_labels))\n",
|
||||
"plt.xticks(tick_marks, class_labels)\n",
|
||||
"plt.yticks([-0.5,0,1,1.5],['','False','True',''])\n",
|
||||
"plt.yticks([-0.5, 0, 1, 1.5], [\"\", \"False\", \"True\", \"\"])\n",
|
||||
"# plotting text value inside cells\n",
|
||||
"thresh = cf.max() / 2.\n",
|
||||
"thresh = cf.max() / 2.0\n",
|
||||
"for i, j in itertools.product(range(cf.shape[0]), range(cf.shape[1])):\n",
|
||||
" plt.text(j,i,format(cf[i,j],'d'),horizontalalignment='center',color='white' if cf[i,j] >thresh else 'black')\n",
|
||||
" plt.text(\n",
|
||||
" j,\n",
|
||||
" i,\n",
|
||||
" format(cf[i, j], \"d\"),\n",
|
||||
" horizontalalignment=\"center\",\n",
|
||||
" color=\"white\" if cf[i, j] > thresh else \"black\",\n",
|
||||
" )\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
@@ -363,7 +349,10 @@
|
||||
"client = ExplanationClient.from_run(best_run)\n",
|
||||
"engineered_explanations = client.download_model_explanation(raw=False)\n",
|
||||
"print(engineered_explanations.get_feature_importance_dict())\n",
|
||||
"print(\"You can visualize the engineered explanations under the 'Explanations (preview)' tab in the AutoML run at:-\\n\" + best_run.get_portal_url())"
|
||||
"print(\n",
|
||||
" \"You can visualize the engineered explanations under the 'Explanations (preview)' tab in the AutoML run at:-\\n\"\n",
|
||||
" + best_run.get_portal_url()\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -382,7 +371,10 @@
|
||||
"source": [
|
||||
"raw_explanations = client.download_model_explanation(raw=True)\n",
|
||||
"print(raw_explanations.get_feature_importance_dict())\n",
|
||||
"print(\"You can visualize the raw explanations under the 'Explanations (preview)' tab in the AutoML run at:-\\n\" + best_run.get_portal_url())"
|
||||
"print(\n",
|
||||
" \"You can visualize the raw explanations under the 'Explanations (preview)' tab in the AutoML run at:-\\n\"\n",
|
||||
" + best_run.get_portal_url()\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -398,7 +390,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_run, fitted_model = local_run.get_output(metric='accuracy')"
|
||||
"automl_run, fitted_model = local_run.get_output(metric=\"accuracy\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -432,12 +424,18 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.train.automl.runtime.automl_explain_utilities import automl_setup_model_explanations\n",
|
||||
"from azureml.train.automl.runtime.automl_explain_utilities import (\n",
|
||||
" automl_setup_model_explanations,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"automl_explainer_setup_obj = automl_setup_model_explanations(fitted_model, X=X_train, \n",
|
||||
" X_test=X_test, y=y_train, \n",
|
||||
" task='classification',\n",
|
||||
" automl_run=automl_run)"
|
||||
"automl_explainer_setup_obj = automl_setup_model_explanations(\n",
|
||||
" fitted_model,\n",
|
||||
" X=X_train,\n",
|
||||
" X_test=X_test,\n",
|
||||
" y=y_train,\n",
|
||||
" task=\"classification\",\n",
|
||||
" automl_run=automl_run,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -455,13 +453,18 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.interpret.mimic_wrapper import MimicWrapper\n",
|
||||
"explainer = MimicWrapper(ws, automl_explainer_setup_obj.automl_estimator,\n",
|
||||
"\n",
|
||||
"explainer = MimicWrapper(\n",
|
||||
" ws,\n",
|
||||
" automl_explainer_setup_obj.automl_estimator,\n",
|
||||
" explainable_model=automl_explainer_setup_obj.surrogate_model,\n",
|
||||
" init_dataset=automl_explainer_setup_obj.X_transform, run=automl_explainer_setup_obj.automl_run,\n",
|
||||
" init_dataset=automl_explainer_setup_obj.X_transform,\n",
|
||||
" run=automl_explainer_setup_obj.automl_run,\n",
|
||||
" features=automl_explainer_setup_obj.engineered_feature_names,\n",
|
||||
" feature_maps=[automl_explainer_setup_obj.feature_map],\n",
|
||||
" classes=automl_explainer_setup_obj.classes,\n",
|
||||
" explainer_kwargs=automl_explainer_setup_obj.surrogate_model_params)"
|
||||
" explainer_kwargs=automl_explainer_setup_obj.surrogate_model_params,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -479,9 +482,14 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Compute the engineered explanations\n",
|
||||
"engineered_explanations = explainer.explain(['local', 'global'], eval_dataset=automl_explainer_setup_obj.X_test_transform)\n",
|
||||
"engineered_explanations = explainer.explain(\n",
|
||||
" [\"local\", \"global\"], eval_dataset=automl_explainer_setup_obj.X_test_transform\n",
|
||||
")\n",
|
||||
"print(engineered_explanations.get_feature_importance_dict())\n",
|
||||
"print(\"You can visualize the engineered explanations under the 'Explanations (preview)' tab in the AutoML run at:-\\n\" + automl_run.get_portal_url())"
|
||||
"print(\n",
|
||||
" \"You can visualize the engineered explanations under the 'Explanations (preview)' tab in the AutoML run at:-\\n\"\n",
|
||||
" + automl_run.get_portal_url()\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -499,12 +507,18 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Compute the raw explanations\n",
|
||||
"raw_explanations = explainer.explain(['local', 'global'], get_raw=True,\n",
|
||||
"raw_explanations = explainer.explain(\n",
|
||||
" [\"local\", \"global\"],\n",
|
||||
" get_raw=True,\n",
|
||||
" raw_feature_names=automl_explainer_setup_obj.raw_feature_names,\n",
|
||||
" eval_dataset=automl_explainer_setup_obj.X_test_transform,\n",
|
||||
" raw_eval_dataset=automl_explainer_setup_obj.X_test_raw)\n",
|
||||
" raw_eval_dataset=automl_explainer_setup_obj.X_test_raw,\n",
|
||||
")\n",
|
||||
"print(raw_explanations.get_feature_importance_dict())\n",
|
||||
"print(\"You can visualize the raw explanations under the 'Explanations (preview)' tab in the AutoML run at:-\\n\" + automl_run.get_portal_url())"
|
||||
"print(\n",
|
||||
" \"You can visualize the raw explanations under the 'Explanations (preview)' tab in the AutoML run at:-\\n\"\n",
|
||||
" + automl_run.get_portal_url()\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -524,15 +538,17 @@
|
||||
"import joblib\n",
|
||||
"\n",
|
||||
"# Initialize the ScoringExplainer\n",
|
||||
"scoring_explainer = TreeScoringExplainer(explainer.explainer, feature_maps=[automl_explainer_setup_obj.feature_map])\n",
|
||||
"scoring_explainer = TreeScoringExplainer(\n",
|
||||
" explainer.explainer, feature_maps=[automl_explainer_setup_obj.feature_map]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Pickle scoring explainer locally to './scoring_explainer.pkl'\n",
|
||||
"scoring_explainer_file_name = 'scoring_explainer.pkl'\n",
|
||||
"with open(scoring_explainer_file_name, 'wb') as stream:\n",
|
||||
"scoring_explainer_file_name = \"scoring_explainer.pkl\"\n",
|
||||
"with open(scoring_explainer_file_name, \"wb\") as stream:\n",
|
||||
" joblib.dump(scoring_explainer, stream)\n",
|
||||
"\n",
|
||||
"# Upload the scoring explainer to the automl run\n",
|
||||
"automl_run.upload_file('outputs/scoring_explainer.pkl', scoring_explainer_file_name)"
|
||||
"automl_run.upload_file(\"outputs/scoring_explainer.pkl\", scoring_explainer_file_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -551,10 +567,12 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Register trained automl model present in the 'outputs' folder in the artifacts\n",
|
||||
"original_model = automl_run.register_model(model_name='automl_model', \n",
|
||||
" model_path='outputs/model.pkl')\n",
|
||||
"scoring_explainer_model = automl_run.register_model(model_name='scoring_explainer',\n",
|
||||
" model_path='outputs/scoring_explainer.pkl')"
|
||||
"original_model = automl_run.register_model(\n",
|
||||
" model_name=\"automl_model\", model_path=\"outputs/model.pkl\"\n",
|
||||
")\n",
|
||||
"scoring_explainer_model = automl_run.register_model(\n",
|
||||
" model_name=\"scoring_explainer\", model_path=\"outputs/scoring_explainer.pkl\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -575,7 +593,7 @@
|
||||
"from azureml.automl.core.shared import constants\n",
|
||||
"from azureml.core.environment import Environment\n",
|
||||
"\n",
|
||||
"automl_run.download_file(constants.CONDA_ENV_FILE_PATH, 'myenv.yml')\n",
|
||||
"automl_run.download_file(constants.CONDA_ENV_FILE_PATH, \"myenv.yml\")\n",
|
||||
"myenv = Environment.from_conda_specification(name=\"myenv\", file_path=\"myenv.yml\")\n",
|
||||
"myenv"
|
||||
]
|
||||
@@ -598,7 +616,9 @@
|
||||
"import joblib\n",
|
||||
"import pandas as pd\n",
|
||||
"from azureml.core.model import Model\n",
|
||||
"from azureml.train.automl.runtime.automl_explain_utilities import automl_setup_model_explanations\n",
|
||||
"from azureml.train.automl.runtime.automl_explain_utilities import (\n",
|
||||
" automl_setup_model_explanations,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def init():\n",
|
||||
@@ -607,28 +627,35 @@
|
||||
"\n",
|
||||
" # Retrieve the path to the model file using the model name\n",
|
||||
" # Assume original model is named original_prediction_model\n",
|
||||
" automl_model_path = Model.get_model_path('automl_model')\n",
|
||||
" scoring_explainer_path = Model.get_model_path('scoring_explainer')\n",
|
||||
" automl_model_path = Model.get_model_path(\"automl_model\")\n",
|
||||
" scoring_explainer_path = Model.get_model_path(\"scoring_explainer\")\n",
|
||||
"\n",
|
||||
" automl_model = joblib.load(automl_model_path)\n",
|
||||
" scoring_explainer = joblib.load(scoring_explainer_path)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def run(raw_data):\n",
|
||||
" data = pd.read_json(raw_data, orient='records') \n",
|
||||
" data = pd.read_json(raw_data, orient=\"records\")\n",
|
||||
" # Make prediction\n",
|
||||
" predictions = automl_model.predict(data)\n",
|
||||
" # Setup for inferencing explanations\n",
|
||||
" automl_explainer_setup_obj = automl_setup_model_explanations(automl_model,\n",
|
||||
" X_test=data, task='classification')\n",
|
||||
" automl_explainer_setup_obj = automl_setup_model_explanations(\n",
|
||||
" automl_model, X_test=data, task=\"classification\"\n",
|
||||
" )\n",
|
||||
" # Retrieve model explanations for engineered explanations\n",
|
||||
" engineered_local_importance_values = scoring_explainer.explain(automl_explainer_setup_obj.X_test_transform)\n",
|
||||
" engineered_local_importance_values = scoring_explainer.explain(\n",
|
||||
" automl_explainer_setup_obj.X_test_transform\n",
|
||||
" )\n",
|
||||
" # Retrieve model explanations for raw explanations\n",
|
||||
" raw_local_importance_values = scoring_explainer.explain(automl_explainer_setup_obj.X_test_transform, get_raw=True)\n",
|
||||
" raw_local_importance_values = scoring_explainer.explain(\n",
|
||||
" automl_explainer_setup_obj.X_test_transform, get_raw=True\n",
|
||||
" )\n",
|
||||
" # You can return any data type as long as it is JSON-serializable\n",
|
||||
" return {'predictions': predictions.tolist(),\n",
|
||||
" 'engineered_local_importance_values': engineered_local_importance_values,\n",
|
||||
" 'raw_local_importance_values': raw_local_importance_values}\n"
|
||||
" return {\n",
|
||||
" \"predictions\": predictions.tolist(),\n",
|
||||
" \"engineered_local_importance_values\": engineered_local_importance_values,\n",
|
||||
" \"raw_local_importance_values\": raw_local_importance_values,\n",
|
||||
" }"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -647,7 +674,7 @@
|
||||
"source": [
|
||||
"from azureml.core.model import InferenceConfig\n",
|
||||
"\n",
|
||||
"inf_config = InferenceConfig(entry_script='score.py', environment=myenv)"
|
||||
"inf_config = InferenceConfig(entry_script=\"score.py\", environment=myenv)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -668,17 +695,17 @@
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# Choose a name for your cluster.\n",
|
||||
"aks_name = 'scoring-explain'\n",
|
||||
"aks_name = \"scoring-explain\"\n",
|
||||
"\n",
|
||||
"# Verify that cluster does not exist already\n",
|
||||
"try:\n",
|
||||
" aks_target = ComputeTarget(workspace=ws, name=aks_name)\n",
|
||||
" print('Found existing cluster, use it.')\n",
|
||||
" print(\"Found existing cluster, use it.\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" prov_config = AksCompute.provisioning_configuration(vm_size='STANDARD_D3_V2')\n",
|
||||
" aks_target = ComputeTarget.create(workspace=ws, \n",
|
||||
" name=aks_name,\n",
|
||||
" provisioning_configuration=prov_config)\n",
|
||||
" prov_config = AksCompute.provisioning_configuration(vm_size=\"STANDARD_D3_V2\")\n",
|
||||
" aks_target = ComputeTarget.create(\n",
|
||||
" workspace=ws, name=aks_name, provisioning_configuration=prov_config\n",
|
||||
" )\n",
|
||||
"aks_target.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
@@ -708,14 +735,16 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"aks_service_name ='model-scoring-local-aks'\n",
|
||||
"aks_service_name = \"model-scoring-local-aks\"\n",
|
||||
"\n",
|
||||
"aks_service = Model.deploy(workspace=ws,\n",
|
||||
"aks_service = Model.deploy(\n",
|
||||
" workspace=ws,\n",
|
||||
" name=aks_service_name,\n",
|
||||
" models=[scoring_explainer_model, original_model],\n",
|
||||
" inference_config=inf_config,\n",
|
||||
" deployment_config=aks_config,\n",
|
||||
" deployment_target=aks_target)\n",
|
||||
" deployment_target=aks_target,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"aks_service.wait_for_deployment(show_output=True)\n",
|
||||
"print(aks_service.state)"
|
||||
@@ -752,18 +781,24 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Serialize the first row of the test data into json\n",
|
||||
"X_test_json = X_test_df[:1].to_json(orient='records')\n",
|
||||
"X_test_json = X_test_df[:1].to_json(orient=\"records\")\n",
|
||||
"print(X_test_json)\n",
|
||||
"\n",
|
||||
"# Call the service to get the predictions and the engineered and raw explanations\n",
|
||||
"output = aks_service.run(X_test_json)\n",
|
||||
"\n",
|
||||
"# Print the predicted value\n",
|
||||
"print('predictions:\\n{}\\n'.format(output['predictions']))\n",
|
||||
"print(\"predictions:\\n{}\\n\".format(output[\"predictions\"]))\n",
|
||||
"# Print the engineered feature importances for the predicted value\n",
|
||||
"print('engineered_local_importance_values:\\n{}\\n'.format(output['engineered_local_importance_values']))\n",
|
||||
"print(\n",
|
||||
" \"engineered_local_importance_values:\\n{}\\n\".format(\n",
|
||||
" output[\"engineered_local_importance_values\"]\n",
|
||||
" )\n",
|
||||
")\n",
|
||||
"# Print the raw feature importances for the predicted value\n",
|
||||
"print('raw_local_importance_values:\\n{}\\n'.format(output['raw_local_importance_values']))\n"
|
||||
"print(\n",
|
||||
" \"raw_local_importance_values:\\n{}\\n\".format(output[\"raw_local_importance_values\"])\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -797,14 +832,14 @@
|
||||
"This Credit Card fraud Detection dataset is made available under the Open Database License: http://opendatacommons.org/licenses/odbl/1.0/. Any rights in individual contents of the database are licensed under the Database Contents License: http://opendatacommons.org/licenses/dbcl/1.0/ and is available at: https://www.kaggle.com/mlg-ulb/creditcardfraud\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00c3\u0192\u00c2\u00a9 Libre de Bruxelles) on big data mining and fraud detection. More details on current and past projects on related topics are available on https://www.researchgate.net/project/Fraud-detection-5 and the page of the DefeatFraud project\n",
|
||||
"The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Université Libre de Bruxelles) on big data mining and fraud detection. More details on current and past projects on related topics are available on https://www.researchgate.net/project/Fraud-detection-5 and the page of the DefeatFraud project\n",
|
||||
"Please cite the following works: \n",
|
||||
"\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00a2\tAndrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015\n",
|
||||
"\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00a2\tDal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon\n",
|
||||
"\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00a2\tDal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE\n",
|
||||
"•\tAndrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015\n",
|
||||
"•\tDal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon\n",
|
||||
"•\tDal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE\n",
|
||||
"o\tDal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)\n",
|
||||
"\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00a2\tCarcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-A\u00c3\u0192\u00c2\u00abl; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier\n",
|
||||
"\u00c3\u00a2\u00e2\u201a\u00ac\u00c2\u00a2\tCarcillo, Fabrizio; Le Borgne, Yann-A\u00c3\u0192\u00c2\u00abl; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing"
|
||||
"•\tCarcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-Aël; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier\n",
|
||||
"•\tCarcillo, Fabrizio; Le Borgne, Yann-Aël; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -832,9 +867,9 @@
|
||||
"friendly_name": "Classification of credit card fraudulent transactions using Automated ML",
|
||||
"index_order": 5,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -0,0 +1,698 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Automated Machine Learning\n",
|
||||
"_**New metric features in Azure AutoML**_\n",
|
||||
"\n",
|
||||
"## Contents\n",
|
||||
"1. [Introduction](#Introduction)\n",
|
||||
"1. [Setup](#Setup)\n",
|
||||
"1. [Train](#Train)\n",
|
||||
"1. [Results](#Results)\n",
|
||||
"1. [Test](#Test)\n",
|
||||
"1. [Acknowledgements](#Acknowledgements)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Introduction\n",
|
||||
"\n",
|
||||
"In this example notebook we use the sklearn datasets, [digits](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html) and [boston](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html) to help you get familiar with binary classification metrics and confidence interval. The goal is to learn how to use these features through the examples. \n",
|
||||
"\n",
|
||||
"This notebook is using remote compute to train the model.\n",
|
||||
"\n",
|
||||
"If you are using an Azure Machine Learning Compute Instance, you are all set. Otherwise, go through the [configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook first if you haven't already to establish your connection to the AzureML Workspace. \n",
|
||||
"\n",
|
||||
"In this notebook you will learn how to:\n",
|
||||
"1. How to have binary classification metrics calculated for AutoML runs\n",
|
||||
"2. How to find binary classification metrics in UI and how to retrieve the values through code\n",
|
||||
"3. How to have confidence intervals calculated for both classification and regression AutoML runs\n",
|
||||
"4. How to find confidence intervals in UI and how to retrieve the values through code"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"As part of the setup you have already created an Azure ML `Workspace` object. For Automated ML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import logging\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"from azureml.core.dataset import Dataset\n",
|
||||
"from azureml.train.automl import AutoMLConfig"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ws = Workspace.from_config()\n",
|
||||
"\n",
|
||||
"experiment_name = \"metrics-new-feature-test\"\n",
|
||||
"\n",
|
||||
"experiment = Experiment(ws, experiment_name)\n",
|
||||
"\n",
|
||||
"output = {}\n",
|
||||
"output[\"Subscription ID\"] = ws.subscription_id\n",
|
||||
"output[\"Workspace\"] = ws.name\n",
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Experiment Name\"] = experiment.name\n",
|
||||
"pd.set_option(\"display.max_colwidth\", -1)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create or Attach existing AmlCompute\n",
|
||||
"A compute target is required to execute the Automated ML run. In this tutorial, you create AmlCompute as your training compute resource.\n",
|
||||
"\n",
|
||||
"> Note that if you have an AzureML Data Scientist role, you will not have permission to create compute resources. Talk to your workspace or IT admin to create the compute targets described in this section, if they do not already exist.\n",
|
||||
"\n",
|
||||
"#### Creation of AmlCompute takes approximately 5 minutes. \n",
|
||||
"If the AmlCompute with that name is already in your workspace this code will skip the creation process.\n",
|
||||
"As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"\n",
|
||||
"# Choose a name for your CPU cluster\n",
|
||||
"cpu_cluster_name = \"cpu-cluster-1\"\n",
|
||||
"\n",
|
||||
"# Verify that cluster does not exist already\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)\n",
|
||||
" print(\"Found existing cluster, use it.\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=\"STANDARD_DS12_V2\", max_nodes=6\n",
|
||||
" )\n",
|
||||
" compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
"compute_target.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Load Data\n",
|
||||
"\n",
|
||||
"We load datasets from sklearn and save to local files to register them to workspace.\n",
|
||||
"\n",
|
||||
"For classification, we use [digits dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits)\n",
|
||||
"\n",
|
||||
"For regression, we use [boston dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html#sklearn.datasets.load_boston)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import sklearn.datasets\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def load_classification_data():\n",
|
||||
" if os.path.exists(\"./data/digits.csv\"):\n",
|
||||
" print(\"Find downloaded dataset. Loading\")\n",
|
||||
" else:\n",
|
||||
" print(\"Downloading dataset\")\n",
|
||||
" os.makedirs(\"./data\", exist_ok=True)\n",
|
||||
" classification_dataset = sklearn.datasets.load_digits()\n",
|
||||
" X = classification_dataset[\"data\"]\n",
|
||||
" y = classification_dataset[\"target\"]\n",
|
||||
" full_data = np.concatenate([X, y.reshape(-1, 1)], axis=1).astype(\"int\")\n",
|
||||
" columns = [\"feature_{}\".format(i) for i in range(X.shape[1])] + [\"label\"]\n",
|
||||
" full_data = pd.DataFrame(data=full_data, columns=columns)\n",
|
||||
" full_data.to_csv(\"./data/digits.csv\", index=False)\n",
|
||||
" print(\"Dataset downloaded\")\n",
|
||||
" ws = Workspace.from_config()\n",
|
||||
" datastore = ws.get_default_datastore()\n",
|
||||
" datastore.upload(\n",
|
||||
" src_dir=\"./data\", target_path=\"data/new-metric-features/\", overwrite=True\n",
|
||||
" )\n",
|
||||
" data = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, (\"data/new-metric-features/digits.csv\"))]\n",
|
||||
" )\n",
|
||||
" train, test = data.random_split(percentage=0.8, seed=101)\n",
|
||||
" validation, test = test.random_split(percentage=0.5, seed=47)\n",
|
||||
" return train, validation, test, np.arange(10), \"label\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"(\n",
|
||||
" digit_train,\n",
|
||||
" digit_validation,\n",
|
||||
" digit_test,\n",
|
||||
" labels,\n",
|
||||
" label_column_name,\n",
|
||||
") = load_classification_data()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Binary Classification Metrics\n",
|
||||
"\n",
|
||||
"In this section we will explain how to set parameters for AutoML runs to have binary classification metrics calculated.\n",
|
||||
"\n",
|
||||
"## Binary Classification Metrics\n",
|
||||
"Binary classification metrics will be calculated for AutoML in two cases:\n",
|
||||
"1. There are exactly two classes.\n",
|
||||
"2. parameter `positive_label` in `AutoMLConfig` is specified as an existing class.\n",
|
||||
"\n",
|
||||
"When a `positive_label` is specified for multiclass classification tasks, all other classes will all be treated the negative class when calculating the binary classification metrics.\n",
|
||||
"\n",
|
||||
"When there are exactly two classes, `np.unique()` will be used to sort the classes and the class with larger index will be used as the positive class. However, we would recommend always specify a `positive_label` when you want to calculate binary classification metrics to make sure that it is calculated for the correct class. In the example below, we use class `4` as the positive class."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_settings = {\n",
|
||||
" \"primary_metric\": \"AUC_weighted\",\n",
|
||||
" \"enable_early_stopping\": True,\n",
|
||||
" \"max_concurrent_iterations\": 6,\n",
|
||||
" \"experiment_timeout_hours\": 0.25,\n",
|
||||
" \"verbosity\": logging.INFO,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(\n",
|
||||
" task=\"classification\",\n",
|
||||
" debug_log=\"automl_errors.log\",\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=digit_train,\n",
|
||||
" validation_data=digit_validation,\n",
|
||||
" label_column_name=label_column_name,\n",
|
||||
" positive_label=4, # specify the positive class with this parameter\n",
|
||||
" **automl_settings\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"classification_run = experiment.submit(automl_config, show_output=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"classification_run.wait_for_completion(show_output=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Find Binary Metrics in UI\n",
|
||||
"\n",
|
||||
"After training, you can click the link above to visit the page of this run. You can find all training runs under `Child runs` tab:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Then under `Metrics` tab, you can find some metrics names that end with `_binary`. They are the binary classification metrics with the specified positive class.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Retrieve Binary Metrics with Code\n",
|
||||
"\n",
|
||||
"You can also retrieve the metrics values for any training run with codes. They returned values will be a dictionary with structure `{name: value}`. The example below retrieves the metrics of the best trained model."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"best_run, fitted_model = classification_run.get_output()\n",
|
||||
"training_metrics = best_run.get_metrics()\n",
|
||||
"training_metrics[\"AUC_binary\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"With data downloaded, you can also calculate the binary classification metrics with other classes as the positive class. \n",
|
||||
"\n",
|
||||
"To calculate metrics with codes, you will need to import Azure AutoML's scoring modules and specify the value of `positive_label` as desired. See example code below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.runtime.shared.score import constants, scoring\n",
|
||||
"\n",
|
||||
"test_df = digit_test.to_pandas_dataframe()\n",
|
||||
"y_test = test_df[label_column_name]\n",
|
||||
"test_df = test_df.drop(columns=[label_column_name])\n",
|
||||
"y_pred_proba = fitted_model.predict_proba(test_df)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for positive_label in range(10):\n",
|
||||
" metrics = scoring.score_classification(\n",
|
||||
" y_test,\n",
|
||||
" y_pred_proba,\n",
|
||||
" constants.CLASSIFICATION_SCALAR_SET,\n",
|
||||
" labels,\n",
|
||||
" labels,\n",
|
||||
" positive_label=positive_label,\n",
|
||||
" )\n",
|
||||
" print(\n",
|
||||
" \"AUC_binary for label {} is {:.4f}\".format(\n",
|
||||
" positive_label, metrics[\"AUC_binary\"]\n",
|
||||
" )\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Wrong Value of `positive_label` Fails the Run\n",
|
||||
"\n",
|
||||
"The value of `positive_label` passed into `AutoMLConfig` must be exactly the same as it is in the dataset. If you passed in a `positive_label` that cannot be found in the training dataset, the run will fail. See the example below, where the correct value `4` is replaced by its string version, `'4'`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_settings = {\n",
|
||||
" \"primary_metric\": \"AUC_weighted\",\n",
|
||||
" \"enable_early_stopping\": True,\n",
|
||||
" \"max_concurrent_iterations\": 6,\n",
|
||||
" \"experiment_timeout_hours\": 0.25,\n",
|
||||
" \"verbosity\": logging.INFO,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(\n",
|
||||
" task=\"classification\",\n",
|
||||
" debug_log=\"automl_errors.log\",\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=digit_train,\n",
|
||||
" validation_data=digit_validation,\n",
|
||||
" label_column_name=label_column_name,\n",
|
||||
" positive_label=\"4\", # replace the correct integer value with its string version\n",
|
||||
" **automl_settings\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"classification_run = experiment.submit(automl_config, show_output=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"classification_run.wait_for_completion(show_output=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Confidence Interval\n",
|
||||
"\n",
|
||||
"We calculate confidence intervals for metrics by doing bootstrap and we give conservative estimates. Like binary classification metrics, you can find the confidence intervals in UI, and also retrieve them with codes. \n",
|
||||
"\n",
|
||||
"To calculate confidence intervals in AutoML runs, we need to pass two other parameters to `AutoMLConfig`:\n",
|
||||
"1. `enable_metric_confidence = True` to tell the run to calculate confidence interval\n",
|
||||
"2. `test_data` to activate a test run, as confidence intervals will only be calculated for test runs.\n",
|
||||
"\n",
|
||||
"Currently, if the task is classification, only primary metrics will have their confidence intervals logged with the run. To get confidence intervals for other metrics, you can use codes. We will provide examples below."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_settings = {\n",
|
||||
" \"primary_metric\": \"AUC_weighted\",\n",
|
||||
" \"enable_early_stopping\": True,\n",
|
||||
" \"max_concurrent_iterations\": 6,\n",
|
||||
" \"experiment_timeout_hours\": 0.25,\n",
|
||||
" \"verbosity\": logging.INFO,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(\n",
|
||||
" task=\"classification\",\n",
|
||||
" debug_log=\"automl_errors.log\",\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=digit_train,\n",
|
||||
" validation_data=digit_validation,\n",
|
||||
" test_data=digit_test, # if you only have a test set, you can pass validation set here, instead of at validation_data\n",
|
||||
" label_column_name=label_column_name,\n",
|
||||
" enable_metric_confidence=True,\n",
|
||||
" **automl_settings\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"classification_run = experiment.submit(automl_config, show_output=False)\n",
|
||||
"classification_run.wait_for_completion(show_output=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Find Confidence Interval in UI\n",
|
||||
"\n",
|
||||
"To locate the confidence intervals in UI, we must first find the run which gives the best model, as only the best model will be run on test set. In order to do so, click the link above for the AutoML run, and go to `Models` tab. The model listed on the top is the one with best performance:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Then for this best model, go to its `Child runs` tab and click the run with tab `Test model`\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"For this test run, under tab `Metrics`, you can find some metrics whose names end with `extras`. By switching `View as` from `Chart` to `Table`, you can find the confidence intervals for those metrics.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"## Find Confidence Interval with Code\n",
|
||||
"\n",
|
||||
"You can retrieve the `Run` object for test run with the following code, and get confidence interval from its metrics."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"best_run, fitted_model = classification_run.get_output()\n",
|
||||
"test_run = next(best_run.get_children(type=\"automl.model_test\"))\n",
|
||||
"test_run.wait_for_completion(show_output=False, wait_post_processing=True)\n",
|
||||
"test_metrics = test_run.get_metrics()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"CIs = {\"metric_name\": [], \"lower_ci_95\": [], \"upper_ci_95\": [], \"value\": []}\n",
|
||||
"\n",
|
||||
"for key, ci in test_metrics.items():\n",
|
||||
" if key.endswith(\"extras\"):\n",
|
||||
" CIs[\"metric_name\"].append(key[:-7]) # remove \"_extras\" to get metric name\n",
|
||||
" for ci_key, ci_value in ci.items():\n",
|
||||
" CIs[ci_key].append(ci_value)\n",
|
||||
"\n",
|
||||
"pd.DataFrame(CIs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Or, you can retrieve the best model, do inference yourself, and get confidence intervals for all metrics. However, since our confidence intervals includes a large number of bootstraps, it will take some time."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_df = digit_test.to_pandas_dataframe()\n",
|
||||
"y_test = test_df[label_column_name]\n",
|
||||
"test_df = test_df.drop(columns=[label_column_name])\n",
|
||||
"y_pred_proba = fitted_model.predict_proba(test_df)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.automl.runtime._ml_engine.classification_ml_engine import (\n",
|
||||
" evaluate_classifier,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"test_metrics = evaluate_classifier(\n",
|
||||
" y_test,\n",
|
||||
" y_pred_proba,\n",
|
||||
" constants.CLASSIFICATION_SCALAR_SET,\n",
|
||||
" labels,\n",
|
||||
" labels,\n",
|
||||
" enable_metric_confidence=True,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"CIs = {\"metric_name\": [], \"lower_ci_95\": [], \"upper_ci_95\": [], \"value\": []}\n",
|
||||
"\n",
|
||||
"for key, ci in test_metrics.items():\n",
|
||||
" if key.endswith(\"extras\"):\n",
|
||||
" CIs[\"metric_name\"].append(key[:-7]) # remove \"_extras\" to get metric name\n",
|
||||
" for ci_key, ci_value in ci.items():\n",
|
||||
" CIs[ci_key].append(ci_value)\n",
|
||||
"\n",
|
||||
"pd.DataFrame(CIs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Confidence Interval for Regression\n",
|
||||
"\n",
|
||||
"Confidence intervals are also supported for regression runs and all confidence intervals can be found in UI. You can find it by following the exact same steps as you do for a classification run. Here we only provide example code for a regression run, screen shots of the confidence intervals, and retrieve it with codes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def load_regression_data():\n",
|
||||
" if os.path.exists(\"./data/boston.csv\"):\n",
|
||||
" print(\"Find downloaded dataset. Loading\")\n",
|
||||
" else:\n",
|
||||
" print(\"Downloading dataset\")\n",
|
||||
" os.makedirs(\"./data\", exist_ok=True)\n",
|
||||
" regression_data = sklearn.datasets.load_boston()\n",
|
||||
" X = regression_data[\"data\"]\n",
|
||||
" y = regression_data[\"target\"]\n",
|
||||
" full_data = np.concatenate([X, y.reshape(-1, 1)], axis=1)\n",
|
||||
" columns = [\"feature_{}\".format(i) for i in range(X.shape[1])] + [\"label\"]\n",
|
||||
" full_data = pd.DataFrame(data=full_data, columns=columns)\n",
|
||||
" full_data.to_csv(\"./data/boston.csv\", index=False)\n",
|
||||
" print(\"Dataset downloaded\")\n",
|
||||
" ws = Workspace.from_config()\n",
|
||||
" datastore = ws.get_default_datastore()\n",
|
||||
" datastore.upload(\n",
|
||||
" src_dir=\"./data\", target_path=\"data/new-metric-features/\", overwrite=True\n",
|
||||
" )\n",
|
||||
" data = Dataset.Tabular.from_delimited_files(\n",
|
||||
" path=[(datastore, (\"data/new-metric-features/boston.csv\"))]\n",
|
||||
" )\n",
|
||||
" train, test = data.random_split(percentage=0.8, seed=101)\n",
|
||||
" validation, test = test.random_split(percentage=0.5, seed=47)\n",
|
||||
" return train, validation, test, \"label\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"boston_train, boston_validation, boston_test, label_column_name = load_regression_data()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_settings = {\n",
|
||||
" \"primary_metric\": \"normalized_root_mean_squared_error\",\n",
|
||||
" \"enable_early_stopping\": True,\n",
|
||||
" \"max_concurrent_iterations\": 6,\n",
|
||||
" \"experiment_timeout_hours\": 0.25,\n",
|
||||
" \"verbosity\": logging.INFO,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(\n",
|
||||
" task=\"regression\",\n",
|
||||
" debug_log=\"automl_errors.log\",\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=boston_train,\n",
|
||||
" validation_data=boston_validation,\n",
|
||||
" test_data=boston_test, # if you only have a test set, you can pass validation set here, instead of at validation_data\n",
|
||||
" label_column_name=label_column_name,\n",
|
||||
" enable_metric_confidence=True,\n",
|
||||
" **automl_settings\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"regression_run = experiment.submit(automl_config, show_output=False)\n",
|
||||
"regression_run.wait_for_completion(show_output=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"best_run, fitted_model = regression_run.get_output()\n",
|
||||
"test_run = next(best_run.get_children(type=\"automl.model_test\"))\n",
|
||||
"test_run.wait_for_completion(show_output=False, wait_post_processing=True)\n",
|
||||
"test_metrics = test_run.get_metrics()\n",
|
||||
"\n",
|
||||
"CIs = {\"metric_name\": [], \"lower_ci_95\": [], \"upper_ci_95\": [], \"value\": []}\n",
|
||||
"\n",
|
||||
"for key, ci in test_metrics.items():\n",
|
||||
" if key.endswith(\"extras\"):\n",
|
||||
" CIs[\"metric_name\"].append(key[:-7]) # remove \"_extras\" to get metric name\n",
|
||||
" for ci_key, ci_value in ci.items():\n",
|
||||
" CIs[ci_key].append(ci_value)\n",
|
||||
"\n",
|
||||
"pd.DataFrame(CIs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "lifengwei"
|
||||
}
|
||||
],
|
||||
"category": "tutorial",
|
||||
"compute": [
|
||||
"AML Compute"
|
||||
],
|
||||
"datasets": [
|
||||
"Digits",
|
||||
"Boston"
|
||||
],
|
||||
"deployment": [
|
||||
"None"
|
||||
],
|
||||
"exclude_from_index": false,
|
||||
"file_extension": ".py",
|
||||
"framework": [
|
||||
"None"
|
||||
],
|
||||
"friendly_name": "New metric features in Azure AutoML",
|
||||
"index_order": 5,
|
||||
"interpreter": {
|
||||
"hash": "cc0892e042a269bcf4aec58f0c86eb5e2be478ff7be4e5f6b2680e2af1718f2e"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.7.0 64-bit ('pypi': conda)",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.0"
|
||||
},
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"tags": [
|
||||
"remote_run",
|
||||
"AutomatedML"
|
||||
],
|
||||
"task": "Classification",
|
||||
"version": "3.6.7"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
After Width: | Height: | Size: 17 KiB |
|
After Width: | Height: | Size: 28 KiB |
|
After Width: | Height: | Size: 19 KiB |
|
After Width: | Height: | Size: 72 KiB |
|
After Width: | Height: | Size: 81 KiB |
|
After Width: | Height: | Size: 31 KiB |
|
After Width: | Height: | Size: 30 KiB |
@@ -1,21 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -78,6 +62,7 @@
|
||||
"import azureml.core\n",
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"\n",
|
||||
"from azureml.automl.core.featurization import FeaturizationConfig\n",
|
||||
"from azureml.train.automl import AutoMLConfig\n",
|
||||
"from azureml.core.dataset import Dataset"
|
||||
@@ -90,16 +75,6 @@
|
||||
"This sample notebook may use features that are not available in previous versions of the Azure ML SDK."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.38.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -109,17 +84,17 @@
|
||||
"ws = Workspace.from_config()\n",
|
||||
"\n",
|
||||
"# Choose a name for the experiment.\n",
|
||||
"experiment_name = 'automl-regression-hardware-explain'\n",
|
||||
"experiment_name = \"automl-regression-hardware-explain\"\n",
|
||||
"experiment = Experiment(ws, experiment_name)\n",
|
||||
"\n",
|
||||
"output = {}\n",
|
||||
"output['Subscription ID'] = ws.subscription_id\n",
|
||||
"output['Workspace Name'] = ws.name\n",
|
||||
"output['Resource Group'] = ws.resource_group\n",
|
||||
"output['Location'] = ws.location\n",
|
||||
"output['Experiment Name'] = experiment.name\n",
|
||||
"pd.set_option('display.max_colwidth', -1)\n",
|
||||
"outputDf = pd.DataFrame(data = output, index = [''])\n",
|
||||
"output[\"Subscription ID\"] = ws.subscription_id\n",
|
||||
"output[\"Workspace Name\"] = ws.name\n",
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Experiment Name\"] = experiment.name\n",
|
||||
"pd.set_option(\"display.max_colwidth\", -1)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
]
|
||||
},
|
||||
@@ -152,12 +127,12 @@
|
||||
"# Verify that cluster does not exist already\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)\n",
|
||||
" print('Found existing cluster, use it.')\n",
|
||||
" print(\"Found existing cluster, use it.\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',\n",
|
||||
" max_nodes=4)\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=\"STANDARD_DS12_V2\", max_nodes=4\n",
|
||||
" )\n",
|
||||
" compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
"compute_target.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
@@ -176,7 +151,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv'\n",
|
||||
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv\"\n",
|
||||
"\n",
|
||||
"dataset = Dataset.Tabular.from_delimited_files(data)\n",
|
||||
"\n",
|
||||
@@ -185,12 +160,20 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"# Register the train dataset with your workspace\n",
|
||||
"train_data.register(workspace = ws, name = 'machineData_train_dataset',\n",
|
||||
" description = 'hardware performance training data',\n",
|
||||
" create_new_version=True)\n",
|
||||
"train_data.register(\n",
|
||||
" workspace=ws,\n",
|
||||
" name=\"machineData_train_dataset\",\n",
|
||||
" description=\"hardware performance training data\",\n",
|
||||
" create_new_version=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Register the test dataset with your workspace\n",
|
||||
"test_data.register(workspace = ws, name = 'machineData_test_dataset', description = 'hardware performance test data', create_new_version=True)\n",
|
||||
"test_data.register(\n",
|
||||
" workspace=ws,\n",
|
||||
" name=\"machineData_test_dataset\",\n",
|
||||
" description=\"hardware performance test data\",\n",
|
||||
" create_new_version=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"label = \"ERP\"\n",
|
||||
"\n",
|
||||
@@ -249,14 +232,18 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"featurization_config = FeaturizationConfig()\n",
|
||||
"featurization_config.blocked_transformers = ['LabelEncoder']\n",
|
||||
"featurization_config.blocked_transformers = [\"LabelEncoder\"]\n",
|
||||
"# featurization_config.drop_columns = ['MMIN']\n",
|
||||
"featurization_config.add_column_purpose('MYCT', 'Numeric')\n",
|
||||
"featurization_config.add_column_purpose('VendorName', 'CategoricalHash')\n",
|
||||
"featurization_config.add_column_purpose(\"MYCT\", \"Numeric\")\n",
|
||||
"featurization_config.add_column_purpose(\"VendorName\", \"CategoricalHash\")\n",
|
||||
"# default strategy mean, add transformer param for for 3 columns\n",
|
||||
"featurization_config.add_transformer_params('Imputer', ['CACH'], {\"strategy\": \"median\"})\n",
|
||||
"featurization_config.add_transformer_params('Imputer', ['CHMIN'], {\"strategy\": \"median\"})\n",
|
||||
"featurization_config.add_transformer_params('Imputer', ['PRP'], {\"strategy\": \"most_frequent\"})\n",
|
||||
"featurization_config.add_transformer_params(\"Imputer\", [\"CACH\"], {\"strategy\": \"median\"})\n",
|
||||
"featurization_config.add_transformer_params(\n",
|
||||
" \"Imputer\", [\"CHMIN\"], {\"strategy\": \"median\"}\n",
|
||||
")\n",
|
||||
"featurization_config.add_transformer_params(\n",
|
||||
" \"Imputer\", [\"PRP\"], {\"strategy\": \"most_frequent\"}\n",
|
||||
")\n",
|
||||
"# featurization_config.add_transformer_params('HashOneHotEncoder', [], {\"number_of_bits\": 3})"
|
||||
]
|
||||
},
|
||||
@@ -276,17 +263,18 @@
|
||||
" \"max_concurrent_iterations\": 4,\n",
|
||||
" \"max_cores_per_iteration\": -1,\n",
|
||||
" \"n_cross_validations\": 5,\n",
|
||||
" \"primary_metric\": 'normalized_root_mean_squared_error',\n",
|
||||
" \"verbosity\": logging.INFO\n",
|
||||
" \"primary_metric\": \"normalized_root_mean_squared_error\",\n",
|
||||
" \"verbosity\": logging.INFO,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(task = 'regression',\n",
|
||||
" debug_log = 'automl_errors.log',\n",
|
||||
"automl_config = AutoMLConfig(\n",
|
||||
" task=\"regression\",\n",
|
||||
" debug_log=\"automl_errors.log\",\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" featurization=featurization_config,\n",
|
||||
" training_data=train_data,\n",
|
||||
" label_column_name=label,\n",
|
||||
" **automl_settings\n",
|
||||
" **automl_settings,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -360,7 +348,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download the featurization summary JSON file locally\n",
|
||||
"best_run.download_file(\"outputs/featurization_summary.json\", \"featurization_summary.json\")\n",
|
||||
"best_run.download_file(\n",
|
||||
" \"outputs/featurization_summary.json\", \"featurization_summary.json\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Render the JSON as a pandas DataFrame\n",
|
||||
"with open(\"featurization_summary.json\", \"r\") as f:\n",
|
||||
@@ -394,6 +384,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"\n",
|
||||
"RunDetails(remote_run).show()"
|
||||
]
|
||||
},
|
||||
@@ -441,7 +432,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('train_explainer.py', 'r') as cefr:\n",
|
||||
"with open(\"train_explainer.py\", \"r\") as cefr:\n",
|
||||
" print(cefr.read())"
|
||||
]
|
||||
},
|
||||
@@ -463,32 +454,36 @@
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# create script folder\n",
|
||||
"script_folder = './sample_projects/automl-regression-hardware'\n",
|
||||
"script_folder = \"./sample_projects/automl-regression-hardware\"\n",
|
||||
"if not os.path.exists(script_folder):\n",
|
||||
" os.makedirs(script_folder)\n",
|
||||
"\n",
|
||||
"# Copy the sample script to script folder.\n",
|
||||
"shutil.copy('train_explainer.py', script_folder)\n",
|
||||
"shutil.copy(\"train_explainer.py\", script_folder)\n",
|
||||
"\n",
|
||||
"# Create the explainer script that will run on the remote compute.\n",
|
||||
"script_file_name = script_folder + '/train_explainer.py'\n",
|
||||
"script_file_name = script_folder + \"/train_explainer.py\"\n",
|
||||
"\n",
|
||||
"# Open the sample script for modification\n",
|
||||
"with open(script_file_name, 'r') as cefr:\n",
|
||||
"with open(script_file_name, \"r\") as cefr:\n",
|
||||
" content = cefr.read()\n",
|
||||
"\n",
|
||||
"# Replace the values in train_explainer.py file with the appropriate values\n",
|
||||
"content = content.replace('<<experiment_name>>', automl_run.experiment.name) # your experiment name.\n",
|
||||
"content = content.replace('<<run_id>>', automl_run.id) # Run-id of the AutoML run for which you want to explain the model.\n",
|
||||
"content = content.replace('<<target_column_name>>', 'ERP') # Your target column name\n",
|
||||
"content = content.replace('<<task>>', 'regression') # Training task type\n",
|
||||
"content = content.replace(\n",
|
||||
" \"<<experiment_name>>\", automl_run.experiment.name\n",
|
||||
") # your experiment name.\n",
|
||||
"content = content.replace(\n",
|
||||
" \"<<run_id>>\", automl_run.id\n",
|
||||
") # Run-id of the AutoML run for which you want to explain the model.\n",
|
||||
"content = content.replace(\"<<target_column_name>>\", \"ERP\") # Your target column name\n",
|
||||
"content = content.replace(\"<<task>>\", \"regression\") # Training task type\n",
|
||||
"# Name of your training dataset register with your workspace\n",
|
||||
"content = content.replace('<<train_dataset_name>>', 'machineData_train_dataset') \n",
|
||||
"content = content.replace(\"<<train_dataset_name>>\", \"machineData_train_dataset\")\n",
|
||||
"# Name of your test dataset register with your workspace\n",
|
||||
"content = content.replace('<<test_dataset_name>>', 'machineData_test_dataset')\n",
|
||||
"content = content.replace(\"<<test_dataset_name>>\", \"machineData_test_dataset\")\n",
|
||||
"\n",
|
||||
"# Write sample file into your script folder.\n",
|
||||
"with open(script_file_name, 'w') as cefw:\n",
|
||||
"with open(script_file_name, \"w\") as cefw:\n",
|
||||
" cefw.write(content)"
|
||||
]
|
||||
},
|
||||
@@ -506,6 +501,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.runconfig import RunConfiguration\n",
|
||||
"from azureml.core.conda_dependencies import CondaDependencies\n",
|
||||
"import pkg_resources\n",
|
||||
"\n",
|
||||
"# create a new RunConfig object\n",
|
||||
"conda_run_config = RunConfiguration(framework=\"python\")\n",
|
||||
@@ -515,7 +512,9 @@
|
||||
"conda_run_config.environment.docker.enabled = True\n",
|
||||
"\n",
|
||||
"# specify CondaDependencies obj\n",
|
||||
"conda_run_config.environment.python.conda_dependencies = automl_run.get_environment().python.conda_dependencies"
|
||||
"conda_run_config.environment.python.conda_dependencies = (\n",
|
||||
" automl_run.get_environment().python.conda_dependencies\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -535,9 +534,11 @@
|
||||
"# Now submit a run on AmlCompute for model explanations\n",
|
||||
"from azureml.core.script_run_config import ScriptRunConfig\n",
|
||||
"\n",
|
||||
"script_run_config = ScriptRunConfig(source_directory=script_folder,\n",
|
||||
" script='train_explainer.py',\n",
|
||||
" run_config=conda_run_config)\n",
|
||||
"script_run_config = ScriptRunConfig(\n",
|
||||
" source_directory=script_folder,\n",
|
||||
" script=\"train_explainer.py\",\n",
|
||||
" run_config=conda_run_config,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"run = experiment.submit(script_run_config)\n",
|
||||
"\n",
|
||||
@@ -579,10 +580,16 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.interpret import ExplanationClient\n",
|
||||
"\n",
|
||||
"client = ExplanationClient.from_run(automl_run)\n",
|
||||
"engineered_explanations = client.download_model_explanation(raw=False, comment='engineered explanations')\n",
|
||||
"engineered_explanations = client.download_model_explanation(\n",
|
||||
" raw=False, comment=\"engineered explanations\"\n",
|
||||
")\n",
|
||||
"print(engineered_explanations.get_feature_importance_dict())\n",
|
||||
"print(\"You can visualize the engineered explanations under the 'Explanations (preview)' tab in the AutoML run at:-\\n\" + automl_run.get_portal_url())"
|
||||
"print(\n",
|
||||
" \"You can visualize the engineered explanations under the 'Explanations (preview)' tab in the AutoML run at:-\\n\"\n",
|
||||
" + automl_run.get_portal_url()\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -599,9 +606,14 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"raw_explanations = client.download_model_explanation(raw=True, comment='raw explanations')\n",
|
||||
"raw_explanations = client.download_model_explanation(\n",
|
||||
" raw=True, comment=\"raw explanations\"\n",
|
||||
")\n",
|
||||
"print(raw_explanations.get_feature_importance_dict())\n",
|
||||
"print(\"You can visualize the raw explanations under the 'Explanations (preview)' tab in the AutoML run at:-\\n\" + automl_run.get_portal_url())"
|
||||
"print(\n",
|
||||
" \"You can visualize the raw explanations under the 'Explanations (preview)' tab in the AutoML run at:-\\n\"\n",
|
||||
" + automl_run.get_portal_url()\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -623,10 +635,12 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Register trained automl model present in the 'outputs' folder in the artifacts\n",
|
||||
"original_model = automl_run.register_model(model_name='automl_model', \n",
|
||||
" model_path='outputs/model.pkl')\n",
|
||||
"scoring_explainer_model = automl_run.register_model(model_name='scoring_explainer',\n",
|
||||
" model_path='outputs/scoring_explainer.pkl')"
|
||||
"original_model = automl_run.register_model(\n",
|
||||
" model_name=\"automl_model\", model_path=\"outputs/model.pkl\"\n",
|
||||
")\n",
|
||||
"scoring_explainer_model = automl_run.register_model(\n",
|
||||
" model_name=\"scoring_explainer\", model_path=\"outputs/scoring_explainer.pkl\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -647,7 +661,6 @@
|
||||
"\n",
|
||||
"with open(\"myenv.yml\", \"w\") as f:\n",
|
||||
" f.write(conda_dep.serialize_to_string())\n",
|
||||
"\n",
|
||||
"with open(\"myenv.yml\", \"r\") as f:\n",
|
||||
" print(f.read())"
|
||||
]
|
||||
@@ -683,22 +696,30 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.webservice import Webservice\n",
|
||||
"from azureml.core.model import InferenceConfig\n",
|
||||
"from azureml.core.webservice import AciWebservice\n",
|
||||
"from azureml.core.model import Model\n",
|
||||
"from azureml.core.environment import Environment\n",
|
||||
"\n",
|
||||
"aciconfig = AciWebservice.deploy_configuration(cpu_cores=2, \n",
|
||||
"aciconfig = AciWebservice.deploy_configuration(\n",
|
||||
" cpu_cores=2,\n",
|
||||
" memory_gb=2,\n",
|
||||
" tags={\"data\": \"Machine Data\", \n",
|
||||
" \"method\" : \"local_explanation\"}, \n",
|
||||
" description='Get local explanations for Machine test data')\n",
|
||||
" tags={\"data\": \"Machine Data\", \"method\": \"local_explanation\"},\n",
|
||||
" description=\"Get local explanations for Machine test data\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"myenv = Environment.from_conda_specification(name=\"myenv\", file_path=\"myenv.yml\")\n",
|
||||
"inference_config = InferenceConfig(entry_script=\"score_explain.py\", environment=myenv)\n",
|
||||
"\n",
|
||||
"# Use configs and models generated above\n",
|
||||
"service = Model.deploy(ws, 'model-scoring', [scoring_explainer_model, original_model], inference_config, aciconfig)\n",
|
||||
"service = Model.deploy(\n",
|
||||
" ws,\n",
|
||||
" \"model-scoring\",\n",
|
||||
" [scoring_explainer_model, original_model],\n",
|
||||
" inference_config,\n",
|
||||
" aciconfig,\n",
|
||||
")\n",
|
||||
"service.wait_for_deployment(show_output=True)"
|
||||
]
|
||||
},
|
||||
@@ -732,19 +753,19 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if service.state == 'Healthy':\n",
|
||||
"if service.state == \"Healthy\":\n",
|
||||
" X_test = test_data.drop_columns([label]).to_pandas_dataframe()\n",
|
||||
" # Serialize the first row of the test data into json\n",
|
||||
" X_test_json = X_test[:1].to_json(orient='records')\n",
|
||||
" X_test_json = X_test[:1].to_json(orient=\"records\")\n",
|
||||
" print(X_test_json)\n",
|
||||
" # Call the service to get the predictions and the engineered and raw explanations\n",
|
||||
" output = service.run(X_test_json)\n",
|
||||
" # Print the predicted value\n",
|
||||
" print(output['predictions'])\n",
|
||||
" print(output[\"predictions\"])\n",
|
||||
" # Print the engineered feature importances for the predicted value\n",
|
||||
" print(output['engineered_local_importance_values'])\n",
|
||||
" print(output[\"engineered_local_importance_values\"])\n",
|
||||
" # Print the raw feature importances for the predicted value\n",
|
||||
" print(output['raw_local_importance_values'])"
|
||||
" print(output[\"raw_local_importance_values\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -780,14 +801,14 @@
|
||||
"# preview the first 3 rows of the dataset\n",
|
||||
"\n",
|
||||
"test_data = test_data.to_pandas_dataframe()\n",
|
||||
"y_test = test_data['ERP'].fillna(0)\n",
|
||||
"test_data = test_data.drop('ERP', 1)\n",
|
||||
"y_test = test_data[\"ERP\"].fillna(0)\n",
|
||||
"test_data = test_data.drop(\"ERP\", 1)\n",
|
||||
"test_data = test_data.fillna(0)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"train_data = train_data.to_pandas_dataframe()\n",
|
||||
"y_train = train_data['ERP'].fillna(0)\n",
|
||||
"train_data = train_data.drop('ERP', 1)\n",
|
||||
"y_train = train_data[\"ERP\"].fillna(0)\n",
|
||||
"train_data = train_data.drop(\"ERP\", 1)\n",
|
||||
"train_data = train_data.fillna(0)"
|
||||
]
|
||||
},
|
||||
@@ -814,27 +835,41 @@
|
||||
"from sklearn.metrics import mean_squared_error, r2_score\n",
|
||||
"\n",
|
||||
"# Set up a multi-plot chart.\n",
|
||||
"f, (a0, a1) = plt.subplots(1, 2, gridspec_kw = {'width_ratios':[1, 1], 'wspace':0, 'hspace': 0})\n",
|
||||
"f.suptitle('Regression Residual Values', fontsize = 18)\n",
|
||||
"f, (a0, a1) = plt.subplots(\n",
|
||||
" 1, 2, gridspec_kw={\"width_ratios\": [1, 1], \"wspace\": 0, \"hspace\": 0}\n",
|
||||
")\n",
|
||||
"f.suptitle(\"Regression Residual Values\", fontsize=18)\n",
|
||||
"f.set_figheight(6)\n",
|
||||
"f.set_figwidth(16)\n",
|
||||
"\n",
|
||||
"# Plot residual values of training set.\n",
|
||||
"a0.axis([0, 360, -100, 100])\n",
|
||||
"a0.plot(y_residual_train, 'bo', alpha = 0.5)\n",
|
||||
"a0.plot([-10,360],[0,0], 'r-', lw = 3)\n",
|
||||
"a0.text(16,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))), fontsize = 12)\n",
|
||||
"a0.text(16,140,'R2 score = {0:.2f}'.format(r2_score(y_train, y_pred_train)),fontsize = 12)\n",
|
||||
"a0.set_xlabel('Training samples', fontsize = 12)\n",
|
||||
"a0.set_ylabel('Residual Values', fontsize = 12)\n",
|
||||
"a0.plot(y_residual_train, \"bo\", alpha=0.5)\n",
|
||||
"a0.plot([-10, 360], [0, 0], \"r-\", lw=3)\n",
|
||||
"a0.text(\n",
|
||||
" 16,\n",
|
||||
" 170,\n",
|
||||
" \"RMSE = {0:.2f}\".format(np.sqrt(mean_squared_error(y_train, y_pred_train))),\n",
|
||||
" fontsize=12,\n",
|
||||
")\n",
|
||||
"a0.text(\n",
|
||||
" 16, 140, \"R2 score = {0:.2f}\".format(r2_score(y_train, y_pred_train)), fontsize=12\n",
|
||||
")\n",
|
||||
"a0.set_xlabel(\"Training samples\", fontsize=12)\n",
|
||||
"a0.set_ylabel(\"Residual Values\", fontsize=12)\n",
|
||||
"\n",
|
||||
"# Plot residual values of test set.\n",
|
||||
"a1.axis([0, 90, -100, 100])\n",
|
||||
"a1.plot(y_residual_test, 'bo', alpha = 0.5)\n",
|
||||
"a1.plot([-10,360],[0,0], 'r-', lw = 3)\n",
|
||||
"a1.text(5,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))), fontsize = 12)\n",
|
||||
"a1.text(5,140,'R2 score = {0:.2f}'.format(r2_score(y_test, y_pred_test)),fontsize = 12)\n",
|
||||
"a1.set_xlabel('Test samples', fontsize = 12)\n",
|
||||
"a1.plot(y_residual_test, \"bo\", alpha=0.5)\n",
|
||||
"a1.plot([-10, 360], [0, 0], \"r-\", lw=3)\n",
|
||||
"a1.text(\n",
|
||||
" 5,\n",
|
||||
" 170,\n",
|
||||
" \"RMSE = {0:.2f}\".format(np.sqrt(mean_squared_error(y_test, y_pred_test))),\n",
|
||||
" fontsize=12,\n",
|
||||
")\n",
|
||||
"a1.text(5, 140, \"R2 score = {0:.2f}\".format(r2_score(y_test, y_pred_test)), fontsize=12)\n",
|
||||
"a1.set_xlabel(\"Test samples\", fontsize=12)\n",
|
||||
"a1.set_yticklabels([])\n",
|
||||
"\n",
|
||||
"plt.show()"
|
||||
@@ -847,9 +882,11 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"test_pred = plt.scatter(y_test, y_pred_test, color='')\n",
|
||||
"test_test = plt.scatter(y_test, y_test, color='g')\n",
|
||||
"plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
|
||||
"test_pred = plt.scatter(y_test, y_pred_test, color=\"\")\n",
|
||||
"test_test = plt.scatter(y_test, y_test, color=\"g\")\n",
|
||||
"plt.legend(\n",
|
||||
" (test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8\n",
|
||||
")\n",
|
||||
"plt.show()"
|
||||
]
|
||||
}
|
||||
@@ -881,9 +918,9 @@
|
||||
"friendly_name": "Automated ML run with featurization and model explainability.",
|
||||
"index_order": 5,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
import pandas as pd
|
||||
import joblib
|
||||
from azureml.core.model import Model
|
||||
from azureml.train.automl.runtime.automl_explain_utilities import automl_setup_model_explanations
|
||||
from azureml.train.automl.runtime.automl_explain_utilities import (
|
||||
automl_setup_model_explanations,
|
||||
)
|
||||
|
||||
|
||||
def init():
|
||||
@@ -11,8 +13,8 @@ def init():
|
||||
|
||||
# Retrieve the path to the model file using the model name
|
||||
# Assume original model is named original_prediction_model
|
||||
automl_model_path = Model.get_model_path('automl_model')
|
||||
scoring_explainer_path = Model.get_model_path('scoring_explainer')
|
||||
automl_model_path = Model.get_model_path("automl_model")
|
||||
scoring_explainer_path = Model.get_model_path("scoring_explainer")
|
||||
|
||||
automl_model = joblib.load(automl_model_path)
|
||||
scoring_explainer = joblib.load(scoring_explainer_path)
|
||||
@@ -20,17 +22,24 @@ def init():
|
||||
|
||||
def run(raw_data):
|
||||
# Get predictions and explanations for each data point
|
||||
data = pd.read_json(raw_data, orient='records')
|
||||
data = pd.read_json(raw_data, orient="records")
|
||||
# Make prediction
|
||||
predictions = automl_model.predict(data)
|
||||
# Setup for inferencing explanations
|
||||
automl_explainer_setup_obj = automl_setup_model_explanations(automl_model,
|
||||
X_test=data, task='regression')
|
||||
automl_explainer_setup_obj = automl_setup_model_explanations(
|
||||
automl_model, X_test=data, task="regression"
|
||||
)
|
||||
# Retrieve model explanations for engineered explanations
|
||||
engineered_local_importance_values = scoring_explainer.explain(automl_explainer_setup_obj.X_test_transform)
|
||||
engineered_local_importance_values = scoring_explainer.explain(
|
||||
automl_explainer_setup_obj.X_test_transform
|
||||
)
|
||||
# Retrieve model explanations for raw explanations
|
||||
raw_local_importance_values = scoring_explainer.explain(automl_explainer_setup_obj.X_test_transform, get_raw=True)
|
||||
raw_local_importance_values = scoring_explainer.explain(
|
||||
automl_explainer_setup_obj.X_test_transform, get_raw=True
|
||||
)
|
||||
# You can return any data type as long as it is JSON-serializable
|
||||
return {'predictions': predictions.tolist(),
|
||||
'engineered_local_importance_values': engineered_local_importance_values,
|
||||
'raw_local_importance_values': raw_local_importance_values}
|
||||
return {
|
||||
"predictions": predictions.tolist(),
|
||||
"engineered_local_importance_values": engineered_local_importance_values,
|
||||
"raw_local_importance_values": raw_local_importance_values,
|
||||
}
|
||||
|
||||
@@ -10,11 +10,13 @@ from azureml.core.dataset import Dataset
|
||||
from azureml.core.run import Run
|
||||
from azureml.interpret.mimic_wrapper import MimicWrapper
|
||||
from azureml.interpret.scoring.scoring_explainer import TreeScoringExplainer
|
||||
from azureml.train.automl.runtime.automl_explain_utilities import automl_setup_model_explanations, \
|
||||
automl_check_model_if_explainable
|
||||
from azureml.train.automl.runtime.automl_explain_utilities import (
|
||||
automl_setup_model_explanations,
|
||||
automl_check_model_if_explainable,
|
||||
)
|
||||
|
||||
|
||||
OUTPUT_DIR = './outputs/'
|
||||
OUTPUT_DIR = "./outputs/"
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
# Get workspace from the run context
|
||||
@@ -22,63 +24,77 @@ run = Run.get_context()
|
||||
ws = run.experiment.workspace
|
||||
|
||||
# Get the AutoML run object from the experiment name and the workspace
|
||||
experiment = Experiment(ws, '<<experiment_name>>')
|
||||
automl_run = Run(experiment=experiment, run_id='<<run_id>>')
|
||||
experiment = Experiment(ws, "<<experiment_name>>")
|
||||
automl_run = Run(experiment=experiment, run_id="<<run_id>>")
|
||||
|
||||
# Check if this AutoML model is explainable
|
||||
if not automl_check_model_if_explainable(automl_run):
|
||||
raise Exception("Model explanations are currently not supported for " + automl_run.get_properties().get(
|
||||
'run_algorithm'))
|
||||
raise Exception(
|
||||
"Model explanations are currently not supported for "
|
||||
+ automl_run.get_properties().get("run_algorithm")
|
||||
)
|
||||
|
||||
# Download the best model from the artifact store
|
||||
automl_run.download_file(name=MODEL_PATH, output_file_path='model.pkl')
|
||||
automl_run.download_file(name=MODEL_PATH, output_file_path="model.pkl")
|
||||
|
||||
# Load the AutoML model into memory
|
||||
fitted_model = joblib.load('model.pkl')
|
||||
fitted_model = joblib.load("model.pkl")
|
||||
|
||||
# Get the train dataset from the workspace
|
||||
train_dataset = Dataset.get_by_name(workspace=ws, name='<<train_dataset_name>>')
|
||||
train_dataset = Dataset.get_by_name(workspace=ws, name="<<train_dataset_name>>")
|
||||
# Drop the labeled column to get the training set.
|
||||
X_train = train_dataset.drop_columns(columns=['<<target_column_name>>'])
|
||||
y_train = train_dataset.keep_columns(columns=['<<target_column_name>>'], validate=True)
|
||||
X_train = train_dataset.drop_columns(columns=["<<target_column_name>>"])
|
||||
y_train = train_dataset.keep_columns(columns=["<<target_column_name>>"], validate=True)
|
||||
|
||||
# Get the test dataset from the workspace
|
||||
test_dataset = Dataset.get_by_name(workspace=ws, name='<<test_dataset_name>>')
|
||||
test_dataset = Dataset.get_by_name(workspace=ws, name="<<test_dataset_name>>")
|
||||
# Drop the labeled column to get the testing set.
|
||||
X_test = test_dataset.drop_columns(columns=['<<target_column_name>>'])
|
||||
X_test = test_dataset.drop_columns(columns=["<<target_column_name>>"])
|
||||
|
||||
# Setup the class for explaining the AutoML models
|
||||
automl_explainer_setup_obj = automl_setup_model_explanations(fitted_model, '<<task>>',
|
||||
X=X_train, X_test=X_test,
|
||||
y=y_train,
|
||||
automl_run=automl_run)
|
||||
automl_explainer_setup_obj = automl_setup_model_explanations(
|
||||
fitted_model, "<<task>>", X=X_train, X_test=X_test, y=y_train, automl_run=automl_run
|
||||
)
|
||||
|
||||
# Initialize the Mimic Explainer
|
||||
explainer = MimicWrapper(ws, automl_explainer_setup_obj.automl_estimator, LGBMExplainableModel,
|
||||
explainer = MimicWrapper(
|
||||
ws,
|
||||
automl_explainer_setup_obj.automl_estimator,
|
||||
LGBMExplainableModel,
|
||||
init_dataset=automl_explainer_setup_obj.X_transform,
|
||||
run=automl_explainer_setup_obj.automl_run,
|
||||
features=automl_explainer_setup_obj.engineered_feature_names,
|
||||
feature_maps=[automl_explainer_setup_obj.feature_map],
|
||||
classes=automl_explainer_setup_obj.classes)
|
||||
classes=automl_explainer_setup_obj.classes,
|
||||
)
|
||||
|
||||
# Compute the engineered explanations
|
||||
engineered_explanations = explainer.explain(['local', 'global'], tag='engineered explanations',
|
||||
eval_dataset=automl_explainer_setup_obj.X_test_transform)
|
||||
engineered_explanations = explainer.explain(
|
||||
["local", "global"],
|
||||
tag="engineered explanations",
|
||||
eval_dataset=automl_explainer_setup_obj.X_test_transform,
|
||||
)
|
||||
|
||||
# Compute the raw explanations
|
||||
raw_explanations = explainer.explain(['local', 'global'], get_raw=True, tag='raw explanations',
|
||||
raw_explanations = explainer.explain(
|
||||
["local", "global"],
|
||||
get_raw=True,
|
||||
tag="raw explanations",
|
||||
raw_feature_names=automl_explainer_setup_obj.raw_feature_names,
|
||||
eval_dataset=automl_explainer_setup_obj.X_test_transform,
|
||||
raw_eval_dataset=automl_explainer_setup_obj.X_test_raw)
|
||||
raw_eval_dataset=automl_explainer_setup_obj.X_test_raw,
|
||||
)
|
||||
|
||||
print("Engineered and raw explanations computed successfully")
|
||||
|
||||
# Initialize the ScoringExplainer
|
||||
scoring_explainer = TreeScoringExplainer(explainer.explainer, feature_maps=[automl_explainer_setup_obj.feature_map])
|
||||
scoring_explainer = TreeScoringExplainer(
|
||||
explainer.explainer, feature_maps=[automl_explainer_setup_obj.feature_map]
|
||||
)
|
||||
|
||||
# Pickle scoring explainer locally
|
||||
with open('scoring_explainer.pkl', 'wb') as stream:
|
||||
with open("scoring_explainer.pkl", "wb") as stream:
|
||||
joblib.dump(scoring_explainer, stream)
|
||||
|
||||
# Upload the scoring explainer to the automl run
|
||||
automl_run.upload_file('outputs/scoring_explainer.pkl', 'scoring_explainer.pkl')
|
||||
automl_run.upload_file("outputs/scoring_explainer.pkl", "scoring_explainer.pkl")
|
||||
|
||||
@@ -1,21 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -86,16 +70,6 @@
|
||||
"This sample notebook may use features that are not available in previous versions of the Azure ML SDK."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"This notebook was created using version 1.38.0 of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -105,18 +79,18 @@
|
||||
"ws = Workspace.from_config()\n",
|
||||
"\n",
|
||||
"# Choose a name for the experiment.\n",
|
||||
"experiment_name = 'automl-regression'\n",
|
||||
"experiment_name = \"automl-regression\"\n",
|
||||
"\n",
|
||||
"experiment = Experiment(ws, experiment_name)\n",
|
||||
"\n",
|
||||
"output = {}\n",
|
||||
"output['Subscription ID'] = ws.subscription_id\n",
|
||||
"output['Workspace'] = ws.name\n",
|
||||
"output['Resource Group'] = ws.resource_group\n",
|
||||
"output['Location'] = ws.location\n",
|
||||
"output['Run History Name'] = experiment_name\n",
|
||||
"pd.set_option('display.max_colwidth', -1)\n",
|
||||
"outputDf = pd.DataFrame(data = output, index = [''])\n",
|
||||
"output[\"Subscription ID\"] = ws.subscription_id\n",
|
||||
"output[\"Workspace\"] = ws.name\n",
|
||||
"output[\"Resource Group\"] = ws.resource_group\n",
|
||||
"output[\"Location\"] = ws.location\n",
|
||||
"output[\"Run History Name\"] = experiment_name\n",
|
||||
"pd.set_option(\"display.max_colwidth\", -1)\n",
|
||||
"outputDf = pd.DataFrame(data=output, index=[\"\"])\n",
|
||||
"outputDf.T"
|
||||
]
|
||||
},
|
||||
@@ -143,10 +117,11 @@
|
||||
"# Verify that cluster does not exist already\n",
|
||||
"try:\n",
|
||||
" compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)\n",
|
||||
" print('Found existing cluster, use it.')\n",
|
||||
" print(\"Found existing cluster, use it.\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',\n",
|
||||
" max_nodes=4)\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=\"STANDARD_DS12_V2\", max_nodes=4\n",
|
||||
" )\n",
|
||||
" compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n",
|
||||
"\n",
|
||||
"compute_target.wait_for_completion(show_output=True)"
|
||||
@@ -179,7 +154,7 @@
|
||||
"# Split the dataset into train and test datasets\n",
|
||||
"train_data, test_data = dataset.random_split(percentage=0.8, seed=223)\n",
|
||||
"\n",
|
||||
"label = \"ERP\"\n"
|
||||
"label = \"ERP\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -213,7 +188,7 @@
|
||||
"source": [
|
||||
"automl_settings = {\n",
|
||||
" \"n_cross_validations\": 3,\n",
|
||||
" \"primary_metric\": 'normalized_root_mean_squared_error',\n",
|
||||
" \"primary_metric\": \"r2_score\",\n",
|
||||
" \"enable_early_stopping\": True,\n",
|
||||
" \"experiment_timeout_hours\": 0.3, # for real scenarios we reccommend a timeout of at least one hour\n",
|
||||
" \"max_concurrent_iterations\": 4,\n",
|
||||
@@ -221,11 +196,12 @@
|
||||
" \"verbosity\": logging.INFO,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(task = 'regression',\n",
|
||||
"automl_config = AutoMLConfig(\n",
|
||||
" task=\"regression\",\n",
|
||||
" compute_target=compute_target,\n",
|
||||
" training_data=train_data,\n",
|
||||
" label_column_name=label,\n",
|
||||
" **automl_settings\n",
|
||||
" **automl_settings,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -281,6 +257,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"\n",
|
||||
"RunDetails(remote_run).show()"
|
||||
]
|
||||
},
|
||||
@@ -366,12 +343,12 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_test = test_data.keep_columns('ERP').to_pandas_dataframe()\n",
|
||||
"test_data = test_data.drop_columns('ERP').to_pandas_dataframe()\n",
|
||||
"y_test = test_data.keep_columns(\"ERP\").to_pandas_dataframe()\n",
|
||||
"test_data = test_data.drop_columns(\"ERP\").to_pandas_dataframe()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"y_train = train_data.keep_columns('ERP').to_pandas_dataframe()\n",
|
||||
"train_data = train_data.drop_columns('ERP').to_pandas_dataframe()\n"
|
||||
"y_train = train_data.keep_columns(\"ERP\").to_pandas_dataframe()\n",
|
||||
"train_data = train_data.drop_columns(\"ERP\").to_pandas_dataframe()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -397,27 +374,41 @@
|
||||
"from sklearn.metrics import mean_squared_error, r2_score\n",
|
||||
"\n",
|
||||
"# Set up a multi-plot chart.\n",
|
||||
"f, (a0, a1) = plt.subplots(1, 2, gridspec_kw = {'width_ratios':[1, 1], 'wspace':0, 'hspace': 0})\n",
|
||||
"f.suptitle('Regression Residual Values', fontsize = 18)\n",
|
||||
"f, (a0, a1) = plt.subplots(\n",
|
||||
" 1, 2, gridspec_kw={\"width_ratios\": [1, 1], \"wspace\": 0, \"hspace\": 0}\n",
|
||||
")\n",
|
||||
"f.suptitle(\"Regression Residual Values\", fontsize=18)\n",
|
||||
"f.set_figheight(6)\n",
|
||||
"f.set_figwidth(16)\n",
|
||||
"\n",
|
||||
"# Plot residual values of training set.\n",
|
||||
"a0.axis([0, 360, -100, 100])\n",
|
||||
"a0.plot(y_residual_train, 'bo', alpha = 0.5)\n",
|
||||
"a0.plot([-10,360],[0,0], 'r-', lw = 3)\n",
|
||||
"a0.text(16,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_train, y_pred_train))), fontsize = 12)\n",
|
||||
"a0.text(16,140,'R2 score = {0:.2f}'.format(r2_score(y_train, y_pred_train)),fontsize = 12)\n",
|
||||
"a0.set_xlabel('Training samples', fontsize = 12)\n",
|
||||
"a0.set_ylabel('Residual Values', fontsize = 12)\n",
|
||||
"a0.plot(y_residual_train, \"bo\", alpha=0.5)\n",
|
||||
"a0.plot([-10, 360], [0, 0], \"r-\", lw=3)\n",
|
||||
"a0.text(\n",
|
||||
" 16,\n",
|
||||
" 170,\n",
|
||||
" \"RMSE = {0:.2f}\".format(np.sqrt(mean_squared_error(y_train, y_pred_train))),\n",
|
||||
" fontsize=12,\n",
|
||||
")\n",
|
||||
"a0.text(\n",
|
||||
" 16, 140, \"R2 score = {0:.2f}\".format(r2_score(y_train, y_pred_train)), fontsize=12\n",
|
||||
")\n",
|
||||
"a0.set_xlabel(\"Training samples\", fontsize=12)\n",
|
||||
"a0.set_ylabel(\"Residual Values\", fontsize=12)\n",
|
||||
"\n",
|
||||
"# Plot residual values of test set.\n",
|
||||
"a1.axis([0, 90, -100, 100])\n",
|
||||
"a1.plot(y_residual_test, 'bo', alpha = 0.5)\n",
|
||||
"a1.plot([-10,360],[0,0], 'r-', lw = 3)\n",
|
||||
"a1.text(5,170,'RMSE = {0:.2f}'.format(np.sqrt(mean_squared_error(y_test, y_pred_test))), fontsize = 12)\n",
|
||||
"a1.text(5,140,'R2 score = {0:.2f}'.format(r2_score(y_test, y_pred_test)),fontsize = 12)\n",
|
||||
"a1.set_xlabel('Test samples', fontsize = 12)\n",
|
||||
"a1.plot(y_residual_test, \"bo\", alpha=0.5)\n",
|
||||
"a1.plot([-10, 360], [0, 0], \"r-\", lw=3)\n",
|
||||
"a1.text(\n",
|
||||
" 5,\n",
|
||||
" 170,\n",
|
||||
" \"RMSE = {0:.2f}\".format(np.sqrt(mean_squared_error(y_test, y_pred_test))),\n",
|
||||
" fontsize=12,\n",
|
||||
")\n",
|
||||
"a1.text(5, 140, \"R2 score = {0:.2f}\".format(r2_score(y_test, y_pred_test)), fontsize=12)\n",
|
||||
"a1.set_xlabel(\"Test samples\", fontsize=12)\n",
|
||||
"a1.set_yticklabels([])\n",
|
||||
"\n",
|
||||
"plt.show()"
|
||||
@@ -430,9 +421,11 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"test_pred = plt.scatter(y_test, y_pred_test, color='')\n",
|
||||
"test_test = plt.scatter(y_test, y_test, color='g')\n",
|
||||
"plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
|
||||
"test_pred = plt.scatter(y_test, y_pred_test, color=\"\")\n",
|
||||
"test_test = plt.scatter(y_test, y_test, color=\"g\")\n",
|
||||
"plt.legend(\n",
|
||||
" (test_pred, test_test), (\"prediction\", \"truth\"), loc=\"upper left\", fontsize=8\n",
|
||||
")\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
@@ -455,9 +448,9 @@
|
||||
"automated-machine-learning"
|
||||
],
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.6",
|
||||
"display_name": "Python 3.6 - AzureML",
|
||||
"language": "python",
|
||||
"name": "python36"
|
||||
"name": "python3-azureml"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
||||
@@ -70,7 +70,7 @@
|
||||
"\n",
|
||||
"import urllib.request\n",
|
||||
"\n",
|
||||
"onnx_model_url = \"https://github.com/onnx/models/blob/main/vision/body_analysis/emotion_ferplus/model/emotion-ferplus-7.tar.gz?raw=true\"\n",
|
||||
"onnx_model_url = \"https://github.com/onnx/models/blob/master/vision/body_analysis/emotion_ferplus/model/emotion-ferplus-7.tar.gz?raw=true\"\n",
|
||||
"\n",
|
||||
"urllib.request.urlretrieve(onnx_model_url, filename=\"emotion-ferplus-7.tar.gz\")\n",
|
||||
"\n",
|
||||
|
||||
@@ -70,7 +70,7 @@
|
||||
"\n",
|
||||
"import urllib.request\n",
|
||||
"\n",
|
||||
"onnx_model_url = \"https://github.com/onnx/models/blob/main/vision/classification/mnist/model/mnist-7.tar.gz?raw=true\"\n",
|
||||
"onnx_model_url = \"https://github.com/onnx/models/blob/master/vision/classification/mnist/model/mnist-7.tar.gz?raw=true\"\n",
|
||||
"\n",
|
||||
"urllib.request.urlretrieve(onnx_model_url, filename=\"mnist-7.tar.gz\")"
|
||||
]
|
||||
|
||||
@@ -5,6 +5,17 @@ import argparse
|
||||
import os
|
||||
from azureml.core import Run
|
||||
|
||||
|
||||
def get_dict(dict_str):
|
||||
pairs = dict_str.strip("{}").split(r'\;')
|
||||
new_dict = {}
|
||||
for pair in pairs:
|
||||
key, value = pair.strip().split(":")
|
||||
new_dict[key.strip().strip("'")] = value.strip().strip("'")
|
||||
|
||||
return new_dict
|
||||
|
||||
|
||||
print("Cleans the input data")
|
||||
|
||||
# Get the input green_taxi_data. To learn more about how to access dataset in your script, please
|
||||
@@ -12,6 +23,7 @@ print("Cleans the input data")
|
||||
run = Run.get_context()
|
||||
raw_data = run.input_datasets["raw_data"]
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser("cleanse")
|
||||
parser.add_argument("--output_cleanse", type=str, help="cleaned taxi data directory")
|
||||
parser.add_argument("--useful_columns", type=str, help="useful columns to keep")
|
||||
@@ -26,8 +38,8 @@ print("Argument 3(output cleansed taxi data path): %s" % args.output_cleanse)
|
||||
# These functions ensure that null data is removed from the dataset,
|
||||
# which will help increase machine learning model accuracy.
|
||||
|
||||
useful_columns = eval(args.useful_columns.replace(';', ','))
|
||||
columns = eval(args.columns.replace(';', ','))
|
||||
useful_columns = [s.strip().strip("'") for s in args.useful_columns.strip("[]").split(r'\;')]
|
||||
columns = get_dict(args.columns)
|
||||
|
||||
new_df = (raw_data.to_pandas_dataframe()
|
||||
.dropna(how='all')
|
||||
|
||||
@@ -254,7 +254,6 @@
|
||||
"- conda-forge\n",
|
||||
"dependencies:\n",
|
||||
"- python=3.6.2\n",
|
||||
"- pip=21.3.1\n",
|
||||
"- pip:\n",
|
||||
" - azureml-defaults\n",
|
||||
" - azureml-opendatasets\n",
|
||||
|
||||
@@ -431,7 +431,6 @@
|
||||
"- conda-forge\n",
|
||||
"dependencies:\n",
|
||||
"- python=3.6.2\n",
|
||||
"- pip=21.3.1\n",
|
||||
"- pip:\n",
|
||||
" - h5py<=2.10.0\n",
|
||||
" - azureml-defaults\n",
|
||||
|
||||
@@ -262,7 +262,6 @@
|
||||
"- conda-forge\n",
|
||||
"dependencies:\n",
|
||||
"- python=3.6.2\n",
|
||||
"- pip=21.3.1\n",
|
||||
"- pip:\n",
|
||||
" - azureml-defaults\n",
|
||||
" - torch==1.6.0\n",
|
||||
|
||||