mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-22 10:35:12 -05:00
270 lines
9.1 KiB
Plaintext
270 lines
9.1 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Copyright (c) Microsoft Corporation. All rights reserved. \n",
|
|
"Licensed under the MIT License."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Use LightGBM Estimator in Azure Machine Learning\n",
|
|
"In this notebook we will demonstrate how to run a training job using LightGBM Estimator. [LightGBM](https://lightgbm.readthedocs.io/en/latest/) is a gradient boosting framework that uses tree based learning algorithms. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Prerequisites\n",
|
|
"This notebook uses azureml-contrib-gbdt package, if you don't already have the package, please install by uncommenting below cell."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#!pip install azureml-contrib-gbdt"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from azureml.core import Workspace, Run, Experiment\n",
|
|
"import shutil, os\n",
|
|
"from azureml.widgets import RunDetails\n",
|
|
"from azureml.contrib.gbdt import LightGBM\n",
|
|
"from azureml.train.dnn import Mpi\n",
|
|
"from azureml.core.compute import AmlCompute, ComputeTarget\n",
|
|
"from azureml.core.compute_target import ComputeTargetException"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"If you are using an AzureML Compute Instance, you are all set. Otherwise, go through the [configuration.ipynb](../../../configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML Workspace"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Set up machine learning resources"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"ws = Workspace.from_config()\n",
|
|
"\n",
|
|
"print('Workspace name: ' + ws.name, \n",
|
|
" 'Azure region: ' + ws.location, \n",
|
|
" 'Subscription id: ' + ws.subscription_id, \n",
|
|
" 'Resource group: ' + ws.resource_group, sep = '\\n')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"cluster_vm_size = \"STANDARD_DS14_V2\"\n",
|
|
"cluster_min_nodes = 0\n",
|
|
"cluster_max_nodes = 20\n",
|
|
"cpu_cluster_name = 'TrainingCompute2' \n",
|
|
"\n",
|
|
"try:\n",
|
|
" cpu_cluster = AmlCompute(ws, cpu_cluster_name)\n",
|
|
" if cpu_cluster and type(cpu_cluster) is AmlCompute:\n",
|
|
" print('found compute target: ' + cpu_cluster_name)\n",
|
|
"except ComputeTargetException:\n",
|
|
" print('creating a new compute target...')\n",
|
|
" provisioning_config = AmlCompute.provisioning_configuration(vm_size = cluster_vm_size, \n",
|
|
" vm_priority = 'lowpriority', \n",
|
|
" min_nodes = cluster_min_nodes, \n",
|
|
" max_nodes = cluster_max_nodes)\n",
|
|
" cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, provisioning_config)\n",
|
|
" \n",
|
|
" # can poll for a minimum number of nodes and for a specific timeout. \n",
|
|
" # if no min node count is provided it will use the scale settings for the cluster\n",
|
|
" cpu_cluster.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)\n",
|
|
" \n",
|
|
" # For a more detailed view of current Azure Machine Learning Compute status, use get_status()\n",
|
|
" print(cpu_cluster.get_status().serialize())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"From this point, you can either upload training data file directly or use Datastore for training data storage\n",
|
|
"## Upload training file from local"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"scripts_folder = \"scripts_folder\"\n",
|
|
"if not os.path.isdir(scripts_folder):\n",
|
|
" os.mkdir(scripts_folder)\n",
|
|
"shutil.copy('./train.conf', os.path.join(scripts_folder, 'train.conf'))\n",
|
|
"shutil.copy('./binary0.train', os.path.join(scripts_folder, 'binary0.train'))\n",
|
|
"shutil.copy('./binary1.train', os.path.join(scripts_folder, 'binary1.train'))\n",
|
|
"shutil.copy('./binary0.test', os.path.join(scripts_folder, 'binary0.test'))\n",
|
|
"shutil.copy('./binary1.test', os.path.join(scripts_folder, 'binary1.test'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"training_data_list=[\"binary0.train\", \"binary1.train\"]\n",
|
|
"validation_data_list = [\"binary0.test\", \"binary1.test\"]\n",
|
|
"lgbm = LightGBM(source_directory=scripts_folder, \n",
|
|
" compute_target=cpu_cluster, \n",
|
|
" distributed_training=Mpi(),\n",
|
|
" node_count=2,\n",
|
|
" lightgbm_config='train.conf',\n",
|
|
" data=training_data_list,\n",
|
|
" valid=validation_data_list\n",
|
|
" )\n",
|
|
"experiment_name = 'lightgbm-estimator-test'\n",
|
|
"experiment = Experiment(ws, name=experiment_name)\n",
|
|
"run = experiment.submit(lgbm, tags={\"test public docker image\": None})\n",
|
|
"RunDetails(run).show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"run.wait_for_completion(show_output=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Use data reference"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from azureml.core.datastore import Datastore\n",
|
|
"from azureml.data.data_reference import DataReference\n",
|
|
"datastore = ws.get_default_datastore()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"datastore.upload(src_dir='.',\n",
|
|
" target_path='.',\n",
|
|
" show_progress=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"training_data_list=[\"binary0.train\", \"binary1.train\"]\n",
|
|
"validation_data_list = [\"binary0.test\", \"binary1.test\"]\n",
|
|
"lgbm = LightGBM(source_directory='.', \n",
|
|
" compute_target=cpu_cluster, \n",
|
|
" distributed_training=Mpi(),\n",
|
|
" node_count=2,\n",
|
|
" inputs=[datastore.as_mount()],\n",
|
|
" lightgbm_config='train.conf',\n",
|
|
" data=training_data_list,\n",
|
|
" valid=validation_data_list\n",
|
|
" )\n",
|
|
"experiment_name = 'lightgbm-estimator-test'\n",
|
|
"experiment = Experiment(ws, name=experiment_name)\n",
|
|
"run = experiment.submit(lgbm, tags={\"use datastore.as_mount()\": None})\n",
|
|
"RunDetails(run).show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"run.wait_for_completion(show_output=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# uncomment below and run if compute resources are no longer needed\n",
|
|
"# cpu_cluster.delete() "
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"authors": [
|
|
{
|
|
"name": "jingywa"
|
|
}
|
|
],
|
|
"kernelspec": {
|
|
"display_name": "Python 3.6",
|
|
"language": "python",
|
|
"name": "python36"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
} |