updating to use AML base image and system managed dependencies

2025-12-19 17:17:04 -05:00 · 2019-09-24 20:47:15 -07:00
parent 6622a6c5f2
commit f252308005
3 changed files with 632 additions and 649 deletions
--- a/contrib/RAPIDS/azure-ml-with-nvidia-rapids.ipynb
+++ b/contrib/RAPIDS/azure-ml-with-nvidia-rapids.ipynb
@@ -9,6 +9,13 @@
    "Licensed under the MIT License."
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/contrib/RAPIDS/azure-ml-with-nvidia-rapids/azure-ml-with-nvidia-rapids.png)"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -20,7 +27,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-        "The [RAPIDS](https://www.developer.nvidia.com/rapids) suite of software libraries from NVIDIA enables the execution of end-to-end data science and analytics pipelines entirely on GPUs. In many machine learning projects, a significant portion of the model training time is spent in setting up the data; this stage of the process is known as Extraction, Transformation and Loading, or ETL. By using the DataFrame API for ETL\u00c3\u201a\u00c2\u00a0and GPU-capable ML algorithms in RAPIDS, data preparation and training models can be done in GPU-accelerated end-to-end pipelines without incurring serialization costs between the pipeline stages. This notebook demonstrates how to use NVIDIA RAPIDS to prepare data and train model\u00c2\u00a0in Azure.\n",
+    "The [RAPIDS](https://www.developer.nvidia.com/rapids) suite of software libraries from NVIDIA enables the execution of end-to-end data science and analytics pipelines entirely on GPUs. In many machine learning projects, a significant portion of the model training time is spent in setting up the data; this stage of the process is known as Extraction, Transformation and Loading, or ETL. By using the DataFrame API for ETL and GPU-capable ML algorithms in RAPIDS, data preparation and training models can be done in GPU-accelerated end-to-end pipelines without incurring serialization costs between the pipeline stages. This notebook demonstrates how to use NVIDIA RAPIDS to prepare data and train modelÂ in Azure.\n",
    " \n",
    "In this notebook, we will do the following:\n",
    " \n",
@@ -119,8 +126,10 @@
   "outputs": [],
   "source": [
    "ws = Workspace.from_config()\n",
+    "\n",
    "# if a locally-saved configuration file for the workspace is not available, use the following to load workspace\n",
    "# ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name)\n",
+    "\n",
    "print('Workspace name: ' + ws.name, \n",
    "      'Azure region: ' + ws.location, \n",
    "      'Subscription id: ' + ws.subscription_id, \n",
@@ -161,7 +170,7 @@
    "if gpu_cluster_name in ws.compute_targets:\n",
    "    gpu_cluster = ws.compute_targets[gpu_cluster_name]\n",
    "    if gpu_cluster and type(gpu_cluster) is AmlCompute:\n",
-        "        print('found compute target. just use it. ' + gpu_cluster_name)\n",
+    "        print('Found compute target. Will use {0} '.format(gpu_cluster_name))\n",
    "else:\n",
    "    print(\"creating new cluster\")\n",
    "    # vm_size parameter below could be modified to one of the RAPIDS-supported VM types\n",
@@ -183,7 +192,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-        "The _process&#95;data.py_ script used in the step below is a slightly modified implementation of [RAPIDS E2E example](https://github.com/rapidsai/notebooks/blob/master/mortgage/E2E.ipynb)."
+    "The _process&#95;data.py_ script used in the step below is a slightly modified implementation of [RAPIDS Mortgage E2E example](https://github.com/rapidsai/notebooks-contrib/blob/master/intermediate_notebooks/E2E/mortgage/mortgage_e2e.ipynb)."
   ]
  },
  {
@@ -194,10 +203,7 @@
   "source": [
    "# copy process_data.py into the script folder\n",
    "import shutil\n",
-        "shutil.copy('./process_data.py', os.path.join(scripts_folder, 'process_data.py'))\n",
-        "\n",
-        "with open(os.path.join(scripts_folder, './process_data.py'), 'r') as process_data_script:\n",
-        "    print(process_data_script.read())"
+    "shutil.copy('./process_data.py', os.path.join(scripts_folder, 'process_data.py'))"
   ]
  },
  {
@@ -221,13 +227,6 @@
    "### Downloading Data"
   ]
  },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "<font color='red'>Important</font>: Python package progressbar2 is necessary to run the following cell. If it is not available in your environment where this notebook is running, please install it."
-      ]
-    },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -237,7 +236,6 @@
    "import tarfile\n",
    "import hashlib\n",
    "from urllib.request import urlretrieve\n",
-        "from progressbar import ProgressBar\n",
    "\n",
    "def validate_downloaded_data(path):\n",
    "    if(os.path.isdir(path) and os.path.exists(path + '//names.csv')) :\n",
@@ -267,7 +265,7 @@
    "        url_format = 'http://rapidsai-data.s3-website.us-east-2.amazonaws.com/notebook-mortgage-data/{0}.tgz'\n",
    "        url = url_format.format(fileroot)\n",
    "        print(\"...Downloading file :{0}\".format(filename))\n",
-        "        urlretrieve(url, filename,show_progress)\n",
+    "        urlretrieve(url, filename)\n",
    "        pbar.finish()\n",
    "        print(\"...File :{0} finished downloading\".format(filename))\n",
    "    else:\n",
@@ -282,9 +280,7 @@
    "    so_far = 0\n",
    "    for member_info in members:\n",
    "        tar.extract(member_info,path=path)\n",
-        "        show_progress(so_far, 1, numFiles)\n",
    "        so_far += 1\n",
-        "    pbar.finish()\n",
    "    print(\"...All {0} files have been decompressed\".format(numFiles))\n",
    "    tar.close()"
   ]
@@ -324,7 +320,9 @@
    "\n",
    "# download and uncompress data in a local directory before uploading to data store\n",
    "# directory specified in src_dir parameter below should have the acq, perf directories with data and names.csv file\n",
-        "ds.upload(src_dir=path, target_path=fileroot, overwrite=True, show_progress=True)\n",
+    "\n",
+    "# ---->>>> UNCOMMENT THE BELOW LINE TO UPLOAD YOUR DATA IF NOT DONE SO ALREADY <<<<----\n",
+    "# ds.upload(src_dir=path, target_path=fileroot, overwrite=True, show_progress=True)\n",
    "\n",
    "# data already uploaded to the datastore\n",
    "data_ref = DataReference(data_reference_name='data', datastore=ds, path_on_datastore=fileroot)"
@@ -360,7 +358,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-        "The following code shows how to use an existing image from [Docker Hub](https://hub.docker.com/r/rapidsai/rapidsai/) that has a prebuilt conda environment named 'rapids' when creating a RunConfiguration. Note that this conda environment does not include azureml-defaults package that is required for using AML functionality like metrics tracking, model management etc. This package is automatically installed when you use 'Specify package dependencies' option and that is why it is the recommended option to create RunConfiguraiton in AML."
+    "The following code shows how to install RAPIDS using conda. The `rapids.yml` file contains the list of packages necessary to run this tutorial. **NOTE:** Initial build of the image might take up to 20 minutes as the service needs to build and cache the new image; once the image is built the subequent runs use the cached image and the overhead is minimal."
   ]
  },
  {
@@ -369,17 +367,13 @@
   "metadata": {},
   "outputs": [],
   "source": [
-        "run_config = RunConfiguration()\n",
+    "cd = CondaDependencies(conda_dependencies_file_path='rapids.yml')\n",
+    "run_config = RunConfiguration(conda_dependencies=cd)\n",
    "run_config.framework = 'python'\n",
-        "run_config.environment.python.user_managed_dependencies = True\n",
-        "run_config.environment.python.interpreter_path = '/conda/envs/rapids/bin/python'\n",
    "run_config.target = gpu_cluster_name\n",
    "run_config.environment.docker.enabled = True\n",
    "run_config.environment.docker.gpu_support = True\n",
-        "run_config.environment.docker.base_image = \"rapidsai/rapidsai:cuda9.2-runtime-ubuntu18.04\"\n",
-        "# run_config.environment.docker.base_image_registry.address = '<registry_url>' # not required if the base_image is in Docker hub\n",
-        "# run_config.environment.docker.base_image_registry.username = '<user_name>' # needed only for private images\n",
-        "# run_config.environment.docker.base_image_registry.password = '<password>' # needed only for private images\n",
+    "run_config.environment.docker.base_image = \"mcr.microsoft.com/azureml/base-gpu:intelmpi2018.3-cuda10.0-cudnn7-ubuntu16.04\"\n",
    "run_config.environment.spark.precache_packages = False\n",
    "run_config.data_references={'data':data_ref.to_config()}"
   ]
@@ -388,14 +382,14 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-        "#### Specify package dependencies"
+    "#### Using Docker"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-        "The following code shows how to list package dependencies in a conda environment definition file (rapids.yml) when creating a RunConfiguration"
+    "Alternatively, you can specify RAPIDS Docker image."
   ]
  },
  {
@@ -404,16 +398,17 @@
   "metadata": {},
   "outputs": [],
   "source": [
-        "# cd = CondaDependencies(conda_dependencies_file_path='rapids.yml')\n",
-        "# run_config = RunConfiguration(conda_dependencies=cd)\n",
+    "# run_config = RunConfiguration()\n",
    "# run_config.framework = 'python'\n",
+    "# run_config.environment.python.user_managed_dependencies = True\n",
+    "# run_config.environment.python.interpreter_path = '/conda/envs/rapids/bin/python'\n",
    "# run_config.target = gpu_cluster_name\n",
    "# run_config.environment.docker.enabled = True\n",
    "# run_config.environment.docker.gpu_support = True\n",
-        "# run_config.environment.docker.base_image = \"<image>\"\n",
-        "# run_config.environment.docker.base_image_registry.address = '<registry_url>' # not required if the base_image is in Docker hub\n",
-        "# run_config.environment.docker.base_image_registry.username = '<user_name>' # needed only for private images\n",
-        "# run_config.environment.docker.base_image_registry.password = '<password>' # needed only for private images\n",
+    "# run_config.environment.docker.base_image = \"rapidsai/rapidsai:cuda9.2-runtime-ubuntu18.04\"\n",
+    "# # run_config.environment.docker.base_image_registry.address = '<registry_url>' # not required if the base_image is in Docker hub\n",
+    "# # run_config.environment.docker.base_image_registry.username = '<user_name>' # needed only for private images\n",
+    "# # run_config.environment.docker.base_image_registry.password = '<password>' # needed only for private images\n",
    "# run_config.environment.spark.precache_packages = False\n",
    "# run_config.data_references={'data':data_ref.to_config()}"
   ]
@@ -551,9 +546,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-      "version": "3.6.6"
+   "version": "3.6.8"
  }
 },
 "nbformat": 4,
-  "nbformat_minor": 2
+ "nbformat_minor": 4
 }
--- a/contrib/RAPIDS/process_data.py
+++ b/contrib/RAPIDS/process_data.py
@@ -15,21 +15,6 @@ from glob import glob
 import os
 import argparse

-def initialize_rmm_pool():
-    from librmm_cffi import librmm_config as rmm_cfg
-
-    rmm_cfg.use_pool_allocator = True
-    #rmm_cfg.initial_pool_size = 2<<30 # set to 2GiB. Default is 1/2 total GPU memory
-    import cudf
-    return cudf._gdf.rmm_initialize()
-
-def initialize_rmm_no_pool():
-    from librmm_cffi import librmm_config as rmm_cfg
-    
-    rmm_cfg.use_pool_allocator = False
-    import cudf
-    return cudf._gdf.rmm_initialize()
-
 def run_dask_task(func, **kwargs):
    task = func(**kwargs)
    return task
@@ -207,26 +192,26 @@ def gpu_load_names(col_path):

 def create_ever_features(gdf, **kwargs):
    everdf = gdf[['loan_id', 'current_loan_delinquency_status']]
-    everdf = everdf.groupby('loan_id', method='hash').max()
+    everdf = everdf.groupby('loan_id', method='hash').max().reset_index()
    del(gdf)
-    everdf['ever_30'] = (everdf['max_current_loan_delinquency_status'] >= 1).astype('int8')
-    everdf['ever_90'] = (everdf['max_current_loan_delinquency_status'] >= 3).astype('int8')
-    everdf['ever_180'] = (everdf['max_current_loan_delinquency_status'] >= 6).astype('int8')
-    everdf.drop_column('max_current_loan_delinquency_status')
+    everdf['ever_30'] = (everdf['current_loan_delinquency_status'] >= 1).astype('int8')
+    everdf['ever_90'] = (everdf['current_loan_delinquency_status'] >= 3).astype('int8')
+    everdf['ever_180'] = (everdf['current_loan_delinquency_status'] >= 6).astype('int8')
+    everdf.drop_column('current_loan_delinquency_status')
    return everdf

 def create_delinq_features(gdf, **kwargs):
    delinq_gdf = gdf[['loan_id', 'monthly_reporting_period', 'current_loan_delinquency_status']]
    del(gdf)
-    delinq_30 = delinq_gdf.query('current_loan_delinquency_status >= 1')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min()
-    delinq_30['delinquency_30'] = delinq_30['min_monthly_reporting_period']
-    delinq_30.drop_column('min_monthly_reporting_period')
-    delinq_90 = delinq_gdf.query('current_loan_delinquency_status >= 3')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min()
-    delinq_90['delinquency_90'] = delinq_90['min_monthly_reporting_period']
-    delinq_90.drop_column('min_monthly_reporting_period')
-    delinq_180 = delinq_gdf.query('current_loan_delinquency_status >= 6')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min()
-    delinq_180['delinquency_180'] = delinq_180['min_monthly_reporting_period']
-    delinq_180.drop_column('min_monthly_reporting_period')
+    delinq_30 = delinq_gdf.query('current_loan_delinquency_status >= 1')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min().reset_index()
+    delinq_30['delinquency_30'] = delinq_30['monthly_reporting_period']
+    delinq_30.drop_column('monthly_reporting_period')
+    delinq_90 = delinq_gdf.query('current_loan_delinquency_status >= 3')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min().reset_index()
+    delinq_90['delinquency_90'] = delinq_90['monthly_reporting_period']
+    delinq_90.drop_column('monthly_reporting_period')
+    delinq_180 = delinq_gdf.query('current_loan_delinquency_status >= 6')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min().reset_index()
+    delinq_180['delinquency_180'] = delinq_180['monthly_reporting_period']
+    delinq_180.drop_column('monthly_reporting_period')
    del(delinq_gdf)
    delinq_merge = delinq_30.merge(delinq_90, how='left', on=['loan_id'], type='hash')
    delinq_merge['delinquency_90'] = delinq_merge['delinquency_90'].fillna(np.dtype('datetime64[ms]').type('1970-01-01').astype('datetime64[ms]'))
@@ -279,16 +264,15 @@ def create_joined_df(gdf, everdf, **kwargs):
 def create_12_mon_features(joined_df, **kwargs):
    testdfs = []
    n_months = 12
+
    for y in range(1, n_months + 1):
        tmpdf = joined_df[['loan_id', 'timestamp_year', 'timestamp_month', 'delinquency_12', 'upb_12']]
        tmpdf['josh_months'] = tmpdf['timestamp_year'] * 12 + tmpdf['timestamp_month']
        tmpdf['josh_mody_n'] = ((tmpdf['josh_months'].astype('float64') - 24000 - y) / 12).floor()
-        tmpdf = tmpdf.groupby(['loan_id', 'josh_mody_n'], method='hash').agg({'delinquency_12': 'max','upb_12': 'min'})
-        tmpdf['delinquency_12'] = (tmpdf['max_delinquency_12']>3).astype('int32')
-        tmpdf['delinquency_12'] +=(tmpdf['min_upb_12']==0).astype('int32')
-        tmpdf.drop_column('max_delinquency_12')
-        tmpdf['upb_12'] = tmpdf['min_upb_12']
-        tmpdf.drop_column('min_upb_12')
+        tmpdf = tmpdf.groupby(['loan_id', 'josh_mody_n'], method='hash').agg({'delinquency_12': 'max','upb_12': 'min'}).reset_index()
+        tmpdf['delinquency_12'] = (tmpdf['delinquency_12']>3).astype('int32')
+        tmpdf['delinquency_12'] +=(tmpdf['upb_12']==0).astype('int32')
+        tmpdf['upb_12'] = tmpdf['upb_12']
        tmpdf['timestamp_year'] = (((tmpdf['josh_mody_n'] * n_months) + 24000 + (y - 1)) / 12).floor().astype('int16')
        tmpdf['timestamp_month'] = np.int8(y)
        tmpdf.drop_column('josh_mody_n')
@@ -329,6 +313,7 @@ def last_mile_cleaning(df, **kwargs):
        'delinquency_30', 'delinquency_90', 'delinquency_180', 'upb_12',
        'zero_balance_effective_date','foreclosed_after', 'disposition_date','timestamp'
    ]
+
    for column in drop_list:
        df.drop_column(column)
    for col, dtype in df.dtypes.iteritems():
@@ -342,7 +327,6 @@ def last_mile_cleaning(df, **kwargs):
    return df.to_arrow(preserve_index=False)

 def main():
-    #print('XGBOOST_BUILD_DOC is ' + os.environ['XGBOOST_BUILD_DOC'])
    parser = argparse.ArgumentParser("rapidssample")
    parser.add_argument("--data_dir", type=str, help="location of data")
    parser.add_argument("--num_gpu", type=int, help="Number of GPUs to use", default=1)
@@ -364,7 +348,6 @@ def main():
    print('data_dir = {0}'.format(data_dir))
    print('num_gpu = {0}'.format(num_gpu))
    print('part_count = {0}'.format(part_count))
-    #part_count = part_count + 1 # adding one because the usage below is not inclusive
    print('end_year = {0}'.format(end_year))
    print('cpu_predictor = {0}'.format(cpu_predictor))
    
@@ -385,12 +368,10 @@ def main():
    perf_data_path = "{0}/perf".format(data_dir) #"/rapids/data/mortgage/perf"
    col_names_path = "{0}/names.csv".format(data_dir) # "/rapids/data/mortgage/names.csv"
    start_year = 2000
-#end_year = 2000 # end_year is inclusive -- converted to parameter
-#part_count = 2 # the number of data files to train against -- converted to parameter

-    client.run(initialize_rmm_pool)
    client
-    print(client.ncores())
+    print('--->>> Workers used: {0}'.format(client.ncores()))
+
    # NOTE: The ETL calculates additional features which are then dropped before creating the XGBoost DMatrix.
    # This can be optimized to avoid calculating the dropped features.
    print("Reading ...")
@@ -414,14 +395,9 @@ def main():
            
    wait(gpu_dfs)
    t2 = datetime.datetime.now()
-    print("Reading time ...")
-    print(t2-t1)
-    print('len(gpu_dfs) is {0}'.format(len(gpu_dfs)))
+    print("Reading time: {0}".format(str(t2-t1)))
+    print('--->>> Number of data parts: {0}'.format(len(gpu_dfs)))

-    client.run(cudf._gdf.rmm_finalize)
-    client.run(initialize_rmm_no_pool)
-    client
-    print(client.ncores())
    dxgb_gpu_params = {
        'nround':            100,
        'max_depth':         8,
@@ -438,7 +414,7 @@ def main():
        'n_gpus':            1, 
        'distributed_dask':  True,
        'loss':              'ls',
-        'objective':         'gpu:reg:linear',
+        'objective':         'reg:squarederror',
        'max_features':      'auto',
        'criterion':         'friedman_mse',
        'grow_policy':       'lossguide',
@@ -446,13 +422,13 @@ def main():
    }
      
    if cpu_predictor:
-        print('Training using CPUs')
+        print('\n---->>>> Training using CPUs <<<<----\n')
        dxgb_gpu_params['predictor'] = 'cpu_predictor'
        dxgb_gpu_params['tree_method'] = 'hist'
        dxgb_gpu_params['objective'] = 'reg:linear'
        
    else:
-        print('Training using GPUs')
+        print('\n---->>>> Training using GPUs <<<<----\n')
    
    print('Training parameters are {0}'.format(dxgb_gpu_params))
    
@@ -482,13 +458,12 @@ def main():
    gc.collect()
    wait(gpu_dfs)

+    # TRAIN THE MODEL
    labels = None
    t1 = datetime.datetime.now()
    bst = dxgb_gpu.train(client, dxgb_gpu_params, gpu_dfs, labels, num_boost_round=dxgb_gpu_params['nround'])
    t2 = datetime.datetime.now()
-    print("Training time ...")
-    print(t2-t1)
-    print('str(bst) is {0}'.format(str(bst)))
+    print('\n---->>>> Training time: {0} <<<<----\n'.format(str(t2-t1)))
    print('Exiting script')

 if __name__ == '__main__':
--- a/contrib/RAPIDS/rapids.yml
+++ b/contrib/RAPIDS/rapids.yml
@@ -1,35 +1,48 @@
-name: rapids
+name: rapids0.9
 channels:
  - nvidia
- numba
- conda-forge
+  - rapidsai/label/xgboost
  - rapidsai
- defaults
+  - conda-forge
+  - numba
  - pytorch
-
 dependencies:
- arrow-cpp=0.12.0
- bokeh
- cffi=1.11.5
- cmake=3.12
- cuda92
- cython==0.29
- dask=1.1.1
- distributed=1.25.3
- faiss-gpu=1.5.0
- numba=0.42
- numpy=1.15.4
- nvstrings
- pandas=0.23.4
- pyarrow=0.12.0
- scikit-learn
- scipy
- cudf
- cuml
- python=3.6.2
+  - python=3.7
+  - pytorch
+  - cudatoolkit=10.0
+  - dask-cuda=0.9.1
+  - cudf=0.9.*
+  - cuml=0.9.*
+  - cugraph=0.9.*
+  - rapidsai/label/xgboost::xgboost=0.90.rapidsdev1
+  - rapidsai/label/xgboost::dask-xgboost=0.2.*
+  - conda-forge::numpy=1.16.4
+  - cython
+  - dask
+  - distributed=2.3.2
+  - pynvml=8.0.2
+  - gcsfs
+  - requests
+  - jupyterhub
  - jupyterlab
+  - matplotlib
+  - ipywidgets
+  - ipyvolume
+  - seaborn
+  - scipy
+  - pandas
+  - boost
+  - nodejs
+  - pytest
+  - pip
  - pip:
-  - file:/rapids/xgboost/python-package/dist/xgboost-0.81-py3-none-any.whl
-  - git+https://github.com/rapidsai/dask-xgboost@dask-cudf
-  - git+https://github.com/rapidsai/dask-cudf@master
-  - git+https://github.com/rapidsai/dask-cuda@master
+     - git+https://github.com/cupy/cupy.git
+     - setuptools
+     - torch
+     - torchvision
+     - pytorch-ignite
+     - graphviz
+     - networkx
+     - dask-kubernetes
+     - dask_labextension
+     - jupyterlab-nvdashboard