mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-19 17:17:04 -05:00
updating to use AML base image and system managed dependencies
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -15,21 +15,6 @@ from glob import glob
|
|||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
def initialize_rmm_pool():
|
|
||||||
from librmm_cffi import librmm_config as rmm_cfg
|
|
||||||
|
|
||||||
rmm_cfg.use_pool_allocator = True
|
|
||||||
#rmm_cfg.initial_pool_size = 2<<30 # set to 2GiB. Default is 1/2 total GPU memory
|
|
||||||
import cudf
|
|
||||||
return cudf._gdf.rmm_initialize()
|
|
||||||
|
|
||||||
def initialize_rmm_no_pool():
|
|
||||||
from librmm_cffi import librmm_config as rmm_cfg
|
|
||||||
|
|
||||||
rmm_cfg.use_pool_allocator = False
|
|
||||||
import cudf
|
|
||||||
return cudf._gdf.rmm_initialize()
|
|
||||||
|
|
||||||
def run_dask_task(func, **kwargs):
|
def run_dask_task(func, **kwargs):
|
||||||
task = func(**kwargs)
|
task = func(**kwargs)
|
||||||
return task
|
return task
|
||||||
@@ -207,26 +192,26 @@ def gpu_load_names(col_path):
|
|||||||
|
|
||||||
def create_ever_features(gdf, **kwargs):
|
def create_ever_features(gdf, **kwargs):
|
||||||
everdf = gdf[['loan_id', 'current_loan_delinquency_status']]
|
everdf = gdf[['loan_id', 'current_loan_delinquency_status']]
|
||||||
everdf = everdf.groupby('loan_id', method='hash').max()
|
everdf = everdf.groupby('loan_id', method='hash').max().reset_index()
|
||||||
del(gdf)
|
del(gdf)
|
||||||
everdf['ever_30'] = (everdf['max_current_loan_delinquency_status'] >= 1).astype('int8')
|
everdf['ever_30'] = (everdf['current_loan_delinquency_status'] >= 1).astype('int8')
|
||||||
everdf['ever_90'] = (everdf['max_current_loan_delinquency_status'] >= 3).astype('int8')
|
everdf['ever_90'] = (everdf['current_loan_delinquency_status'] >= 3).astype('int8')
|
||||||
everdf['ever_180'] = (everdf['max_current_loan_delinquency_status'] >= 6).astype('int8')
|
everdf['ever_180'] = (everdf['current_loan_delinquency_status'] >= 6).astype('int8')
|
||||||
everdf.drop_column('max_current_loan_delinquency_status')
|
everdf.drop_column('current_loan_delinquency_status')
|
||||||
return everdf
|
return everdf
|
||||||
|
|
||||||
def create_delinq_features(gdf, **kwargs):
|
def create_delinq_features(gdf, **kwargs):
|
||||||
delinq_gdf = gdf[['loan_id', 'monthly_reporting_period', 'current_loan_delinquency_status']]
|
delinq_gdf = gdf[['loan_id', 'monthly_reporting_period', 'current_loan_delinquency_status']]
|
||||||
del(gdf)
|
del(gdf)
|
||||||
delinq_30 = delinq_gdf.query('current_loan_delinquency_status >= 1')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min()
|
delinq_30 = delinq_gdf.query('current_loan_delinquency_status >= 1')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min().reset_index()
|
||||||
delinq_30['delinquency_30'] = delinq_30['min_monthly_reporting_period']
|
delinq_30['delinquency_30'] = delinq_30['monthly_reporting_period']
|
||||||
delinq_30.drop_column('min_monthly_reporting_period')
|
delinq_30.drop_column('monthly_reporting_period')
|
||||||
delinq_90 = delinq_gdf.query('current_loan_delinquency_status >= 3')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min()
|
delinq_90 = delinq_gdf.query('current_loan_delinquency_status >= 3')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min().reset_index()
|
||||||
delinq_90['delinquency_90'] = delinq_90['min_monthly_reporting_period']
|
delinq_90['delinquency_90'] = delinq_90['monthly_reporting_period']
|
||||||
delinq_90.drop_column('min_monthly_reporting_period')
|
delinq_90.drop_column('monthly_reporting_period')
|
||||||
delinq_180 = delinq_gdf.query('current_loan_delinquency_status >= 6')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min()
|
delinq_180 = delinq_gdf.query('current_loan_delinquency_status >= 6')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min().reset_index()
|
||||||
delinq_180['delinquency_180'] = delinq_180['min_monthly_reporting_period']
|
delinq_180['delinquency_180'] = delinq_180['monthly_reporting_period']
|
||||||
delinq_180.drop_column('min_monthly_reporting_period')
|
delinq_180.drop_column('monthly_reporting_period')
|
||||||
del(delinq_gdf)
|
del(delinq_gdf)
|
||||||
delinq_merge = delinq_30.merge(delinq_90, how='left', on=['loan_id'], type='hash')
|
delinq_merge = delinq_30.merge(delinq_90, how='left', on=['loan_id'], type='hash')
|
||||||
delinq_merge['delinquency_90'] = delinq_merge['delinquency_90'].fillna(np.dtype('datetime64[ms]').type('1970-01-01').astype('datetime64[ms]'))
|
delinq_merge['delinquency_90'] = delinq_merge['delinquency_90'].fillna(np.dtype('datetime64[ms]').type('1970-01-01').astype('datetime64[ms]'))
|
||||||
@@ -279,16 +264,15 @@ def create_joined_df(gdf, everdf, **kwargs):
|
|||||||
def create_12_mon_features(joined_df, **kwargs):
|
def create_12_mon_features(joined_df, **kwargs):
|
||||||
testdfs = []
|
testdfs = []
|
||||||
n_months = 12
|
n_months = 12
|
||||||
|
|
||||||
for y in range(1, n_months + 1):
|
for y in range(1, n_months + 1):
|
||||||
tmpdf = joined_df[['loan_id', 'timestamp_year', 'timestamp_month', 'delinquency_12', 'upb_12']]
|
tmpdf = joined_df[['loan_id', 'timestamp_year', 'timestamp_month', 'delinquency_12', 'upb_12']]
|
||||||
tmpdf['josh_months'] = tmpdf['timestamp_year'] * 12 + tmpdf['timestamp_month']
|
tmpdf['josh_months'] = tmpdf['timestamp_year'] * 12 + tmpdf['timestamp_month']
|
||||||
tmpdf['josh_mody_n'] = ((tmpdf['josh_months'].astype('float64') - 24000 - y) / 12).floor()
|
tmpdf['josh_mody_n'] = ((tmpdf['josh_months'].astype('float64') - 24000 - y) / 12).floor()
|
||||||
tmpdf = tmpdf.groupby(['loan_id', 'josh_mody_n'], method='hash').agg({'delinquency_12': 'max','upb_12': 'min'})
|
tmpdf = tmpdf.groupby(['loan_id', 'josh_mody_n'], method='hash').agg({'delinquency_12': 'max','upb_12': 'min'}).reset_index()
|
||||||
tmpdf['delinquency_12'] = (tmpdf['max_delinquency_12']>3).astype('int32')
|
tmpdf['delinquency_12'] = (tmpdf['delinquency_12']>3).astype('int32')
|
||||||
tmpdf['delinquency_12'] +=(tmpdf['min_upb_12']==0).astype('int32')
|
tmpdf['delinquency_12'] +=(tmpdf['upb_12']==0).astype('int32')
|
||||||
tmpdf.drop_column('max_delinquency_12')
|
tmpdf['upb_12'] = tmpdf['upb_12']
|
||||||
tmpdf['upb_12'] = tmpdf['min_upb_12']
|
|
||||||
tmpdf.drop_column('min_upb_12')
|
|
||||||
tmpdf['timestamp_year'] = (((tmpdf['josh_mody_n'] * n_months) + 24000 + (y - 1)) / 12).floor().astype('int16')
|
tmpdf['timestamp_year'] = (((tmpdf['josh_mody_n'] * n_months) + 24000 + (y - 1)) / 12).floor().astype('int16')
|
||||||
tmpdf['timestamp_month'] = np.int8(y)
|
tmpdf['timestamp_month'] = np.int8(y)
|
||||||
tmpdf.drop_column('josh_mody_n')
|
tmpdf.drop_column('josh_mody_n')
|
||||||
@@ -329,6 +313,7 @@ def last_mile_cleaning(df, **kwargs):
|
|||||||
'delinquency_30', 'delinquency_90', 'delinquency_180', 'upb_12',
|
'delinquency_30', 'delinquency_90', 'delinquency_180', 'upb_12',
|
||||||
'zero_balance_effective_date','foreclosed_after', 'disposition_date','timestamp'
|
'zero_balance_effective_date','foreclosed_after', 'disposition_date','timestamp'
|
||||||
]
|
]
|
||||||
|
|
||||||
for column in drop_list:
|
for column in drop_list:
|
||||||
df.drop_column(column)
|
df.drop_column(column)
|
||||||
for col, dtype in df.dtypes.iteritems():
|
for col, dtype in df.dtypes.iteritems():
|
||||||
@@ -342,7 +327,6 @@ def last_mile_cleaning(df, **kwargs):
|
|||||||
return df.to_arrow(preserve_index=False)
|
return df.to_arrow(preserve_index=False)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
#print('XGBOOST_BUILD_DOC is ' + os.environ['XGBOOST_BUILD_DOC'])
|
|
||||||
parser = argparse.ArgumentParser("rapidssample")
|
parser = argparse.ArgumentParser("rapidssample")
|
||||||
parser.add_argument("--data_dir", type=str, help="location of data")
|
parser.add_argument("--data_dir", type=str, help="location of data")
|
||||||
parser.add_argument("--num_gpu", type=int, help="Number of GPUs to use", default=1)
|
parser.add_argument("--num_gpu", type=int, help="Number of GPUs to use", default=1)
|
||||||
@@ -364,7 +348,6 @@ def main():
|
|||||||
print('data_dir = {0}'.format(data_dir))
|
print('data_dir = {0}'.format(data_dir))
|
||||||
print('num_gpu = {0}'.format(num_gpu))
|
print('num_gpu = {0}'.format(num_gpu))
|
||||||
print('part_count = {0}'.format(part_count))
|
print('part_count = {0}'.format(part_count))
|
||||||
#part_count = part_count + 1 # adding one because the usage below is not inclusive
|
|
||||||
print('end_year = {0}'.format(end_year))
|
print('end_year = {0}'.format(end_year))
|
||||||
print('cpu_predictor = {0}'.format(cpu_predictor))
|
print('cpu_predictor = {0}'.format(cpu_predictor))
|
||||||
|
|
||||||
@@ -380,19 +363,17 @@ def main():
|
|||||||
client
|
client
|
||||||
print(client.ncores())
|
print(client.ncores())
|
||||||
|
|
||||||
# to download data for this notebook, visit https://rapidsai.github.io/demos/datasets/mortgage-data and update the following paths accordingly
|
# to download data for this notebook, visit https://rapidsai.github.io/demos/datasets/mortgage-data and update the following paths accordingly
|
||||||
acq_data_path = "{0}/acq".format(data_dir) #"/rapids/data/mortgage/acq"
|
acq_data_path = "{0}/acq".format(data_dir) #"/rapids/data/mortgage/acq"
|
||||||
perf_data_path = "{0}/perf".format(data_dir) #"/rapids/data/mortgage/perf"
|
perf_data_path = "{0}/perf".format(data_dir) #"/rapids/data/mortgage/perf"
|
||||||
col_names_path = "{0}/names.csv".format(data_dir) # "/rapids/data/mortgage/names.csv"
|
col_names_path = "{0}/names.csv".format(data_dir) # "/rapids/data/mortgage/names.csv"
|
||||||
start_year = 2000
|
start_year = 2000
|
||||||
#end_year = 2000 # end_year is inclusive -- converted to parameter
|
|
||||||
#part_count = 2 # the number of data files to train against -- converted to parameter
|
|
||||||
|
|
||||||
client.run(initialize_rmm_pool)
|
|
||||||
client
|
client
|
||||||
print(client.ncores())
|
print('--->>> Workers used: {0}'.format(client.ncores()))
|
||||||
# NOTE: The ETL calculates additional features which are then dropped before creating the XGBoost DMatrix.
|
|
||||||
# This can be optimized to avoid calculating the dropped features.
|
# NOTE: The ETL calculates additional features which are then dropped before creating the XGBoost DMatrix.
|
||||||
|
# This can be optimized to avoid calculating the dropped features.
|
||||||
print("Reading ...")
|
print("Reading ...")
|
||||||
t1 = datetime.datetime.now()
|
t1 = datetime.datetime.now()
|
||||||
gpu_dfs = []
|
gpu_dfs = []
|
||||||
@@ -414,14 +395,9 @@ def main():
|
|||||||
|
|
||||||
wait(gpu_dfs)
|
wait(gpu_dfs)
|
||||||
t2 = datetime.datetime.now()
|
t2 = datetime.datetime.now()
|
||||||
print("Reading time ...")
|
print("Reading time: {0}".format(str(t2-t1)))
|
||||||
print(t2-t1)
|
print('--->>> Number of data parts: {0}'.format(len(gpu_dfs)))
|
||||||
print('len(gpu_dfs) is {0}'.format(len(gpu_dfs)))
|
|
||||||
|
|
||||||
client.run(cudf._gdf.rmm_finalize)
|
|
||||||
client.run(initialize_rmm_no_pool)
|
|
||||||
client
|
|
||||||
print(client.ncores())
|
|
||||||
dxgb_gpu_params = {
|
dxgb_gpu_params = {
|
||||||
'nround': 100,
|
'nround': 100,
|
||||||
'max_depth': 8,
|
'max_depth': 8,
|
||||||
@@ -438,7 +414,7 @@ def main():
|
|||||||
'n_gpus': 1,
|
'n_gpus': 1,
|
||||||
'distributed_dask': True,
|
'distributed_dask': True,
|
||||||
'loss': 'ls',
|
'loss': 'ls',
|
||||||
'objective': 'gpu:reg:linear',
|
'objective': 'reg:squarederror',
|
||||||
'max_features': 'auto',
|
'max_features': 'auto',
|
||||||
'criterion': 'friedman_mse',
|
'criterion': 'friedman_mse',
|
||||||
'grow_policy': 'lossguide',
|
'grow_policy': 'lossguide',
|
||||||
@@ -446,13 +422,13 @@ def main():
|
|||||||
}
|
}
|
||||||
|
|
||||||
if cpu_predictor:
|
if cpu_predictor:
|
||||||
print('Training using CPUs')
|
print('\n---->>>> Training using CPUs <<<<----\n')
|
||||||
dxgb_gpu_params['predictor'] = 'cpu_predictor'
|
dxgb_gpu_params['predictor'] = 'cpu_predictor'
|
||||||
dxgb_gpu_params['tree_method'] = 'hist'
|
dxgb_gpu_params['tree_method'] = 'hist'
|
||||||
dxgb_gpu_params['objective'] = 'reg:linear'
|
dxgb_gpu_params['objective'] = 'reg:linear'
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print('Training using GPUs')
|
print('\n---->>>> Training using GPUs <<<<----\n')
|
||||||
|
|
||||||
print('Training parameters are {0}'.format(dxgb_gpu_params))
|
print('Training parameters are {0}'.format(dxgb_gpu_params))
|
||||||
|
|
||||||
@@ -481,14 +457,13 @@ def main():
|
|||||||
gpu_dfs = [gpu_df.persist() for gpu_df in gpu_dfs]
|
gpu_dfs = [gpu_df.persist() for gpu_df in gpu_dfs]
|
||||||
gc.collect()
|
gc.collect()
|
||||||
wait(gpu_dfs)
|
wait(gpu_dfs)
|
||||||
|
|
||||||
|
# TRAIN THE MODEL
|
||||||
labels = None
|
labels = None
|
||||||
t1 = datetime.datetime.now()
|
t1 = datetime.datetime.now()
|
||||||
bst = dxgb_gpu.train(client, dxgb_gpu_params, gpu_dfs, labels, num_boost_round=dxgb_gpu_params['nround'])
|
bst = dxgb_gpu.train(client, dxgb_gpu_params, gpu_dfs, labels, num_boost_round=dxgb_gpu_params['nround'])
|
||||||
t2 = datetime.datetime.now()
|
t2 = datetime.datetime.now()
|
||||||
print("Training time ...")
|
print('\n---->>>> Training time: {0} <<<<----\n'.format(str(t2-t1)))
|
||||||
print(t2-t1)
|
|
||||||
print('str(bst) is {0}'.format(str(bst)))
|
|
||||||
print('Exiting script')
|
print('Exiting script')
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
@@ -1,35 +1,48 @@
|
|||||||
name: rapids
|
name: rapids0.9
|
||||||
channels:
|
channels:
|
||||||
- nvidia
|
- nvidia
|
||||||
- numba
|
- rapidsai/label/xgboost
|
||||||
- conda-forge
|
- rapidsai
|
||||||
- rapidsai
|
- conda-forge
|
||||||
- defaults
|
- numba
|
||||||
- pytorch
|
- pytorch
|
||||||
|
|
||||||
dependencies:
|
dependencies:
|
||||||
- arrow-cpp=0.12.0
|
- python=3.7
|
||||||
- bokeh
|
- pytorch
|
||||||
- cffi=1.11.5
|
- cudatoolkit=10.0
|
||||||
- cmake=3.12
|
- dask-cuda=0.9.1
|
||||||
- cuda92
|
- cudf=0.9.*
|
||||||
- cython==0.29
|
- cuml=0.9.*
|
||||||
- dask=1.1.1
|
- cugraph=0.9.*
|
||||||
- distributed=1.25.3
|
- rapidsai/label/xgboost::xgboost=0.90.rapidsdev1
|
||||||
- faiss-gpu=1.5.0
|
- rapidsai/label/xgboost::dask-xgboost=0.2.*
|
||||||
- numba=0.42
|
- conda-forge::numpy=1.16.4
|
||||||
- numpy=1.15.4
|
- cython
|
||||||
- nvstrings
|
- dask
|
||||||
- pandas=0.23.4
|
- distributed=2.3.2
|
||||||
- pyarrow=0.12.0
|
- pynvml=8.0.2
|
||||||
- scikit-learn
|
- gcsfs
|
||||||
- scipy
|
- requests
|
||||||
- cudf
|
- jupyterhub
|
||||||
- cuml
|
- jupyterlab
|
||||||
- python=3.6.2
|
- matplotlib
|
||||||
- jupyterlab
|
- ipywidgets
|
||||||
- pip:
|
- ipyvolume
|
||||||
- file:/rapids/xgboost/python-package/dist/xgboost-0.81-py3-none-any.whl
|
- seaborn
|
||||||
- git+https://github.com/rapidsai/dask-xgboost@dask-cudf
|
- scipy
|
||||||
- git+https://github.com/rapidsai/dask-cudf@master
|
- pandas
|
||||||
- git+https://github.com/rapidsai/dask-cuda@master
|
- boost
|
||||||
|
- nodejs
|
||||||
|
- pytest
|
||||||
|
- pip
|
||||||
|
- pip:
|
||||||
|
- git+https://github.com/cupy/cupy.git
|
||||||
|
- setuptools
|
||||||
|
- torch
|
||||||
|
- torchvision
|
||||||
|
- pytorch-ignite
|
||||||
|
- graphviz
|
||||||
|
- networkx
|
||||||
|
- dask-kubernetes
|
||||||
|
- dask_labextension
|
||||||
|
- jupyterlab-nvdashboard
|
||||||
|
|||||||
Reference in New Issue
Block a user