Updated notebooks to use dataprep

This commit is contained in:
Jeff Shepherd
2019-06-26 14:23:20 -07:00
parent cd3c980a6e
commit 3d2552174d
4 changed files with 3033 additions and 3087 deletions

View File

@@ -77,6 +77,7 @@
"import pandas as pd\n", "import pandas as pd\n",
"import os\n", "import os\n",
"from sklearn import datasets\n", "from sklearn import datasets\n",
"import azureml.dataprep as dprep\n",
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
"\n", "\n",
"import azureml.core\n", "import azureml.core\n",
@@ -220,30 +221,12 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"%%writefile $project_folder/get_data.py\n", "data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv\"\n",
"\n", "dflow = dprep.auto_read_file(data)\n",
"import pandas as pd\n", "dflow.get_profile()\n",
"from sklearn.model_selection import train_test_split\n", "X_train = dflow.drop_columns(columns=['y'])\n",
"\n", "y_train = dflow.keep_columns(columns=['y'], validate_column_exists=True)\n",
"def _read_x_y(file_name, label_col):\n", "dflow.head()"
" df = pd.read_csv(file_name)\n",
" y = None\n",
" if label_col in df.columns:\n",
" y = df.pop(label_col)\n",
" y = y.values[:, None]\n",
" X = df.values\n",
" return X, y\n",
" \n",
"def get_data():\n",
" # Load the bank marketing datasets.\n",
" from sklearn.datasets import load_diabetes\n",
" from sklearn.model_selection import train_test_split\n",
"\n",
" X_train, y_train = _read_x_y('https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv', \"y\")\n",
"\n",
" columns = ['age','job','marital','education','default','housing','loan','contact','month','day_of_week','duration','campaign','pdays','previous','poutcome','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','y']\n",
"\n",
" return { \"X\" : X_train, \"y\" : y_train[:,0] }"
] ]
}, },
{ {
@@ -288,7 +271,8 @@
" debug_log = 'automl_errors.log',\n", " debug_log = 'automl_errors.log',\n",
" path = project_folder,\n", " path = project_folder,\n",
" run_configuration=conda_run_config,\n", " run_configuration=conda_run_config,\n",
" data_script = project_folder + \"/get_data.py\",\n", " X = X_train,\n",
" y = y_train,\n",
" **automl_settings\n", " **automl_settings\n",
" )" " )"
] ]
@@ -631,14 +615,10 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def _read_x_y(file_name, label_col):\n", "# Load the bank marketing datasets.\n",
" df = pd.read_csv(file_name)\n", "from sklearn.datasets import load_diabetes\n",
" y = None\n", "from sklearn.model_selection import train_test_split\n",
" if label_col in df.columns:\n", "from numpy import array"
" y = df.pop(label_col)\n",
" y = y.values[:, None]\n",
" X = df.values\n",
" return X, y"
] ]
}, },
{ {
@@ -647,15 +627,22 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Load the bank marketing datasets.\n", "data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv\"\n",
"from sklearn.datasets import load_diabetes\n", "dflow = dprep.auto_read_file(data)\n",
"from sklearn.model_selection import train_test_split\n", "dflow.get_profile()\n",
"from numpy import array\n", "X_test = dflow.drop_columns(columns=['y'])\n",
"\n", "y_test = dflow.keep_columns(columns=['y'], validate_column_exists=True)\n",
"\n", "dflow.head()"
"X_test, y_test = _read_x_y('https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv',\"y\")\n", ]
"\n", },
"columns = ['age','job','marital','education','default','housing','loan','contact','month','day_of_week','duration','campaign','pdays','previous','poutcome','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','y']" {
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_test = X_test.to_pandas_dataframe()\n",
"y_test = y_test.to_pandas_dataframe()"
] ]
}, },
{ {
@@ -665,8 +652,9 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"y_pred = fitted_model.predict(X_test)\n", "y_pred = fitted_model.predict(X_test)\n",
"actual = array(y_test.tolist())\n", "actual = array(y_test)\n",
"print(y_pred.shape, \" \", actual[:,0].shape)" "actual = actual[:,0]\n",
"print(y_pred.shape, \" \", actual.shape)"
] ]
}, },
{ {
@@ -685,10 +673,9 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"y_test = y_test[:,0]# Plot outputs\n",
"%matplotlib notebook\n", "%matplotlib notebook\n",
"test_pred = plt.scatter(y_test, y_pred, color='b')\n", "test_pred = plt.scatter(actual, y_pred, color='b')\n",
"test_test = plt.scatter(y_test, y_test, color='g')\n", "test_test = plt.scatter(actual, actual, color='g')\n",
"plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n", "plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
"plt.show()" "plt.show()"
] ]

View File

@@ -75,6 +75,7 @@
"import pandas as pd\n", "import pandas as pd\n",
"import os\n", "import os\n",
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
"import azureml.dataprep as dprep\n",
"\n", "\n",
"import azureml.core\n", "import azureml.core\n",
"from azureml.core.experiment import Experiment\n", "from azureml.core.experiment import Experiment\n",
@@ -217,19 +218,13 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"%%writefile $project_folder/get_data.py\n", "data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\"\n",
"\n", "dflow = dprep.auto_read_file(data)\n",
"import pandas as pd\n", "dflow.get_profile()\n",
"from sklearn.model_selection import train_test_split\n", "X = dflow.drop_columns(columns=['Class'])\n",
"\n", "y = dflow.keep_columns(columns=['Class'], validate_column_exists=True)\n",
" \n", "X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
"def get_data():\n", "y_train, y_test = y.random_split(percentage=0.8, seed=223)"
" cards = pd.read_csv(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\")\n",
" y = cards.Class\n",
" x = cards.drop('Class', axis=1)\n",
" X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=1)\n",
" \n",
" return { \"X\" : X_train, \"y\" : y_train.values}"
] ]
}, },
{ {
@@ -281,7 +276,8 @@
" debug_log = 'automl_errors_20190417.log',\n", " debug_log = 'automl_errors_20190417.log',\n",
" path = project_folder,\n", " path = project_folder,\n",
" run_configuration=conda_run_config,\n", " run_configuration=conda_run_config,\n",
" data_script = project_folder + \"/get_data.py\",\n", " X = X_train,\n",
" y = y_train,\n",
" **automl_settings\n", " **automl_settings\n",
" )" " )"
] ]
@@ -621,11 +617,9 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"cards = pd.read_csv(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\")\n", "#Randomly select and test\n",
"print(cards.head())\n", "X_test = X_test.to_pandas_dataframe()\n",
"y = cards.Class\n", "y_test = y_test.to_pandas_dataframe()\n"
"x = cards.drop('Class', axis=1)\n",
"X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=1)\n"
] ]
}, },
{ {
@@ -678,14 +672,14 @@
"This Credit Card fraud Detection dataset is made available under the Open Database License: http://opendatacommons.org/licenses/odbl/1.0/. Any rights in individual contents of the database are licensed under the Database Contents License: http://opendatacommons.org/licenses/dbcl/1.0/ and is available at: https://www.kaggle.com/mlg-ulb/creditcardfraud\n", "This Credit Card fraud Detection dataset is made available under the Open Database License: http://opendatacommons.org/licenses/odbl/1.0/. Any rights in individual contents of the database are licensed under the Database Contents License: http://opendatacommons.org/licenses/dbcl/1.0/ and is available at: https://www.kaggle.com/mlg-ulb/creditcardfraud\n",
"\n", "\n",
"\n", "\n",
"The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00c3\u00a9 Libre de Bruxelles) on big data mining and fraud detection. More details on current and past projects on related topics are available on https://www.researchgate.net/project/Fraud-detection-5 and the page of the DefeatFraud project\n", "The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Université Libre de Bruxelles) on big data mining and fraud detection. More details on current and past projects on related topics are available on https://www.researchgate.net/project/Fraud-detection-5 and the page of the DefeatFraud project\n",
"Please cite the following works: \n", "Please cite the following works: \n",
"\u00e2\u20ac\u00a2\tAndrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015\n", "\tAndrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015\n",
"\u00e2\u20ac\u00a2\tDal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon\n", "\tDal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon\n",
"\u00e2\u20ac\u00a2\tDal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE\n", "\tDal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE\n",
"o\tDal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)\n", "o\tDal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)\n",
"\u00e2\u20ac\u00a2\tCarcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-A\u00c3\u00abl; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier\n", "\tCarcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-Aël; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier\n",
"\u00e2\u20ac\u00a2\tCarcillo, Fabrizio; Le Borgne, Yann-A\u00c3\u00abl; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing" "\tCarcillo, Fabrizio; Le Borgne, Yann-Aël; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing"
] ]
} }
], ],

View File

@@ -71,6 +71,7 @@
"import pandas as pd\n", "import pandas as pd\n",
"import os\n", "import os\n",
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
"import azureml.dataprep as dprep\n",
" \n", " \n",
"\n", "\n",
"import azureml.core\n", "import azureml.core\n",
@@ -212,25 +213,14 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"%%writefile $project_folder/get_data.py\n", "data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/compresive_strength_concrete.csv\"\n",
"\n", "dflow = dprep.auto_read_file(data)\n",
"import pandas as pd\n", "dflow.get_profile()\n",
"from sklearn.model_selection import train_test_split\n", "X = dflow.drop_columns(columns=['CONCRETE'])\n",
"\n", "y = dflow.keep_columns(columns=['CONCRETE'], validate_column_exists=True)\n",
"def _read_x_y(file_name, label_col):\n", "X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
" df = pd.read_csv(file_name)\n", "y_train, y_test = y.random_split(percentage=0.8, seed=223) \n",
" y = None\n", "dflow.head()"
" if label_col in df.columns:\n",
" y = df.pop(label_col)\n",
" y = y.values[:, None]\n",
" X = df.values\n",
" return X, y\n",
" \n",
"def get_data():\n",
" X,y = _read_x_y(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/compresive_strength_concrete.csv\",\"CONCRETE\")\n",
" X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)\n",
" \n",
" return { \"X\" : X_train, \"y\" : y_train[:,0] }"
] ]
}, },
{ {
@@ -282,7 +272,8 @@
" debug_log = 'automl.log',\n", " debug_log = 'automl.log',\n",
" path = project_folder,\n", " path = project_folder,\n",
" run_configuration=conda_run_config,\n", " run_configuration=conda_run_config,\n",
" data_script = project_folder + \"/get_data.py\",\n", " X = X_train,\n",
" y = y_train,\n",
" **automl_settings\n", " **automl_settings\n",
" )" " )"
] ]
@@ -311,7 +302,7 @@
"source": [ "source": [
"## Results\n", "## Results\n",
"Widget for Monitoring Runs\n", "Widget for Monitoring Runs\n",
"The widget will first report a \u00e2\u20ac\u0153loading status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n", "The widget will first report a loading status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
"Note: The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details." "Note: The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
] ]
}, },
@@ -664,14 +655,14 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def _read_x_y(file_name, label_col):\n", "X_test = X_test.to_pandas_dataframe()\n",
" df = pd.read_csv(file_name)\n", "y_test = y_test.to_pandas_dataframe()\n",
" y = None\n", "y_test = np.array(y_test)\n",
" if label_col in df.columns:\n", "y_test = y_test[:,0]\n",
" y = df.pop(label_col)\n", "X_train = X_train.to_pandas_dataframe()\n",
" y = y.values[:, None]\n", "y_train = y_train.to_pandas_dataframe()\n",
" X = df.values\n", "y_train = np.array(y_train)\n",
" return X, y" "y_train = y_train[:,0]"
] ]
}, },
{ {
@@ -687,9 +678,6 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"X,y = _read_x_y(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/compresive_strength_concrete.csv\",\"CONCRETE\")\n",
"X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)\n",
"\n",
"y_pred_train = fitted_model.predict(X_train)\n", "y_pred_train = fitted_model.predict(X_train)\n",
"y_residual_train = y_train - y_pred_train\n", "y_residual_train = y_train - y_pred_train\n",
"\n", "\n",

View File

@@ -71,6 +71,7 @@
"import pandas as pd\n", "import pandas as pd\n",
"import os\n", "import os\n",
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
"import azureml.dataprep as dprep\n",
" \n", " \n",
"\n", "\n",
"import azureml.core\n", "import azureml.core\n",
@@ -212,25 +213,14 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"%%writefile $project_folder/get_data.py\n", "data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv\"\n",
"\n", "dflow = dprep.auto_read_file(data)\n",
"import pandas as pd\n", "dflow.get_profile()\n",
"from sklearn.model_selection import train_test_split\n", "X = dflow.drop_columns(columns=['ERP'])\n",
"\n", "y = dflow.keep_columns(columns=['ERP'], validate_column_exists=True)\n",
"def _read_x_y(file_name, label_col):\n", "X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
" df = pd.read_csv(file_name)\n", "y_train, y_test = y.random_split(percentage=0.8, seed=223) \n",
" y = None\n", "dflow.head()"
" if label_col in df.columns:\n",
" y = df.pop(label_col)\n",
" y = y.values[:, None]\n",
" X = df.values\n",
" return X, y\n",
" \n",
"def get_data():\n",
" X,y = _read_x_y(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv\",\"ERP\")\n",
" X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)\n",
" \n",
" return { \"X\" : X_train, \"y\" : y_train[:,0] }"
] ]
}, },
{ {
@@ -283,7 +273,8 @@
" debug_log = 'automl_errors_20190417.log',\n", " debug_log = 'automl_errors_20190417.log',\n",
" path = project_folder,\n", " path = project_folder,\n",
" run_configuration=conda_run_config,\n", " run_configuration=conda_run_config,\n",
" data_script = project_folder + \"/get_data.py\",\n", " X = X_train,\n",
" y = y_train,\n",
" **automl_settings\n", " **automl_settings\n",
" )" " )"
] ]
@@ -334,16 +325,6 @@
"RunDetails(remote_run).show() " "RunDetails(remote_run).show() "
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.train.automl.run import AutoMLRun\n",
"setup_run = AutoMLRun(experiment, remote_run.id + \"_setup\")"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@@ -690,18 +671,14 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def _read_x_y(file_name, label_col):\n", "X_test = X_test.to_pandas_dataframe()\n",
" df = pd.read_csv(file_name)\n", "y_test = y_test.to_pandas_dataframe()\n",
" y_split = None\n", "y_test = np.array(y_test)\n",
" if label_col in df.columns:\n", "y_test = y_test[:,0]\n",
" y_split = df.pop(label_col)\n", "X_train = X_train.to_pandas_dataframe()\n",
" y_split = y_split.values[:, None]\n", "y_train = y_train.to_pandas_dataframe()\n",
" X_split = df.values\n", "y_train = np.array(y_train)\n",
" return X_split, y_split\n", "y_train = y_train[:,0]"
" \n",
"\n",
"X,y = _read_x_y(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv\",\"ERP\")\n",
"X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)"
] ]
}, },
{ {