mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-20 01:27:06 -05:00
Updated notebooks to use dataprep
This commit is contained in:
@@ -77,6 +77,7 @@
|
|||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"from sklearn import datasets\n",
|
"from sklearn import datasets\n",
|
||||||
|
"import azureml.dataprep as dprep\n",
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import azureml.core\n",
|
"import azureml.core\n",
|
||||||
@@ -220,30 +221,12 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"%%writefile $project_folder/get_data.py\n",
|
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv\"\n",
|
||||||
"\n",
|
"dflow = dprep.auto_read_file(data)\n",
|
||||||
"import pandas as pd\n",
|
"dflow.get_profile()\n",
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
"X_train = dflow.drop_columns(columns=['y'])\n",
|
||||||
"\n",
|
"y_train = dflow.keep_columns(columns=['y'], validate_column_exists=True)\n",
|
||||||
"def _read_x_y(file_name, label_col):\n",
|
"dflow.head()"
|
||||||
" df = pd.read_csv(file_name)\n",
|
|
||||||
" y = None\n",
|
|
||||||
" if label_col in df.columns:\n",
|
|
||||||
" y = df.pop(label_col)\n",
|
|
||||||
" y = y.values[:, None]\n",
|
|
||||||
" X = df.values\n",
|
|
||||||
" return X, y\n",
|
|
||||||
" \n",
|
|
||||||
"def get_data():\n",
|
|
||||||
" # Load the bank marketing datasets.\n",
|
|
||||||
" from sklearn.datasets import load_diabetes\n",
|
|
||||||
" from sklearn.model_selection import train_test_split\n",
|
|
||||||
"\n",
|
|
||||||
" X_train, y_train = _read_x_y('https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv', \"y\")\n",
|
|
||||||
"\n",
|
|
||||||
" columns = ['age','job','marital','education','default','housing','loan','contact','month','day_of_week','duration','campaign','pdays','previous','poutcome','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','y']\n",
|
|
||||||
"\n",
|
|
||||||
" return { \"X\" : X_train, \"y\" : y_train[:,0] }"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -288,7 +271,8 @@
|
|||||||
" debug_log = 'automl_errors.log',\n",
|
" debug_log = 'automl_errors.log',\n",
|
||||||
" path = project_folder,\n",
|
" path = project_folder,\n",
|
||||||
" run_configuration=conda_run_config,\n",
|
" run_configuration=conda_run_config,\n",
|
||||||
" data_script = project_folder + \"/get_data.py\",\n",
|
" X = X_train,\n",
|
||||||
|
" y = y_train,\n",
|
||||||
" **automl_settings\n",
|
" **automl_settings\n",
|
||||||
" )"
|
" )"
|
||||||
]
|
]
|
||||||
@@ -631,14 +615,10 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def _read_x_y(file_name, label_col):\n",
|
"# Load the bank marketing datasets.\n",
|
||||||
" df = pd.read_csv(file_name)\n",
|
"from sklearn.datasets import load_diabetes\n",
|
||||||
" y = None\n",
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
" if label_col in df.columns:\n",
|
"from numpy import array"
|
||||||
" y = df.pop(label_col)\n",
|
|
||||||
" y = y.values[:, None]\n",
|
|
||||||
" X = df.values\n",
|
|
||||||
" return X, y"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -647,15 +627,22 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Load the bank marketing datasets.\n",
|
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv\"\n",
|
||||||
"from sklearn.datasets import load_diabetes\n",
|
"dflow = dprep.auto_read_file(data)\n",
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
"dflow.get_profile()\n",
|
||||||
"from numpy import array\n",
|
"X_test = dflow.drop_columns(columns=['y'])\n",
|
||||||
"\n",
|
"y_test = dflow.keep_columns(columns=['y'], validate_column_exists=True)\n",
|
||||||
"\n",
|
"dflow.head()"
|
||||||
"X_test, y_test = _read_x_y('https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv',\"y\")\n",
|
]
|
||||||
"\n",
|
},
|
||||||
"columns = ['age','job','marital','education','default','housing','loan','contact','month','day_of_week','duration','campaign','pdays','previous','poutcome','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','y']"
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"X_test = X_test.to_pandas_dataframe()\n",
|
||||||
|
"y_test = y_test.to_pandas_dataframe()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -665,8 +652,9 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"y_pred = fitted_model.predict(X_test)\n",
|
"y_pred = fitted_model.predict(X_test)\n",
|
||||||
"actual = array(y_test.tolist())\n",
|
"actual = array(y_test)\n",
|
||||||
"print(y_pred.shape, \" \", actual[:,0].shape)"
|
"actual = actual[:,0]\n",
|
||||||
|
"print(y_pred.shape, \" \", actual.shape)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -685,10 +673,9 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"y_test = y_test[:,0]# Plot outputs\n",
|
|
||||||
"%matplotlib notebook\n",
|
"%matplotlib notebook\n",
|
||||||
"test_pred = plt.scatter(y_test, y_pred, color='b')\n",
|
"test_pred = plt.scatter(actual, y_pred, color='b')\n",
|
||||||
"test_test = plt.scatter(y_test, y_test, color='g')\n",
|
"test_test = plt.scatter(actual, actual, color='g')\n",
|
||||||
"plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
|
"plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
|
||||||
"plt.show()"
|
"plt.show()"
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -75,6 +75,7 @@
|
|||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"import azureml.dataprep as dprep\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import azureml.core\n",
|
"import azureml.core\n",
|
||||||
"from azureml.core.experiment import Experiment\n",
|
"from azureml.core.experiment import Experiment\n",
|
||||||
@@ -217,19 +218,13 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"%%writefile $project_folder/get_data.py\n",
|
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\"\n",
|
||||||
"\n",
|
"dflow = dprep.auto_read_file(data)\n",
|
||||||
"import pandas as pd\n",
|
"dflow.get_profile()\n",
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
"X = dflow.drop_columns(columns=['Class'])\n",
|
||||||
"\n",
|
"y = dflow.keep_columns(columns=['Class'], validate_column_exists=True)\n",
|
||||||
" \n",
|
"X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
|
||||||
"def get_data():\n",
|
"y_train, y_test = y.random_split(percentage=0.8, seed=223)"
|
||||||
" cards = pd.read_csv(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\")\n",
|
|
||||||
" y = cards.Class\n",
|
|
||||||
" x = cards.drop('Class', axis=1)\n",
|
|
||||||
" X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=1)\n",
|
|
||||||
" \n",
|
|
||||||
" return { \"X\" : X_train, \"y\" : y_train.values}"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -281,7 +276,8 @@
|
|||||||
" debug_log = 'automl_errors_20190417.log',\n",
|
" debug_log = 'automl_errors_20190417.log',\n",
|
||||||
" path = project_folder,\n",
|
" path = project_folder,\n",
|
||||||
" run_configuration=conda_run_config,\n",
|
" run_configuration=conda_run_config,\n",
|
||||||
" data_script = project_folder + \"/get_data.py\",\n",
|
" X = X_train,\n",
|
||||||
|
" y = y_train,\n",
|
||||||
" **automl_settings\n",
|
" **automl_settings\n",
|
||||||
" )"
|
" )"
|
||||||
]
|
]
|
||||||
@@ -621,11 +617,9 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"cards = pd.read_csv(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\")\n",
|
"#Randomly select and test\n",
|
||||||
"print(cards.head())\n",
|
"X_test = X_test.to_pandas_dataframe()\n",
|
||||||
"y = cards.Class\n",
|
"y_test = y_test.to_pandas_dataframe()\n"
|
||||||
"x = cards.drop('Class', axis=1)\n",
|
|
||||||
"X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=1)\n"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -678,14 +672,14 @@
|
|||||||
"This Credit Card fraud Detection dataset is made available under the Open Database License: http://opendatacommons.org/licenses/odbl/1.0/. Any rights in individual contents of the database are licensed under the Database Contents License: http://opendatacommons.org/licenses/dbcl/1.0/ and is available at: https://www.kaggle.com/mlg-ulb/creditcardfraud\n",
|
"This Credit Card fraud Detection dataset is made available under the Open Database License: http://opendatacommons.org/licenses/odbl/1.0/. Any rights in individual contents of the database are licensed under the Database Contents License: http://opendatacommons.org/licenses/dbcl/1.0/ and is available at: https://www.kaggle.com/mlg-ulb/creditcardfraud\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00c3\u00a9 Libre de Bruxelles) on big data mining and fraud detection. More details on current and past projects on related topics are available on https://www.researchgate.net/project/Fraud-detection-5 and the page of the DefeatFraud project\n",
|
"The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Université Libre de Bruxelles) on big data mining and fraud detection. More details on current and past projects on related topics are available on https://www.researchgate.net/project/Fraud-detection-5 and the page of the DefeatFraud project\n",
|
||||||
"Please cite the following works: \n",
|
"Please cite the following works: \n",
|
||||||
"\u00e2\u20ac\u00a2\tAndrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015\n",
|
"•\tAndrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015\n",
|
||||||
"\u00e2\u20ac\u00a2\tDal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon\n",
|
"•\tDal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon\n",
|
||||||
"\u00e2\u20ac\u00a2\tDal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE\n",
|
"•\tDal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE\n",
|
||||||
"o\tDal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)\n",
|
"o\tDal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)\n",
|
||||||
"\u00e2\u20ac\u00a2\tCarcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-A\u00c3\u00abl; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier\n",
|
"•\tCarcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-Aël; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier\n",
|
||||||
"\u00e2\u20ac\u00a2\tCarcillo, Fabrizio; Le Borgne, Yann-A\u00c3\u00abl; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing"
|
"•\tCarcillo, Fabrizio; Le Borgne, Yann-Aël; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -71,6 +71,7 @@
|
|||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"import azureml.dataprep as dprep\n",
|
||||||
" \n",
|
" \n",
|
||||||
"\n",
|
"\n",
|
||||||
"import azureml.core\n",
|
"import azureml.core\n",
|
||||||
@@ -212,25 +213,14 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"%%writefile $project_folder/get_data.py\n",
|
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/compresive_strength_concrete.csv\"\n",
|
||||||
"\n",
|
"dflow = dprep.auto_read_file(data)\n",
|
||||||
"import pandas as pd\n",
|
"dflow.get_profile()\n",
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
"X = dflow.drop_columns(columns=['CONCRETE'])\n",
|
||||||
"\n",
|
"y = dflow.keep_columns(columns=['CONCRETE'], validate_column_exists=True)\n",
|
||||||
"def _read_x_y(file_name, label_col):\n",
|
"X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
|
||||||
" df = pd.read_csv(file_name)\n",
|
"y_train, y_test = y.random_split(percentage=0.8, seed=223) \n",
|
||||||
" y = None\n",
|
"dflow.head()"
|
||||||
" if label_col in df.columns:\n",
|
|
||||||
" y = df.pop(label_col)\n",
|
|
||||||
" y = y.values[:, None]\n",
|
|
||||||
" X = df.values\n",
|
|
||||||
" return X, y\n",
|
|
||||||
" \n",
|
|
||||||
"def get_data():\n",
|
|
||||||
" X,y = _read_x_y(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/compresive_strength_concrete.csv\",\"CONCRETE\")\n",
|
|
||||||
" X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)\n",
|
|
||||||
" \n",
|
|
||||||
" return { \"X\" : X_train, \"y\" : y_train[:,0] }"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -282,7 +272,8 @@
|
|||||||
" debug_log = 'automl.log',\n",
|
" debug_log = 'automl.log',\n",
|
||||||
" path = project_folder,\n",
|
" path = project_folder,\n",
|
||||||
" run_configuration=conda_run_config,\n",
|
" run_configuration=conda_run_config,\n",
|
||||||
" data_script = project_folder + \"/get_data.py\",\n",
|
" X = X_train,\n",
|
||||||
|
" y = y_train,\n",
|
||||||
" **automl_settings\n",
|
" **automl_settings\n",
|
||||||
" )"
|
" )"
|
||||||
]
|
]
|
||||||
@@ -311,7 +302,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"## Results\n",
|
"## Results\n",
|
||||||
"Widget for Monitoring Runs\n",
|
"Widget for Monitoring Runs\n",
|
||||||
"The widget will first report a \u00e2\u20ac\u0153loading status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
|
"The widget will first report a “loading status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
|
||||||
"Note: The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
|
"Note: The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -664,14 +655,14 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def _read_x_y(file_name, label_col):\n",
|
"X_test = X_test.to_pandas_dataframe()\n",
|
||||||
" df = pd.read_csv(file_name)\n",
|
"y_test = y_test.to_pandas_dataframe()\n",
|
||||||
" y = None\n",
|
"y_test = np.array(y_test)\n",
|
||||||
" if label_col in df.columns:\n",
|
"y_test = y_test[:,0]\n",
|
||||||
" y = df.pop(label_col)\n",
|
"X_train = X_train.to_pandas_dataframe()\n",
|
||||||
" y = y.values[:, None]\n",
|
"y_train = y_train.to_pandas_dataframe()\n",
|
||||||
" X = df.values\n",
|
"y_train = np.array(y_train)\n",
|
||||||
" return X, y"
|
"y_train = y_train[:,0]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -687,9 +678,6 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"X,y = _read_x_y(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/compresive_strength_concrete.csv\",\"CONCRETE\")\n",
|
|
||||||
"X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)\n",
|
|
||||||
"\n",
|
|
||||||
"y_pred_train = fitted_model.predict(X_train)\n",
|
"y_pred_train = fitted_model.predict(X_train)\n",
|
||||||
"y_residual_train = y_train - y_pred_train\n",
|
"y_residual_train = y_train - y_pred_train\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
|||||||
@@ -71,6 +71,7 @@
|
|||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"import azureml.dataprep as dprep\n",
|
||||||
" \n",
|
" \n",
|
||||||
"\n",
|
"\n",
|
||||||
"import azureml.core\n",
|
"import azureml.core\n",
|
||||||
@@ -212,25 +213,14 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"%%writefile $project_folder/get_data.py\n",
|
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv\"\n",
|
||||||
"\n",
|
"dflow = dprep.auto_read_file(data)\n",
|
||||||
"import pandas as pd\n",
|
"dflow.get_profile()\n",
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
"X = dflow.drop_columns(columns=['ERP'])\n",
|
||||||
"\n",
|
"y = dflow.keep_columns(columns=['ERP'], validate_column_exists=True)\n",
|
||||||
"def _read_x_y(file_name, label_col):\n",
|
"X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
|
||||||
" df = pd.read_csv(file_name)\n",
|
"y_train, y_test = y.random_split(percentage=0.8, seed=223) \n",
|
||||||
" y = None\n",
|
"dflow.head()"
|
||||||
" if label_col in df.columns:\n",
|
|
||||||
" y = df.pop(label_col)\n",
|
|
||||||
" y = y.values[:, None]\n",
|
|
||||||
" X = df.values\n",
|
|
||||||
" return X, y\n",
|
|
||||||
" \n",
|
|
||||||
"def get_data():\n",
|
|
||||||
" X,y = _read_x_y(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv\",\"ERP\")\n",
|
|
||||||
" X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)\n",
|
|
||||||
" \n",
|
|
||||||
" return { \"X\" : X_train, \"y\" : y_train[:,0] }"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -283,7 +273,8 @@
|
|||||||
" debug_log = 'automl_errors_20190417.log',\n",
|
" debug_log = 'automl_errors_20190417.log',\n",
|
||||||
" path = project_folder,\n",
|
" path = project_folder,\n",
|
||||||
" run_configuration=conda_run_config,\n",
|
" run_configuration=conda_run_config,\n",
|
||||||
" data_script = project_folder + \"/get_data.py\",\n",
|
" X = X_train,\n",
|
||||||
|
" y = y_train,\n",
|
||||||
" **automl_settings\n",
|
" **automl_settings\n",
|
||||||
" )"
|
" )"
|
||||||
]
|
]
|
||||||
@@ -334,16 +325,6 @@
|
|||||||
"RunDetails(remote_run).show() "
|
"RunDetails(remote_run).show() "
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from azureml.train.automl.run import AutoMLRun\n",
|
|
||||||
"setup_run = AutoMLRun(experiment, remote_run.id + \"_setup\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
@@ -690,18 +671,14 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def _read_x_y(file_name, label_col):\n",
|
"X_test = X_test.to_pandas_dataframe()\n",
|
||||||
" df = pd.read_csv(file_name)\n",
|
"y_test = y_test.to_pandas_dataframe()\n",
|
||||||
" y_split = None\n",
|
"y_test = np.array(y_test)\n",
|
||||||
" if label_col in df.columns:\n",
|
"y_test = y_test[:,0]\n",
|
||||||
" y_split = df.pop(label_col)\n",
|
"X_train = X_train.to_pandas_dataframe()\n",
|
||||||
" y_split = y_split.values[:, None]\n",
|
"y_train = y_train.to_pandas_dataframe()\n",
|
||||||
" X_split = df.values\n",
|
"y_train = np.array(y_train)\n",
|
||||||
" return X_split, y_split\n",
|
"y_train = y_train[:,0]"
|
||||||
" \n",
|
|
||||||
"\n",
|
|
||||||
"X,y = _read_x_y(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv\",\"ERP\")\n",
|
|
||||||
"X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user