Updated notebooks to use dataprep

2025-12-20 09:37:04 -05:00 · 2019-06-26 14:23:20 -07:00
parent cd3c980a6e
commit 3d2552174d
4 changed files with 3033 additions and 3087 deletions
--- a/how-to-use-azureml/automated-machine-learning/classification-bank-marketing/auto-ml-classification-bank-marketing.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/classification-bank-marketing/auto-ml-classification-bank-marketing.ipynb
@@ -77,6 +77,7 @@
    "import pandas as pd\n",
    "import os\n",
    "from sklearn import datasets\n",
    "import azureml.dataprep as dprep\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "import azureml.core\n",
@@ -220,30 +221,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
-        "%%writefile $project_folder/get_data.py\n",
+    "data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv\"\n",
-        "\n",
+    "dflow = dprep.auto_read_file(data)\n",
-        "import pandas as pd\n",
+    "dflow.get_profile()\n",
-        "from sklearn.model_selection import train_test_split\n",
+    "X_train = dflow.drop_columns(columns=['y'])\n",
-        "\n",
+    "y_train = dflow.keep_columns(columns=['y'], validate_column_exists=True)\n",
-        "def _read_x_y(file_name, label_col):\n",
+    "dflow.head()"
        "        df = pd.read_csv(file_name)\n",
        "        y = None\n",
        "        if label_col in df.columns:\n",
        "            y = df.pop(label_col)\n",
        "            y = y.values[:, None]\n",
        "        X = df.values\n",
        "        return X, y\n",
        "    \n",
        "def get_data():\n",
        "    # Load the bank marketing datasets.\n",
        "    from sklearn.datasets import load_diabetes\n",
        "    from sklearn.model_selection import train_test_split\n",
        "\n",
        "    X_train, y_train =  _read_x_y('https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv', \"y\")\n",
        "\n",
        "    columns = ['age','job','marital','education','default','housing','loan','contact','month','day_of_week','duration','campaign','pdays','previous','poutcome','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','y']\n",
        "\n",
        "    return { \"X\" : X_train, \"y\" : y_train[:,0] }"
   ]
  },
  {
@@ -288,7 +271,8 @@
    "                             debug_log = 'automl_errors.log',\n",
    "                             path = project_folder,\n",
    "                             run_configuration=conda_run_config,\n",
-        "                             data_script = project_folder + \"/get_data.py\",\n",
+    "                             X = X_train,\n",
    "                             y = y_train,\n",
    "                             **automl_settings\n",
    "                            )"
   ]
@@ -631,14 +615,10 @@
   "metadata": {},
   "outputs": [],
   "source": [
-        "def _read_x_y(file_name, label_col):\n",
+    "# Load the bank marketing datasets.\n",
-        "        df = pd.read_csv(file_name)\n",
+    "from sklearn.datasets import load_diabetes\n",
-        "        y = None\n",
+    "from sklearn.model_selection import train_test_split\n",
-        "        if label_col in df.columns:\n",
+    "from numpy import array"
        "            y = df.pop(label_col)\n",
        "            y = y.values[:, None]\n",
        "        X = df.values\n",
        "        return X, y"
   ]
  },
  {
@@ -647,15 +627,22 @@
   "metadata": {},
   "outputs": [],
   "source": [
-        "# Load the bank marketing datasets.\n",
+    "data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv\"\n",
-        "from sklearn.datasets import load_diabetes\n",
+    "dflow = dprep.auto_read_file(data)\n",
-        "from sklearn.model_selection import train_test_split\n",
+    "dflow.get_profile()\n",
-        "from numpy import array\n",
+    "X_test = dflow.drop_columns(columns=['y'])\n",
-        "\n",
+    "y_test = dflow.keep_columns(columns=['y'], validate_column_exists=True)\n",
-        "\n",
+    "dflow.head()"
-        "X_test, y_test =  _read_x_y('https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv',\"y\")\n",
+   ]
-        "\n",
+  },
-        "columns = ['age','job','marital','education','default','housing','loan','contact','month','day_of_week','duration','campaign','pdays','previous','poutcome','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','y']"
+  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test = X_test.to_pandas_dataframe()\n",
    "y_test = y_test.to_pandas_dataframe()"
   ]
  },
  {
@@ -665,8 +652,9 @@
   "outputs": [],
   "source": [
    "y_pred  = fitted_model.predict(X_test)\n",
-        "actual = array(y_test.tolist())\n",
+    "actual = array(y_test)\n",
-        "print(y_pred.shape, \" \", actual[:,0].shape)"
+    "actual = actual[:,0]\n",
    "print(y_pred.shape, \" \", actual.shape)"
   ]
  },
  {
@@ -685,10 +673,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
        "y_test = y_test[:,0]# Plot outputs\n",
    "%matplotlib notebook\n",
-        "test_pred = plt.scatter(y_test, y_pred, color='b')\n",
+    "test_pred = plt.scatter(actual, y_pred, color='b')\n",
-        "test_test = plt.scatter(y_test, y_test, color='g')\n",
+    "test_test = plt.scatter(actual, actual, color='g')\n",
    "plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
    "plt.show()"
   ]
--- a/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb
@@ -75,6 +75,7 @@
    "import pandas as pd\n",
    "import os\n",
    "from sklearn.model_selection import train_test_split\n",
    "import azureml.dataprep as dprep\n",
    "\n",
    "import azureml.core\n",
    "from azureml.core.experiment import Experiment\n",
@@ -217,19 +218,13 @@
   "metadata": {},
   "outputs": [],
   "source": [
-        "%%writefile $project_folder/get_data.py\n",
+    "data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\"\n",
-        "\n",
+    "dflow = dprep.auto_read_file(data)\n",
-        "import pandas as pd\n",
+    "dflow.get_profile()\n",
-        "from sklearn.model_selection import train_test_split\n",
+    "X = dflow.drop_columns(columns=['Class'])\n",
-        "\n",
+    "y = dflow.keep_columns(columns=['Class'], validate_column_exists=True)\n",
-        "    \n",
+    "X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
-        "def get_data():\n",
+    "y_train, y_test = y.random_split(percentage=0.8, seed=223)"
        "    cards = pd.read_csv(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\")\n",
        "    y = cards.Class\n",
        "    x = cards.drop('Class', axis=1)\n",
        "    X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=1)\n",
        "    \n",
        "    return { \"X\" : X_train, \"y\" : y_train.values}"
   ]
  },
  {
@@ -281,7 +276,8 @@
    "                             debug_log = 'automl_errors_20190417.log',\n",
    "                             path = project_folder,\n",
    "                             run_configuration=conda_run_config,\n",
-        "                             data_script = project_folder + \"/get_data.py\",\n",
+    "                             X = X_train,\n",
    "                             y = y_train,\n",
    "                             **automl_settings\n",
    "                            )"
   ]
@@ -621,11 +617,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-        "cards = pd.read_csv(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\")\n",
+    "#Randomly select and test\n",
-        "print(cards.head())\n",
+    "X_test = X_test.to_pandas_dataframe()\n",
-        "y = cards.Class\n",
+    "y_test = y_test.to_pandas_dataframe()\n"
        "x = cards.drop('Class', axis=1)\n",
        "X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=1)\n"
   ]
  },
  {
@@ -678,14 +672,14 @@
    "This Credit Card fraud Detection dataset is made available under the Open Database License: http://opendatacommons.org/licenses/odbl/1.0/. Any rights in individual contents of the database are licensed under the Database Contents License: http://opendatacommons.org/licenses/dbcl/1.0/ and is available at: https://www.kaggle.com/mlg-ulb/creditcardfraud\n",
    "\n",
    "\n",
-        "The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Universit\u00c3\u00a9 Libre de Bruxelles) on big data mining and fraud detection. More details on current and past projects on related topics are available on https://www.researchgate.net/project/Fraud-detection-5 and the page of the DefeatFraud project\n",
+    "The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Université Libre de Bruxelles) on big data mining and fraud detection. More details on current and past projects on related topics are available on https://www.researchgate.net/project/Fraud-detection-5 and the page of the DefeatFraud project\n",
    "Please cite the following works: \n",
-        "\u00e2\u20ac\u00a2\tAndrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015\n",
+    "•\tAndrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015\n",
-        "\u00e2\u20ac\u00a2\tDal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon\n",
+    "•\tDal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon\n",
-        "\u00e2\u20ac\u00a2\tDal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE\n",
+    "•\tDal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE\n",
    "o\tDal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)\n",
-        "\u00e2\u20ac\u00a2\tCarcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-A\u00c3\u00abl; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier\n",
+    "•\tCarcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-Aël; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier\n",
-        "\u00e2\u20ac\u00a2\tCarcillo, Fabrizio; Le Borgne, Yann-A\u00c3\u00abl; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing"
+    "•\tCarcillo, Fabrizio; Le Borgne, Yann-Aël; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing"
   ]
  }
 ],
--- a/how-to-use-azureml/automated-machine-learning/regression-concrete-strength/auto-ml-regression-concrete-strength.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/regression-concrete-strength/auto-ml-regression-concrete-strength.ipynb
@@ -71,6 +71,7 @@
    "import pandas as pd\n",
    "import os\n",
    "from sklearn.model_selection import train_test_split\n",
    "import azureml.dataprep as dprep\n",
    " \n",
    "\n",
    "import azureml.core\n",
@@ -212,25 +213,14 @@
   "metadata": {},
   "outputs": [],
   "source": [
-        "%%writefile $project_folder/get_data.py\n",
+    "data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/compresive_strength_concrete.csv\"\n",
-        "\n",
+    "dflow = dprep.auto_read_file(data)\n",
-        "import pandas as pd\n",
+    "dflow.get_profile()\n",
-        "from sklearn.model_selection import train_test_split\n",
+    "X = dflow.drop_columns(columns=['CONCRETE'])\n",
-        "\n",
+    "y = dflow.keep_columns(columns=['CONCRETE'], validate_column_exists=True)\n",
-        "def _read_x_y(file_name, label_col):\n",
+    "X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
-        "        df = pd.read_csv(file_name)\n",
+    "y_train, y_test = y.random_split(percentage=0.8, seed=223) \n",
-        "        y = None\n",
+    "dflow.head()"
        "        if label_col in df.columns:\n",
        "            y = df.pop(label_col)\n",
        "            y = y.values[:, None]\n",
        "        X = df.values\n",
        "        return X, y\n",
        "    \n",
        "def get_data():\n",
        "    X,y = _read_x_y(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/compresive_strength_concrete.csv\",\"CONCRETE\")\n",
        "    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)\n",
        "    \n",
        "    return { \"X\" : X_train, \"y\" : y_train[:,0] }"
   ]
  },
  {
@@ -282,7 +272,8 @@
    "                             debug_log = 'automl.log',\n",
    "                             path = project_folder,\n",
    "                             run_configuration=conda_run_config,\n",
-        "                             data_script = project_folder + \"/get_data.py\",\n",
+    "                             X = X_train,\n",
    "                             y = y_train,\n",
    "                             **automl_settings\n",
    "                            )"
   ]
@@ -311,7 +302,7 @@
   "source": [
    "## Results\n",
    "Widget for Monitoring Runs\n",
-        "The widget will first report a \u00e2\u20ac\u0153loading status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
+    "The widget will first report a “loading status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.\n",
    "Note: The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details."
   ]
  },
@@ -664,14 +655,14 @@
   "metadata": {},
   "outputs": [],
   "source": [
-        "def _read_x_y(file_name, label_col):\n",
+    "X_test = X_test.to_pandas_dataframe()\n",
-        "        df = pd.read_csv(file_name)\n",
+    "y_test = y_test.to_pandas_dataframe()\n",
-        "        y = None\n",
+    "y_test = np.array(y_test)\n",
-        "        if label_col in df.columns:\n",
+    "y_test = y_test[:,0]\n",
-        "            y = df.pop(label_col)\n",
+    "X_train = X_train.to_pandas_dataframe()\n",
-        "            y = y.values[:, None]\n",
+    "y_train = y_train.to_pandas_dataframe()\n",
-        "        X = df.values\n",
+    "y_train = np.array(y_train)\n",
-        "        return X, y"
+    "y_train = y_train[:,0]"
   ]
  },
  {
@@ -687,9 +678,6 @@
   "metadata": {},
   "outputs": [],
   "source": [
        "X,y = _read_x_y(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/compresive_strength_concrete.csv\",\"CONCRETE\")\n",
        "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)\n",
        "\n",
    "y_pred_train = fitted_model.predict(X_train)\n",
    "y_residual_train = y_train - y_pred_train\n",
    "\n",
--- a/how-to-use-azureml/automated-machine-learning/regression-hardware-performance/auto-ml-regression-hardware-performance.ipynb
+++ b/how-to-use-azureml/automated-machine-learning/regression-hardware-performance/auto-ml-regression-hardware-performance.ipynb
@@ -71,6 +71,7 @@
    "import pandas as pd\n",
    "import os\n",
    "from sklearn.model_selection import train_test_split\n",
    "import azureml.dataprep as dprep\n",
    " \n",
    "\n",
    "import azureml.core\n",
@@ -212,25 +213,14 @@
   "metadata": {},
   "outputs": [],
   "source": [
-        "%%writefile $project_folder/get_data.py\n",
+    "data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv\"\n",
-        "\n",
+    "dflow = dprep.auto_read_file(data)\n",
-        "import pandas as pd\n",
+    "dflow.get_profile()\n",
-        "from sklearn.model_selection import train_test_split\n",
+    "X = dflow.drop_columns(columns=['ERP'])\n",
-        "\n",
+    "y = dflow.keep_columns(columns=['ERP'], validate_column_exists=True)\n",
-        "def _read_x_y(file_name, label_col):\n",
+    "X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
-        "        df = pd.read_csv(file_name)\n",
+    "y_train, y_test = y.random_split(percentage=0.8, seed=223) \n",
-        "        y = None\n",
+    "dflow.head()"
        "        if label_col in df.columns:\n",
        "            y = df.pop(label_col)\n",
        "            y = y.values[:, None]\n",
        "        X = df.values\n",
        "        return X, y\n",
        "    \n",
        "def get_data():\n",
        "    X,y = _read_x_y(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv\",\"ERP\")\n",
        "    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)\n",
        "    \n",
        "    return { \"X\" : X_train, \"y\" : y_train[:,0] }"
   ]
  },
  {
@@ -283,7 +273,8 @@
    "                             debug_log = 'automl_errors_20190417.log',\n",
    "                             path = project_folder,\n",
    "                             run_configuration=conda_run_config,\n",
-        "                             data_script = project_folder + \"/get_data.py\",\n",
+    "                             X = X_train,\n",
    "                             y = y_train,\n",
    "                             **automl_settings\n",
    "                            )"
   ]
@@ -334,16 +325,6 @@
    "RunDetails(remote_run).show() "
   ]
  },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from azureml.train.automl.run import AutoMLRun\n",
        "setup_run = AutoMLRun(experiment, remote_run.id + \"_setup\")"
      ]
    },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -690,18 +671,14 @@
   "metadata": {},
   "outputs": [],
   "source": [
-        "def _read_x_y(file_name, label_col):\n",
+    "X_test = X_test.to_pandas_dataframe()\n",
-        "    df = pd.read_csv(file_name)\n",
+    "y_test = y_test.to_pandas_dataframe()\n",
-        "    y_split = None\n",
+    "y_test = np.array(y_test)\n",
-        "    if label_col in df.columns:\n",
+    "y_test = y_test[:,0]\n",
-        "        y_split = df.pop(label_col)\n",
+    "X_train = X_train.to_pandas_dataframe()\n",
-        "        y_split = y_split.values[:, None]\n",
+    "y_train = y_train.to_pandas_dataframe()\n",
-        "    X_split = df.values\n",
+    "y_train = np.array(y_train)\n",
-        "    return X_split, y_split\n",
+    "y_train = y_train[:,0]"
        "    \n",
        "\n",
        "X,y = _read_x_y(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv\",\"ERP\")\n",
        "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)"
   ]
  },
  {