MachineLearningNotebooks/how-to-use-azureml/azure-databricks/amlsdk/ingest-data-02.ipynb

{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Azure ML & Azure Databricks notebooks by Parashar Shah.\n",
        "\n",
        "Copyright (c) Microsoft Corporation. All rights reserved.\n",
        "\n",
        "Licensed under the MIT License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/azure-databricks/amlsdk/ingest-data-02.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#Data Ingestion"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "import urllib"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Download AdultCensusIncome.csv from Azure CDN. This file has 32,561 rows.\n",
        "dataurl = \"https://amldockerdatasets.azureedge.net/AdultCensusIncome.csv\"\n",
        "datafile = \"AdultCensusIncome.csv\"\n",
        "datafile_dbfs = os.path.join(\"/dbfs\", datafile)\n",
        "\n",
        "if os.path.isfile(datafile_dbfs):\n",
        "    print(\"found {} at {}\".format(datafile, datafile_dbfs))\n",
        "else:\n",
        "    print(\"downloading {} to {}\".format(datafile, datafile_dbfs))\n",
        "    urllib.request.urlretrieve(dataurl, datafile_dbfs)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Create a Spark dataframe out of the csv file.\n",
        "data_all = sqlContext.read.format('csv').options(header='true', inferSchema='true', ignoreLeadingWhiteSpace='true', ignoreTrailingWhiteSpace='true').load(datafile)\n",
        "print(\"({}, {})\".format(data_all.count(), len(data_all.columns)))\n",
        "data_all.printSchema()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "#renaming columns\n",
        "columns_new = [col.replace(\"-\", \"_\") for col in data_all.columns]\n",
        "data_all = data_all.toDF(*columns_new)\n",
        "data_all.printSchema()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "display(data_all.limit(5))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#Data Preparation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Choose feature columns and the label column.\n",
        "label = \"income\"\n",
        "xvars = set(data_all.columns) - {label}\n",
        "\n",
        "print(\"label = {}\".format(label))\n",
        "print(\"features = {}\".format(xvars))\n",
        "\n",
        "data = data_all.select([*xvars, label])\n",
        "\n",
        "# Split data into train and test.\n",
        "train, test = data.randomSplit([0.75, 0.25], seed=123)\n",
        "\n",
        "print(\"train ({}, {})\".format(train.count(), len(train.columns)))\n",
        "print(\"test ({}, {})\".format(test.count(), len(test.columns)))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "#Data Persistence"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Write the train and test data sets to intermediate storage\n",
        "train_data_path = \"AdultCensusIncomeTrain\"\n",
        "test_data_path = \"AdultCensusIncomeTest\"\n",
        "\n",
        "train_data_path_dbfs = os.path.join(\"/dbfs\", \"AdultCensusIncomeTrain\")\n",
        "test_data_path_dbfs = os.path.join(\"/dbfs\", \"AdultCensusIncomeTest\")\n",
        "\n",
        "train.write.mode('overwrite').parquet(train_data_path)\n",
        "test.write.mode('overwrite').parquet(test_data_path)\n",
        "print(\"train and test datasets saved to {} and {}\".format(train_data_path_dbfs, test_data_path_dbfs))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/azure-databricks/amlsdk/ingest-data-02.png)"
      ]
    }
  ],
  "metadata": {
    "authors": [
      {
        "name": "pasha"
      }
    ],
    "kernelspec": {
      "display_name": "Python 3.6",
      "language": "python",
      "name": "python36"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.6"
    },
    "name": "ingest-data-02",
    "notebookId": 3836944406456362
  },
  "nbformat": 4,
  "nbformat_minor": 1
}