{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Azure ML & Azure Databricks notebooks by Parashar Shah.\n", "\n", "Copyright (c) Microsoft Corporation. All rights reserved.\n", "\n", "Licensed under the MIT License." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#Data Ingestion" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import urllib" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Download AdultCensusIncome.csv from Azure CDN. This file has 32,561 rows.\n", "dataurl = \"https://amldockerdatasets.azureedge.net/AdultCensusIncome.csv\"\n", "datafile = \"AdultCensusIncome.csv\"\n", "datafile_dbfs = os.path.join(\"/dbfs\", datafile)\n", "\n", "if os.path.isfile(datafile_dbfs):\n", " print(\"found {} at {}\".format(datafile, datafile_dbfs))\n", "else:\n", " print(\"downloading {} to {}\".format(datafile, datafile_dbfs))\n", " urllib.request.urlretrieve(dataurl, datafile_dbfs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a Spark dataframe out of the csv file.\n", "data_all = sqlContext.read.format('csv').options(header='true', inferSchema='true', ignoreLeadingWhiteSpace='true', ignoreTrailingWhiteSpace='true').load(datafile)\n", "print(\"({}, {})\".format(data_all.count(), len(data_all.columns)))\n", "data_all.printSchema()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#renaming columns\n", "columns_new = [col.replace(\"-\", \"_\") for col in data_all.columns]\n", "data_all = data_all.toDF(*columns_new)\n", "data_all.printSchema()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "display(data_all.limit(5))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#Data Preparation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Choose feature columns and the label column.\n", "label = \"income\"\n", "xvars = set(data_all.columns) - {label}\n", "\n", "print(\"label = {}\".format(label))\n", "print(\"features = {}\".format(xvars))\n", "\n", "data = data_all.select([*xvars, label])\n", "\n", "# Split data into train and test.\n", "train, test = data.randomSplit([0.75, 0.25], seed=123)\n", "\n", "print(\"train ({}, {})\".format(train.count(), len(train.columns)))\n", "print(\"test ({}, {})\".format(test.count(), len(test.columns)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#Data Persistence" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Write the train and test data sets to intermediate storage\n", "train_data_path = \"AdultCensusIncomeTrain\"\n", "test_data_path = \"AdultCensusIncomeTest\"\n", "\n", "train_data_path_dbfs = os.path.join(\"/dbfs\", \"AdultCensusIncomeTrain\")\n", "test_data_path_dbfs = os.path.join(\"/dbfs\", \"AdultCensusIncomeTest\")\n", "\n", "train.write.mode('overwrite').parquet(train_data_path)\n", "test.write.mode('overwrite').parquet(test_data_path)\n", "print(\"train and test datasets saved to {} and {}\".format(train_data_path_dbfs, test_data_path_dbfs))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/azure-databricks/amlsdk/ingest-data-02.png)" ] } ], "metadata": { "authors": [ { "name": "pasha" } ], "kernelspec": { "display_name": "Python 3.6", "language": "python", "name": "python36" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" }, "name": "ingest-data-02", "notebookId": 3836944406456362 }, "nbformat": 4, "nbformat_minor": 1 }