mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-21 01:55:07 -05:00
186 lines
5.3 KiB
Plaintext
186 lines
5.3 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Azure ML & Azure Databricks notebooks by Parashar Shah.\n",
|
|
"\n",
|
|
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
|
"\n",
|
|
"Licensed under the MIT License."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#Data Ingestion"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"import urllib"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Download AdultCensusIncome.csv from Azure CDN. This file has 32,561 rows.\n",
|
|
"dataurl = \"https://amldockerdatasets.azureedge.net/AdultCensusIncome.csv\"\n",
|
|
"datafile = \"AdultCensusIncome.csv\"\n",
|
|
"datafile_dbfs = os.path.join(\"/dbfs\", datafile)\n",
|
|
"\n",
|
|
"if os.path.isfile(datafile_dbfs):\n",
|
|
" print(\"found {} at {}\".format(datafile, datafile_dbfs))\n",
|
|
"else:\n",
|
|
" print(\"downloading {} to {}\".format(datafile, datafile_dbfs))\n",
|
|
" urllib.request.urlretrieve(dataurl, datafile_dbfs)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create a Spark dataframe out of the csv file.\n",
|
|
"data_all = sqlContext.read.format('csv').options(header='true', inferSchema='true', ignoreLeadingWhiteSpace='true', ignoreTrailingWhiteSpace='true').load(datafile)\n",
|
|
"print(\"({}, {})\".format(data_all.count(), len(data_all.columns)))\n",
|
|
"data_all.printSchema()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#renaming columns\n",
|
|
"columns_new = [col.replace(\"-\", \"_\") for col in data_all.columns]\n",
|
|
"data_all = data_all.toDF(*columns_new)\n",
|
|
"data_all.printSchema()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"display(data_all.limit(5))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#Data Preparation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Choose feature columns and the label column.\n",
|
|
"label = \"income\"\n",
|
|
"xvars = set(data_all.columns) - {label}\n",
|
|
"\n",
|
|
"print(\"label = {}\".format(label))\n",
|
|
"print(\"features = {}\".format(xvars))\n",
|
|
"\n",
|
|
"data = data_all.select([*xvars, label])\n",
|
|
"\n",
|
|
"# Split data into train and test.\n",
|
|
"train, test = data.randomSplit([0.75, 0.25], seed=123)\n",
|
|
"\n",
|
|
"print(\"train ({}, {})\".format(train.count(), len(train.columns)))\n",
|
|
"print(\"test ({}, {})\".format(test.count(), len(test.columns)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#Data Persistence"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Write the train and test data sets to intermediate storage\n",
|
|
"train_data_path = \"AdultCensusIncomeTrain\"\n",
|
|
"test_data_path = \"AdultCensusIncomeTest\"\n",
|
|
"\n",
|
|
"train_data_path_dbfs = os.path.join(\"/dbfs\", \"AdultCensusIncomeTrain\")\n",
|
|
"test_data_path_dbfs = os.path.join(\"/dbfs\", \"AdultCensusIncomeTest\")\n",
|
|
"\n",
|
|
"train.write.mode('overwrite').parquet(train_data_path)\n",
|
|
"test.write.mode('overwrite').parquet(test_data_path)\n",
|
|
"print(\"train and test datasets saved to {} and {}\".format(train_data_path_dbfs, test_data_path_dbfs))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
""
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"authors": [
|
|
{
|
|
"name": "pasha"
|
|
}
|
|
],
|
|
"kernelspec": {
|
|
"display_name": "Python 3.6",
|
|
"language": "python",
|
|
"name": "python36"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.6"
|
|
},
|
|
"name": "ingest-data-02",
|
|
"notebookId": 3836944406456362
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
} |