Azure ML & Azure Databricks notebooks by Parashar Shah.

Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

#Data Ingestion

In [None]:
import os
import urllib

In [None]:
# Download AdultCensusIncome.csv from Azure CDN. This file has 32,561 rows.
dataurl = "https://amldockerdatasets.azureedge.net/AdultCensusIncome.csv"
datafile = "AdultCensusIncome.csv"
datafile_dbfs = os.path.join("/dbfs", datafile)

if os.path.isfile(datafile_dbfs):
 print("found {} at {}".format(datafile, datafile_dbfs))
else:
 print("downloading {} to {}".format(datafile, datafile_dbfs))
 urllib.request.urlretrieve(dataurl, datafile_dbfs)

In [None]:
# Create a Spark dataframe out of the csv file.
data_all = sqlContext.read.format('csv').options(header='true', inferSchema='true', ignoreLeadingWhiteSpace='true', ignoreTrailingWhiteSpace='true').load(datafile)
print("({}, {})".format(data_all.count(), len(data_all.columns)))
data_all.printSchema()

In [None]:
#renaming columns
columns_new = [col.replace("-", "_") for col in data_all.columns]
data_all = data_all.toDF(*columns_new)
data_all.printSchema()

In [None]:
display(data_all.limit(5))

#Data Preparation

In [None]:
# Choose feature columns and the label column.
label = "income"
xvars = set(data_all.columns) - {label}

print("label = {}".format(label))
print("features = {}".format(xvars))

data = data_all.select([*xvars, label])

# Split data into train and test.
train, test = data.randomSplit([0.75, 0.25], seed=123)

print("train ({}, {})".format(train.count(), len(train.columns)))
print("test ({}, {})".format(test.count(), len(test.columns)))

#Data Persistence

In [None]:
# Write the train and test data sets to intermediate storage
train_data_path = "AdultCensusIncomeTrain"
test_data_path = "AdultCensusIncomeTest"

train_data_path_dbfs = os.path.join("/dbfs", "AdultCensusIncomeTrain")
test_data_path_dbfs = os.path.join("/dbfs", "AdultCensusIncomeTest")

train.write.mode('overwrite').parquet(train_data_path)
test.write.mode('overwrite').parquet(test_data_path)
print("train and test datasets saved to {} and {}".format(train_data_path_dbfs, test_data_path_dbfs))

![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/azure-databricks/amlsdk/ingest-data-02.png)