From 143590cfb4ab3b0a8f8457dafd188ebc5297a2b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Shan=C3=A9=20Winner?= <43390034+swinner95@users.noreply.github.com> Date: Wed, 21 Aug 2019 10:30:39 -0700 Subject: [PATCH] Delete new-york-taxi_scale-out.ipynb --- .../new-york-taxi_scale-out.ipynb | 135 ------------------ 1 file changed, 135 deletions(-) delete mode 100644 work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi_scale-out.ipynb diff --git a/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi_scale-out.ipynb b/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi_scale-out.ipynb deleted file mode 100644 index fd69f736..00000000 --- a/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi_scale-out.ipynb +++ /dev/null @@ -1,135 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Scale-Out Data Preparation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once we are done with preparing and featurizing the data locally, we can run the same steps on the full dataset in scale-out mode. The new york taxi cab data is about 300GB in total, which is perfect for scale-out. Let's start by downloading the package we saved earlier to disk. Feel free to run the `new_york_taxi_cab.ipynb` notebook to generate the package yourself, in which case you may comment out the download code and set the `package_path` to where the package is saved." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from tempfile import mkdtemp\n", - "from os import path\n", - "from urllib.request import urlretrieve\n", - "\n", - "dflow_root = mkdtemp()\n", - "dflow_path = path.join(dflow_root, \"new_york_taxi.dprep\")\n", - "print(\"Downloading Dataflow to: {}\".format(dflow_path))\n", - "urlretrieve(\"https://dprepdata.blob.core.windows.net/demo/new_york_taxi_v2.dprep\", dflow_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's load the package we just downloaded." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "\n", - "df = dprep.Dataflow.open(dflow_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's replace the datasources with the full dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from uuid import uuid4\n", - "\n", - "other_step = df._get_steps()[7].arguments['dataflows'][0]['anonymousSteps'][0]\n", - "other_step['id'] = str(uuid4())\n", - "other_step['arguments']['path']['target'] = 1\n", - "other_step['arguments']['path']['resourceDetails'][0]['path'] = 'https://wranglewestus.blob.core.windows.net/nyctaxi/yellow_tripdata*'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "green_dsource = dprep.BlobDataSource(\"https://wranglewestus.blob.core.windows.net/nyctaxi/green_tripdata*\")\n", - "df = df.replace_datasource(green_dsource)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once we have replaced the datasource, we can now run the same steps on the full dataset. We will print the first 5 rows of the spark DataFrame. Since we are running on the full dataset, this might take a little while depending on your spark cluster's size." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "spark_df = df.to_spark_dataframe()\n", - "spark_df.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi_scale-out.png)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License.", - "skip_execute_as_test": true - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file