From 82d8353d54c03378064ec2a4daf65492cdae704c Mon Sep 17 00:00:00 2001 From: Roope Astala Date: Tue, 23 Apr 2019 10:04:32 -0400 Subject: [PATCH] adding work-with-data --- .../work-with-data/dataprep/README.md | 177 +++ .../new-york-taxi/new-york-taxi.ipynb | 508 +++++++ .../new-york-taxi_scale-out.ipynb | 129 ++ .../dataprep/data/adls-dpreptestfiles.crt | 45 + .../dataprep/data/chicago-aldermen-2015.csv | 54 + .../dataprep/data/crime-dirty.csv | 15 + .../dataprep/data/crime-spring.csv | 11 + .../dataprep/data/crime-winter.csv | 11 + .../work-with-data/dataprep/data/crime.dprep | 204 +++ .../dataprep/data/crime.parquet | Bin 0 -> 3607 bytes .../work-with-data/dataprep/data/crime.txt | 10 + .../work-with-data/dataprep/data/crime.xlsx | Bin 0 -> 16109 bytes .../work-with-data/dataprep/data/crime.zip | Bin 0 -> 3685 bytes .../dataprep/data/crime_duplicate_headers.csv | 12 + .../dataprep/data/crime_fixed_width_file.txt | 10 + .../data/crime_multiple_separators.csv | 11 + .../dataprep/data/crime_partfiles/_SUCCESS | 0 ...8e77b-f17a-4c20-972c-aa382e830fca-c000.csv | 914 ++++++++++++ ...8e77b-f17a-4c20-972c-aa382e830fca-c000.csv | 921 ++++++++++++ ...8e77b-f17a-4c20-972c-aa382e830fca-c000.csv | 930 ++++++++++++ ...8e77b-f17a-4c20-972c-aa382e830fca-c000.csv | 953 ++++++++++++ ...8e77b-f17a-4c20-972c-aa382e830fca-c000.csv | 923 ++++++++++++ ...8e77b-f17a-4c20-972c-aa382e830fca-c000.csv | 887 +++++++++++ ...8e77b-f17a-4c20-972c-aa382e830fca-c000.csv | 971 ++++++++++++ ...8e77b-f17a-4c20-972c-aa382e830fca-c000.csv | 759 ++++++++++ .../work-with-data/dataprep/data/json.json | 1306 +++++++++++++++++ .../work-with-data/dataprep/data/map_func.py | 4 + .../dataprep/data/median_income.csv | 251 ++++ .../data/median_income_transformed.csv | 251 ++++ .../dataprep/data/parquet.parquet | Bin 0 -> 3091 bytes ...7a7-c3cd-4926-92b2-ba2dcd3f95b7.gz.parquet | Bin 0 -> 6078 bytes ...7a7-c3cd-4926-92b2-ba2dcd3f95b7.gz.parquet | Bin 0 -> 5083 bytes .../dataprep/data/secrets.dprep | 63 + .../add-column-using-expression.ipynb | 269 ++++ .../append-columns-and-rows.ipynb | 245 ++++ .../dataprep/how-to-guides/assertions.ipynb | 127 ++ .../how-to-guides/auto-read-file.ipynb | 183 +++ .../dataprep/how-to-guides/cache.ipynb | 188 +++ .../how-to-guides/column-manipulations.ipynb | 557 +++++++ .../column-type-transforms.ipynb | 467 ++++++ .../custom-python-transforms.ipynb | 225 +++ .../how-to-guides/data-ingestion.ipynb | 908 ++++++++++++ .../dataprep/how-to-guides/data-profile.ipynb | 200 +++ .../dataprep/how-to-guides/datastore.ipynb | 197 +++ .../derive-column-by-example.ipynb | 181 +++ .../how-to-guides/external-references.ipynb | 112 ++ .../dataprep/how-to-guides/filtering.ipynb | 215 +++ .../dataprep/how-to-guides/fuzzy-group.ipynb | 205 +++ .../how-to-guides/impute-missing-values.ipynb | 141 ++ .../dataprep/how-to-guides/join.ipynb | 259 ++++ .../how-to-guides/label-encoder.ipynb | 162 ++ .../how-to-guides/min-max-scaler.ipynb | 233 +++ .../how-to-guides/one-hot-encoder.ipynb | 173 +++ .../how-to-guides/open-save-dataflows.ipynb | 165 +++ .../quantile-transformation.ipynb | 85 ++ .../dataprep/how-to-guides/random-split.ipynb | 139 ++ ...replace-datasource-replace-reference.ipynb | 124 ++ .../how-to-guides/replace-fill-error.ipynb | 233 +++ .../dataprep/how-to-guides/secrets.ipynb | 134 ++ .../how-to-guides/semantic-types.ipynb | 158 ++ .../split-column-by-example.ipynb | 214 +++ .../how-to-guides/subsetting-sampling.ipynb | 211 +++ .../dataprep/how-to-guides/summarize.ipynb | 584 ++++++++ .../working-with-file-streams.ipynb | 186 +++ .../dataprep/how-to-guides/writing-data.ipynb | 177 +++ .../getting-started/getting-started.ipynb | 435 ++++++ 66 files changed, 18422 insertions(+) create mode 100644 how-to-use-azureml/work-with-data/dataprep/README.md create mode 100644 how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi_scale-out.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/adls-dpreptestfiles.crt create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/chicago-aldermen-2015.csv create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime-dirty.csv create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime-spring.csv create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime-winter.csv create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime.dprep create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime.parquet create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime.txt create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime.xlsx create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime.zip create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_duplicate_headers.csv create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_fixed_width_file.txt create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_multiple_separators.csv create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/_SUCCESS create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/part-00000-0b08e77b-f17a-4c20-972c-aa382e830fca-c000.csv create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/part-00001-0b08e77b-f17a-4c20-972c-aa382e830fca-c000.csv create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/part-00002-0b08e77b-f17a-4c20-972c-aa382e830fca-c000.csv create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/part-00003-0b08e77b-f17a-4c20-972c-aa382e830fca-c000.csv create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/part-00004-0b08e77b-f17a-4c20-972c-aa382e830fca-c000.csv create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/part-00005-0b08e77b-f17a-4c20-972c-aa382e830fca-c000.csv create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/part-00006-0b08e77b-f17a-4c20-972c-aa382e830fca-c000.csv create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/part-00007-0b08e77b-f17a-4c20-972c-aa382e830fca-c000.csv create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/json.json create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/map_func.py create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/median_income.csv create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/median_income_transformed.csv create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/parquet.parquet create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/parquet_dataset/Arrest=false/part-00000-34f8a7a7-c3cd-4926-92b2-ba2dcd3f95b7.gz.parquet create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/parquet_dataset/Arrest=true/part-00000-34f8a7a7-c3cd-4926-92b2-ba2dcd3f95b7.gz.parquet create mode 100644 how-to-use-azureml/work-with-data/dataprep/data/secrets.dprep create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/add-column-using-expression.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/append-columns-and-rows.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/assertions.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/auto-read-file.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/cache.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-manipulations.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-type-transforms.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/custom-python-transforms.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-ingestion.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-profile.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/datastore.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/derive-column-by-example.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/external-references.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/filtering.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/fuzzy-group.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/impute-missing-values.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/join.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/label-encoder.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/min-max-scaler.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/one-hot-encoder.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/open-save-dataflows.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/quantile-transformation.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/random-split.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-datasource-replace-reference.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-fill-error.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/secrets.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/semantic-types.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/split-column-by-example.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/subsetting-sampling.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/summarize.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/working-with-file-streams.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/writing-data.ipynb create mode 100644 how-to-use-azureml/work-with-data/dataprep/tutorials/getting-started/getting-started.ipynb diff --git a/how-to-use-azureml/work-with-data/dataprep/README.md b/how-to-use-azureml/work-with-data/dataprep/README.md new file mode 100644 index 00000000..bf862c72 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/README.md @@ -0,0 +1,177 @@ +# Azure Machine Learning Data Prep SDK + +You will find in this repo: +- [How-To Guide Notebooks](how-to-guides) for more in-depth feature examples. +- [Case Study Notebooks](case-studies/new-york-taxi) that show in-depth scenario examples of features. +- [Getting Started Tutorial](tutorials/getting-started/getting-started.ipynb) for a quick introduction to the Data Prep SDK and some of its main features. + +## Installation +Here are the [SDK installation steps](https://docs.microsoft.com/python/api/overview/azure/dataprep/intro?view=azure-dataprep-py#install). + +## Documentation +Here is more information on how to use the new Data Prep SDK: +- [SDK overview and API reference docs](http://aka.ms/data-prep-sdk) that show different classes, methods, and function parameters for the SDK. +- [Tutorial: Prep NYC taxi data](https://docs.microsoft.com/azure/machine-learning/service/tutorial-data-prep) for regression modeling and then run automated machine learning to build the model. +- [How to load data](https://docs.microsoft.com/azure/machine-learning/service/how-to-load-data) is an overview guide on how to load data using the Data Prep SDK. +- [How to transform data](https://docs.microsoft.com/azure/machine-learning/service/how-to-transform-data) is an overview guide on how to transform data. +- [How to write data](https://docs.microsoft.com/azure/machine-learning/service/how-to-write-data) is an overview guide on how to write data to different storage locations. + +## Known Issues + +- **If running version 0.1.0**: To fix "Error Message: Cannot run the event loop while another loop is running", downgrade Tornado version to 4.5.3. Restart any running kernels for the change to take effect. +``` +pip install -U tornado==4.5.3 +``` + +## Release Notes + +### 2019-03-25 (version 1.1.0) + +Breaking changes +- The concept of the Data Prep Package has been deprecated and is no longer supported. Instead of persisting multiple Dataflows in one Package, you can persist Dataflows individually. + - How-to guide: [Opening and Saving Dataflows notebook](https://aka.ms/aml-data-prep-open-save-dataflows-nb) + +New features +- Data Prep can now recognize columns that match a particular Semantic Type, and split accordingly. The STypes currently supported include: email address, geographic coordinates (latitude & longitude), IPv4 and IPv6 addresses, US phone number, and US zip code. + - How-to guide: [Semantic Types notebook](https://aka.ms/aml-data-prep-semantic-types-nb) +- Data Prep now supports the following operations to generate a resultant column from two numeric columns: subtract, multiply, divide, and modulo. +- You can call `verify_has_data()` on a Dataflow to check whether the Dataflow would produce records if executed. + +Bug fixes and improvements +- You can now specify the number of bins to use in a histogram for numeric column profiles. +- The `read_pandas_dataframe` transform now requires the DataFrame to have string- or byte- typed column names. +- Fixed a bug in the `fill_nulls` transform, where values were not correctly filled in if the column was missing. + +### 2019-03-11 (version 1.0.17) + +New features +- Now supports adding two numeric columns to generate a resultant column using the expression language. + +Bug fixes and improvements +- Improved the documentation and parameter checking for random_split. + +### 2019-02-27 (version 1.0.16) + +Bug fix +- Fixed a Service Principal authentication issue that was caused by an API change. + +### 2019-02-25 (version 1.0.15) + +New features +- Data Prep now supports writing file streams from a dataflow. Also provides the ability to manipulate the file stream names to create new file names. + - How-to guide: [Working With File Streams notebook](https://aka.ms/aml-data-prep-file-stream-nb) + +Bug fixes and improvements +- Improved performance of t-Digest on large data sets. +- Data Prep now supports reading data from a DataPath. +- One hot encoding now works on boolean and numeric columns. +- Other miscellaneous bug fixes. + +### 2019-02-11 (version 1.0.12) + +New features +- Data Prep now supports reading from an Azure SQL database using Datastore. + +Changes +- Significantly improved the memory performance of certain operations on large data. +- `read_pandas_dataframe()` now requires `temp_folder` to be specified. +- The `name` property on `ColumnProfile` has been deprecated - use `column_name` instead. + +### 2019-01-28 (version 1.0.8) + +Bug fixes +- Significantly improved the performance of getting data profiles. +- Fixed minor bugs related to error reporting. + +### 2019-01-14 (version 1.0.7) + +New features +- Datastore improvements (documented in [Datastore how-to-guide](https://aka.ms/aml-data-prep-datastore-nb)) + - Added ability to read from and write to Azure File Share and ADLS Datastores in scale-up. + - When using Datastores, Data Prep now supports using service principal authentication instead of interactive authentication. + - Added support for wasb and wasbs urls. + +### 2019-01-09 (version 1.0.6) + +Bug fixes +- Fixed bug with reading from public readable Azure Blob containers on Spark. + +### 2018-12-19 (version 1.0.4) + +New features +- `to_bool` function now allows mismatched values to be converted to Error values. This is the new default mismatch behavior for `to_bool` and `set_column_types`, whereas the previous default behavior was to convert mismatched values to False. +- When calling `to_pandas_dataframe`, there is a new option to interpret null/missing values in numeric columns as NaN. +- Added ability to check the return type of some expressions to ensure type consistency and fail early. +- You can now call `parse_json` to parse values in a column as JSON objects and expand them into multiple columns. + +Bug fixes +- Fixed a bug that crashed `set_column_types` in Python 3.5.2. +- Fixed a bug that crashed when connecting to Datastore using an AML image. + +### 2018-12-07 (version 0.5.3) + +Fixed missing dependency issue for .NET Core2 on Ubuntu 16. + +### 2018-12-03 (version 0.5.2) + +Breaking changes +- `SummaryFunction.N` was renamed to `SummaryFunction.Count`. + +Bug fixes +- Use latest AML Run Token when reading from and writing to datastores on remote runs. Previously, if the AML Run Token is updated in Python, the Data Prep runtime will not be updated with the updated AML Run Token. +- Additional clearer error messages +- to_spark_dataframe() will no longer crash when Spark uses Kryo serialization +- Value Count Inspector can now show more than 1000 unique values +- Random Split no longer fails if the original Dataflow doesn’t have a name + +### 2018-11-19 (version 0.5.0) + +New features +- Created a new DataPrep CLI to execute DataPrep packages and view the data profile for a dataset or dataflow +- Redesigned SetColumnType API to improve usability +- Renamed smart_read_file to auto_read_file +- Now includes skew and kurtosis in the Data Profile +- Can sample with stratified sampling +- Can read from zip files that contain CSV files +- Can split datasets row-wise with Random Split (e.g. into test-train sets) +- Can get all the column data types from a dataflow or a data profile by calling .dtypes +- Can get the row count from a dataflow or a data profile by calling .row_count + +Bug fixes +- Fixed long to double conversion +- Fixed assert after any add column +- Fixed an issue with FuzzyGrouping, where it would not detect groups in some cases +- Fixed sort function to respect multi-column sort order +- Fixed and/or expressions to be similar to how Pandas handles them +- Fixed reading from dbfs path. +- Made error messages more understandable +- Now no longer fails when reading on remote compute target using AML token +- Now no longer fails on Linux DSVM +- Now no longer crashes when non-string values are in string predicates +- Now handles assertion errors when Dataflow should fail correctly +- Now supports dbutils mounted storage locations on Azure Databricks + +### 2018-11-05 (version 0.4.0) + +New features +- Type Count added to Data Profile +- Value Count and Histogram is now available +- More percentiles in Data Profile +- The Median is available in Summarize +- Python 3.7 is now supported +- When you save a dataflow that contains datastores to a Data Prep package, the datastore information will be persisted as part of the Data Prep package +- Writing to datastore is now supported + +Bug fixes +- 64bit unsigned integer overflows are now handled properly on Linux +- Fixed incorrect text label for plain text files in smart_read +- String column type now shows up in metrics view +- Type count now is fixed to show ValueKinds mapped to single FieldType instead of individual ones +- Write_to_csv no longer fails when path is provided as a string +- When using Replace, leaving “find” blank will no longer fail + +## Datasets License Information + +IMPORTANT: Please read the notice and find out more about this NYC Taxi and Limousine Commission dataset here: http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml + +IMPORTANT: Please read the notice and find out more about this Chicago Police Department dataset here: https://catalog.data.gov/dataset/crimes-2001-to-present-398a4 \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi.ipynb b/how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi.ipynb new file mode 100644 index 00000000..16578f1a --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi.ipynb @@ -0,0 +1,508 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cleaning up New York Taxi Cab data\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use DataPrep to clean and featurize the data which can then be used to predict taxi trip duration. We will not use the For Hire Vehicle (FHV) datasets as they are not really taxi rides and they don't provide drop-off time and geo-coordinates." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display\n", + "from os import path\n", + "from tempfile import mkdtemp\n", + "\n", + "import pandas as pd\n", + "import azureml.dataprep as dprep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a quick peek at yellow cab data and green cab data to see what the data looks like. DataPrep supports globing, so you will notice below that we have added a `*` in the path.\n", + "\n", + "*We are using a small sample of the taxi data for this demo. You can find a bigger sample ~6GB by changing \"green-small\" to \"green-sample\" and \"yellow-small\" to \"yellow-sample\" in the paths below.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_columns', None)\n", + "\n", + "cache_location = mkdtemp()\n", + "dataset_root = \"https://dprepdata.blob.core.windows.net/demo\"\n", + "\n", + "green_path = \"/\".join([dataset_root, \"green-small/*\"])\n", + "yellow_path = \"/\".join([dataset_root, \"yellow-small/*\"])\n", + "\n", + "print(\"Retrieving data from the following two sources:\")\n", + "print(green_path)\n", + "print(yellow_path)\n", + "\n", + "green_df = dprep.read_csv(path=green_path, header=dprep.PromoteHeadersMode.GROUPED)\n", + "yellow_df = dprep.auto_read_file(path=yellow_path)\n", + "\n", + "display(green_df.head(5))\n", + "display(yellow_df.head(5))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Cleanup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define some shortcut transforms that will apply to all Dataflows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_columns = dprep.ColumnSelector(term=\".*\", use_regex=True)\n", + "drop_if_all_null = [all_columns, dprep.ColumnRelationship(dprep.ColumnRelationship.ALL)]\n", + "useful_columns = [\n", + " \"cost\", \"distance\"\"distance\", \"dropoff_datetime\", \"dropoff_latitude\", \"dropoff_longitude\",\n", + " \"passengers\", \"pickup_datetime\", \"pickup_latitude\", \"pickup_longitude\", \"store_forward\", \"vendor\"\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's first work with the green taxi data and get it into a good shape that then can be combined with the yellow taxi data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tmp_df = (green_df\n", + " .replace_na(columns=all_columns)\n", + " .drop_nulls(*drop_if_all_null)\n", + " .rename_columns(column_pairs={\n", + " \"VendorID\": \"vendor\",\n", + " \"lpep_pickup_datetime\": \"pickup_datetime\",\n", + " \"Lpep_dropoff_datetime\": \"dropoff_datetime\",\n", + " \"lpep_dropoff_datetime\": \"dropoff_datetime\",\n", + " \"Store_and_fwd_flag\": \"store_forward\",\n", + " \"store_and_fwd_flag\": \"store_forward\",\n", + " \"Pickup_longitude\": \"pickup_longitude\",\n", + " \"Pickup_latitude\": \"pickup_latitude\",\n", + " \"Dropoff_longitude\": \"dropoff_longitude\",\n", + " \"Dropoff_latitude\": \"dropoff_latitude\",\n", + " \"Passenger_count\": \"passengers\",\n", + " \"Fare_amount\": \"cost\",\n", + " \"Trip_distance\": \"distance\"\n", + " })\n", + " .keep_columns(columns=useful_columns))\n", + "tmp_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "green_df = tmp_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's do the same thing to yellow taxi data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tmp_df = (yellow_df\n", + " .replace_na(columns=all_columns)\n", + " .drop_nulls(*drop_if_all_null)\n", + " .rename_columns(column_pairs={\n", + " \"vendor_name\": \"vendor\",\n", + " \"VendorID\": \"vendor\",\n", + " \"vendor_id\": \"vendor\",\n", + " \"Trip_Pickup_DateTime\": \"pickup_datetime\",\n", + " \"tpep_pickup_datetime\": \"pickup_datetime\",\n", + " \"Trip_Dropoff_DateTime\": \"dropoff_datetime\",\n", + " \"tpep_dropoff_datetime\": \"dropoff_datetime\",\n", + " \"store_and_forward\": \"store_forward\",\n", + " \"store_and_fwd_flag\": \"store_forward\",\n", + " \"Start_Lon\": \"pickup_longitude\",\n", + " \"Start_Lat\": \"pickup_latitude\",\n", + " \"End_Lon\": \"dropoff_longitude\",\n", + " \"End_Lat\": \"dropoff_latitude\",\n", + " \"Passenger_Count\": \"passengers\",\n", + " \"passenger_count\": \"passengers\",\n", + " \"Fare_Amt\": \"cost\",\n", + " \"fare_amount\": \"cost\",\n", + " \"Trip_Distance\": \"distance\",\n", + " \"trip_distance\": \"distance\"\n", + " })\n", + " .keep_columns(columns=useful_columns))\n", + "tmp_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "yellow_df = tmp_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's now append the rows from the `yellow_df` to `green_df`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "combined_df = green_df.append_rows(dataflows=[yellow_df])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at the pickup and drop-off coordinates' data profile to see how the data is distributed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "decimal_type = dprep.TypeConverter(data_type=dprep.FieldType.DECIMAL)\n", + "combined_df = combined_df.set_column_types(type_conversions={\n", + " \"pickup_longitude\": decimal_type,\n", + " \"pickup_latitude\": decimal_type,\n", + " \"dropoff_longitude\": decimal_type,\n", + " \"dropoff_latitude\": decimal_type\n", + "})\n", + "combined_df.keep_columns(columns=[\n", + " \"pickup_longitude\", \"pickup_latitude\", \n", + " \"dropoff_longitude\", \"dropoff_latitude\"\n", + "]).get_profile()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the data profile, we can see that there are coordinates that are missing and coordinates that are not in New York. Let's filter out coordinates not in the [city border](https://mapmakerapp.com?map=5b60a055a191245990310739f658)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tmp_df = (combined_df\n", + " .drop_nulls(\n", + " columns=[\"pickup_longitude\", \"pickup_latitude\", \"dropoff_longitude\", \"dropoff_latitude\"],\n", + " column_relationship=dprep.ColumnRelationship(dprep.ColumnRelationship.ANY)\n", + " ) \n", + " .filter(dprep.f_and(\n", + " dprep.col(\"pickup_longitude\") <= -73.72,\n", + " dprep.col(\"pickup_longitude\") >= -74.09,\n", + " dprep.col(\"pickup_latitude\") <= 40.88,\n", + " dprep.col(\"pickup_latitude\") >= 40.53,\n", + " dprep.col(\"dropoff_longitude\") <= -73.72,\n", + " dprep.col(\"dropoff_longitude\") >= -74.09,\n", + " dprep.col(\"dropoff_latitude\") <= 40.88,\n", + " dprep.col(\"dropoff_latitude\") >= 40.53\n", + " )))\n", + "tmp_df.keep_columns(columns=[\n", + " \"pickup_longitude\", \"pickup_latitude\", \n", + " \"dropoff_longitude\", \"dropoff_latitude\"\n", + "]).get_profile()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "combined_df = tmp_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at the data profile for the `store_forward` column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "combined_df.keep_columns(columns='store_forward').get_profile()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the data profile of `store_forward` above, we can see that the data is inconsistent and there are missing values. Let's fix them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "combined_df = combined_df.replace(columns=\"store_forward\", find=\"0\", replace_with=\"N\").fill_nulls(\"store_forward\", \"N\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's now split the pick up and drop off datetimes into a date column and a time column. We will use `split_column_by_example` to perform the split. If the `example` parameter of `split_column_by_example` is omitted, we will automatically try to figure out where to split based on the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tmp_df = (combined_df\n", + " .split_column_by_example(source_column=\"pickup_datetime\")\n", + " .split_column_by_example(source_column=\"dropoff_datetime\"))\n", + "tmp_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "combined_df = tmp_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's rename the columns generated by `split_column_by_example` into meaningful names." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tmp_df = (combined_df\n", + " .rename_columns(column_pairs={\n", + " \"pickup_datetime_1\": \"pickup_date\",\n", + " \"pickup_datetime_2\": \"pickup_time\",\n", + " \"dropoff_datetime_1\": \"dropoff_date\",\n", + " \"dropoff_datetime_2\": \"dropoff_time\"\n", + " }))\n", + "tmp_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "combined_df = tmp_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Feature Engineering" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Datetime features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's split the pickup and drop-off date further into day of week, day of month, and month. For pickup and drop-off time columns, we will split it into hour, minute, and second." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tmp_df = (combined_df\n", + " .derive_column_by_example(\n", + " source_columns=\"pickup_date\", \n", + " new_column_name=\"pickup_weekday\", \n", + " example_data=[(\"2009-01-04\", \"Sunday\"), (\"2013-08-22\", \"Thursday\")]\n", + " )\n", + " .derive_column_by_example(\n", + " source_columns=\"dropoff_date\",\n", + " new_column_name=\"dropoff_weekday\",\n", + " example_data=[(\"2013-08-22\", \"Thursday\"), (\"2013-11-03\", \"Sunday\")]\n", + " )\n", + " .split_column_by_example(source_column=\"pickup_date\")\n", + " .split_column_by_example(source_column=\"pickup_time\")\n", + " .split_column_by_example(source_column=\"dropoff_date\")\n", + " .split_column_by_example(source_column=\"dropoff_time\")\n", + " .split_column_by_example(source_column=\"pickup_time_1\")\n", + " .split_column_by_example(source_column=\"dropoff_time_1\")\n", + " .drop_columns(columns=[\n", + " \"pickup_date\", \"pickup_time\", \"dropoff_date\", \"dropoff_time\", \n", + " \"pickup_date_1\", \"dropoff_date_1\", \"pickup_time_1\", \"dropoff_time_1\"\n", + " ])\n", + " .rename_columns(column_pairs={\n", + " \"pickup_date_2\": \"pickup_month\",\n", + " \"pickup_date_3\": \"pickup_monthday\",\n", + " \"pickup_time_1_1\": \"pickup_hour\",\n", + " \"pickup_time_1_2\": \"pickup_minute\",\n", + " \"pickup_time_2\": \"pickup_second\",\n", + " \"dropoff_date_2\": \"dropoff_month\",\n", + " \"dropoff_date_3\": \"dropoff_monthday\",\n", + " \"dropoff_time_1_1\": \"dropoff_hour\",\n", + " \"dropoff_time_1_2\": \"dropoff_minute\",\n", + " \"dropoff_time_2\": \"dropoff_second\"\n", + " }))\n", + "tmp_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "combined_df = tmp_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the data above, we can see that the pickup and drop-off date and time components produced from the transforms above looks good. Let's drop the `pickup_datetime` and `dropoff_datetime` columns as they are no longer needed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tmp_df = combined_df.drop_columns(columns=[\"pickup_datetime\", \"dropoff_datetime\"])\n", + "tmp_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "combined_df = tmp_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's now save the transformation steps into a DataPrep package so we can use it to to run on spark." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_path = path.join(mkdtemp(), \"new_york_taxi.dprep\")\n", + "combined_df.save(file_path=dflow_path)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi_scale-out.ipynb b/how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi_scale-out.ipynb new file mode 100644 index 00000000..66e5fc00 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi_scale-out.ipynb @@ -0,0 +1,129 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scale-Out Data Preparation\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we are done with preparing and featurizing the data locally, we can run the same steps on the full dataset in scale-out mode. The new york taxi cab data is about 300GB in total, which is perfect for scale-out. Let's start by downloading the package we saved earlier to disk. Feel free to run the `new_york_taxi_cab.ipynb` notebook to generate the package yourself, in which case you may comment out the download code and set the `package_path` to where the package is saved." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tempfile import mkdtemp\n", + "from os import path\n", + "from urllib.request import urlretrieve\n", + "\n", + "dflow_root = mkdtemp()\n", + "dflow_path = path.join(dflow_root, \"new_york_taxi.dprep\")\n", + "print(\"Downloading Dataflow to: {}\".format(dflow_path))\n", + "urlretrieve(\"https://dprepdata.blob.core.windows.net/demo/new_york_taxi_v2.dprep\", dflow_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's load the package we just downloaded." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "\n", + "df = dprep.Dataflow.open(dflow_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's replace the datasources with the full dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from uuid import uuid4\n", + "\n", + "other_step = df._get_steps()[7].arguments['dataflows'][0]['anonymousSteps'][0]\n", + "other_step['id'] = str(uuid4())\n", + "other_step['arguments']['path']['target'] = 1\n", + "other_step['arguments']['path']['resourceDetails'][0]['path'] = 'https://wranglewestus.blob.core.windows.net/nyctaxi/yellow_tripdata*'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "green_dsource = dprep.BlobDataSource(\"https://wranglewestus.blob.core.windows.net/nyctaxi/green_tripdata*\")\n", + "df = df.replace_datasource(green_dsource)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have replaced the datasource, we can now run the same steps on the full dataset. We will print the first 5 rows of the spark DataFrame. Since we are running on the full dataset, this might take a little while depending on your spark cluster's size." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark_df = df.take(5).to_pandas_dataframe()\n", + "spark_df.head(5)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + }, + "skip_execute_as_test": true + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/data/adls-dpreptestfiles.crt b/how-to-use-azureml/work-with-data/dataprep/data/adls-dpreptestfiles.crt new file mode 100644 index 00000000..98498f95 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/data/adls-dpreptestfiles.crt @@ -0,0 +1,45 @@ +-----BEGIN PRIVATE KEY----- +MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQDmkkyF0BwipZow +Wd1AMkRkySx0y079JPxpsYhv4i1xXKdoa9bpFqwoXmJpeQM1JWnU4UeZzFeM86qK +AhQvL4KV4kibcP2ENvu2NKFEdotO3uxPJ+6GlcYwMYzy+tUj008KnnRZfTrR78sJ +tIl3C6lnVL0ICihksG59P1sskRq3PvOjXLAdEZalwDjZ4ZPoNDZdj6nUjB2l8zqu +pKAt5mR+bJ9Sox4yrDuNhMmFt5QsRDRe3wUqdV+C9OCWHmjlmsjrYw7p9YmjBDvC +5U7mF0Mk/XeYFzj0pkXKQVqBL6xqig+q5ob0szYfg19iDeFhS3iIsRcJGEnRVW/A +NpsBZyKrAgMBAAECggEBANlvP8C1F8NInhZYuIAwpzTQTh86Fxw8g9h8dijkh2wv +LyQXBk07d1B+aZoDZ5X32UzKwcX04N9obfvFqBkzWZdVFJmZvUmwvEEActBoZkkT +io+/HX5HweVy5PPCvbsSK6jc8uXtZcnSs4tMeJIOKkvqqnTpd1w00Y1FcQqfMC16 +4p7o8wbt6OFoFAYqcxeVYVwDzCTLZD3+iJaqmntkBkoDndJy52yXQmMq5z1wbQVp +BL6+L9nTvmouy64jiHVSKOx8nnWThYfHsXoPv+rYywjeuK/v3hyaTAwogs36ooEn +SnuTBRvJcumN9Q0XIVlxKMVBcGyyAP+0yNKGz5NQgdECgYEA/I/Uq1E3epPJgEWR +Bub+LpCgwtrw/lgKncb/Q/AiE9qoXobUe4KNU8aGaNMb7uVNLckY7cOluLS6SQb3 +Mzwk2Jl0G3vk8rW46tZWvSYB8+zAR2Rz7seUOT9SE5OmvwpnHrnp3nRr1vvVd2bp +Q/ypwMLrwWQN51Kr+oTS74bUbrkCgYEA6bXVIUyao7z2Q3qAr6h+6JEWDbkJA7hJ +BjHIOXvxd1tMoJJX+X9+IE/2XoJaUkGCb0vrM/hi1cyQFmS4Or/J6IWSZu8oBpDr +EBmIK3PF1nrzNvWD28wM46c6ScehyWSm/u4bJWSm9liTX3dv5Kpa6ym7yLKc3c0B +ECpSJM+5SoMCgYEAq585Tukzn/IJPUcIk/4nv5C8DW0l0lAVdr2g/JOTNJajTwik +HwHJ86G1+Elsc9wRpAlBDWCjnm4BIFrBZGl8SEuOoJaCL4PZEotwCbxoG09IIbtb +JGkuifBDX9Y3ux3gkPqYt3e5SC99EVQ3MuHgoIJUHehVolmFUAkuJWIjvNECgYEA +5pU0VspRuELzZdgzpxvDOooLDDcHodfslGQBfFXBA1Xc4IACtHMJaa/7D3vkyUtA ++bYZtQjX2sEdWDq/WZdoCjXfIBfNkczhXt0R8G0lQFvGIu9QzUchYGrZo3mHMkBQ +Uy1xMw9/e4YgwQwCJcW+Nk7Sq00uX9enuN9IdHFOCykCgYAqAGMK6CH1tlpjvHrf +k+ZhigYxTXBlsVVvK1BIGGaiwzDpn65zeQp4aLOjSZkI1LuRi3tfTiZ321jRd64J +4lGk5Jurqv5grDmxROX/U50wEYbI9ncu/thU7syUdxDiqxHPI2RMG50mRcm3a55p +ZCNSqkMlcXyA0U1z8C1ILNUsbA== +-----END PRIVATE KEY----- +-----BEGIN CERTIFICATE----- +MIICoTCCAYkCAgPoMA0GCSqGSIb3DQEBBQUAMBQxEjAQBgNVBAMMCUNMSS1Mb2dp +bjAiGA8yMDE4MDcxMzIzMjA0N1oYDzIwMTkwNzEzMjMyMDQ5WjAUMRIwEAYDVQQD +DAlDTEktTG9naW4wggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDmkkyF +0BwipZowWd1AMkRkySx0y079JPxpsYhv4i1xXKdoa9bpFqwoXmJpeQM1JWnU4UeZ +zFeM86qKAhQvL4KV4kibcP2ENvu2NKFEdotO3uxPJ+6GlcYwMYzy+tUj008KnnRZ +fTrR78sJtIl3C6lnVL0ICihksG59P1sskRq3PvOjXLAdEZalwDjZ4ZPoNDZdj6nU +jB2l8zqupKAt5mR+bJ9Sox4yrDuNhMmFt5QsRDRe3wUqdV+C9OCWHmjlmsjrYw7p +9YmjBDvC5U7mF0Mk/XeYFzj0pkXKQVqBL6xqig+q5ob0szYfg19iDeFhS3iIsRcJ +GEnRVW/ANpsBZyKrAgMBAAEwDQYJKoZIhvcNAQEFBQADggEBAI4VlaFb9NsXMLdT +Cw5/pk0Xo2Qi6483RGTy8vzrw88IE7f3juB/JWG+rayjtW5bBRx2fae4/ZIdZ4zg +N2FDKn2PQPAc9m9pcKyUKUvWOC8ixSkrUmeQew0l1AXU0hsPSlJ7/7ZK4efoyB47 +hj71fsyKdyKbisZDcUFBq/S8PazdPF0YOD1W/4A2tW0cSMg+jmFWynuUTdWt3SU8 +CwBGqdiSKT5faJuYwIWnRXDEQS3ObRn1OFEfFdd4d2sxjxydWKRgnINnGlBdiFAT +KzCozVr+75cO2ErH6x5C0hLQGG5BxXbaijyxyvaRNokTMVVv6OaDEnjzCGfJ72Yf +2wgitNc= +-----END CERTIFICATE----- diff --git a/how-to-use-azureml/work-with-data/dataprep/data/chicago-aldermen-2015.csv b/how-to-use-azureml/work-with-data/dataprep/data/chicago-aldermen-2015.csv new file mode 100644 index 00000000..a0cae0ba --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/data/chicago-aldermen-2015.csv @@ -0,0 +1,54 @@ +"Retrieved from https://en.wikipedia.org/wiki/Chicago_City_Council on November 6, 2018" + + +Ward,Name,Took Office,Party +1,Proco Joe Moreno,2010*,Dem +2,Brian Hopkins,2015,Dem +3,Pat Dowell,2007,Dem +4,Sophia King,2016*,Dem +5,Leslie Hairston,1999,Dem +6,Roderick Sawyer,2011,Dem +7,Gregory Mitchell,2015,Dem +8,Michelle A. Harris,2006*,Dem +9,Anthony Beale,1999,Dem +10,Susie Sadlowski Garza,2015,Dem +11,Patrick Daley Thompson,2015,Dem +12,George Cardenas,2003,Dem +13,Marty Quinn,2011,Dem +14,Edward M. Burke,1969,Dem +15,Raymond Lopez,2015,Dem +16,Toni Foulkes,2007,Dem +17,David H. Moore,2015,Dem +18,Derrick Curtis,2015,Dem +19,Matthew O'Shea,2011,Dem +20,Willie Cochran,2007,Dem +21,Howard Brookins Jr.,2003,Dem +22,Ricardo Muñoz,1993*,Dem +23,Silvana Tabares,2018*,Dem +24,"Michael Scott, Jr.",2015,Dem +25,Daniel Solis,1996*,Dem +26,Roberto Maldonado,2009*,Dem +27,"Walter Burnett, Jr.",1995,Dem +28,Jason Ervin,2011*,Dem +29,Chris Taliaferro,2015,Dem +30,Ariel Reboyras,2003,Dem +31,Milly Santiago,2015,Dem +32,Scott Waguespack,2007,Dem +33,Deb Mell,2013*,Dem +34,Carrie Austin,1994*,Dem +35,Carlos Ramirez-Rosa,2015,Dem +36,Gilbert Villegas,2015,Dem +37,Emma Mitts,2000*,Dem +38,Nicholas Sposato,2011,Ind +39,Margaret Laurino,1994*,Dem +40,Patrick J. O'Connor,1983,Dem +41,Anthony Napolitano,2015,Rep +42,Brendan Reilly,2007,Dem +43,Michele Smith,2011,Dem +44,Thomas M. Tunney,2002*,Dem +45,John Arena,2011,Dem +46,James Cappleman,2011,Dem +47,Ameya Pawar,2011,Dem +48,Harry Osterman,2011,Dem +49,Joe Moore,1991,Dem +50,Debra Silverstein,2011,Dem diff --git a/how-to-use-azureml/work-with-data/dataprep/data/crime-dirty.csv b/how-to-use-azureml/work-with-data/dataprep/data/crime-dirty.csv new file mode 100644 index 00000000..ef7beb0b --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/data/crime-dirty.csv @@ -0,0 +1,15 @@ +File updated 11/2/2018 + + + +ID|Case Number|Date|Block|IUCR|Primary Type|Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|Updated On|Latitude|Longitude|Location +10140490|HY329907|07/05/2015 11:50:00 PM|050XX N NEWLAND AVE|0820|THEFT|$500 AND UNDER|STREET|false|false|1613|016|41|10|06|1129230|1933315|2015|07/12/2015 12:42:46 PM|41.973309466|-87.800174996|(41.973309466, -87.800174996) +10139776|HY329265|07/05/2015 11:30:00 PM|011XX W MORSE AVE|0460|BATTERY|SIMPLE|STREET|false|true|2431|024|49|1|08B|1167370|1946271|2015|07/12/2015 12:42:46 PM|42.008124017|-87.65955018|(42.008124017, -87.65955018) +10140270|HY329253|07/05/2015 11:20:00 PM|121XX S FRONT AVE|0486|BATTERY|DOMESTIC BATTERY SIMPLE|STREET|false|true|0532||9|53|08B|||2015|07/12/2015 12:42:46 PM||| +10139885|HY329308|07/05/2015 11:19:00 PM|051XX W DIVISION ST|0610|BURGLARY|FORCIBLE ENTRY|SMALL RETAIL STORE|false|false|1531|015|37|25|05|1141721|1907465|2015|07/12/2015 12:42:46 PM|41.902152027|-87.754883404|(41.902152027, -87.754883404) +10140379|HY329556|07/05/2015 11:00:00 PM|012XX W LAKE ST|0930|MOTOR VEHICLE THEFT|THEFT/RECOVERY: AUTOMOBILE|STREET|false|false|1215|012|27|28|07|1168413|1901632|2015|07/12/2015 12:42:46 PM|41.885610142|-87.657008701|(41.885610142, -87.657008701) +10140868|HY330421|07/05/2015 10:54:00 PM|118XX S PEORIA ST|1320|CRIMINAL DAMAGE|TO VEHICLE|VEHICLE NON-COMMERCIAL|false|false|0524|005|34|53|14|1172409|1826485|2015|07/12/2015 12:42:46 PM|41.6793109|-87.644545209|(41.6793109, -87.644545209) +10139762|HY329232|07/05/2015 10:42:00 PM|026XX W 37TH PL|1020|ARSON|BY FIRE|VACANT LOT/LAND|false|false|0911|009|12|58|09|1159436|1879658|2015|07/12/2015 12:42:46 PM|41.825500607|-87.690578042|(41.825500607, -87.690578042) +10139722|HY329228|07/05/2015 10:30:00 PM|016XX S CENTRAL PARK AVE|1811|NARCOTICS|POSS: CANNABIS 30GMS OR LESS|ALLEY|true|false|1021|010|24|29|18|1152687|1891389|2015|07/12/2015 12:42:46 PM|41.857827814|-87.715028789|(41.857827814, -87.715028789) +10139774|HY329209|07/05/2015 10:15:00 PM|048XX N ASHLAND AVE|1310|CRIMINAL DAMAGE|TO PROPERTY|APARTMENT|false|false|2032|020|46|3|14|1164821|1932394|2015|07/12/2015 12:42:46 PM|41.970099796|-87.669324377|(41.970099796, -87.669324377) +10139697|HY329177|07/05/2015 10:10:00 PM|058XX S ARTESIAN AVE|1320|CRIMINAL DAMAGE|TO VEHICLE|ALLEY|false|false|0824|008|16|63|14|1160997|1865851|2015|07/12/2015 12:42:46 PM|41.787580282|-87.685233078|(41.787580282, -87.685233078) diff --git a/how-to-use-azureml/work-with-data/dataprep/data/crime-spring.csv b/how-to-use-azureml/work-with-data/dataprep/data/crime-spring.csv new file mode 100644 index 00000000..3750a186 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/data/crime-spring.csv @@ -0,0 +1,11 @@ +ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location +10498554,HZ239907,4/4/2016 23:56,007XX E 111TH ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,OTHER,FALSE,FALSE,531,5,9,50,11,1183356,1831503,2016,5/11/2016 15:48,41.69283384,-87.60431945,"(41.692833841, -87.60431945)" +10516598,HZ258664,4/15/2016 17:00,082XX S MARSHFIELD AVE,890,THEFT,FROM BUILDING,RESIDENCE,FALSE,FALSE,614,6,21,71,6,1166776,1850053,2016,5/12/2016 15:48,41.74410697,-87.66449429,"(41.744106973, -87.664494285)" +10519196,HZ261252,4/15/2016 10:00,104XX S SACRAMENTO AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,RESIDENCE,FALSE,FALSE,2211,22,19,74,11,,,2016,5/12/2016 15:50,,, +10519591,HZ261534,4/15/2016 9:00,113XX S PRAIRIE AVE,1120,DECEPTIVE PRACTICE,FORGERY,RESIDENCE,FALSE,FALSE,531,5,9,49,10,,,2016,5/13/2016 15:51,,, +10534446,HZ277630,4/15/2016 10:00,055XX N KEDZIE AVE,890,THEFT,FROM BUILDING,"SCHOOL, PUBLIC, BUILDING",FALSE,FALSE,1712,17,40,13,6,,,2016,5/25/2016 15:59,,, +10535059,HZ278872,4/15/2016 4:30,004XX S KILBOURN AVE,810,THEFT,OVER $500,RESIDENCE,FALSE,FALSE,1131,11,24,26,6,,,2016,5/25/2016 15:59,,, +10499802,HZ240778,4/15/2016 10:00,010XX N MILWAUKEE AVE,1152,DECEPTIVE PRACTICE,ILLEGAL USE CASH CARD,RESIDENCE,FALSE,FALSE,1213,12,27,24,11,,,2016,5/27/2016 15:45,,, +10522293,HZ264802,4/15/2016 16:00,019XX W DIVISION ST,1110,DECEPTIVE PRACTICE,BOGUS CHECK,RESTAURANT,FALSE,FALSE,1424,14,1,24,11,1163094,1908003,2016,5/16/2016 15:48,41.90320604,-87.67636193,"(41.903206037, -87.676361925)" +10523111,HZ265911,4/15/2016 8:00,061XX N SHERIDAN RD,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,FALSE,FALSE,2433,24,48,77,11,,,2016,5/16/2016 15:50,,, +10525877,HZ268138,4/15/2016 15:00,023XX W EASTWOOD AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,FALSE,FALSE,1911,19,47,4,11,,,2016,5/18/2016 15:50,,, diff --git a/how-to-use-azureml/work-with-data/dataprep/data/crime-winter.csv b/how-to-use-azureml/work-with-data/dataprep/data/crime-winter.csv new file mode 100644 index 00000000..4c70d468 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/data/crime-winter.csv @@ -0,0 +1,11 @@ +ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location +10378283,HZ114126,1/10/2016 11:00,033XX W IRVING PARK RD,610,BURGLARY,FORCIBLE ENTRY,RESIDENCE-GARAGE,TRUE,FALSE,1724,17,33,16,5,1153593,1926401,2016,5/22/2016 15:51,41.95388599,-87.71077048,"(41.95388599, -87.710770479)" +10382154,HZ118288,1/10/2016 21:00,055XX S FRANCISCO AVE,1754,OFFENSE INVOLVING CHILDREN,AGG SEX ASSLT OF CHILD FAM MBR,RESIDENCE,FALSE,TRUE,824,8,14,63,2,1157983,1867874,2016,6/1/2016 15:51,41.79319349,-87.69622926,"(41.793193489, -87.696229255)" +10374287,HZ110730,1/10/2016 11:50,043XX W ARMITAGE AVE,5002,OTHER OFFENSE,OTHER VEHICLE OFFENSE,STREET,FALSE,TRUE,2522,25,30,20,26,1146917,1912931,2016,6/7/2016 15:55,41.91705356,-87.73565764,"(41.917053561, -87.735657637)" +10374662,HZ110403,1/10/2016 1:30,073XX S CLAREMONT AVE,497,BATTERY,AGGRAVATED DOMESTIC BATTERY: OTHER DANG WEAPON,STREET,FALSE,TRUE,835,8,18,66,04B,1162007,1855951,2016,2/4/2016 15:44,41.76039236,-87.68180481,"(41.760392356, -87.681804812)" +10374720,HZ110836,1/10/2016 7:30,079XX S RHODES AVE,890,THEFT,FROM BUILDING,OTHER,FALSE,FALSE,624,6,6,44,6,1181279,1852568,2016,2/4/2016 15:44,41.75068679,-87.61127681,"(41.75068679, -87.611276811)" +10375178,HZ110832,1/10/2016 14:20,057XX S KEDZIE AVE,460,BATTERY,SIMPLE,RESTAURANT,FALSE,FALSE,824,8,14,63,08B,1156029,1866379,2016,2/4/2016 15:44,41.78913051,-87.7034346,"(41.78913051, -87.703434602)" +10398695,HZ135279,1/10/2016 23:00,031XX S PARNELL AVE,620,BURGLARY,UNLAWFUL ENTRY,RESIDENCE-GARAGE,FALSE,FALSE,915,9,11,60,5,1173138,1884117,2016,2/4/2016 15:44,41.8374442,-87.64017699,"(41.837444199, -87.640176991)" +10402270,HZ138745,1/10/2016 11:00,051XX S ELIZABETH ST,620,BURGLARY,UNLAWFUL ENTRY,APARTMENT,FALSE,FALSE,934,9,16,61,5,,,2016,2/4/2016 6:53,,, +10380619,HZ116583,1/10/2016 9:41,091XX S PAXTON AVE,4387,OTHER OFFENSE,VIOLATE ORDER OF PROTECTION,RESIDENCE,TRUE,TRUE,413,4,7,48,26,1192434,1844707,2016,2/2/2016 15:56,41.72885134,-87.57065553,"(41.728851343, -87.570655525)" +10400131,HZ136171,1/10/2016 18:00,0000X W TERMINAL ST,810,THEFT,OVER $500,AIRPORT BUILDING NON-TERMINAL - SECURE AREA,FALSE,FALSE,1651,16,41,76,6,,,2016,2/2/2016 15:58,,, diff --git a/how-to-use-azureml/work-with-data/dataprep/data/crime.dprep b/how-to-use-azureml/work-with-data/dataprep/data/crime.dprep new file mode 100644 index 00000000..58a84196 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/data/crime.dprep @@ -0,0 +1,204 @@ +{ + "id": "75637565-60ad-4baa-87d3-396a7930cfe7", + "blocks": [ + { + "id": "ba5a8061-129e-4618-953a-ce3e89c8f2cb", + "type": "Microsoft.DPrep.GetFilesBlock", + "arguments": { + "path": { + "target": 0, + "resourceDetails": [ + { + "path": "./crime-spring.csv" + } + ] + } + }, + "isEnabled": true, + "name": null, + "annotation": null + }, + { + "id": "1b345643-6b60-4ca1-99f9-2a64ae932a23", + "type": "Microsoft.DPrep.ParseDelimitedBlock", + "arguments": { + "columnHeadersMode": 1, + "fileEncoding": 0, + "handleQuotedLineBreaks": false, + "preview": false, + "separator": ",", + "skipRowsMode": 0 + }, + "isEnabled": true, + "name": null, + "annotation": null + }, + { + "id": "12cf73a2-1487-4915-bfa7-c86be7de08c0", + "type": "Microsoft.DPrep.SetColumnTypesBlock", + "arguments": { + "columnConversion": [ + { + "column": { + "type": 2, + "details": { + "selectedColumn": "ID" + } + }, + "typeProperty": 3 + }, + { + "column": { + "type": 2, + "details": { + "selectedColumn": "IUCR" + } + }, + "typeProperty": 3 + }, + { + "column": { + "type": 2, + "details": { + "selectedColumn": "Domestic" + } + }, + "typeProperty": 1 + }, + { + "column": { + "type": 2, + "details": { + "selectedColumn": "Beat" + } + }, + "typeProperty": 3 + }, + { + "column": { + "type": 2, + "details": { + "selectedColumn": "District" + } + }, + "typeProperty": 3 + }, + { + "column": { + "type": 2, + "details": { + "selectedColumn": "Ward" + } + }, + "typeProperty": 3 + }, + { + "column": { + "type": 2, + "details": { + "selectedColumn": "Community Area" + } + }, + "typeProperty": 3 + }, + { + "column": { + "type": 2, + "details": { + "selectedColumn": "Year" + } + }, + "typeProperty": 3 + }, + { + "column": { + "type": 2, + "details": { + "selectedColumn": "Longitude" + } + }, + "typeProperty": 3 + }, + { + "column": { + "type": 2, + "details": { + "selectedColumn": "Arrest" + } + }, + "typeProperty": 1 + }, + { + "column": { + "type": 2, + "details": { + "selectedColumn": "X Coordinate" + } + }, + "typeProperty": 3 + }, + { + "column": { + "type": 2, + "details": { + "selectedColumn": "Updated On" + } + }, + "typeArguments": { + "dateTimeFormats": [ + "%m/%d/%Y %I:%M:%S %p" + ] + }, + "typeProperty": 4 + }, + { + "column": { + "type": 2, + "details": { + "selectedColumn": "Date" + } + }, + "typeArguments": { + "dateTimeFormats": [ + "%m/%d/%Y %I:%M:%S %p" + ] + }, + "typeProperty": 4 + }, + { + "column": { + "type": 2, + "details": { + "selectedColumn": "Y Coordinate" + } + }, + "typeProperty": 3 + }, + { + "column": { + "type": 2, + "details": { + "selectedColumn": "Latitude" + } + }, + "typeProperty": 3 + } + ] + }, + "isEnabled": true, + "name": null, + "annotation": null + }, + { + "id": "dfd62543-9285-412b-a930-0aeaaffde699", + "type": "Microsoft.DPrep.HandlePathColumnBlock", + "arguments": { + "pathColumnOperation": 0 + }, + "isEnabled": true, + "name": null, + "annotation": null + } + ], + "inspectors": [] +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/data/crime.parquet b/how-to-use-azureml/work-with-data/dataprep/data/crime.parquet new file mode 100644 index 0000000000000000000000000000000000000000..dc06b73120f80a4d13b3791adc1f7d12c8ebe9f2 GIT binary patch literal 3607 zcma)C|F3u8=U$UC_VhDu7gTPl~CF)EHUDcvN-%wV>;*fCvbw__w(sbtGW z8>WQHrG(RVbtxw*Su471QmNIpDw|L1pk|+WXU5n));a$1KF{y_yZ@f&`3##8<_e?W zatO|b=EJ7oQxDcdp=|czQOKWROiXk%nS(VUg3G2d-G|DtKG?h@H&^dvLfyf*^nrl*F>@&dI+%Xv@5E&Nx++#+LA+4dN{Gk7BfLoXT8xj=O1_$_?*Gd-8#% z19C-m=wUghsWkI|AG>bX%X6#EMsGBof9kTeu?Fieaai0N{>0hY;ZbzQYnoX^eJi(i zPKfjG1xKx2Ja5@7Pp^hFp5uoH3pJi^V&Hc4q_y+e?@DTPAqHY*&@)I`e#})xXE$_~ zt=u*J#>z*dg=yEr*ZcS=mA{^FEm^p+RcJGC5smrr*+KpKyf!!@;vM=)cG}kXO9Y(X z+=R-6>B5$VJE0q+v%d^ul-_^N!U-N^Y*im>m|*Hf~dJ4L;N+NzY2-^(5EgMXnWRFMZ=m9iH3`;>WAbO_+xB+ z1ukD9b57gjf@Avk?`ii-q{(orsSZXO!=DA$#A%vdiH6@zZdiExAA}SaBDQ^A#zm<@ zzdt-?pGaY|>_q!$)b-B8hF(qFWhHBaMUQDMq!re<#V#}N8I?Bg6{1d`@s4dPmdJXr zoo}Dp)!N#8m0Dz!5q)kS`tWzL476L`mAU?V{O^IuP|e?&IR&%6Rp*$FfjcH|&+3pX zghm%~C5b=?O{I~%E?APk6$d5P%4f}ZQ@Q)Ha_^CjmKV=QpKOcV_BhhQ&%Zh44(4_e z-OpnwbDpte-?T$)kL?L-;O`qL73UA$*PmhR;@G_-QP#LJDytxO%|Jt2@YfqNwr^OK ziYx7%X?;`*H`fox*JLj<>Uk-QByT?|F`=br*!?V8cZ5*gr7X13l+oKl+1I3wy$B5t z|4kXQ7=%VTF>t9O)sd0;dSlTL(ROD7t!0@-T94AMP54Xmou3D-#}eoA{US%i=JZb| zMrqt|!mPC?7ew@S+Z^-T9=SWM=I~;A zxmdT9`g({NK^OQsR6r*Ki)u^`t&w-wvo|zo&%T{a*OoAKdLY)+$s44-H;Ke)P9*X} z4Ab3#@t;ukRnJ3P(>g5J57EJw0t6U+ZIJm%_{W+cfz6buZPJULp%ni2cb(F6c6eeE z{D>ZZ;ezv_y4@XAcG5q`OPQW|sUBjfvj%0%{hJ6U)X{9Xe3})*qZ^0+%H5KAYm>Zvu}wgH z!9H^GM-TjhWFqs7*h_}hSns`2*Q%*0u%7>aSU+MZv|q+e{|%NFn?GBNm<$c^WVIPiQFOqh)} zLkF=?FcEwh25=Jjw9qZ07`}8i(A5KbSQbadSBr5>p@h3xB~WQ$gTe#C)B++~DiCp` zNwn2T34E0tF~;I2a;1UUCWX3)DlvFL?qfG7RZul3@M+Z4gng~Kb4zevgP!^T8L5M^gr&+5K$7_=W zSyN2Sspex8dps2;n0xBsQRb>@!&r(br30eWLH6JsvA+sYVWLJlu>}Zr98GD100Ouy z3>DVXa7bIUzz_sv07${<45^!i5N|M)4`?6+#0Py{AV>qXp;SXa27odHoryoS6#+ZZ zlw34m0C?3<2O3W3Cc>3tD8p!g0IrUt<0frCqR@<{6rceM6gtUN7^TTlGZF~mMu`P7 z09-WI0pnvbj$AYbNRi2eG*zR6h}9J@G66u>bR950PzZDrq8x{S96-aSItUpvt(7te z$W(49I^ZwEr$tL3KmoLQhOP|bBZ)xe5TyVDasc(2>FoV!OcAgZqFjamqjGF90h~Cz zTCg6twE$qRr4B?!8UflN%2pr(@-C`wq$bQcLKUGYsh|>6fZey!nZawPU+XI!4dejo zvC%=gGohtY&;U?Twi61WJt+v3j-zB^fE=8f-dPk=l!FypeQP;|@@2HGd}*TU;&5|v Tb@Fg>Tj-5?1m0sd_(}K&3>y?D literal 0 HcmV?d00001 diff --git a/how-to-use-azureml/work-with-data/dataprep/data/crime.txt b/how-to-use-azureml/work-with-data/dataprep/data/crime.txt new file mode 100644 index 00000000..d6d8b8d7 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/data/crime.txt @@ -0,0 +1,10 @@ +10140490 HY329907 7/5/2015 23:50 050XX N NEWLAND AVE 820 THEFT +10139776 HY329265 7/5/2015 23:30 011XX W MORSE AVE 460 BATTERY +10140270 HY329253 7/5/2015 23:20 121XX S FRONT AVE 486 BATTERY +10139885 HY329308 7/5/2015 23:19 051XX W DIVISION ST 610 BURGLARY +10140379 HY329556 7/5/2015 23:00 012XX W LAKE ST 930 MOTOR VEHICLE THEFT +10140868 HY330421 7/5/2015 22:54 118XX S PEORIA ST 1320 CRIMINAL DAMAGE +10139762 HY329232 7/5/2015 22:42 026XX W 37TH PL 1020 ARSON +10139722 HY329228 7/5/2015 22:30 016XX S CENTRAL PARK AVE 1811 NARCOTICS +10139774 HY329209 7/5/2015 22:15 048XX N ASHLAND AVE 1310 CRIMINAL DAMAGE +10139697 HY329177 7/5/2015 22:10 058XX S ARTESIAN AVE 1320 CRIMINAL DAMAGE diff --git a/how-to-use-azureml/work-with-data/dataprep/data/crime.xlsx b/how-to-use-azureml/work-with-data/dataprep/data/crime.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..21f200b4e38cb035371f6a27f7325b6af6302bca GIT binary patch literal 16109 zcmeHuRdih0uC19l#+WH)cFYWkF~*pgnIXo+%*@OfGcz+Yl__Rs##iY+=XQ6x?;H31 zzI*n^uBtIgb7*TzOH*1}vXbBs=paxaFd!fx#2~XyIM*JaARw?%ARuo*V8GOct*smk ztsJx!U2F{PHR-=tS`cMJfKg_GfC2yipX-0I1$q<*t-2UdT9ps@McR~lIogJTqn-#TUToG-z5w|3jx>t8669L4GG*5&^_@6o{D?@A>r+1M6MFj1@3IFpHPs2DS`T1I*PNLtvzUa<@o;k90BxhRKdq@H&jRJ zwM8dipu7d3kVajPrUGqkX0p#ORQPqh9o4$(h7dP%gjBsdd_|Ec#Y@klE-fSP2= zfG}49phH$2fop2HbGl2?wNX{hmmCx~C$xA`SnpJ$>1L%a z$>)X!Xw(?H4`D}QU0x&>#fRUoS;DBdvlg!Tf=f2Ced)Hj{ zR=biwE+*HA+s5}_dn-H2+B+r&!)*_q&3D&w-@?W&U2Zz;YCV`oN)wqcG8U!!Uj75QUov>w&ixSwWKB!1sQsI-%Sj*H zcNtrlkqcC;7zVR%nJRhp$CSs}(Z*J*9)MQ(d&qnp=i5yI29rK8?;!*K@x_9{+1k!r z&)VAjXOb)VtZtpdf$pho`U<|0=G_P>8zc8kM7^jvm@91zerf=cLIpCVRyx}1{i{cW zxEKRYSW{Y9AVdGorSsT*c;i!zHLkv-Z*Hozw8M`?C|&Y$N`TZTL%k#oGhz|rcbjgU zq~r*ZQ3J45_7(V zb(Na^u*aARMg%Bm=L86CTGP>6El28IGoT`oP&Ang=1YU9rF3iH^LpnK&Psy-B%%{j z;n__Pv$vK>8kX2tcWt2cx0H-GVCg6?173ajGTwRBIcjQf^!I9-AT=ID9y|PY7QR}3 zI$sHVZi(;fh}4Vizvz*7Ay>i4sQh5Tw)4b6^s0d>B|~XJ&W5D0D&uU{nD$@B&oy5! zdxWp&QV26;RVfw~#VE?(fyj6NsD&f4md9(UGbB~PbjRA{gkm2Sr! zb-RCwXP^226J-&X7N~Nh$d-qGp4db^1=jt!o|hg?#BS8~IPY3A^e3xPic8kyf~1Vz zMQ582zT$(n+vpeBAucy6zt%bc zG-D-NfV72v5x>i0a_k9PMQ*}G4jJnu(}>X{=hX&2spBdYWfdMo(^ASzD&u4zJ5XbxB4_bWynpU zsyp{-q3!Q8lTCxu@vjVEZ(?ZZV9)T|2h-0~kgBF>4QN2~ zXzzGN_FvNvLBAFx9n20t?r}^;u7^GOzCjrz2e^oftLfEyeMAx+n!{`SAg>!D{biv0 z$H)&8k)6BpjA7A3x>5E^3&$x;{PGnE67@WaW9|L? z=lT7wul7%*!|m2uQ#B^2-DdVvo>ak(<$#qDttrE*eS<>>(E){ch8S5WZX8tw1^2Z02G#S1P1LmF>GaXwM{6x3^C`iWoNmY~%!BRi zWf4Y;cGBp`m%S}_pQG#4hgGegT81vRbOjGzs1%vapGWrj#b2MylB-&ortSEKtEF!g z9ReH8Ev@LaogLYeER&KCzhZ^SuJCxIq=1|hZe=8qdDA5u&U*WOwv%%r;RS}0IR!Gc^pGkeF0AbZp^FC*O5bwrvZis_X?v%vdSL>qV zXYr`MS8HRr)>kSPac6GH=UuZtxuS!Vmg_`5^70V9&lkSE4%St_0=D3i@4cazs%ngxL>!jtD1X`gYv{(h0_;c4cN#7T9W{>lIEL9zWEL z;?0%od+QrJO4(V+_Pd889wu%q6oy0MahK~8_e+`7nFn&PmFY9}^UX7jR>G=tbL>QW zttT-cK+J0}p}5zutRwVoE&6AAg{34>}JuMI?yOi z?LhldO9sKoP|BDpQp;Il5OwYv8xd5~h4#duFCCg3yLq_@G>5|Wnp+0i){?TpfH{u| zz$`<86Wi#+^Bn=w&@drrXUn1AYJqd~d&yluPsxWwjD%3f9-srG=9rXFCr zb_>a&W5%>ke1?&O=?GRNT-qw;f?DDJU1?%6>eNlmFUy!f9!D}SJvDZa^N zhu(>*q7%1&QG%8Lp=`N!(fqs|#PoHLS%KDq4^T;2>s(P2!ilACOr8GtYPgCpKX7KD zb&tkyVBi{crn6k>8njy-5yU9wq>9`@=xIP^4&x!ex7ravpI2=QWH;Cfi(ucQnl0#( z(cyUMDj2IMbYnOBymdN^#2*jwQ_PjIdQ3aknZbqPf+-ZH(n5SbS$ylP{;gLZb~*uh$>e`6qZb!|P<=OB=`4efw(RHcF^{Ov)cb{f>=Vr0aJ3&ksR z1RTe@BM<~n?TwI^cqk$md&ayVBq;+n%nv+un|1^w;2S8)jOw8ViB7b<>9ka?Q}zB{ zZpl)!%3+ozLZ*vfkHlgap;m5DE<9WNRbenTcY7n9g#0UgtpEznz8&eJ}k zo!F+r_hY_sa^w zyUA_=DQ(A!s5aUL?Rw~nAbSkaNPvqfXr?+Kw6e`^w890b3&c~OTnM9*O64k-lp9G; zESF;pt7C{9ccJ>QP*Yy$2U9<%alX&1i&i(=xRbcaFLBQuGPkq_?Nsdjc8FfTdIJw?D^##ayD1KwmhSJ*~5ctv?_Vcv^)u?5OthlGw z90IRj;i8St>Z;$K(Q{kfz^?JxsHG)aWt?g|(^E&$ z6a8Y`9JleAS~fPVbH)ubd~GIlXZhy&y}rid1}t$0P>&(Dj^xP^@T$*%5_qn+jPFWt zXmH;)8RigGrn8{j3_^nIcK_7lr#WgB;GZ^Zk2BTlsd?0@MnQL$Tk+V#g(1}LD_+zC zdZUn{EX^u;t9mo>V@nO0IF+ZIL$O?ArT4usRu--DENZbtDtGNzf)jcTpsRufzlyaq zz0=~6Ie0Ux)N;q1m2yLd6US!?mmFsRfn5%5A*P!Dv*atdFXIPz@vGa$~P_tkh zEFz2E>p-%uYeZ@V_^R=;OTe;~740_pfns)ASOf+wX0=aNldr;mO0BicK`8@KtX9_) z9jB?b{-|=lEZ^!BG;y+UY5#B z-4vK|6f_u@J4gEl4`#0CmSWDJH9u(oKt7nU`(4^xkiBomDIV@%S!*qxe_GRSQ6c~r zSyj$p_@Ex^;n6RaUau7iAHZj*D)%hJ-sgX7uGUhfuF6-SzIGvl_G#&mmf^xcYXX7S z{xvOva9I85hs%nlP-aCE#kRay#EB#927)TR4^ zinBO9ZIO&rwSP#WiK!%(J{IGBq}@KEbSSZ;_kd~}LiT#z#Fc0if)Vs3an!i?RNkby zHjyAaJwY{{XT)^1-B%*NRGafz{Na}7y`h`%v>w}NwIh&rh}q3AEvL4f99S{0h?$MO z3hVhE?Cb+ZAZ%zPUif1~Dx%2iyPjUDXqRDFPKJ=RjU)Pxze0f%`vte*Av|1M30%9K z3(B}2tV@nZJWXe<#ehFsKW^>Wen4fbHrw8xwtWoYYQZXA;pL=;b>Nr$T4tW1E_!iB zmsT)MLQbB&!u;euP(_%(8JNt2)=1Bd9W_{!*qZn3z{hWeEGmNc+B~Dj7ZZH);p4M? z-f7L&mGr8sR7Ui?3)7%ydO`JWf;T!etAcP`cmJA?121`a^g%cRn?G_i2`A@Jbp~B< zJxTBPo)Q9E%CLTCBwo(sq1B;jrP?hlyJYnc??%xN}Us zmD609Hb*7lem|K9YX1jE+MNYuHl$D-7L<}kopjK74Q3$S=mpY^_u!qGxKQ4LK+5qd zg$;xB&?j&<;S6PV($>sH>BW7EutX2aSkkuj$L=t z6bzL6;tP&b56hI!Iox?wF*=XHvC{6E(?>8o#>&JilskX95H7EotU0aR7MNW3a-%;u zN8(S;f$}kJ;Zh{G2C2qHhxwM48i3Kgfa@9Ds@toj{SmeBFshKwO&9Sdj6ml%#<9Ni zP23xjyN0b0rORH8(6^FXpNMfQwMh6z&4%6W5E}+xoh9!Aw4o40r#uC77%yI5ZG?gE*Iz2#7jF`LFxNZg@|q*LppWVgV;?a5M!(<$ku-p<=-$lRiqs$9d!Nj0C_&9#J~lvAtJpL=)+|c1VJi=&FUcQgYrrlKgjfs+C}iS38!sHfpV*;yi}YJ z)58D}A7Oi3ucHCbcFxY%;iy@sn*3xO1pj6npTt=_bSG}|8EF15jH7PLxMnsZyL2_5 zfgjB3N^Hiz4i3)0Z4$f&QIA@G2wJ`m_>dwTHEgmguqgy=FwVRPJy)NharcM4n_Zfr zak-V{AR5!KFhE*rN3joS<`F_}m)L|mfV!ae!O_eC*{6(eqCCYX@tAt-+SZm@`M~wyNn@{_cdm=|~R1=y~N+lbwgM`ma$6)|y*x&o* zF_={vFHpgl*0U?zOGeIc0fcPy9nOp&D}A>pTMj=Ut&DDZtS_kIim3zwl@W0oRq4?9 zykc9f8R|Zh#0NQ-X^WfY zfr)bB9<}avx)j8sJ22niumQq2DwFDjh#^{xeJvO8bl$5LgRy;=7J@QEY@jWnfUW&r zEN*NBp1Z9OD`$t9ryE0{N-;!MF6H|u7NhXG% zWtNq6@b*z4KjAfmjQTTN6P*}WbJu4*(VC8L`Vkmvx$5XSf6^W_I);=xY7|*rXAL>Q zG`#l9^qC&FQJbr~{qi!y(#4kAx8mHc2$_0w{>^8s{*d__gs_k#eZN?Q0v}SUQ=goe z?Jk~QEr(rU9u4+YoY$EUw(^{O8jNTklpq`6UYEYz?wP#j&UF)+emCE&;DI$%cYoCi z%JfG+k*&O`876Mn-El#6S`Za`U=DLI6?`&uh+#o_NxbO#AV=^u{p#8!k!1{C0gGn5 zyZ@Nya}6BW+8Z(pNn#r@1EPrB@ZQtu~h(SfhFWVk4BCsoZbL1I)1$60I$WwqDzjT@jSiIlJ5KH8gTt$4lteoN6e#QPR8KJeTz7hv>XmN zy`i=+v+f(peLU#s6Fd?(>=1U?R1_6K0JpB2N3%rw^f1}ibh`Ocmj}hw^Qj;$D+aID zaLzX)N;^C3*@r;^C6ra7*T;UI;89bKW%F4O*Rh78*M28yVawL0=S^mY_n{}@!x``K z>XlWswgyXx5Ce9QcnNufFOpp^Nkd~^fh~@|x4}EHP})90gMjoA{oL>UtqI~_VrXf| z@Z0^j{>PE(Xb3JldOOwwKZ3pU)BDW`^40Zmi+CC4?ACxu#rPNuE`?*$h@^Od<#O2=Jg(*9z<(`gtlgNBYts&-+ZRx*9w0^gr;>-_;NuiK8@~xG@}CJ z!}&~fZT0SX$zv_q`YUr;5plZ)e|o=KGv~%741=z{@HeXnK7y^$RO51yh~JOo6{J*1 z*tgX}Pnh{YzJbY;iEj+{BFId@R5gT9a<2CP$fse@to_m6=`<)(AxeC{E9dii$tMl2 zV`R>a)6@x4!Ii)HqSrGhRrf@bIm=GOkk0G&a<+dN(9ZkyYVW>VyQ;C4TG9%dTtVSv zHbVRL`59|fyZzlU5Kt1(&ITOe_9WtP*1b~>Yh{K$?yP}mO9Zvmt)0o7XVMC53+@) z-F&f$@Vt>MJgNLcCFHVxs1vp`*LrdC@Y$J7y3|U{Wn)lt&_tB^3B5!K$n7pmrt*gT zVpxUY4q`A`P(DZJltO!@a47!wX5ql?d0yrXsfwVh6O?kFz-e+}I~{U)`Wj)qz~L@K zBHyKp(N)Xx?n3ifUUvghU7#L}o`FT%eZwa7a!2xSsnC-rHetcKPKqlfE zOm+F7PwOkW!xeD-R%7iY<@NVOzGKo$c{*gop&A`~_>r@?3!{}o3IV!sZ z7#hRd^H|FYV{2RQFPM};sB5|RZ$(xlG�Y6u9g?9=Ac@xSQb+-PGTVqR5$h^XU4+z#(9*cn4yYO<71`Lk36xpw#*%W-F|BJgAD|$$5yt8PpyMGD7|QF_tTZ(r{PAs~Yx}kk;JQ zBz?KVCc4Ca&gHYc*6iKuW3kb+7GJsHn0eBrTHp1}wV+v2i@Vp*yVHxkbeWdQZ5HC# zlVQl-W+7(}h?dDvXvRW-ToXm38R!_vUGn=x+T~bMx=q`6{1nVf5fcb%!iFmyslig$ zieFgXW$Nb!IMQHbOq4!;y8IFmHvP%Cv7{-v2p%%}{tJ~xm~etb{+=4|>1yfBq352W zvuXkq<>EB9u9Ok3oFlfJHj)s!ma(taVv@?FgUY1rK|+?=2it8pMoSEgFWWckqOIEj zf=hz_MtHHi`WOW@NiCseaAQdyTw_>@l0aqk7+8oR926FtY+cM8O^aJabLmJ!9je0^ zrx?Ale$aA{#y_g|H6;biFv~RRHR9T_9E2gGnv0T9G%4&Z9 zW)Hi2{tHwi>5Mv7wnBpwj(ED&owIH{4pvowNGz8|%%>qJrl>@>*d97r``&p&XUJr}C3uQ6Hk_0zMAj)g(ICC+eNfrR zqznxqw#;I3XVxJ-Phv6nO{>fbJK^k*J(TH=aegoL!Hh%9egO9t;`a;TnJVf$tLIO2 zc4g-DjYw#-E)p$2W;{Z`I~nORMwq2lTRwFMS69mcYFKDt5M*Z9?kpWYaPXd&QSc9> zji@;a&>l*p-8~E~x6B$rJA3iC+I;0Gkl&LPvQ8kSe3-%=h-K3WS&mT+gIR0gq@a8C zq_TiskyxOMeXHA!qUa-^gykxT_;LT18Ms30aQbGLe$B$ynmkl zWOJKt)DJZzL`r)EtMyX`3|QQBkeP^PM-lbN4UEiWZR>!+NFqd6`=ty1+< zMAwyyNj#pjR#HTyrCBT4Lw2}YyLxpPGjc=E9MwYTI0^2;dfeyCXCfMXooy-)gEo*` zv*x;2;J(w})9*W0F`2(22IlTG7PV*$2dlOH@CFD6 z_(hFi-KysyuE1n2LA^=NVM|O#sohD;y=)sGb+K|Nm2`t_5J?~(UzBXJXQp7~Ck4F# zM0y9_{SZAabs?Sd#je}^#FyHar&RZ$(JB^_$*a)3b3=9Cn%X`iwK_s#^MQLw0W=*X zhIOoV0!lR!iEG{kJ+368AViA+ytyOEFTbGxwTNb6GpP-k-pm@hv`)W*6(Ki9&}##-pu6UZqYRWVXMx zvsG=A_gYvT=8Vjy_jIX}VuQSEWDMx`mQ-ME%1ik+q?0I-wOFs3ZmQve(vDorYDrxTPHgU-H+9|}sO5>RI zT}C0lo5aUt3R>Oc+Q9ardqSx_V7vR@Wg)pTmjR>x>I5a=F#CHalQIE2%{xR z_5|fR>9qEk2+h5YF!iZ(W?QG!qfpmKS2XMg?2B#v6QlNuVHW#k0*kfXD2X)zow62d zqh^`X;xh6xAR|Y4n}0R-1QjCBXD}P{>>@F1!RzVaNb<~NdioTK&?KU9yu0X?3C^9N z`3BsYIgiDftu?%b7Ord7p{gaM%A|+abrY}|L$s&O^pQ_Zg+6Khpm0_$s}f7Z`k3Oe{6d>%0QX81Gj0VU_n68|K;o^x^{*J3J!LrR>r^iyt|TSL=GFeNBPXF zP8Od<-Y3SLZll=2Arxb8OdAL?mP`4dcMedF-_XTo?{D42->JdIcx)}q^WCOyri`?| zXt(CP2G`M~JP5KFsR)a( zhCVJmjks-EJzfi@J~bMxEY#gT%`d$m-~%YPa@UD^?yUxAm#;c{SLP_YsG6Isr5+HR zIN~9u9bBv_{Ln@@FbQ!{rr&ucQ5n1TWSS2shu_%M^x9CkR-Oz`h=S098A|=o2kFp=)lV zyM=p+t%nxP%r6!gdjYpa@oZ_j0NY}R&g(T6NWm=0m^j-uHWw|stKcPAj|Gn|R?GbS zRo)so<>9)oOLWy1_Yf{}4ogDvJ14sSd+-$|E`4m=89UA0Tk)=*N{lXlP~Qw$vKU3F$?GriL0g6z z2~d;o6kWSVaIK&8qHy{~h%jZFD;zB5>nb~~B5Ov^aiDO-@=f9fI8sO&W23~XJ3eMu zOtQ9Co4+$EHpfDZ@1z$4$H`Gd1pwYf5Gy;X4E7+m$P%F%zo8c8L69S1Ch)Z>ltS+v zd#9(j01+eOQfSF!q_&Ve*I$P}D`LmSw6)d)aVyp}YCT^25AQ6w7eU6C~Ne<$eKUE48{P(efQw4VW9vuw3;b76Nn4Gf_^XjNpK!frcJsv zH<6;vHz77$)xscom6YhMJ%d>?DKeaCl8)B(gdZdRRh7O*7AUdEuMMJT@e7%6s$bE} z8iaE}p`#DZ2t_hOmk=J>+6au^U~eem2Y6n37Mqc~5IhOI-sgbhzaY}J{o_m_w6PVwYK9)(vncEUwyI>{9!7@55 z9Z#h0G<{n#_@_@2LH&^`81R*Ta%9ig-JU&*mEo{Ok$gLg;z2HqgZCl06D7rFmKWO4 zH4L9>PieP2lzq5(&#Vi|` zXdack4GKw6+u$@(niaCp#T>Nmvq`kQE>ZYUY#?QLEw7&>b@KxtW8x%T&Awd@zMQ6j z4+X!?Sd;l_ULzr+d77i|+bPvk$xjmul1J-sW{dF-d_Erb1)AXFO``Jy9&G60$cnx5 zKRUtq`p23spaalifPhf^?gTc#wc5eZPQlQ@;pd{w@XK!|dOcICBgIjPWm#-qmdfM3 zd?fT`2f17t+dYJsqLt4{xlR&OzGN&7woDH4FrkByA|iR-cW?Q`66iH=K%1P1d!cP| zcJo;l;y0ekE3kE~r9Mjmfcn|XkY4mV@FVQeV1Diau+Ty{2+Uv%RUoI~CU8|P+2Ee$e?9~+>&guk?{m|^Vg`|~zF?soy3 z55(;zZ;zX{6t6;fAGZz``}3;j&XMOz&IzcUHQ>Hq#8oQabCnich@QQ`K90~kEr2T; z?^mrATCHb~Q+5;FDj>kTVC~y5q@x1f_49q$y1n=1l5wm`eUFpM2y@H+FR?7=Sf(aIYrL(VvR(abAjF4#-*2@9vE{1x1EUT1VqaShm*y*Plk6 z#Gn!lV5LxJ1)*X@m-W6tng;D%(?YIu_=?0TF|NAvq9;^fzRaf{taP+&oZ#G7Tnsf^ z7=(JE@KmS8%l>&&W40OXTa^ghJ6fA^%eo0s(9G<-I>L3d<(Px_n9KK$3wQL;_g@B; zC^|K327Av9(4N$hxAMh8pmF?6O6cqqv_6Iu9HSWu3l%WsF{N3+>N<=N^qpQ`^6wlL z9*eplp7WEWF$v)$T?euY%x%5g^HwqZ<8t(3F{BapGb;ge7RsOfP}j!hKW+##*}tyT zXhCZLBZA*4_zfY#3NGiOEP_`_wix?Wg)V4xo$=><{j55&z>=t|`G`8Xc~xXcSJHce zbp9B5<^H-sF0nmth90}95QG-QyvqQg<>RlKPbfIi=Xs{y0Ya%Dy>TxOqLvDQkprMc zct0vp-pgr72G#k`t$Q1eOJK;G2Lc=-vw#PBQgkxZ&&ZTrJgI(&txHleY@k#2^*Daa zK&10IzCZK)8jMgf>lhkV?SIE`O=zspAuD%tX_6&pM^Bv(@bdT+{)tQ3ZnZe=J)#AN zigLc7?1g2fRp*r?;BA8J_wYG<6*wy4+b(nUprc8mD-ORsDZ9Qm`qy=pI0x@Y$e&TB z-pqmcN>u6MA#E>nm#;XtzD3YT#eMvS>g#9U)0zWyW?lO}h>wH-`@<@#@%$y-uKSyP zqA(8oD_vZK^EG4ktgyZemBiw;G5#N#%K^B;XHEN;Sxd9;1PA+*=$Q~_=omQPdko~W zy?|_g-))^}4|zO4Kiu9Xd5ymuG-e@ROK*Gu?ymejqjuYLc5MP(*y;Ckc7Iw*9qk>g zE&t=i|Mp=JkeayBZ~$;Iy^VW7!gSLkath5Mo0jq{7Lzv4&DC^xquY?KQq{>!#$r0_ z%rGg%bx~E;BGU1LJ7HMY?p8jo6X&fuHm|6IfGmjG+s}IiR1bC9`r%-QzDn7wZ}a!S zFjq$TJQI1?^8NcTbif(1j?wYczW3{zY@<@^LVKz`CM~nZ*?2DS_`Ms4hm|Ci!7#yM z7W$CIqiRg8@fs&w?z&lhNL>`#GPQX6D0ysa9K(Jt8L3<*t8itG!w^55*ppZ~1)1-= zy6KSkF40D4fmgj``u>0&?%+_*AB>LL+M^l3L(mq8FeFUCMC^~+%|lFz$24Y}PuUx3 z$n$ABk&zymhOP=NmjCsr&+yqolLf#|k?4E6DgH(OQD#V2c}Vkz&a}UjO1fS8c4CBG zFe$ZNs(h4gK3Wk>9(72xmXwI5G!KutiJD)0+bQ#%B1RR{+?ExQ^envna^u;0e3xKI zGU`y}DJj>|8|Hj>sT{!}Dsk-BN%-eNnrG!#GPS*oPHitrw=Ah9`LV+$+FTpHWwi=` zx;n&VMgf#}m}eMOJN?<@4*{+EtBu}W(|!&Y+f0nYZm0TH!n*bF{AY_@3c}pg??-;z zt+uBEfG&*PF3p!b(5_o#{hR-a4Nx#TU{C0uPoe(d3cr8*hcl_Nl79vG>w&mG1JnU? z#-EPI{VMqDv7kRh_kof6%ORm(h5vdM;SW&|kV?4Ug#Vk93BTg}I+^|l(jmfsDe<>C z^RD<$&(O=t#zasqF#P|aNj`mj@{9{YwSHNF;0)GG^086L88ud?re{=|b1^l(N z`U4P|{x`s1>g#{3xBus$n2i4u(BG=JUqS!M{{PSu1O%TM1mthEz^~$eC2fBe&tv_Q r_`fLJuhM_bmw%QvVgHk~{(ogpSxHD>!2aZiC?E0o30umrdm0kqt5$Q!kk4P5@A`zr_ zk*Cs&AR>W)3Q`x}zW0v1dv@lWxo76bH~0Se&boPMkOaP|Cr^X=Ax3h;-y=54lDEwi($f+BT!c1;_MwGs1+2dS7=IU+vvwN=Ar zSK2+a-s*R8ST1(LwAF@&$fKY!OliqcAgjZf4^er7k?=1`x_4|qYZfb=E6v_LPf?O> z?q2)a6?lhDjeX03BN?GNYauAV9e%t~&#Rv6MtrprJ z5Zpp&&l*-ewlCk6FeqlUS&qocDS&peom}y}n7in%7Vu(nZ4;(!Ua+Hi{t333+?G-& z_pHzmgbwFr0r4`E^!0I{cHNPqwXtQVZ~~wX z%>A}>-9pu+u349etL+t&%YW)%u9Ma?W#`Y79R-FEkmr4>>wplQ zH?H4AvszOKBZQ(RJHycHDDQ$>`$40^Wto+A$&xI{gpO&B~;6&B38P$f@leMig}!5=n$UTMJ3lgL+2nY%$Uoa?P@=I z?lJqlqABe$lVJGRH|xsVo#l)*N?weC<>R_a=MeUM%?}prY-0?2W6?+gOWcLQm9ah} zQ=b&xxT|Mh94yVG3n`&R7mPLuWt*cg2M68}n8APb4yP%|MrUvR8sa&!K6I2O`AY~;^yH;Orky1@D@KJ#~kA(mxqL; zM>2hyQcV13`tq(T`&+Xwt#g!Q8Jf!IJ|5Rk&57O@{HXR-??=!c+Ui>SRpPB5yS}@? zOiPQ-hPip~jd|?B@oj1naj|^vRE5a2m(J(YGEAfV^7BT>o^R~FZ$iQ`!%(2@QB#D{ z)M365?)i5~?F2l&_wbl>ne^R<&Z;q1J7JpFZg~+gPu$8US48p$PIF&>gWW4S=Ob~D^~B8NXj)h0x$Qkl0PjXJ4$1GC-|uP z&K0*ejp(I7IQg80rGJA_6b+ApX@*25eS*nXjuZ1pv8|yHwLK?G9R=OJqh) zKYP9?=JHSjU%I4B3zkb-G2+e7$m9FiDmqrcG}d;zC$b(an0z6$ zPQ|Ngh{H6s1yeUP=$Ev6Y4At_a=)aQU6?RdUMg|Qq99qh7V9#HOEy0$ zNnvp(%bKMyyKdI*oN&cpMO&3D8@O%$sP{Uk+}>if=3=kWb-2PRMc`uNJ20b1^wY~J zwEW>zO$uJdG+gC%x%J+Rq1ta^ec$_87@+Gv;m*e|8926OcTt{;zy(k%pCx;oTs0io z|DCJ6P;Ay&cEr{J)m|HJ`m%lF&{vHNy^nWKBs$&r@j7A)A_*Ex zyiZfgpAb0WKX-RT&`{7??U^2FXFs?(F5rbNVf%3P>4(dqA!U{%%ZG7pb zl>@jSJg2KndyI-<*XuRs_udZIw_Sf82UBAF@M(zzg3EicFEJ7BMLE}%~& zUD$;=Q!NPXkg0bie&B0?rG*k6srW!YY$jkRJiP$MG&G^K%4wUzyjsVJnlAmA_&X$1(&e5erex^qVQ2H`#Usid* zhgqyPhka?37}jXJmYqa=Sd}$S-<)_TFRT`vAtGYA8a31?yC)-_Y>Y9#;P=iw+JrV< zOg`9YR;f0{YDiE2na5JA`H+FQEgX~1%8Rpw9|$NX^M$+X=!#e3OW0mAaZRv;rL#zn zB~-;#V2)-z4HtKz1FYTJ-)t?M*a!8Mhjx5D34D+G**V~U;NCBoeWSU#2y^=b6{jf< zDO0iAXkDFXX+5i0hlw5gddJvKO3D|a{)6L1FMJICaJ0XPv(%)30khK4>8QNpvf__>JIfH?;K{OcM0KHm#<|pZ z4_qFDF2QjzRL^F@=#q7BH+LV0JcS#S7Gft`oqmS&|#^)vyl1*@W^R@yfVhaM&|n3 zm!R$lNsE$aXj_1nwlQ?-d%2q4$m6ZKBXWcx8M!*(|4mHbv!3SH68Q7{ulb4guV*p< zC!kKb#{v~NN%r6Icl`f0OuyrAotf{?@$8@W>36f*p7^^P}gydKvd zZC2bkA$|ad{sTbbgg8{m!VXCdbx=OM2_UWdE^c@y#$ zki-L4u!bW?QL;~UVUmg_c%G*>Gryt zn_a?Wr+c68Y}j}AEi&Gq8wzs-F1a$dkuh09|9(uQ8u22(ar<*u;hJC}oy|MKDZ%@Bkd2avs z>q7JI(-+M1i)+UG?Bc>aVLp~Q+)w8kPv@H7oo@btj{h$!h#4eQ8~qSNNQf;Y^qxdN zBC43Rp6(&Z2@~WLKQ_pk22pqB+%VhB}wk1 zj;&K0UP741GOTiEPsE{QrEM$kKocs*yQ>>etfNEA1r#@-30MP>%0N=PbQs-0c62os zlbft8VfmN>hcyzFM{bs&DXw^^f|?pt5gN@_O|mHV0l$brk{C|PmdT?imT)MN$v8?? z4wFtu)1oABVKL2-k^~3J#r8-gO=O%vP0EN3fcb$8CbfPWQA_^6#KeO7)mVU+GIV|F z*#@kLX;m@=Gf7rXHO$c4E`7{a|#!8T2G2T9CWx=bz%|Hb>l-g&?ls>nGOr} zq&yGHS*^BTorOMN^v=o#u@vsX3$O!m&22q!C%>lr@ASu8&{bo=@BZITpCQnlqPYj$I`U^E}>^7oBPuFcx=;Vz~~0i|68n#m>nt zrYE~go$xx-JZZ&^-vY84r` zA_BE4RT=Gc>bOz0m7-D?ip#iBsTFFiB8sCOrPOt9UJ_m~VTzPr_uX^9@0@ebx#yga z7@ovK0lEo6mwKTta796>42%i@D6XC<4aZkZ3S80Q7I0sslj#1WnPRb1CfUWI=I7I^ z!)n7pg!_XJ&)dt!X(Rl~a;lNKF@0OR`^N1jxviyuUhk!?2@nM7Zz5lKL&drDTW8y9h8 zk8`O?Y@6j)`-v4`okD>|r<9ETfS{ivLg*fOfD3({`#on@l+=Bmu%RSeyzFluc~&0j zOuIeS+4D!~i{^Z8Z;5!rj$?_v%YO|o;r5bWKA!NQxLb6^je4wkdesq^;(sBF{Q@Ji z^^P&oNS#|`{RPrM%;_SoGXRB4Jpe`nYeleywuLr_#*AydeEj0olgY>zweFtx7&q_Sy>q*3 zTj!0gyIuEN>_%#6cD=t_0Z~fw#p!xa7z1K$JtHP6dIn}4o-|#QVA&PLV{XlfV@&|0 zC$>Cr(2x>{JIH~&Xhd+%+6JNK~|X>(Q92+@3Xm~t9-jU zyz}^%8SgFlwPnhvYcE=~;xl!fbI&-&rR9d!*)LqN+TCZ))A%)(JhPi2Vx~mJ#0rE_ zn8RCqE|(de5XnqVh>S`a#Lu39-c=h)5HMSPbDP1>6A0RB9vF1xWV>RnQ{+X@R9pI$ zL&b}hdpKa#*|L4q>n~aCpPsgkE4xMMo3xYaI)YQPr+|O&NaMjg)@LapN#52~dvnK3 zUB7c`O6d4P?~v1P-m>n#=;YGjcxs*SJ*g!>=msa438DmVfoR&as3bvbxDbX+D2z%Q z#1*-?TP_y^aCu_k*SISH#wx%hfe{2lfMqbWVo&*+a$T|17uP})oW6v<_$&-Bt^kOO z!z{A^?pbr)LdDUqTsW zL+wH8Tr0PJ@U~~i=(x!d$`PJD4m(CLz`JMT9~1oM<`;|l`5%N1a!XdI4fXlau|oc$ z@IR^&+fN|L*G9$Vg?;yY4A|b*(ty*G(Ks~4ygI-p#^p*W>*xtWX3I!n?Xnob8{8jV zz)u{uz{64|k$8R>iOaWaiTNwXgR*=;zk{sc&@ViFX9W(Ed4qLU+6PybuV&WXC6y_} zeeXK5)Cxi0-kHotXHPoSX1!TP524g0f8`cL0-dOz2XTky+P3TsF=#t*ee$5%T1Paj zW){9+Q_nO#*fus&6DR0%@Nz1=8#0gk>4CG9h_p?*S?G_Zb2 z#h|qI4+ntNv>r61zv0ywI-@@OKiFc7D_mK=O(zm4nw32fCn2L48fLDmT6(mjr>W_D z;@?{USfABChNh3%K#er8DZp`67@TPIz1ZaI&o$!_JT@SLox6B^J0uD^uw>`eo&UWO zY1x5m2NonfxfIHAa7-CptKl6E;uarWQ@FAy{i~Tv7tBvhAGx3XWZNo1Px&Vkrex&J zr@nZ!MshiR)JL{$vs3SY#}CVcZhcyG;|2h^{1G#W_%;r(W&f;-ps+j1O&B%n&_w4+ z|4nYP=#sEw*Tgx!fsdMOcJD4#?2d56ZE-v_x4zi1&SvbheW!jw%GE3H?9nW8E!^sP zj&rVST31O|#m4%!;v0W#YEyRD&*cS-$G|IAMXiZA64n0?z_DmM9bJk6+}w%1IfKU^ivNT2uB z_xojaU4q=Z!|2Fiy=C_t4^n%oRHtikkeA!~Y$h^O0?J<8}+$4|+vc}|)Z);qs$ydBau9ZOkJRv3103%^)+F73FN&G)&Xz_AUSjB{#9 zzP7Ch392Y5a719CCb>SN$I$X}C7O8|dD1y@l>~hUpmrW4j1(IQP#Rp=3nY2+jD&nu zrd$K-14@B=_w3C)Fr( zwMuo4ac`thEtTjtGF}k^NJ))UtK@lFrBrVhIYKVc>X+0=Wu8`}lo~$CGbI|CUg987 zt5o?pN^L<#xJE9~+eVF!h|LhFWwRQ>~FHb8v6)tu(VOa++MC(YtMzoGXJz zWf`Izg9Sopwl-e|??Q*R6{>Tx4Bxujpc?&xj?!IR8LrPzHkEY<(4Qdec7%b_3@Foh;PAsFdN%@(K>Q#-EZEIf43y%^r&7qS`q<&XJ`(*t z3Ce=uu>Su?clKTiVSqm3I72y^zS#;2gR!B{>Fo2i1Q(t!I8`S?{|14wVC>Z)nO$Lz z190mH}|wxXZ6f=7a}KO6ekjm`v*#cah1yGZmZ zD|j{-8^ZL(zBH170z**ylN{%XrsqIuFh)#U>|-dAH~16455mkji$Iw^bpS9Qu-I!y z5y1WjV+1!MP;La9k4@~e7y-tgml4>6K$#I}J`k~2k0yYxJVql0)V+FiKZV)M!)|pW zuDt*0;^4yw{XPQa!I?>>nOcXLBx{?GBK=dHZm)-4&M+>iO{bf(@zBT=GD<}`f zhSD$T)f~DDaAjcQ)tW4hBv%5P5(n>3oV@wj9N4Jl=gYMmjd~$RCD%$|IhJ4>?H9AO zu*I-~Sg;@DOSLR7mO?4dmgTX6(igLGBr2UKHWF?dj>**dnb~sqos*xPEzvE5AW-hb zGIFwl7B5-i)sHo4fdq-J_vB;^okZ6&3A9$J>uM5J?$8PFPD?Pnu6|u>9zNQ%mk*!C zq`wcJ#Mo6$r-k9O8OIBPAyci+mP>Nr#t_p(Hboc5&Qu0 zilC~Zwg^*P8c>8Nr3mN;0ad6UNQJniNFb5YA3jh8LP9ALf~pdQ7AWfM%d;lyKujc( zJCAej%-p&6%+ls+cMyQE5cOq*4jOvGV4(~Efchyb{VrsYunWAqTNn|3{JwST)&=ls zC5LWbPJ#1@!Cy;YFT~FwmZ*uAtgftBTv5SL2F21Ox&zYh?7P%7(9}eNJ*UrHm?n_n z1Vz+);i~W)&-4BnxjN_Ka9h`jD**KUwB&D-<(m?aQpL6^&Hh5ltuiL0aD* zC@nQ+j3YyRvt6k=BbIXAG^W@gaBtFzW}DmhG^iFb+2!>%uBN0vn$+CnbgHgyXdf6^`g!S8iKB#Aq~gE;g#m3W z0IjZ}(jp}$bt;sU26y+?iwOb%wDNH5n)zP|pjVv!Mh=aIaE{>ta}*wE<>8q`6amZk zriQ{&OpW@&2k(yS>0_}izh$uv~XrgRR5g8#HPqJx(e>|DCb zB4@27C!*rHo*r;Wg$D?hCJjE#bnk?@MhEtO_t_uWy}Ve$K<)2NnMttnG5xSPWWk<3 z@-tbYv7z4nwQ?xX(j?-*uEBpxz?#m@r6NzB>UGecxpjYd_*$D#z0y55X630~hnT6m z>jBs)RWDUb+j(yFvJf8iCF@%2UViny2ktzl|?a$pdQK0fgHJt3WS z=4a#*Q@odrZRn2s*YjbPcpnhPxjIVM>;Z%c8pUol&Ud#Y!#+MX7ee%~5BIPMUK*a< z5Q_RY2m``oyt&KWt~5+-v0#{uZRzgZ(#tEI#U4KHj|F=Z!DvM3o8gW6S)rr*g+Tx* z4W4M2k0*kDX z!RV8?Gr#!fACZ`k>JhVecSPck$d57lsNN}Qy-M-vV?}6w&h-*;+sPvsoZ?ieyULV2 zUW5V2o~7!#i3ATYGOeR8!dQ$e-s6u-ey$slAIV$f&NH%@NgzlA?BI3jd`2_}tq^l*0^9wRf-*^MzviT%qQy3gpuw$i} zzFUH@V)7x}-I6SBj26?6rXt)6nTi-5P0cclkzx9iQgpc4v=*`L`t0aeLXV}lZs^e1d(wm^^%adBJCnr*fS8y1F| zJ+NDz#zm995D(jtWGKW6W(@@LWj2M=Rmj^i5PYJS zOL_Pd&LF&%3PpWjF7~-r|tdFhYSiail pE9a_s&gTzQFRrL$c}HdC;y?ux;3@;v0e@gVxCK8c;eSv${s&hv$<6=( literal 0 HcmV?d00001 diff --git a/how-to-use-azureml/work-with-data/dataprep/data/secrets.dprep b/how-to-use-azureml/work-with-data/dataprep/data/secrets.dprep new file mode 100644 index 00000000..bf156e3c --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/data/secrets.dprep @@ -0,0 +1,63 @@ +{ + "id": "b308e5b8-9b2a-47f8-9d32-0f542b4a34a4", + "name": "read_csv_duplicate_headers", + "blocks": [ + { + "id": "8d9ec228-6a4b-4abf-afb7-65f58dda1581", + "type": "Microsoft.DPrep.GetFilesBlock", + "arguments": { + "path": { + "target": 1, + "resourceDetails": [ + { + "path": "https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv", + "sas": { + "id": "https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv", + "secretType": "AzureMLSecret" + }, + "storageAccountName": null, + "storageAccountKey": null + } + ] + } + }, + "isEnabled": true, + "name": null, + "annotation": null + }, + { + "id": "4ad0460f-ec65-47c0-a0a4-44345404a462", + "type": "Microsoft.DPrep.ParseDelimitedBlock", + "arguments": { + "columnHeadersMode": 3, + "fileEncoding": 0, + "handleQuotedLineBreaks": false, + "preview": false, + "separator": ",", + "skipRows": 0, + "skipRowsMode": 0 + }, + "isEnabled": true, + "name": null, + "annotation": null + }, + { + "id": "1a3e11ba-5854-48da-aa47-53af61beb782", + "type": "Microsoft.DPrep.DropColumnsBlock", + "arguments": { + "columns": { + "type": 0, + "details": { + "selectedColumns": [ + "Path" + ] + } + } + }, + "isEnabled": true, + "name": null, + "annotation": null + } + ], + "inspectors": [] +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/add-column-using-expression.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/add-column-using-expression.ipynb new file mode 100644 index 00000000..a225aca6 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/add-column-using-expression.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Add Column using Expression\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With Azure ML Data Prep you can add a new column to data with `Dataflow.add_column` by using a Data Prep expression to calculate the value from existing columns. This is similar to using Python to create a [new script column](./custom-python-transforms.ipynb#New-Script-Column) except the Data Prep expressions are more limited and will execute faster. The expressions used are the same as for [filtering rows](./filtering.ipynb#Filtering-rows) and hence have the same functions and operators available.\n", + "

\n", + "Here we add additional columns. First we get input data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# loading data\n", + "dflow = dprep.auto_read_file('../data/crime-spring.csv')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `substring(start, length)`\n", + "Add a new column \"Case Category\" using the `substring(start, length)` expression to extract the prefix from the \"Case Number\" column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "case_category = dflow.add_column(new_column_name='Case Category',\n", + " prior_column='Case Number',\n", + " expression=dflow['Case Number'].substring(0, 2))\n", + "case_category.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `substring(start)`\n", + "Add a new column \"Case Id\" using the `substring(start)` expression to extract just the number from \"Case Number\" column and then convert it to numeric." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "case_id = dflow.add_column(new_column_name='Case Id',\n", + " prior_column='Case Number',\n", + " expression=dflow['Case Number'].substring(2))\n", + "case_id = case_id.to_number('Case Id')\n", + "case_id.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `length()`\n", + "Using the length() expression, add a new numeric column \"Length\", which contains the length of the string in \"Primary Type\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_length = dflow.add_column(new_column_name='Length',\n", + " prior_column='Primary Type',\n", + " expression=dflow['Primary Type'].length())\n", + "dflow_length.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `col(column1) + col(column2)`\n", + "Add a new column \"Total\" to show the result of adding the values in the \"FBI Code\" column to the \"Community Area\" column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_total = dflow.add_column(new_column_name='Total',\n", + " prior_column='FBI Code',\n", + " expression=dflow['Community Area']+dflow['FBI Code'])\n", + "dflow_total.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `col(column1) - col(column2)`\n", + "Add a new column \"Subtract\" to show the result of subtracting the values in the \"FBI Code\" column from the \"Community Area\" column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_diff = dflow.add_column(new_column_name='Difference',\n", + " prior_column='FBI Code',\n", + " expression=dflow['Community Area']-dflow['FBI Code'])\n", + "dflow_diff.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `col(column1) * col(column2)`\n", + "Add a new column \"Product\" to show the result of multiplying the values in the \"FBI Code\" column to the \"Community Area\" column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_prod = dflow.add_column(new_column_name='Product',\n", + " prior_column='FBI Code',\n", + " expression=dflow['Community Area']*dflow['FBI Code'])\n", + "dflow_prod.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `col(column1) / col(column2)`\n", + "Add a new column \"True Quotient\" to show the result of true (decimal) division of the values in \"Community Area\" column by the \"FBI Code\" column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_true_div = dflow.add_column(new_column_name='True Quotient',\n", + " prior_column='FBI Code',\n", + " expression=dflow['Community Area']/dflow['FBI Code'])\n", + "dflow_true_div.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `col(column1) // col(column2)`\n", + "Add a new column \"Floor Quotient\" to show the result of floor (integer) division of the values in \"Community Area\" column by the \"FBI Code\" column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_floor_div = dflow.add_column(new_column_name='Floor Quotient',\n", + " prior_column='FBI Code',\n", + " expression=dflow['Community Area']//dflow['FBI Code'])\n", + "dflow_floor_div.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `col(column1) % col(column2)`\n", + "Add a new column \"Mod\" to show the result of applying the modulo operation on the \"FBI Code\" column and the \"Community Area\" column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_mod = dflow.add_column(new_column_name='Mod',\n", + " prior_column='FBI Code',\n", + " expression=dflow['Community Area']%dflow['FBI Code'])\n", + "dflow_mod.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `col(column1) ** col(column2)`\n", + "Add a new column \"Power\" to show the result of applying the exponentiation operation when the base is the \"Community Area\" column and the exponent is \"FBI Code\" column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_pow = dflow.add_column(new_column_name='Power',\n", + " prior_column='FBI Code',\n", + " expression=dflow['Community Area']**dflow['FBI Code'])\n", + "dflow_pow.head(5)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/append-columns-and-rows.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/append-columns-and-rows.ipynb new file mode 100644 index 00000000..fa6f5e36 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/append-columns-and-rows.ipynb @@ -0,0 +1,245 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Append Columns and Rows\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License.
\n", + "\n", + "Often the data we want does not come in a single dataset: they are coming from different locations, have features that are separated, or are simply not homogeneous. Unsurprisingly, we typically want to work with a single dataset at a time.\n", + "\n", + "Azure ML Data Prep allows the concatenation of two or more dataflows by means of column and row appends.\n", + "\n", + "We will demonstrate this by defining a single dataflow that will pull data from multiple datasets.\n", + "\n", + "## Table of Contents\n", + "[append_columns(dataflows)](#append_columns)
\n", + "[append_rows(dataflows)](#append_rows)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `append_columns(dataflows)`\n", + "We can append data width-wise, which will change some or all existing rows and potentially adding rows (based on an assumption that data in two datasets are aligned on row number).\n", + "\n", + "However we cannot do this if the reference dataflows have clashing schema with the target dataflow. Observe:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import auto_read_file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_chicago = auto_read_file(path='../data/chicago-aldermen-2015.csv')\n", + "dflow_chicago.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import ExecutionError\n", + "try:\n", + " dflow_combined_by_column = dflow.append_columns([dflow_chicago])\n", + " dflow_combined_by_column.head(5)\n", + "except ExecutionError:\n", + " print('Cannot append_columns with schema clash!')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As expected, we cannot call `append_columns` with target dataflows that have clashing schema.\n", + "\n", + "We can make the call once we rename or drop the offending columns. In more complex scenarios, we could opt to skip or filter to make rows align before appending columns. Here we will choose to simply drop the clashing column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_combined_by_column = dflow.append_columns([dflow_chicago.drop_columns(['Ward'])])\n", + "dflow_combined_by_column.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that the resultant schema has more columns in the first N records (N being the number of records in `dataflow` and the extra columns being the width of the schema of our reference dataflow, chicago, minus the `Ward` column). From the N+1th record onwards, we will only have a schema width matching that of the `Ward`-less chicago set.\n", + "\n", + "Why is this? As much as possible, the data from the reference dataflow(s) will be attached to existing rows in the target dataflow. If there are not enough rows in the target dataflow to attach to, we simply append them as new rows.\n", + "\n", + "Note that these are appends, not joins (for joins please reference [Join](join.ipynb)), so the append may not be logically correct, but will take effect as long as there are no schema clashes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Ward-less data after we skip the first N rows\n", + "dflow_len = dflow.row_count\n", + "dflow_combined_by_column.skip(dflow_len).head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `append_rows(dataflows)`\n", + "We can append data length-wise, which will only have the effect of adding new rows. No existing data will be changed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import auto_read_file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_spring = auto_read_file(path='../data/crime-spring.csv')\n", + "dflow_spring.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_chicago = auto_read_file(path='../data/chicago-aldermen-2015.csv')\n", + "dflow_chicago.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_combined_by_row = dflow.append_rows([dflow_chicago, dflow_spring])\n", + "dflow_combined_by_row.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that neither schema nor data has changed for the target dataflow.\n", + "\n", + "If we skip ahead, we will see our target dataflows' data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# chicago data\n", + "dflow_len = dflow.row_count\n", + "dflow_combined_by_row.skip(dflow_len).head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# crimes spring data\n", + "dflow_chicago_len = dflow_chicago.row_count\n", + "dflow_combined_by_row.skip(dflow_len + dflow_chicago_len).head(5)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/assertions.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/assertions.ipynb new file mode 100644 index 00000000..38cea497 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/assertions.ipynb @@ -0,0 +1,127 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Assertions\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Frequently, the data we work with while cleaning and preparing data is just a subset of the total data we will need to work with in production. It is also common to be working on a snapshot of a live dataset that is continuously updated and augmented.\n", + "\n", + "In these cases, some of the assumptions we make as part of our cleaning might turn out to be false. Columns that originally only contained numbers within a certain range might actually contain a wider range of values in later executions. These errors often result in either broken pipelines or bad data.\n", + "\n", + "Azure ML Data Prep supports creating assertions on data, which are evaluated as the pipeline is executed. These assertions enable us to verify that our assumptions on the data continue to be accurate and, when not, to handle failures in a clean way." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To demonstrate, we will load a dataset and then add some assertions based on what we can see in the first few rows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import auto_read_file\n", + "\n", + "dflow = auto_read_file('../data/crime-dirty.csv')\n", + "dflow.get_profile()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see there are latitude and longitude columns present in this dataset. By definition, these are constrained to specific ranges of values. We can assert that this is indeed the case so that if any records come through with invalid values, we detect them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import value\n", + "\n", + "dflow = dflow.assert_value('Latitude', (value <= 90) & (value >= -90), error_code='InvalidLatitude')\n", + "dflow = dflow.assert_value('Longitude', (value <= 180) & (value >= -180), error_code='InvalidLongitude')\n", + "dflow.keep_columns(['Latitude', 'Longitude']).get_profile()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Any assertion failures are represented as Errors in the resulting dataset. From the profile above, you can see that the Error Count for both of these columns is 1. We can use a filter to retrieve the error and see what value caused the assertion to fail." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import col\n", + "\n", + "dflow_error = dflow.filter(col('Latitude').is_error())\n", + "error = dflow_error.head(10)['Latitude'][0]\n", + "print(error.originalValue)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our assertion failed because we were not removing missing values from our data. At this point, we have two options: we can go back and edit our code to avoid this error in the first place or we can resolve it now. In this case, we will just filter these out." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import LocalFileOutput\n", + "dflow_clean = dflow.filter(~dflow['Latitude'].is_error())\n", + "dflow_clean.get_profile()" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/auto-read-file.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/auto-read-file.ipynb new file mode 100644 index 00000000..3dc06fa3 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/auto-read-file.ipynb @@ -0,0 +1,183 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Auto Read File\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Prep has the ability to load different kinds of text files. The `auto_read_file` entry point can take any text based file (including excel, json and parquet) and auto-detect how to parse the file. It will also attempt to auto-detect the types of each column and apply type transformations to the columns it detects.\n", + "\n", + "The result will be a Dataflow object that has all the steps added that are required to read the given file(s) and convert their columns to the predicted types. No parameters are required beyond the file path or `FileDataSource` object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_auto = dprep.auto_read_file('../data/crime_multiple_separators.csv')\n", + "dflow_auto.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_auto1 = dprep.auto_read_file('../data/crime.xlsx')\n", + "dflow_auto1.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_auto2 = dprep.auto_read_file('../data/crime.parquet')\n", + "dflow_auto2.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looking at the data, we can see that there are two empty columns either side of the 'Completed' column.\n", + "If we compare the dataframe to a few rows from the original file:\n", + "```\n", + "ID |CaseNumber| |Completed|\n", + "10140490 |HY329907| |Y|\n", + "10139776 |HY329265| |Y|\n", + "```\n", + "We can see that the `|`'s have disappeared in the dataframe. This is because `|` is a very common separator character in csv files, so `auto_read_file` guessed it was the column separator. For this data we actually want the `|`'s to remain and instead use space as the column separator.\n", + "\n", + "To achieve this we can use `detect_file_format`. It takes a file path or datasource object and gives back a `FileFormatBuilder` which has learnt some information about the supplied data.\n", + "This is what `auto_read_file` is using behind the scenes to 'learn' the contents of the given file and determine how to parse it. With the `FileFormatBuilder` we can take advantage of the intelligent learning aspect of `auto_read_file` but have the chance to modify some of the learnt information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ffb = dprep.detect_file_format('../data/crime_multiple_separators.csv')\n", + "ffb_2 = dprep.detect_file_format('../data/crime.xlsx')\n", + "ffb_3 = dprep.detect_file_format('../data/crime_fixed_width_file.txt')\n", + "ffb_4 = dprep.detect_file_format('../data/json.json')\n", + "\n", + "print(ffb.file_format)\n", + "print(ffb_2.file_format)\n", + "print(ffb_3.file_format)\n", + "print(type(ffb_4.file_format))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After calling `detect_file_format` we get a `FileFormatBuilder` that has had `learn` called on it. This means the `file_format` attribute will be populated with a `Properties` object, it contains all the information that was learnt about the file. As we can see above different file types have corresponding file_formats detected. \n", + "Continuing with our delimited example we can change any of these values and then call `ffb.to_dataflow()` to create a `Dataflow` that has the steps required to parse the datasource." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ffb.file_format.separator = ' '\n", + "dflow = ffb.to_dataflow()\n", + "df = dflow.to_pandas_dataframe()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The result is our desired dataframe with `|`'s included.\n", + "\n", + "If we refer back to the original data output by `auto_read_file`, the 'ID' column was also detected as numeric and converted to a number data type instead of remaining a string like in the data above.\n", + "We can perform type inference on our new dataflow using the `dataflow.builders` property. This property exposes different builders that can `learn` from a dataflow and `apply` the learning to produce a new dataflow, very similar to the pattern we used above for the `FileFormatBuilder`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ctb = dflow.builders.set_column_types()\n", + "ctb.learn()\n", + "ctb.conversion_candidates" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After learning `ctb.conversion_candidates` has been populated with information about the inferred types for each column, it is possible for there to be multiple candidate types per column, in this example there is only one type for each column.\n", + "\n", + "The candidates look correct, we only want to convert `ID` to be an integer column, so applying this `ColumnTypesBuilder` should result in a Dataflow with our columns converted to their respective types." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_converted = ctb.to_dataflow()\n", + "\n", + "df_converted = dflow_converted.to_pandas_dataframe()\n", + "df_converted" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/cache.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/cache.ipynb new file mode 100644 index 00000000..dfbf65ba --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/cache.ipynb @@ -0,0 +1,188 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cache\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A Dataflow can be cached as a file on your disk during a local run by calling `dflow_cached = dflow.cache(directory_path)`. Doing this will run all the steps in the Dataflow, `dflow`, and save the cached data to the specified `directory_path`. The returned Dataflow, `dflow_cached`, has a Caching Step added at the end. Any subsequent runs on on the Dataflow `dflow_cached` will reuse the cached data, and the steps before the Caching Step will not be run again.\n", + "\n", + "Caching avoids running transforms multiple times, which can make local runs more efficient. Here are common places to use Caching:\n", + "- after reading data from remote\n", + "- after expensive transforms, such as Sort\n", + "- after transforms that change the shape of data, such as Sampling, Filter and Summarize\n", + "\n", + "Caching Step will be ignored during scale-out run invoked by `to_spark_dataframe()`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will start by reading in a dataset and applying some transforms to the Dataflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "dflow = dprep.read_csv(path='../data/crime-spring.csv')\n", + "dflow = dflow.take_sample(probability=0.2, seed=7)\n", + "dflow = dflow.sort_asc(columns='Primary Type')\n", + "dflow = dflow.keep_columns(['ID', 'Case Number', 'Date', 'Primary Type'])\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we will choose a directory to store the cached data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "cache_dir = str(Path(os.getcwd(), 'dataflow-cache'))\n", + "cache_dir" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will now call `dflow.cache(directory_path)` to cache the Dataflow to your directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_cached = dflow.cache(directory_path=cache_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will check steps in the `dflow_cached` to see that all of the previous steps were cached." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "[s.step_type for s in dflow_cached._get_steps()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We also check the data stored in the cache directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "os.listdir(cache_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Running against `dflow_cached` will reuse the cached data and skip running all of the previous steps again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_cached.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Adding additional steps to `dflow_cached` will also reuse the cache data and skip running the steps prior to the Cache Step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_cached_take = dflow_cached.take(10)\n", + "dflow_cached_skip = dflow_cached.skip(10).take(10)\n", + "\n", + "df_cached_take = dflow_cached_take.to_pandas_dataframe()\n", + "df_cached_skip = dflow_cached_skip.to_pandas_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# shutil.rmtree will then clean up the cached data \n", + "import shutil\n", + "shutil.rmtree(path=cache_dir)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-manipulations.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-manipulations.ipynb new file mode 100644 index 00000000..4b5d61d8 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-manipulations.ipynb @@ -0,0 +1,557 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Column Manipulations\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License.
\n", + "\n", + "Azure ML Data Prep has many methods for manipulating columns, including basic CUD operations and several other more complex manipulations.\n", + "\n", + "This notebook will focus primarily on data-agnostic operations. For all other column manipulation operations, we will link to their specific how-to guide.\n", + "\n", + "## Table of Contents\n", + "[ColumnSelector](#ColumnSelector)
\n", + "[add_column](#add_column)
\n", + "[append_columns](#append_columns)
\n", + "[drop_columns](#drop_columns)
\n", + "[duplicate_column](#duplicate_column)
\n", + "[fuzzy_group_column](#fuzzy_group_column)
\n", + "[keep_columns](#keep_columns)
\n", + "[map_column](#map_column)
\n", + "[new_script_column](#new_script_column)
\n", + "[rename_columns](#rename_columns)
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ColumnSelector\n", + "`ColumnSelector` is a Data Prep class that allows us to select columns by name. The idea is to be able to describe columns generally instead of explicitly, using a search term or regex expression, with various options.\n", + "\n", + "Note that a `ColumnSelector` does not represent the columns they match themselves, but the selector of the described columns. Therefore if we use the same `ColumnSelector` on two different dataflows, we may get different results depending on the columns of each dataflow.\n", + "\n", + "Column manipulations that can utilize `ColumnSelector` will be noted in their respective sections in this book." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import auto_read_file\n", + "dflow = auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All parameters to a `ColumnSelector` are shown here for completeness. We will use `keep_columns` in our example, which will keep only the columns in the dataflow that we tell it to keep.\n", + "\n", + "In the below example, we match all columns with the letter 'i'. Because we set `ignore_case` to false and `match_whole_word` to false, then any column that contains 'i' or 'I' will be selected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import ColumnSelector\n", + "column_selector = ColumnSelector(term=\"i\",\n", + " use_regex=False,\n", + " ignore_case=True,\n", + " match_whole_word=False,\n", + " invert=False)\n", + "dflow_selected = dflow.keep_columns(column_selector)\n", + "dflow_selected.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we set `invert` to true, we get the opposite of what we matched earlier." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "column_selector = ColumnSelector(term=\"i\",\n", + " use_regex=False,\n", + " ignore_case=True,\n", + " match_whole_word=False,\n", + " invert=True)\n", + "dflow_selected = dflow.keep_columns(column_selector)\n", + "dflow_selected.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we change the search term to 'I' and set case sensitivity to true, we get only the handful of columns that contain an upper case 'I'." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "column_selector = ColumnSelector(term=\"I\",\n", + " use_regex=False,\n", + " ignore_case=False,\n", + " match_whole_word=False,\n", + " invert=False)\n", + "dflow_selected = dflow.keep_columns(column_selector)\n", + "dflow_selected.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And if we set `match_whole_word` to true, we get no results at all as there is no column called 'I'." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "column_selector = ColumnSelector(term=\"I\",\n", + " use_regex=False,\n", + " ignore_case=False,\n", + " match_whole_word=True,\n", + " invert=False)\n", + "dflow_selected = dflow.keep_columns(column_selector)\n", + "dflow_selected.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, the `use_regex` flag dictates whether or not to treat the search term as a regex. It can be combined still with the other options.\n", + "\n", + "Here we define all columns that begin with the capital letter 'I'." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "column_selector = ColumnSelector(term=\"I.*\",\n", + " use_regex=True,\n", + " ignore_case=True,\n", + " match_whole_word=True,\n", + " invert=False)\n", + "dflow_selected = dflow.keep_columns(column_selector)\n", + "dflow_selected.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## add_column" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Please see [add-column-using-expression](add-column-using-expression.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## append_columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Please see [append-columns-and-rows](append-columns-and-rows.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## drop_columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Prep supports dropping columns one or more columns in a single statement. Supports `ColumnSelector`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import auto_read_file\n", + "dflow = auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that there are 22 columns to begin with. We will now drop the 'ID' column and observe that the resulting dataflow contains 21 columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_dropped = dflow.drop_columns('ID')\n", + "dflow_dropped.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also drop more than one column at once by passing a list of column names." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_dropped = dflow_dropped.drop_columns(['IUCR', 'Description'])\n", + "dflow_dropped.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## duplicate_column" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Prep supports duplicating columns one or more columns in a single statement.\n", + "\n", + "Duplicated columns are placed to the immediate right of their source column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import auto_read_file\n", + "dflow = auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We decide which column(s) to duplicate and what the new column name(s) should be with a key value pairing (dictionary)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_dupe = dflow.duplicate_column({'ID': 'ID2', 'IUCR': 'IUCR_Clone'})\n", + "dflow_dupe.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## fuzzy_group_column" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Please see [fuzzy-group](fuzzy-group.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## keep_columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Prep supports keeping one or more columns in a single statement. The resulting dataflow will contain only the column(s) specified; dropping all the other columns. Supports `ColumnSelector`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import auto_read_file\n", + "dflow = auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_keep = dflow.keep_columns(['ID', 'Date', 'Description'])\n", + "dflow_keep.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similar to `drop_columns`, we can pass a single column name or a list of them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_keep = dflow_keep.keep_columns('ID')\n", + "dflow_keep.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## map_column" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Prep supports string mapping. For a column containing strings, we can provide specific mappings from an original value to a new value, and then produce a new column that contains the mapped values.\n", + "\n", + "The mapped columns are placed to the immediate right of their source column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import auto_read_file\n", + "dflow = auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import ReplacementsValue\n", + "replacements = [ReplacementsValue('THEFT', 'THEFT2'), ReplacementsValue('BATTERY', 'BATTERY!!!')]\n", + "dflow_mapped = dflow.map_column(column='Primary Type', \n", + " new_column_id='Primary Type V2',\n", + " replacements=replacements)\n", + "dflow_mapped.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## new_script_column" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Please see [custom-python-transforms](custom-python-transforms.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## rename_columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Prep supports renaming one or more columns in a single statement." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import auto_read_file\n", + "dflow = auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We decide which column(s) to rename and what the new column name(s) should be with a key value pairing (dictionary)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_renamed = dflow.rename_columns({'ID': 'ID2', 'IUCR': 'IUCR_Clone'})\n", + "dflow_renamed.head(5)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-type-transforms.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-type-transforms.ipynb new file mode 100644 index 00000000..87f457e5 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-type-transforms.ipynb @@ -0,0 +1,467 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Column Type Transforms\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When consuming a data set, it is highly useful to know as much as possible about the data. Column types can help you understand more about each column, and enable type-specific transformations later. This provides much more insight than treating all data as strings.\n", + "\n", + "In this notebook, you will learn about:\n", + "- [Built-in column types](#types)\n", + "- How to:\n", + " - [Convert to long (integer)](#long)\n", + " - [Convert to double (floating point or decimal number)](#double)\n", + " - [Convert to boolean](#boolean)\n", + " - [Convert to datetime](#datetime)\n", + "- [How to use `ColumnTypesBuilder` to get suggested column types and convert them](#builder)\n", + "- [How to convert column type for multiple columns if types are known](#multiple-columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dprep.read_csv('../data/crime-winter.csv')\n", + "dflow = dflow.keep_columns(['Case Number', 'Date', 'IUCR', 'Arrest', 'Longitude', 'Latitude'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Built-in column types" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Currently, Data Prep supports the following column types: string, long (integer), double (floating point or decimal number), boolean, and datetime.\n", + "\n", + "In the previous step, a data set was read in as a Dataflow, with only a few interesting columns kept. We will use this Dataflow to explore column types throughout the notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the first few rows of the Dataflow, you can see that the columns contain different types of data. However, by looking at `dtypes`, you can see that `read_csv()` treats all columns as string columns.\n", + "\n", + "Note that `auto_read_file()` is a data ingestion function that infers column types. Learn more about it [here](./auto-read-file.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Converting to long (integer)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Suppose the \"IUCR\" column should only contain integers. You can call `to_long` to convert the column type of \"IUCR\" to `FieldType.INTEGER`. If you look at the data profile ([learn more about data profiles](./data-profile.ipynb)), you will see numeric metrics populated for that column such as mean, variance, quantiles, etc. This is helpful for understanding the shape and distribution of numeric data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_conversion = dflow.to_long('IUCR')\n", + "profile = dflow_conversion.get_profile()\n", + "profile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Converting to double (floating point or decimal number)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Suppose the \"Latitude\" and \"Longitude\" columns should only contain decimal numbers. You can call `to_double` to convert the column type of \"Latitude\" and \"Longitude\" to `FieldType.DECIMAL`. In the data profile, you will see numeric metrics populated for these columns as well. Note that after converting the column types, you can see that there are missing values in these columns. Metrics like this can be helpful for noticing issues with the data set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_conversion = dflow_conversion.to_number(['Latitude', 'Longitude'])\n", + "profile = dflow_conversion.get_profile()\n", + "profile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Converting to boolean" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Suppose the \"Arrest\" column should only contain boolean values. You can call `to_bool` to convert the column type of \"Arrest\" to `FieldType.BOOLEAN`.\n", + "\n", + "The `to_bool` function allows you to specify which values should map to `True` and which values should map to `False`. To do so, you can provide those values in an array as parameters `true_values` and `false_values`. Additionally, you can specify whether all other values should become `True`, `False` or Error by using the `mismatch_as` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_conversion.to_bool('Arrest', \n", + " true_values=[1],\n", + " false_values=[0],\n", + " mismatch_as=dprep.MismatchAsOption.ASERROR).head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the previous conversion, all the values in the \"Arrest\" column became `DataPrepError`, because 'FALSE' didn't match any of the `false_values` nor any of the `true_values`, and all the unmatched values were set to become errors. Let's try the conversion again with different `false_values`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_conversion = dflow_conversion.to_bool('Arrest',\n", + " true_values=['1', 'TRUE'],\n", + " false_values=['0', 'FALSE'],\n", + " mismatch_as=dprep.MismatchAsOption.ASERROR)\n", + "dflow_conversion.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This time, all the string values 'FALSE' have been successfully converted to the boolean value `False`. Take another look at the data profile." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "profile = dflow_conversion.get_profile()\n", + "profile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Suppose the \"Date\" column should only contain datetime values. You can convert its column type to `FieldType.DateTime` using the `to_datetime` function. Typically, datetime formats can be confusing or inconsistent. Next, we will show you all the tools that can help correctly converting the column to `DateTime`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the first example, directly call `to_datetime` with only the column name. Data Prep will inspect the data in this column and learn what format should be used for the conversion.\n", + "\n", + "Note that if there is data in the column that cannot be converted to datetime, an Error value will be created in that cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_conversion_date = dflow_conversion.to_datetime('Date')\n", + "dflow_conversion_date.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case, we can see that '1/10/2016 11:00' was converted using the format `%m/%d/%Y %H:%M`.\n", + "\n", + "The data in this column is actually somewhat ambiguous. Should the dates be 'October 1' or 'January 10'? The function `to_datetime` determines that both are possible, but defaults to month-first (US format).\n", + "\n", + "If the data was supposed to be day-first, you can customize the conversion." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_alternate_conversion = dflow_conversion.to_datetime('Date', date_time_formats=['%d/%m/%Y %H:%M'])\n", + "dflow_alternate_conversion.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using `ColumnTypesBuilder`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Prep can help you automatically detect what are the likely column types.\n", + "\n", + "You can call `dflow.builders.set_column_types()` to get a `ColumnTypesBuilder`. Then, calling `learn()` on it will trigger Data Prep to inspect the data in each column. As a result, you can see the suggested column types for each column (conversion candidates)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder = dflow.builders.set_column_types()\n", + "builder.learn()\n", + "builder" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case, Data Prep suggested the correct column types for \"Arrest\", \"Case Number\", \"Latitude\", and \"Longitude\".\n", + "\n", + "However, for \"Date\", it has suggested two possible date formats: month-first, or day-first. The ambiguity must be resolved before you complete the conversion. To use the month-first format, you can call `builder.ambiguous_date_conversions_keep_month_day()`. Otherwise, call `builder.ambiguous_date_conversions_keep_day_month()`. Note that if there were multiple datetime columns with ambiguous date conversions, calling one of these functions will apply the resolution to all of them.\n", + "\n", + "If you want to skip all the ambiguous date column conversions instead, you can call: `builder.ambiguous_date_conversions_drop()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.ambiguous_date_conversions_keep_month_day()\n", + "builder.conversion_candidates" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The conversion candidate for \"IUCR\" is currently `FieldType.INTEGER`. If you know that \"IUCR\" should be floating point (called `FieldType.DECIMAL`), you can tweak the builder to change the conversion candidate for that specific column. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.conversion_candidates['IUCR'] = dprep.FieldType.DECIMAL\n", + "builder" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case we are happy with \"IUCR\" as `FieldType.INTEGER`. So we set it back. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.conversion_candidates['IUCR'] = dprep.FieldType.INTEGER\n", + "builder" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you are happy with the conversion candidates, you can complete the conversion by calling `builder.to_dataflow()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_converion_using_builder = builder.to_dataflow()\n", + "dflow_converion_using_builder.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Convert column types for multiple columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you already know the column types, you can simply call `dflow.set_column_types()`. This function allows you to specify multiple columns, and the desired column type for each one. Here's how you can convert all five columns at once.\n", + "\n", + "Note that `set_column_types` only supports a subset of column type conversions. For example, we cannot specify the true/false values for a boolean conversion, so the results of this operation is incorrect for the \"Arrest\" column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_conversion_using_set = dflow.set_column_types({\n", + " 'IUCR': dprep.FieldType.INTEGER,\n", + " 'Latitude': dprep.FieldType.DECIMAL,\n", + " 'Longitude': dprep.FieldType.DECIMAL,\n", + " 'Arrest': dprep.FieldType.BOOLEAN,\n", + " 'Date': (dprep.FieldType.DATE, ['%m/%d/%Y %H:%M']),\n", + "})\n", + "dflow_conversion_using_set.head(5)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/custom-python-transforms.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/custom-python-transforms.ipynb new file mode 100644 index 00000000..59c0b8d8 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/custom-python-transforms.ipynb @@ -0,0 +1,225 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Custom Python Transforms\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There will be scenarios when the easiest thing for you to do is just to write some Python code. This SDK provides three extension points that you can use.\n", + "\n", + "1. New Script Column\n", + "2. New Script Filter\n", + "3. Transform Partition\n", + "\n", + "Each of these are supported in both the scale-up and the scale-out runtime. A key advantage of using these extension points is that you don't need to pull all of the data in order to create a dataframe. Your custom python code will be run just like other transforms, at scale, by partition, and typically in parallel." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initial data prep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We start by loading crime data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "col = dprep.col\n", + "\n", + "dflow = dprep.read_csv(path='../data/crime-spring.csv')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We trim the dataset down and keep only the columns we are interested in. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.keep_columns(['Case Number','Primary Type', 'Description', 'Latitude', 'Longitude'])\n", + "dflow = dflow.replace_na(columns=['Latitude', 'Longitude'], custom_na_list='')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We look for null values using a filter. We found some, so now we'll look at a way to fill these missing values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow.filter(col('Latitude').is_null()).head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Transform Partition" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to replace all null values with a 0, so we decide to use a handy pandas function. This code will be run by partition, not on all of the dataset at a time. This means that on a large dataset, this code may run in parallel as the runtime processes the data partition by partition." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pt_dflow = dflow\n", + "dflow = pt_dflow.transform_partition(\"\"\"\n", + "def transform(df, index):\n", + " df['Latitude'].fillna('0',inplace=True)\n", + " df['Longitude'].fillna('0',inplace=True)\n", + " return df\n", + "\"\"\")\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transform Partition With File" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Being able to use any python code to manipulate your data as a pandas DataFrame is extremely useful for complex and specific data operations that DataPrep doesn't handle natively. Though the code isn't very testable unfortunately, it's just sitting inside a string.\n", + "So to improve code testability and ease of script writing there is another transform_partiton interface that takes the path to a python script which must contain a function matching the 'transform' signature defined above.\n", + "\n", + "The `script_path` argument should be a relative path to ensure Dataflow portability. Here `map_func.py` contains the same code as in the previous example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = pt_dflow.transform_partition_with_file('../data/map_func.py')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## New Script Column" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to create a new column that has both the latitude and longitude. We can achieve it easily using [Data Prep expression](./add-column-using-expression.ipynb), which is faster in execution. Alternatively, We can do this using Python code by using the `new_script_column()` method on the dataflow. Note that we use custom Python code here for demo purpose only. In practise, you should always use Data Prep native functions as a preferred method, and use custom Python code when the functionality is not available in Data Prep. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.new_script_column(new_column_name='coordinates', insert_after='Longitude', script=\"\"\"\n", + "def newvalue(row):\n", + " return '(' + row['Latitude'] + ', ' + row['Longitude'] + ')'\n", + "\"\"\")\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## New Script Filter" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we want to filter the dataset down to only the crimes that incurred over $300 in loss. We can build a Python expression that returns True if we want to keep the row, and False to drop the row." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.new_script_filter(\"\"\"\n", + "def includerow(row):\n", + " val = row['Description']\n", + " return 'OVER $ 300' in val\n", + "\"\"\")\n", + "dflow.head(5)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-ingestion.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-ingestion.ipynb new file mode 100644 index 00000000..7444938c --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-ingestion.ipynb @@ -0,0 +1,908 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Ingestion\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Prep has the ability to load different types of input data. You can use auto-reading functionality to detect the type of a file, or directly specify a file type and its parameters." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table of Contents\n", + "[Read Lines](#lines)
\n", + "[Read CSV](#csv)
\n", + "[Read Compressed CSV](#compressed-csv)
\n", + "[Read Excel](#excel)
\n", + "[Read Fixed Width Files](#fixed-width)
\n", + "[Read Parquet](#parquet)
\n", + "[Read Part Files Using Globbing](#globbing)
\n", + "[Read JSON](#json)
\n", + "[Read SQL](#sql)
\n", + "[Read From Azure Blob](#azure-blob)
\n", + "[Read From ADLS](#adls)
\n", + "[Read Pandas DataFrame](#pandas-df)
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read Lines" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One of the simplest ways to read data using Data Prep is to just read it as text lines." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dprep.read_lines(path='../data/crime.txt')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With ingestion done, you can go ahead and start prepping the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = dflow.to_pandas_dataframe()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read CSV" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When reading delimited files, you can let the underlying runtime infer the parsing parameters (e.g. separator, encoding, whether to use headers, etc.) simply by not providing them. In this case, you can read a file by specifying only its location, then retrieve the first 10 rows to evaluate the result." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_duplicate_headers = dprep.read_csv(path='../data/crime_duplicate_headers.csv')\n", + "dflow_duplicate_headers.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the result, you can see that the delimiter and encoding were correctly detected. Column headers were also detected. However, the first line seems to be a duplicate of the column headers. One of the parameters is a number of lines to skip from the files being read. You can use this to filter out the duplicate line." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_skip_headers = dprep.read_csv(path='../data/crime_duplicate_headers.csv', skip_rows=1)\n", + "dflow_skip_headers.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now the data set contains the correct headers and the extraneous row has been skipped by read_csv. Next, look at the data types of the columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_skip_headers.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Unfortunately, all of the columns came back as strings. This is because, by default, Data Prep will not change the type of the data. Since the data source is a text file, all values are kept as strings. In this case, however, numeric columns should be parsed as numbers. To do this, set the `inference_arguments` parameter to a new instance of the `InferenceArguments` class, which will trigger type inference to be performed.\n", + "Note that setting inference arguments at this step also requires you to choose a strategy for dealing with ambiguous dates. The example below shows the month before day option." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_inferred_types = dprep.read_csv(path='../data/crime_duplicate_headers.csv',\n", + " skip_rows=1,\n", + " inference_arguments=dprep.InferenceArguments(day_first=False))\n", + "dflow_inferred_types.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now several of the columns were correctly detected as numbers and their `FieldType` is Decimal.\n", + "\n", + "With ingestion done, the data set is ready to start preparing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = dflow_inferred_types.to_pandas_dataframe()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read Compressed CSV" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Prep can also read delimited files compressed in an archive. The `archive_options` parameter specifies the type of archive and glob pattern of entries in the archive.\n", + "\n", + "At this moment, only reading from ZIP archives is supported." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import ArchiveOptions, ArchiveType\n", + "\n", + "dflow = dprep.read_csv(path='../data/crime.zip',\n", + " archive_options=ArchiveOptions(archive_type=ArchiveType.ZIP, entry_glob='*10-20.csv'))\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read Excel" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Prep can also load Excel files using the `read_excel` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_default_sheet = dprep.read_excel(path='../data/crime.xlsx')\n", + "dflow_default_sheet.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, the first sheet of the Excel document has been loaded. You could achieve the same result by specifying the name of the desired sheet explicitly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_second_sheet = dprep.read_excel(path='../data/crime.xlsx', sheet_name='Sheet2')\n", + "dflow_second_sheet.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, the table in the second sheet had headers as well as three empty rows, so you can modify the arguments accordingly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_skipped_rows = dprep.read_excel(path='../data/crime.xlsx',\n", + " sheet_name='Sheet2',\n", + " use_column_headers=True,\n", + " skip_rows=3)\n", + "dflow_skipped_rows.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = dflow_skipped_rows.to_pandas_dataframe()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read Fixed Width Files" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For fixed-width files, you can specify a list of offsets. The first column is always assumed to start at offset 0." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_fixed_width = dprep.read_fwf('../data/crime.txt', offsets=[8, 17, 26, 33, 56, 58, 74])\n", + "dflow_fixed_width.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looking at the data, you can see that the first row was used as headers. In this particular case, however, there are no headers in the file, so the first row should be treated as data.\n", + "\n", + "Passing in `PromoteHeadersMode.NONE` to the `header` keyword argument avoids header detection and gets the correct data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_no_headers = dprep.read_fwf('../data/crime.txt',\n", + " offsets=[8, 17, 26, 33, 56, 58, 74],\n", + " header=dprep.PromoteHeadersMode.NONE)\n", + "dflow_no_headers.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = dflow_no_headers.to_pandas_dataframe()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read Parquet" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Prep has two different methods for reading data stored as Parquet.\n", + "\n", + "Currently, both methods require the `pyarrow` package to be installed in your Python environment. This can be done via `pip install azureml-dataprep[parquet]`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read Parquet File" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For reading single `.parquet` files, or a folder full of only Parquet files, use `read_parquet_file`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dprep.read_parquet_file('../data/crime.parquet')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Parquet data is explicitly typed so no type inference is needed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read Parquet Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A Parquet Dataset is different from a Parquet file in that it could be a folder containing a number of Parquet files within a complex directory structure. It may have a hierarchical structure that partitions the data by value of a column. These more complex forms of Parquet data are commonly produced by Spark/HIVE.\n", + "\n", + "For these more complex data sets, you can use `read_parquet_dataset`, which uses pyarrow to handle complex Parquet layouts. This will also handle single Parquet files, though these are better read using `read_parquet_file`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dprep.read_parquet_dataset('../data/parquet_dataset')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above data was partitioned by the value of the `Arrest` column. It is a boolean column in the original crime0 data set and hence was partitioned by `Arrest=true` and `Arrest=false`.\n", + "\n", + "The directory structure is printed below for clarity." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "for path, dirs, files in os.walk('../data/parquet_dataset'):\n", + " level = path.replace('../data/parquet_dataset', '').count(os.sep)\n", + " indent = ' ' * (level)\n", + " print(indent + os.path.basename(path) + '/')\n", + " fileindent = ' ' * (level + 1)\n", + " for f in files:\n", + " print(fileindent + f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read Part Files Using Globbing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Prep supports globbing, which allows you to read partitioned files (or any other type of files) in a folder. Globbing is supported by all of the read transformations that take in file paths, such as `read_csv`, `read_lines`, etc. By specifying `../data/crime_partfiles/part-*` in the path, we will read all files start with `part-`in `crime_partfiles` folder and return them in one Dataflow. [`auto_read_file`](./auto-read-file.ipynb) will detect column types of your part files and parse them automatically." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_partfiles = dprep.auto_read_file(path='../data/crime_partfiles/part-*')\n", + "dflow_partfiles.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read JSON" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Prep can also load JSON files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_json = dprep.read_json(path='../data/json.json')\n", + "dflow_json.head(15)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When you use `read_json`, Data Prep will attempt to extract data from the file into a table. You can also control the file encoding Data Prep should use as well as whether Data Prep should flatten nested JSON arrays.\n", + "\n", + "Choosing the option to flatten nested arrays could result in a much larger number of rows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_flat_arrays = dprep.read_json(path='../data/json.json', flatten_nested_arrays=True)\n", + "dflow_flat_arrays.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read SQL" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Prep can also fetch data from SQL servers. Currently, only Microsoft SQL Server is supported.\n", + "\n", + "To read data from a SQL server, first create a data source object that contains the connection information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "secret = dprep.register_secret(value=\"dpr3pTestU$er\", id=\"dprepTestUser\")\n", + "ds = dprep.MSSQLDataSource(server_name=\"dprep-sql-test.database.windows.net\",\n", + " database_name=\"dprep-sql-test\",\n", + " user_name=\"dprepTestUser\",\n", + " password=secret)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, the password parameter of `MSSQLDataSource` accepts a Secret object. You can get a Secret object in two ways:\n", + "1. Register the secret and its value with the execution engine.\n", + "2. Create the secret with just an id (useful if the secret value was already registered in the execution environment).\n", + "\n", + "Now that you have created a data source object, you can proceed to read data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dprep.read_sql(ds, \"SELECT top 100 * FROM [SalesLT].[Product]\")\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = dflow.to_pandas_dataframe()\n", + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read from Azure Blob" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can read files stored in public Azure Blob by directly passing a file url. To read file from a protected Blob, pass SAS (Shared Access Signature) URI with both resource URI and SAS token in the path." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv', skip_rows=1)\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read from ADLS" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are two ways the Data Prep API can acquire the necessary OAuth token to access Azure DataLake Storage:\n", + "1. Retrieve the access token from a recent login session of the user's [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) login.\n", + "2. Use a ServicePrincipal (SP) and a certificate as a secret." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using Access Token from a recent Azure CLI session" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "On your local machine, run the following command:\n", + "```\n", + "az login\n", + "```\n", + "If your user account is a member of more than one Azure tenant, you need to specify the tenant, either in the AAD url hostname form '.onmicrosoft.com' or the tenantId GUID. The latter can be retrieved as follows:\n", + "```\n", + "az account show --query tenantId\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "dflow = read_csv(path = DataLakeDataSource(path='adl://dpreptestfiles.azuredatalakestore.net/farmers-markets.csv', tenant='microsoft.onmicrosoft.com'))\n", + "head = dflow.head(5)\n", + "head\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a ServicePrincipal via Azure CLI" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A ServicePrincipal and the corresponding certificate can be created via [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest).\n", + "This particular SP is configured as Reader, with its scope reduced to just the ADLS account 'dpreptestfiles'.\n", + "```\n", + "az account set --subscription \"Data Wrangling development\"\n", + "az ad sp create-for-rbac -n \"SP-ADLS-dpreptestfiles\" --create-cert --role reader --scopes /subscriptions/35f16a99-532a-4a47-9e93-00305f6c40f2/resourceGroups/dpreptestfiles/providers/Microsoft.DataLakeStore/accounts/dpreptestfiles\n", + "```\n", + "This command emits the appId and the path to the certificate file (usually in the home folder). The .crt file contains both the public certificate and the private key in PEM format.\n", + "\n", + "Extract the thumbprint with:\n", + "```\n", + "openssl x509 -in adls-dpreptestfiles.crt -noout -fingerprint\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configure ADLS Account for ServicePrincipal" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To configure the ACL for the ADLS filesystem, use the objectId of the user or, here, ServicePrincipal:\n", + "```\n", + "az ad sp show --id \"8dd38f34-1fcb-4ff9-accd-7cd60b757174\" --query objectId\n", + "```\n", + "Configure Read and Execute access for the ADLS file system. Since the underlying HDFS ACL model doesn't support inheritance, folders and files need to be ACL-ed individually.\n", + "```\n", + "az dls fs access set-entry --account dpreptestfiles --acl-spec \"user:e37b9b1f-6a5e-4bee-9def-402b956f4e6f:r-x\" --path /\n", + "az dls fs access set-entry --account dpreptestfiles --acl-spec \"user:e37b9b1f-6a5e-4bee-9def-402b956f4e6f:r--\" --path /farmers-markets.csv\n", + "```\n", + "\n", + "References:\n", + "- [az ad sp](https://docs.microsoft.com/en-us/cli/azure/ad/sp?view=azure-cli-latest)\n", + "- [az dls fs access](https://docs.microsoft.com/en-us/cli/azure/dls/fs/access?view=azure-cli-latest)\n", + "- [ACL model for ADLS](https://github.com/MicrosoftDocs/azure-docs/blob/master/articles/data-lake-store/data-lake-store-access-control.md)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "certThumbprint = 'C2:08:9D:9E:D1:74:FC:EB:E9:7E:63:96:37:1C:13:88:5E:B9:2C:84'\n", + "certificate = ''\n", + "with open('../data/adls-dpreptestfiles.crt', 'rt', encoding='utf-8') as crtFile:\n", + " certificate = crtFile.read()\n", + "\n", + "servicePrincipalAppId = \"8dd38f34-1fcb-4ff9-accd-7cd60b757174\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Acquire an OAuth Access Token" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the adal package (via: `pip install adal`) to create an authentication context on the MSFT tenant and acquire an OAuth access token. Note that for ADLS, the `resource` in the token request must be for 'datalake.azure.net', which is different from most other Azure resources." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import adal\n", + "from azureml.dataprep.api.datasources import DataLakeDataSource\n", + "\n", + "ctx = adal.AuthenticationContext('https://login.microsoftonline.com/microsoft.onmicrosoft.com')\n", + "token = ctx.acquire_token_with_client_certificate('https://datalake.azure.net/', servicePrincipalAppId, certificate, certThumbprint)\n", + "dflow = dprep.read_csv(path = DataLakeDataSource(path='adl://dpreptestfiles.azuredatalakestore.net/crime-spring.csv', accessToken=token['accessToken']))\n", + "dflow.to_pandas_dataframe().head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read Pandas DataFrame" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are situations where you may already have some data in the form of a pandas DataFrame.\n", + "The steps taken to get to this DataFrame may be non-trivial or not easy to convert to Data Prep Steps. The `read_pandas_dataframe` reader can take a DataFrame and use it as the data source for a Dataflow.\n", + "\n", + "You can pass in a path to a directory (that doesn't exist yet) for Data Prep to store the contents of the DataFrame; otherwise, a temporary directory will be made in the system's temp folder. The files written to this directory will be named `part-00000` and so on; they are written out in Data Prep's internal row-based file format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dprep.read_excel(path='../data/crime.xlsx')\n", + "dflow = dflow.drop_columns(columns=['Column1'])\n", + "df = dflow.to_pandas_dataframe()\n", + "df.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After loading in the data you can now do `read_pandas_dataframe`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import shutil\n", + "cache_dir = 'dflow_df'\n", + "shutil.rmtree(cache_dir, ignore_errors=True)\n", + "dflow_df = dprep.read_pandas_dataframe(df, cache_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_df.head(5)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-profile.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-profile.ipynb new file mode 100644 index 00000000..d723321a --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-profile.ipynb @@ -0,0 +1,200 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Profile\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A DataProfile collects summary statistics on each column of the data produced by a Dataflow. This can be used to:\n", + "- Understand the input data.\n", + "- Determine which columns might need further preparation.\n", + "- Verify that data preparation operations produced the desired result.\n", + "\n", + "`Dataflow.get_profile()` executes the Dataflow, calculates profile information, and returns a newly constructed DataProfile." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "\n", + "dflow = dprep.auto_read_file('../data/crime-spring.csv')\n", + "\n", + "profile = dflow.get_profile()\n", + "profile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A DataProfile contains a collection of ColumnProfiles, indexed by column name. Each ColumnProfile has attributes for the calculated column statistics. For non-numeric columns, profiles include only basic statistics like min, max, and error count. For numeric columns, profiles also include statistical moments and estimated quantiles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "profile.columns['Beat']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also extract and filter data from profiles by using list and dict comprehensions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "variances = [c.variance for c in profile.columns.values() if c.variance]\n", + "variances" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "column_types = {c.name: c.type for c in profile.columns.values()}\n", + "column_types" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If a column has fewer than a thousand unique values, its ColumnProfile contains a summary of values with their respective counts." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "profile.columns['Primary Type'].value_counts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Numeric ColumnProfiles include an estimated histogram of the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "profile.columns['District'].histogram" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To configure the number of bins in the histogram, you can pass an integer as the `number_of_histogram_bins` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "profile_more_bins = dflow.get_profile(number_of_histogram_bins=5)\n", + "profile_more_bins.columns['District'].histogram" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For columns containing data of mixed types, the ColumnProfile also provides counts of each type." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "profile.columns['X Coordinate'].type_counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#TEST CELL: Profile.Compare\n", + "import azureml.dataprep as dprep\n", + "import math\n", + "\n", + "lhs_dflow = dprep.auto_read_file('../data/crime-spring.csv')\n", + "lhs_profile = lhs_dflow.get_profile(number_of_histogram_bins=100)\n", + "rhs_dflow = dprep.auto_read_file('../data/crime-winter.csv')\n", + "rhs_profile = rhs_dflow.get_profile(number_of_histogram_bins=100)\n", + "\n", + "diff = lhs_profile.compare(rhs_profile)\n", + "\n", + "expected_col1 = dprep.ColumnProfileDifference()\n", + "expected_col1.difference_in_count_in_percent = 0\n", + "expected_col1.difference_in_histograms = 135349.66146244822\n", + "\n", + "for actual, expected in zip(diff.column_profile_difference, [expected_col1]) :\n", + " assert math.isclose(actual.difference_in_count_in_percent, expected.difference_in_count_in_percent)\n", + " assert math.isclose(actual.difference_in_histograms, expected.difference_in_histograms)\n", + " break\n" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/datastore.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/datastore.ipynb new file mode 100644 index 00000000..478b8770 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/datastore.ipynb @@ -0,0 +1,197 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reading from and Writing to Datastores" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A datastore is a reference that points to an Azure storage service like a blob container for example. It belongs to a workspace and a workspace can have many datastores.\n", + "\n", + "A data path points to a path on the underlying Azure storage service the datastore references. For example, given a datastore named `blob` that points to an Azure blob container, a data path can point to `/test/data/titanic.csv` in the blob container." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read data from Datastore" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Prep supports reading data from a `Datastore` or a `DataPath` or a `DataReference`. \n", + "\n", + "Passing in a datastore into all the `read_*` methods of Data Prep will result in reading everything in the underlying Azure storage service. To read a specific folder or file in the underlying storage, you have to pass in a data reference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Workspace, Datastore\n", + "from azureml.data.datapath import DataPath\n", + "\n", + "import azureml.dataprep as dprep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, get or create a workspace. Feel free to replace `subscription_id`, `resource_group`, and `workspace_name` with other values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subscription_id = '35f16a99-532a-4a47-9e93-00305f6c40f2'\n", + "resource_group = 'DataStoreTest'\n", + "workspace_name = 'dataprep-centraleuap'\n", + "\n", + "workspace = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "workspace.datastores" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can now read a crime data set from the datastore. If you are using your own workspace, the `crime0-10.csv` will not be there by default. You will have to upload the data to the datastore yourself." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "datastore = Datastore(workspace=workspace, name='dataprep_blob')\n", + "dflow = dprep.read_csv(path=datastore.path('crime0-10.csv'))\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also read from an Azure SQL database. To do that, you will first get an Azure SQL database datastore instance and pass it to Data Prep for reading." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "datastore = Datastore(workspace=workspace, name='test_sql')\n", + "dflow_sql = dprep.read_sql(data_source=datastore, query='SELECT * FROM team')\n", + "dflow_sql.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Write data to Datastore" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also write a dataflow to a datastore. The code below will write the file you read in earlier to the folder in the datastore." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dest_datastore = Datastore(workspace, 'dataprep_blob_key')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow.write_to_csv(directory_path=dest_datastore.path('output/crime0-10')).run_local()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now you can read all the files in the `dataprep_adls` datastore which references an Azure Data Lake store." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "datastore = Datastore(workspace=workspace, name='dataprep_adls')\n", + "dflow_adls = dprep.read_csv(path=DataPath(datastore, path_on_datastore='/input/crime0-10.csv'))\n", + "dflow_adls.head(5)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/derive-column-by-example.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/derive-column-by-example.ipynb new file mode 100644 index 00000000..06924d37 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/derive-column-by-example.ipynb @@ -0,0 +1,181 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Derive Column By Example\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One of the more advanced tools in Data Prep is the ability to derive columns by providing examples of desired results and letting Data Prep generate code to achieve the intended derivation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dprep.read_csv(path = '../data/crime-spring.csv')\n", + "df = dflow.head(5)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, this is a fairly simple file, but let's assume that we need to be able to join this with a dataset where date and time come in a format 'Apr 4, 2016 | 10PM-12AM'.\n", + "\n", + "Let's wrangle the data into the shape we need." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder = dflow.builders.derive_column_by_example(source_columns = ['Date'], new_column_name = 'date_timerange')\n", + "builder.add_example(source_data = df.iloc[0], example_value = 'Apr 4, 2016 10PM-12AM')\n", + "builder.preview() # will preview top 10 rows" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The code above first creates a builder for the derived column by providing an array of source columns to consider ('DATE') and name for the new column to be added.\n", + "\n", + "Then, we provide the first example by passing in the first row (index 0) of the DataFrame printed above and giving an expected value for the derived column.\n", + "\n", + "Finally, we call `builder.preview()` and observe the derived column next to the source column." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Everything looks good here. However, we just noticed that it's not quite what we wanted. We forgot to separate date and time range by '|' to generate the format we need.\n", + "\n", + "To fix that, we will add another example. This time, instead of passing in a row from the preview, we just construct a dictionary of column name to value for the source_data parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.add_example(source_data = {'Date': '4/15/2016 10:00'}, example_value = 'Apr 15, 2016 | 10AM-12PM')\n", + "builder.preview()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This clearly had negative effects, as now the only rows that have any values in derived column are the ones that match exactly with the examples we have provided.\n", + "\n", + "Let's look at the examples:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "examples = builder.list_examples()\n", + "examples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we can see that we have provided inconsistent examples. To fix the issue, we need to replace the first example with a correct one (including '|' between date and time).\n", + "\n", + "We can achieve this by deleting examples that are incorrect (by either passing in example_row from examples DataFrame, or by just passing in example_id value) and then adding new modified examples back." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.delete_example(example_id = -1)\n", + "builder.add_example(examples.iloc[0], 'Apr 4, 2016 | 10PM-12AM')\n", + "builder.preview()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now this looks correct and we can finally call to_dataflow() on the builder, which would return a dataflow with the desired derived columns added." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = builder.to_dataflow()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = dflow.to_pandas_dataframe()\n", + "df" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/external-references.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/external-references.ipynb new file mode 100644 index 00000000..0392c9cd --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/external-references.ipynb @@ -0,0 +1,112 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# External References\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In addition to opening existing Dataflows in code and modifying them, it is also possible to create and persist Dataflows that reference another Dataflow that has been persisted to a .dprep file. In this case, executing this Dataflow will load and execute the referenced Dataflow dynamically, and then execute the steps in the referencing Dataflow." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To demonstrate, we will create a Dataflow that loads and transforms some data. After that, we will persist this Dataflow to disk. To learn more about saving and opening .dprep files, see: [Opening and Saving Dataflows](./open-save-dataflows.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "import tempfile\n", + "import os\n", + "\n", + "dflow = dprep.auto_read_file('../data/crime.txt')\n", + "dflow = dflow.drop_errors(['Column7', 'Column8', 'Column9'], dprep.ColumnRelationship.ANY)\n", + "dflow_path = os.path.join(tempfile.gettempdir(), 'package.dprep')\n", + "dflow.save(dflow_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have a .dprep file, we can create a new Dataflow that references it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_new = dprep.Dataflow.reference(dprep.ExternalReference(dflow_path))\n", + "dflow_new.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When executed, the new Dataflow returns the same results as the one we saved to the .dprep file. Since this reference is resolved on execution, updating the referenced Dataflow results in the changes being visible when re-executing the referencing Dataflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.take(5)\n", + "dflow.save(dflow_path)\n", + "\n", + "dflow_new.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see, even though we did not modify `dflow_new`, it now returns only 5 records, as the referenced Dataflow was updated with the result from `dflow.take(5)`." + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/filtering.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/filtering.ipynb new file mode 100644 index 00000000..d1e76e5c --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/filtering.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Filtering\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Azure ML Data Prep has the ability to filter out columns or rows using `Dataflow.drop_columns` or `Dataflow.filter`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# initial set up\n", + "import azureml.dataprep as dprep\n", + "from datetime import datetime\n", + "dflow = dprep.read_csv(path='../data/crime-spring.csv')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filtering columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To filter columns, use `Dataflow.drop_columns`. This method takes a list of columns to drop or a more complex argument called `ColumnSelector`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering columns with list of strings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, `drop_columns` takes a list of strings. Each string should exactly match the desired column to drop." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.drop_columns(['ID', 'Location Description', 'Ward', 'Community Area', 'FBI Code'])\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering columns with regex" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, a `ColumnSelector` can be used to drop columns that match a regex expression. In this example, we drop all the columns that match the expression `Column*|.*longitud|.*latitude`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.drop_columns(dprep.ColumnSelector('Column*|.*longitud|.*latitude', True, True))\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filtering rows" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To filter rows, use `DataFlow.filter`. This method takes an `Expression` as an argument, and returns a new dataflow with the rows in which the expression evaluates to `True`. Expressions are built by indexing the `Dataflow` with a column name (`dataflow['myColumn']`) and regular operators (`>`, `<`, `>=`, `<=`, `==`, `!=`)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering rows with simple expressions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Index into the Dataflow specifying the column name as a string argument `dataflow['column_name']` and in combination with one of the following standard operators `>, <, >=, <=, ==, !=`, build an expression such as `dataflow['District'] > 9`. Finally, pass the built expression into the `Dataflow.filter` function.\n", + "\n", + "In this example, `dataflow.filter(dataflow['District'] > 9)` returns a new dataflow with the rows in which the value of \"District\" is greater than '10' \n", + "\n", + "*Note that \"District\" is first converted to numeric, which allows us to build an expression comparing it against other numeric values.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.to_number(['District'])\n", + "dflow = dflow.filter(dflow['District'] > 9)\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering rows with complex expressions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To filter using complex expressions, combine one or more simple expressions with the operators `&`, `|`, and `~`. Please note that the precedence of these operators is lower than that of the comparison operators; therefore, you'll need to use parentheses to group clauses together. \n", + "\n", + "In this example, `Dataflow.filter` returns a new dataflow with the rows in which \"Primary Type\" equals 'DECEPTIVE PRACTICE' and \"District\" is greater than or equal to '10'." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.to_number(['District'])\n", + "dflow = dflow.filter((dflow['Primary Type'] == 'DECEPTIVE PRACTICE') & (dflow['District'] >= 10))\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is also possible to filter rows combining more than one expression builder to create a nested expression.\n", + "\n", + "*Note that `'Date'` and `'Updated On'` are first converted to datetime, which allows us to build an expression comparing it against other datetime values.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.to_datetime(['Date', 'Updated On'], ['%Y-%m-%d %H:%M:%S'])\n", + "dflow = dflow.to_number(['District', 'Y Coordinate'])\n", + "comparison_date = datetime(2016,4,13)\n", + "dflow = dflow.filter(\n", + " ((dflow['Date'] > comparison_date) | (dflow['Updated On'] > comparison_date))\n", + " | ((dflow['Y Coordinate'] > 1900000) & (dflow['District'] > 10.0)))\n", + "dflow.head(5)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/fuzzy-group.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/fuzzy-group.ipynb new file mode 100644 index 00000000..2a91d463 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/fuzzy-group.ipynb @@ -0,0 +1,205 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fuzzy Grouping\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License.
\n", + "\n", + "Unprepared data often represents the same entity with multiple values; examples include different spellings, varying capitalizations, and abbreviations. This is common when working with data gathered from multiple sources or through human input. One way to canonicalize and reconcile these variants is to use Data Prep's fuzzy_group_column (also known as \"text clustering\") functionality.\n", + "\n", + "Data Prep inspects a column to determine clusters of similar values. A new column is added in which clustered values are replaced with the canonical value of its cluster, thus significantly reducing the number of distinct values. You can control the degree of similarity required for values to be clustered together, override canonical form, and set clusters if automatic clustering did not provide the desired results.\n", + "\n", + "Let's explore the capabilities of `fuzzy_group_column` by first reading in a dataset and inspecting it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dprep.read_json(path='../data/json.json')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see above, the column `inspections.business.city` contains several forms of the city name \"San Francisco\".\n", + "Let's add a column with values replaced by the automatically detected canonical form. To do so call fuzzy_group_column() on an existing Dataflow:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_clean = dflow.fuzzy_group_column(source_column='inspections.business.city',\n", + " new_column_name='city_grouped',\n", + " similarity_threshold=0.8,\n", + " similarity_score_column_name='similarity_score')\n", + "dflow_clean.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The arguments `source_column` and `new_column_name` are required, whereas the others are optional.\n", + "If `similarity_threshold` is provided, it will be used to control the required similarity level for the values to be grouped together.\n", + "If `similarity_score_column_name` is provided, a second new column will be added to show similarity score between every pair of original and canonical values.\n", + "\n", + "In the resulting data set, you can see that all the different variations of representing \"San Francisco\" in the data were normalized to the same string, \"San Francisco\".\n", + "\n", + "But what if you want more control over what gets grouped, what doesn't, and what the canonical value should be? \n", + "\n", + "To get more control over grouping, canonical values, and exceptions, you need to use the `FuzzyGroupBuilder` class.\n", + "Let's see what it has to offer below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder = dflow.builders.fuzzy_group_column(source_column='inspections.business.city',\n", + " new_column_name='city_grouped',\n", + " similarity_threshold=0.8,\n", + " similarity_score_column_name='similarity_score')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# calling learn() to get fuzzy groups\n", + "builder.learn()\n", + "builder.groups" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here you can see that `fuzzy_group_column` detected one group with four values that all map to \"San Francisco\" as the canonical value.\n", + "You can see the effects of changing the similarity threshold next:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.similarity_threshold = 0.9\n", + "builder.learn()\n", + "builder.groups" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that you are using a similarity threshold of `0.9`, two distinct groups of values are generated.\n", + "\n", + "Let's tweak some of the detected groups before completing the builder and getting back the Dataflow with the resulting fuzzy grouped column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.similarity_threshold = 0.8\n", + "builder.learn()\n", + "groups = builder.groups\n", + "groups" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# change the canonical value for the first group\n", + "groups[0]['canonicalValue'] = 'SANFRAN'\n", + "duplicates = groups[0]['duplicates']\n", + "# remove the last duplicate value from the cluster\n", + "duplicates = duplicates[:-1]\n", + "# assign modified duplicate array back\n", + "groups[0]['duplicates'] = duplicates\n", + "# assign modified groups back to builder\n", + "builder.groups = groups\n", + "builder.groups" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, the canonical value is modified to be used for the single fuzzy group and removed 'S.F.' from this group's duplicates list.\n", + "\n", + "You can mutate the copy of the `groups` list from the builder (be careful to keep the structure of objects inside this list). After getting the desired groups in the list, you can update the builder with it.\n", + "\n", + "Now you can get a dataflow with the FuzzyGroup step in it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_clean = builder.to_dataflow()\n", + "\n", + "df = dflow_clean.to_pandas_dataframe()\n", + "df" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/impute-missing-values.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/impute-missing-values.ipynb new file mode 100644 index 00000000..21eec9ff --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/impute-missing-values.ipynb @@ -0,0 +1,141 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Impute missing values\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Azure ML Data Prep has the ability to impute missing values in specified columns. In this case, we will attempt to impute the missing _Latitude_ and _Longitude_ values in the input data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# loading input data\n", + "dflow = dprep.read_csv(path= '../data/crime-spring.csv')\n", + "dflow = dflow.keep_columns(['ID', 'Arrest', 'Latitude', 'Longitude'])\n", + "dflow = dflow.to_number(['Latitude', 'Longitude'])\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The third record from input data has _Latitude_ and _Longitude_ missing. To impute those missing values, we can use `ImputeMissingValuesBuilder` to learn a fixed program which imputes the columns with either a calculated `MIN`, `MAX` or `MEAN` value or a `CUSTOM` value. When `group_by_columns` is specified, missing values will be imputed by group with `MIN`, `MAX` and `MEAN` calculated per group." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Firstly, let us quickly see check the `MEAN` value of _Latitude_ column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_mean = dflow.summarize(group_by_columns=['Arrest'],\n", + " summary_columns=[dprep.SummaryColumnsValue(column_id='Latitude',\n", + " summary_column_name='Latitude_MEAN',\n", + " summary_function=dprep.SummaryFunction.MEAN)])\n", + "dflow_mean = dflow_mean.filter(dprep.col('Arrest') == 'FALSE')\n", + "dflow_mean.head(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `MEAN` value of _Latitude_ looks good. So we will impute _Latitude_ with it. As for `Longitude`, we will impute it using `42` based on external knowledge." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# impute with MEAN\n", + "impute_mean = dprep.ImputeColumnArguments(column_id='Latitude',\n", + " impute_function=dprep.ReplaceValueFunction.MEAN)\n", + "# impute with custom value 42\n", + "impute_custom = dprep.ImputeColumnArguments(column_id='Longitude',\n", + " custom_impute_value=42)\n", + "# get instance of ImputeMissingValuesBuilder\n", + "impute_builder = dflow.builders.impute_missing_values(impute_columns=[impute_mean, impute_custom],\n", + " group_by_columns=['Arrest'])\n", + "# call learn() to learn a fixed program to impute missing values\n", + "impute_builder.learn()\n", + "# call to_dataflow() to get a dataflow with impute step added\n", + "dflow_imputed = impute_builder.to_dataflow()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# check impute result\n", + "dflow_imputed.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As the result above, the missing _Latitude_ has been imputed with the `MEAN` value of `Arrest=='false'` group, and the missing _Longitude_ has been imputed with `42`." + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/join.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/join.ipynb new file mode 100644 index 00000000..60038d52 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/join.ipynb @@ -0,0 +1,259 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Join\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License.
\n", + "\n", + "In Data Prep you can easily join two Dataflows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, get the left side of the data into a shape that is ready for the join." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get the first Dataflow and derive desired key column\n", + "dflow_left = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/testfiles/BostonWeather.csv')\n", + "dflow_left = dflow_left.derive_column_by_example(source_columns='DATE', new_column_name='date_timerange',\n", + " example_data=[('11/11/2015 0:54', 'Nov 11, 2015 | 12AM-2AM'),\n", + " ('2/1/2015 0:54', 'Feb 1, 2015 | 12AM-2AM'),\n", + " ('1/29/2015 20:54', 'Jan 29, 2015 | 8PM-10PM')])\n", + "dflow_left = dflow_left.drop_columns(['DATE'])\n", + "\n", + "# convert types and summarize data\n", + "dflow_left = dflow_left.set_column_types(type_conversions={'HOURLYDRYBULBTEMPF': dprep.TypeConverter(dprep.FieldType.DECIMAL)})\n", + "dflow_left = dflow_left.filter(expression=~dflow_left['HOURLYDRYBULBTEMPF'].is_error())\n", + "dflow_left = dflow_left.summarize(group_by_columns=['date_timerange'],summary_columns=[dprep.SummaryColumnsValue('HOURLYDRYBULBTEMPF', dprep.api.engineapi.typedefinitions.SummaryFunction.MEAN, 'HOURLYDRYBULBTEMPF_Mean')] )\n", + "\n", + "# cache the result so the steps above are not executed every time we pull on the data\n", + "import os\n", + "from pathlib import Path\n", + "cache_dir = str(Path(os.getcwd(), 'dataflow-cache'))\n", + "dflow_left.cache(directory_path=cache_dir)\n", + "dflow_left.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's prepare the data for the right side of the join." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get the second Dataflow and desired key column\n", + "dflow_right = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/bike-share/*-hubway-tripdata.csv')\n", + "dflow_right = dflow_right.keep_columns(['starttime', 'start station id'])\n", + "dflow_right = dflow_right.derive_column_by_example(source_columns='starttime', new_column_name='l_date_timerange',\n", + " example_data=[('2015-01-01 00:21:44', 'Jan 1, 2015 | 12AM-2AM')])\n", + "dflow_right = dflow_right.drop_columns('starttime')\n", + "\n", + "# cache the results\n", + "dflow_right.cache(directory_path=cache_dir)\n", + "dflow_right.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are three ways you can join two Dataflows in Data Prep:\n", + "1. Create a `JoinBuilder` object for interactive join configuration.\n", + "2. Call ```join()``` on one of the Dataflows and pass in the other along with all other arguments.\n", + "3. Call ```Dataflow.join()``` method and pass in two Dataflows along with all other arguments.\n", + "\n", + "We will explore the builder object as it simplifies the determination of correct arguments. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# construct a builder for joining dataflow_l with dataflow_r\n", + "join_builder = dflow_left.builders.join(right_dataflow=dflow_right, left_column_prefix='l', right_column_prefix='r')\n", + "\n", + "join_builder" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So far the builder has no properties set except default values.\n", + "From here you can set each of the options and preview its effect on the join result or use Data Prep to determine some of them.\n", + "\n", + "Let's start with determining appropriate column prefixes for left and right side of the join and lists of columns that would not conflict and therefore don't need to be prefixed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "join_builder.detect_column_info()\n", + "join_builder" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can see that Data Prep has performed a pull on both Dataflows to determine the column names in them. Given that `dataflow_r` already had a column starting with `l_` new prefix got generated which would not collide with any column names that are already present.\n", + "Additionally columns in each Dataflow that won't conflict during join would remain unprefixed.\n", + "This apprach to column naming is crucial for join robustness to schema changes in the data. Let's say that at some time in future the data consumed by left Dataflow will also have `l_date_timerange` column in it.\n", + "Configured as above the join will still run as expected and the new column will be prefixed with `l2_` ensuring that ig column `l_date_timerange` was consumed by some other future transformation it remains unaffected.\n", + "\n", + "Note: `KEY_generated` is appended to both lists and is reserved for Data Prep use in case Autojoin is performed.\n", + "\n", + "### Autojoin\n", + "Autojoin is a Data prep feature that determines suitable join arguments given data on both sides. In some cases Autojoin can even derive a key column from a number of available columns in the data.\n", + "Here is how you can use Autojoin:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# generate join suggestions\n", + "join_builder.generate_suggested_join()\n", + "\n", + "# list generated suggestions\n", + "join_builder.list_join_suggestions()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's select the first suggestion and preview the result of the join." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# apply first suggestion\n", + "join_builder.apply_suggestion(0)\n", + "\n", + "join_builder.preview(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, get our new joined Dataflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_autojoined = join_builder.to_dataflow().drop_columns(['l_date_timerange'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Joining two Dataflows without pulling the data\n", + "\n", + "If you don't want to pull on data and know what join should look like, you can always use the join method on the Dataflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_joined = dprep.Dataflow.join(left_dataflow=dflow_left,\n", + " right_dataflow=dflow_right,\n", + " join_key_pairs=[('date_timerange', 'l_date_timerange')],\n", + " left_column_prefix='l2_',\n", + " right_column_prefix='r_')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_joined.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_joined = dflow_joined.filter(expression=dflow_joined['r_start station id'] == '67')\n", + "df = dflow_joined.to_pandas_dataframe()\n", + "df" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/label-encoder.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/label-encoder.ipynb new file mode 100644 index 00000000..31b0cbe2 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/label-encoder.ipynb @@ -0,0 +1,162 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Label Encoder\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Prep has the ability to encode labels with values between 0 and (number of classes - 1) using `label_encode`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "from datetime import datetime\n", + "dflow = dprep.read_csv(path='../data/crime-spring.csv')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use `label_encode` from a Dataflow, simply specify the source column and the new column name. `label_encode` will figure out all the distinct values or classes in the source column, and it will return a new Dataflow with a new column containing the labels." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.label_encode(source_column='Primary Type', new_column_name='Primary Type Label')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To have more control over the encoded labels, create a builder with `dataflow.builders.label_encode`.\n", + "The builder allows you to preview and modify the encoded labels before generating a new Dataflow with the results. \n", + "To get started, create a builder object with `dataflow.builders.label_encode` specifying the source column and the new column name. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder = dflow.builders.label_encode(source_column='Location Description', new_column_name='Location Description Label')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To generate the encoded labels, call the `learn` method on the builder object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.learn()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To check the result, access the generated labels through the property `encoded_labels`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.encoded_labels" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To modify the generated results, just assign a new value to `encoded_labels`. The following example adds a missing label not found in the sample data. `builder.encoded_labels` is saved into a variable `encoded_labels`, modified, and assigned back to `builder.encoded_labels`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "encoded_labels = builder.encoded_labels\n", + "encoded_labels['TOWNHOUSE'] = 6\n", + "\n", + "builder.encoded_labels = encoded_labels\n", + "builder.encoded_labels" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once the desired results are achieved, call `builder.to_dataflow` to get the new Dataflow with the encoded labels." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataflow = builder.to_dataflow()\n", + "dataflow.head(5)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/min-max-scaler.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/min-max-scaler.ipynb new file mode 100644 index 00000000..725a7471 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/min-max-scaler.ipynb @@ -0,0 +1,233 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Min-Max Scaler\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The min-max scaler scales all values in a column to a desired range (typically [0, 1]). This is also known as feature scaling or unity-based normalization. Min-max scaling is commonly used to normalize numeric columns in a data set for machine learning algorithms." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, load a data set containing information about crime in Chicago. Keep only a few columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dprep.read_csv('../data/crime-spring.csv')\n", + "dflow = dflow.keep_columns(columns=['ID', 'District', 'FBI Code'])\n", + "dflow = dflow.to_number(columns=['District', 'FBI Code'])\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using `get_profile()`, you can see the shape of the numeric columns such as the minimum, maximum, count, and number of error values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow.get_profile()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To apply min-max scaling, call the function `min_max_scaler` on the Dataflow and specify the column name. This will trigger a full data scan over the column to determine the min and max values and perform the scaling. Note that the min and max values of the column are preserved at this point. If the same dataflow steps are performed over a different dataset, the min-max scaler must be re-executed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_district = dflow.min_max_scale(column='District')\n", + "dflow_district.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Look at the data profile to see that the \"District\" column is now scaled; the min is 0 and the max is 1. Any error values and missing values from the source column are preserved." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_district.get_profile()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also specify a custom range for the scaling. Instead of [0, 1], let's choose [-10, 10]." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_district_range = dflow.min_max_scale(column='District', range_min=-10, range_max=10)\n", + "dflow_district_range.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In some cases, you may want to manually provide the min and max of the data in the source column. For example, you may want to avoid a full data scan because the dataset is large and we already know the min and max. You can provide the known min and max to the `min_max_scaler` function. The column will be scaled using the provided values. For example, if you want to scale the `FBI Code` column with 6 (`data_min`) becoming 0 (`range_min`), the program will scan the data to get `data_max`, which will become 1 (`range_max`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_fbi = dflow.min_max_scale(column='FBI Code', data_min=6)\n", + "dflow_fbi.get_profile()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using a Min-Max Scaler builder" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For more flexibility when constructing the arguments for the min-max scaling, you can use a Min-Max Scaler builder." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder = dflow.builders.min_max_scale(column='District')\n", + "builder" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Calling `builder.learn()` will trigger a full data scan to see what `data_min` and `data_max` are. You can choose whether to use these values or set custom values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.learn()\n", + "builder" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to provide custom values for any of the arguments, you can update the builder object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.range_max = 10\n", + "builder.data_min = 6\n", + "builder" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When you are satisfied with the arguments, you will call `builder.to_dataflow()` to get the result. Note that the min and max values of the source column is preserved by the builder at this point. If you need to get the true `data_min` and `data_max` values again, you will need to set those arguments on the builder to `None` and then call `builder.learn()` again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_builder = builder.to_dataflow()\n", + "dflow_builder.head(5)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/one-hot-encoder.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/one-hot-encoder.ipynb new file mode 100644 index 00000000..7e4208d3 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/one-hot-encoder.ipynb @@ -0,0 +1,173 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# One Hot Encoder\n", + "Copyright (c) Microsoft Corporation. All rights reserved. \n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Azure ML Data Prep has the ability to perform one hot encoding on a selected column using `one_hot_encode`. The result Dataflow will have a new binary column for each categorical label encountered in the selected column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "dflow = dprep.read_csv(path='../data/crime-spring.csv')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use `one_hot_encode` from a Dataflow, simply specify the source column. `one_hot_encode` will figure out all the distinct values or categorical labels in the source column using the current data, and it will return a new Dataflow with a new binary column for each categorical label. Note that the categorical labels are remembered in the Dataflow step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_result = dflow.one_hot_encode(source_column='Location Description')\n", + "dflow_result.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, all the new columns will use the `source_column` name as a prefix. However, if you would like to specify your own prefix, simply pass a `prefix` string as a second parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_result = dflow.one_hot_encode(source_column='Location Description', prefix='LOCATION_')\n", + "dflow_result.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To have more control over the categorical labels, create a builder using `dataflow.builders.one_hot_encode`. The builder allows to preview and modify the categorical labels before generating a new Dataflow with the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder = dflow.builders.one_hot_encode(source_column='Location Description', prefix='LOCATION_')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To generate the categorical labels, call the `learn` method on the builder object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.learn()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To preview the categorical labels, simply access them through the property `categorical_labels` on the builder object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.categorical_labels" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To modify the generated `categorical_labels`, assign a new value to `categorical_labels` or modify the existing one. The following example adds a missing label not found on the sample data to `categorical_labels`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.categorical_labels.append('TOWNHOUSE')\n", + "builder.categorical_labels" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once the desired results are achieved, call `builder.to_dataflow` to get the new Dataflow with the encoded labels." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_result = builder.to_dataflow()\n", + "dflow_result.head(5)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/open-save-dataflows.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/open-save-dataflows.ipynb new file mode 100644 index 00000000..03661fb2 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/open-save-dataflows.ipynb @@ -0,0 +1,165 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Opening and Saving Dataflows\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have built a Dataflow, you can save it to a `.dprep` file. This persists all of the information in your Dataflow including steps you've added, examples and programs from by-example steps, computed aggregations, etc.\n", + "\n", + "You can also open `.dprep` files to access any Dataflows you have previously persisted." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Open\n", + "\n", + "Use the `open()` method of the Dataflow class to load existing `.dprep` files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "dflow_path = os.path.join(os.getcwd(), '..', 'data', 'crime.dprep')\n", + "print(dflow_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import Dataflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = Dataflow.open(dflow_path)\n", + "head = dflow.head(5)\n", + "head" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Edit\n", + "\n", + "After a Dataflow is loaded, it can be further edited as needed. In this example, a filter is added." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.dataprep import col" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.filter(col('Description') != 'SIMPLE')\n", + "head = dflow.head(5)\n", + "head" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save\n", + "\n", + "Use the `save()` method of the Dataflow class to write out the `.dprep` file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tempfile\n", + "temp_dir = tempfile._get_default_tempdir()\n", + "temp_file_name = next(tempfile._get_candidate_names())\n", + "temp_dflow_path = os.path.join(temp_dir, temp_file_name + '.dprep')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow.save(temp_dflow_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Round-trip\n", + "\n", + "This illustrates the ability to load the edited Dataflow back in and use it, in this case to get a pandas DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_to_open = Dataflow.open(temp_dflow_path)\n", + "df = dflow_to_open.to_pandas_dataframe()\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if os.path.isfile(temp_dflow_path):\n", + " os.remove(temp_dflow_path)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/quantile-transformation.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/quantile-transformation.ipynb new file mode 100644 index 00000000..ddb4fe02 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/quantile-transformation.ipynb @@ -0,0 +1,85 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Quantile Transformation\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DataPrep has the ability to perform quantile transformation to a numeric column. This transformation can transform the data into a normal or uniform distribution. Values bigger than the learnt boundaries will simply be clipped to the learnt boundaries when applying quantile transformation.\n", + "\n", + "Let's load a sample of the median income of california households in different suburbs from the 1990 census data. From the data profile, we can see that the minimum value and maximum value is 0.9946 and 15 respectively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "\n", + "dflow = dprep.read_csv(path='../data/median_income.csv').set_column_types(type_conversions={\n", + " 'median_income': dprep.TypeConverter(dprep.FieldType.DECIMAL)\n", + "})\n", + "dflow.get_profile()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's now apply quantile transformation to `median_income` and see how that affects the data. We will apply quantile transformation twice, one that maps the data to a Uniform(0, 1) distribution, one that maps it to a Normal(0, 1) distribution.\n", + "\n", + "From the data profile, we can see that the min and max of the uniform median income is strictly between 0 and 1 and the mean and standard deviation of the normal median income is close to 0 and 1 respectively.\n", + "\n", + "*Note: for normal distribution, we will clip the values at the ends as the 0th percentile and the 100th percentile are -Inf and Inf respectively.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.quantile_transform(source_column='median_income', new_column='median_income_uniform', quantiles_count=5)\n", + "dflow = dflow.quantile_transform(source_column='median_income', new_column='median_income_normal', \n", + " quantiles_count=5, output_distribution=\"Normal\")\n", + "dflow.get_profile()" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/random-split.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/random-split.ipynb new file mode 100644 index 00000000..cd5ebb57 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/random-split.ipynb @@ -0,0 +1,139 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Random Split\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Azure ML Data Prep provides the functionality of splitting a data set into two. When training a machine learning model, it is often desirable to train the model on a subset of data, then validate the model on a different subset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `random_split(percentage, seed=None)` function in Data Prep takes in a Dataflow and randomly splitting it into two distinct subsets (approximately by the percentage specified)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `seed` parameter is optional. If a seed is not provided, a stable one is generated, ensuring that the results for a specific Dataflow remain consistent. Different calls to `random_split` will receive different seeds." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To demonstrate, you can go through the following example. First, you can read the first 10,000 lines from a file. Since the contents of the file don't matter, just the first two columns can be used for a simple example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/testfiles/crime0.csv').take(10000)\n", + "dflow = dflow.keep_columns(['ID', 'Date'])\n", + "profile = dflow.get_profile()\n", + "print('Row count: %d' % (profile.columns['ID'].count))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, you can call `random_split` with the percentage set to 10% (the actual split ratio will be an approximation of `percentage`). You can take a look at the row count of the first returned Dataflow. You should see that `dflow_test` has approximately 1,000 rows (10% of 10,000)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(dflow_test, dflow_train) = dflow.random_split(percentage=0.1)\n", + "profile_test = dflow_test.get_profile()\n", + "print('Row count of \"test\": %d' % (profile_test.columns['ID'].count))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now you can take a look at the row count of the second returned Dataflow. The row count of `dflow_test` and `dflow_train` sums exactly to 10,000, because `random_split` results in two subsets that make up the original Dataflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "profile_train = dflow_train.get_profile()\n", + "print('Row count of \"train\": %d' % (profile_train.columns['ID'].count))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To specify a fixed seed, simply provide it to the `random_split` function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(dflow_test, dflow_train) = dflow.random_split(percentage=0.1, seed=12345)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-datasource-replace-reference.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-datasource-replace-reference.ipynb new file mode 100644 index 00000000..14164bd0 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-datasource-replace-reference.ipynb @@ -0,0 +1,124 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Replace DataSource Reference\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A common practice when performing DataPrep is to build up a script or set of cleaning operations on a smaller example file locally. This is quicker and easier than dealing with large amounts of data initially.\n", + "\n", + "After building a Dataflow that performs the desired steps, it's time to run it against the larger dataset, which may be stored in the cloud, or even locally just in a different file. This is where we can use `Dataflow.replace_datasource` to get a Dataflow identical to the one built on the small data, but referencing the newly specified DataSource." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "\n", + "dflow = dprep.read_csv('../data/crime-spring.csv')\n", + "df = dflow.to_pandas_dataframe()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we have the first 10 rows of a dataset called 'Crime'. The original dataset is over 100MB (admittedly not that large of a dataset but this is just an example).\n", + "\n", + "We'll perform a few cleaning operations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_dropped = dflow.drop_columns(['Location', 'Updated On', 'X Coordinate', 'Y Coordinate', 'Description'])\n", + "sctb = dflow_dropped.builders.set_column_types()\n", + "sctb.learn(inference_arguments=dprep.InferenceArguments(day_first=False))\n", + "dflow_typed = sctb.to_dataflow()\n", + "dflow_typed.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have a Dataflow with all our desired steps, we're ready to run against the 'full' dataset stored in Azure Blob.\n", + "All we need to do is pass the BlobDataSource into `replace_datasource` and we'll get back an identical Dataflow with the new DataSource substituted in." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_replaced = dflow_typed.replace_datasource(dprep.BlobDataSource('https://dpreptestfiles.blob.core.windows.net/testfiles/crime0.csv'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "'replaced_dflow' will now pull data from the 168MB (729734 rows) version of Crime0.csv stored in Azure Blob!\n", + "\n", + "NOTE: Dataflows can also be created by referencing a different Dataflow. Instead of using `replace_datasource`, there is a corresponding `replace_reference` method.\n", + "\n", + "We should be careful now since pulling all that data down and putting it in a pandas dataframe isn't an ideal way to inspect the result of our Dataflow. So instead, to see that our steps are being applied to all the new data, we can add a `take_sample` step, which will select records at random (based on a given probability) to be returned.\n", + "\n", + "The probability below takes the ~730000 rows down to a more inspectable ~73, though the number will vary each time `to_pandas_dataframe()` is run, since they are being randomly selected based on the probability." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_random_sample= dflow_replaced.take_sample(probability=0.0001)\n", + "sample = dflow_random_sample.to_pandas_dataframe()\n", + "sample" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-fill-error.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-fill-error.ipynb new file mode 100644 index 00000000..1e3b83f7 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-fill-error.ipynb @@ -0,0 +1,233 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Replace, Fill, Error\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can use the methods in this notebook to change values in your dataset.\n", + "\n", + "* replace - use this method to replace a value with another value. You can also use this to replace null with a value, or a value with null\n", + "* error - use this method to replace a value with an error.\n", + "* fill_nulls - this method lets you fill all nulls in a column with a certain value.\n", + "* fill_errors - this method lets you fill all errors in a column with a certain value." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dprep.read_csv('../data/crime-spring.csv')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.to_datetime('Date', ['%m/%d/%Y %H:%M'])\n", + "dflow = dflow.to_number(['IUCR', 'District', 'FBI Code'])\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Replace " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### String\n", + "Use `replace` to swap a string value with another string value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.replace('Primary Type', 'THEFT', 'STOLEN')\n", + "head = dflow.head(5)\n", + "head" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use `replace` to remove a certain string value from the column, replacing it with null. Note that Pandas shows null values as None." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.replace('Primary Type', 'DECEPTIVE PRACTICE', None)\n", + "head = dflow.head(5)\n", + "head" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Numeric\n", + "Use `replace` to swap a numeric value with another numeric value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.replace('District', 5, 1)\n", + "head = dflow.head(5)\n", + "head" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Date\n", + "Use `replace` to swap in a new Date for an existing Date in the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime, timezone\n", + "dflow = dflow.replace('Date', \n", + " datetime(2016, 4, 15, 9, 0, tzinfo=timezone.utc), \n", + " datetime(2018, 7, 4, 0, 0, tzinfo=timezone.utc))\n", + "head = dflow.head(5)\n", + "head" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Error \n", + "\n", + "The `error` method lets you create Error values. You can pass to this function the value that you want to find, along with the Error code to use in any Errors created." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.error('IUCR', 890, 'Invalid value')\n", + "head = dflow.head(5)\n", + "head" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fill Nulls \n", + "\n", + "Use the `fill_nulls` method to replace all null values in columns with another value. This is similar to Panda's fillna() method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.fill_nulls('Primary Type', 'N/A')\n", + "head = dflow.head(5)\n", + "head" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fill Errors \n", + "\n", + "Use the `fill_errors` method to replace all error values in columns with another value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.fill_errors('IUCR', -1)\n", + "head = dflow.head(5)\n", + "head" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/secrets.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/secrets.ipynb new file mode 100644 index 00000000..8714e00a --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/secrets.ipynb @@ -0,0 +1,134 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Providing Secrets\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Currently, secrets are only persisted for the lifetime of the engine process. Even if the dataflow is saved to a file, the secrets are not persisted in the dprep file. If you started a new session (i.e. start a new engine process), loaded a dataflow and wanted to run it, you will need to call `use_secrets` to register the required secrets to use during execution, otherwise the execution will fail as the required secrets are not available.\n", + "\n", + "In this notebook, we will:\n", + "1. Loading a previously saved dataflow\n", + "2. Call `get_missing_secrets` to determine the missing secrets\n", + "3. Call `use_secrets` and pass in the missing secrets to register it with the engine for this session\n", + "4. Call `head` to see the a preview of the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's load the previously saved dataflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dprep.Dataflow.open(file_path='../data/secrets.dprep')\n", + "dflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can call `get_missing_secrets` to see which required secrets are missing in the engine." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow.get_missing_secrets()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can now read the secrets from an environment variable, put it in a secret dictionary, and call `use_secrets` with the secrets. This will register the secrets in the engine so you don't need to provide them again in this session.\n", + "\n", + "_Note: It is a bad practice to have secrets in files that will be checked into source control._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sas = os.environ['SCENARIOS_SECRETS']\n", + "secrets = {\n", + " 'https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv': sas\n", + "}\n", + "dflow.use_secrets(secrets=secrets)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can now call `head` without passing in `secrets` and the engine will successfully execute. Here is a preview of the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow.head(5)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/semantic-types.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/semantic-types.ipynb new file mode 100644 index 00000000..1e6b5553 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/semantic-types.ipynb @@ -0,0 +1,158 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Semantic Types\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Some string values can be recognized as semantic types. For example, email addresses, US zip codes or IP addresses have specific formats that can be recognized, and then split in specific ways.\n", + "\n", + "When getting a DataProfile you can optionally ask to collect counts of values recognized as semantic types. [`Dataflow.get_profile()`](./data-profile.ipynb) executes the Dataflow, calculates profile information, and returns a newly constructed DataProfile. Semantic type counts can be included in the data profile by calling `get_profile` with the `include_stype_counts` argument set to true.\n", + "\n", + "The `stype_counts` property of the DataProfile will then include entries for columns where some semantic types were recognized for some values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "dflow = dprep.read_json(path='../data/json.json')\n", + "\n", + "profile = dflow.get_profile(include_stype_counts=True)\n", + "\n", + "print(\"row count: \" + str(profile.row_count))\n", + "profile.stype_counts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To see all the supported semantic types, you can examine the `SType` enumeration. More types will be added over time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "[t.name for t in dprep.SType]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can filter the found semantic types down to just those where all non-empty values matched. The `DataProfile.stype_counts` gives a list of semantic type counts for each column, where at least some matches were found. Those lists are in desecending order of count, so here we consider only the first in each list, as that will be the one with the highest count of values that match.\n", + "\n", + "In this example, the column `inspections.business.postal_code` looks to be a US zip code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stypes_counts = profile.stype_counts\n", + "all_match = [\n", + " (column, stypes_counts[column][0].stype)\n", + " for column in stypes_counts\n", + " if profile.row_count - profile.columns[column].empty_count == stypes_counts[column][0].count\n", + "]\n", + "all_match" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can use semantic types to compute new columns. The new columns are the values split up into elements, or canonicalized.\n", + "\n", + "Here we reduce our data down to just the `postal` column so we can better see what a `split_stype` operation can do." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_postal = dflow.keep_columns(['inspections.business.postal_code']).rename_columns({'inspections.business.postal_code': 'postal'})\n", + "dflow_postal.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With `SType.ZipCode`, values are split into their basic five digit zip code and the plus-four add-on of the Zip+4 format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_split = dflow_postal.split_stype('postal', dprep.SType.ZIPCODE)\n", + "dflow_split.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`split_stype` also allows you to specify the fields of the stype to use and the name of the new columns. For example, if you just needed to strip the plus four from our zip codes, you could use this." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_no_plus4 = dflow_postal.split_stype('postal', dprep.SType.ZIPCODE, ['zip'], ['zipNoPlus4'])\n", + "dflow_no_plus4.head(5)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/split-column-by-example.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/split-column-by-example.ipynb new file mode 100644 index 00000000..cf4ef5ec --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/split-column-by-example.ipynb @@ -0,0 +1,214 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Split column by example\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DataPrep also offers you a way to easily split a column into multiple columns.\n", + "The SplitColumnByExampleBuilder class lets you generate a proper split program that will work even when the cases are not trivial, like in example below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dprep.read_lines(path='../data/crime.txt')\n", + "df = dflow.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df['Line'].iloc[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see above, you can't split this particular file by space character as it will create too many columns.\n", + "That's where split_column_by_example could be quite useful." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder = dflow.builders.split_column_by_example('Line', keep_delimiters=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.preview()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Couple things to take note of here. No examples were given, and yet DataPrep was able to generate quite reasonable split program. \n", + "We have passed keep_delimiters=True so we can see all the data split into columns. In practice, though, delimiters are rarely useful, so let's exclude them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.keep_delimiters = False\n", + "builder.preview()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This looks pretty good already, except that one case number is split into 2 columns. Taking the first row as an example, we want to keep case number as \"HY329907\" instead of \"HY\" and \"329907\" seperately. \n", + "If we request generation of suggested examples we will get a list of examples that require input." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "suggestions = builder.generate_suggested_examples()\n", + "suggestions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "suggestions.iloc[0]['Line']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Having retrieved source value we can now provide an example of desired split.\n", + "Notice that we chose not to split date and time but rather keep them together in one column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.add_example(example=(suggestions['Line'].iloc[0], ['10140490','HY329907','7/5/2015 23:50','050XX N NEWLAND AVE','820','THEFT']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.preview()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see from the preview, some of the crime types (`Line_6`) do not show up as expected. Let's try to add one more example. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "builder.add_example(example=(df['Line'].iloc[1],['10139776','HY329265','7/5/2015 23:30','011XX W MORSE AVE','460','BATTERY']))\n", + "builder.preview()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This looks just like what we need. Let's get a dataflow with splited columns and drop original column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = builder.to_dataflow()\n", + "dflow = dflow.drop_columns(['Line'])\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we have successfully split the data into useful columns through examples. " + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/subsetting-sampling.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/subsetting-sampling.ipynb new file mode 100644 index 00000000..fded6681 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/subsetting-sampling.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Sampling and Subsetting\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once a Dataflow has been created, it is possible to act on only a subset of the records contained in it. This can help when working with very large datasets or when only a portion of the records is truly relevant." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Head\n", + "\n", + "The `head` method will take the number of records specified, run them through the transformations in the Dataflow, and then return the result as a Pandas dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "\n", + "dflow = dprep.read_csv('../data/crime_duplicate_headers.csv')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Take\n", + "\n", + "The `take` method adds a step to the Dataflow that will keep the number of records specified (counting from the beginning) and drop the rest. Unlike `head`, which does not modify the Dataflow, all operations applied on a Dataflow on which `take` has been applied will affect only the records kept." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_top_five = dflow.take(5)\n", + "dflow_top_five.to_pandas_dataframe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Skip\n", + "\n", + "It is also possible to skip a certain number of records in a Dataflow, such that transformations are only applied after a specific point. Depending on the underlying data source, a Dataflow with a `skip` step might still have to scan through the data in order to skip past the records." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_skip_top_one = dflow_top_five.skip(1)\n", + "dflow_skip_top_one.to_pandas_dataframe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Take Sample\n", + "\n", + "In addition to taking records from the top of the dataset, it's also possible to take a random sample of the dataset. This is done through the `take_sample(probability, seed=None)` method. This method will scan through all of the records available in the Dataflow and include them based on the probability specified. The `seed` parameter is optional. If a seed is not provided, a stable one is generated, ensuring that the results for a specific Dataflow remain consistent. Different calls to `take_sample` will receive different seeds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_sampled = dflow.take_sample(0.1)\n", + "dflow_sampled.to_pandas_dataframe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`skip`, `take`, and `take_sample` can all be combined. With this, we can achieve behaviors like getting a random 10% sample fo the middle N records of a dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "seed = 1\n", + "dflow_nested_sample = dflow.skip(1).take(5).take_sample(0.5, seed)\n", + "dflow_nested_sample.to_pandas_dataframe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Take Stratified Sample\n", + "Besides sampling all by a probability, we also have stratified sampling, provided the strata and strata weights, the probability to sample each stratum with.\n", + "This is done through the `take_stratified_sample(columns, fractions, seed=None)` method.\n", + "For all records, we will group each record by the columns specified to stratify, and based on the stratum x weight information in `fractions`, include said record.\n", + "\n", + "Seed behavior is same as in `take_sample`.\n", + "\n", + "If a stratum is not specified or the record cannot be grouped by said stratum, we default the weight to sample by to 0 (it will not be included).\n", + "\n", + "The order of `fractions` must match the order of `columns`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fractions = {}\n", + "fractions[('ASSAULT',)] = 0.5\n", + "fractions[('BATTERY',)] = 0.2\n", + "fractions[('ARSON',)] = 0.5\n", + "fractions[('THEFT',)] = 1.0\n", + "\n", + "columns = ['Primary Type']\n", + "\n", + "single_strata_sample = dflow.take_stratified_sample(columns=columns, fractions = fractions, seed = 42)\n", + "single_strata_sample.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Stratified sampling on multiple columns is also supported." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fractions = {}\n", + "fractions[('ASSAULT', '560')] = 0.5\n", + "fractions[('BATTERY', '460')] = 0.2\n", + "fractions[('ARSON', '1020')] = 0.5\n", + "fractions[('THEFT', '820')] = 1.0\n", + "\n", + "columns = ['Primary Type', 'IUCR']\n", + "\n", + "multi_strata_sample = dflow.take_stratified_sample(columns=columns, fractions = fractions, seed = 42)\n", + "multi_strata_sample.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Caching\n", + "It is usually a good idea to cache the sampled Dataflow for later uses.\n", + "\n", + "See [here](cache.ipynb) for more details about caching." + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/summarize.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/summarize.ipynb new file mode 100644 index 00000000..83208445 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/summarize.ipynb @@ -0,0 +1,584 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Summarize\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License.
\n", + "\n", + "Azure ML Data Prep can help summarize your data by providing you a synopsis based on aggregates over specific columns.\n", + "\n", + "## Table of Contents\n", + "[Overview](#overview)
\n", + "[Summmary Functions](#summary)
\n", + "* [SummaryFunction.MIN](#min)
\n", + "* [SummaryFunction.MAX](#max)
\n", + "* [SummaryFunction.MEAN](#mean)
\n", + "* [SummaryFunction.MEDIAN](#median)
\n", + "* [SummaryFunction.VAR](#var)
\n", + "* [SummaryFunction.SD](#sd)
\n", + "* [SummaryFunction.COUNT](#count)
\n", + "* [SummaryFunction.SUM](#sum)
\n", + "* [SummaryFunction.SKEWNESS](#skewness)
\n", + "* [SummaryFunction.KURTOSIS](#kurtosis)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overview\n", + "Before we drill down into each aggregate function, let us observe `summarize` end to end.\n", + "\n", + "We will start by reading some data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we count (`SummaryFunction.COUNT`) the number of rows with column ID with non-null values grouped by Primary Type." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_summarize = dflow.summarize(\n", + " summary_columns=[\n", + " dprep.SummaryColumnsValue(\n", + " column_id='ID',\n", + " summary_column_name='Primary Type ID Counts', \n", + " summary_function=dprep.SummaryFunction.COUNT)],\n", + " group_by_columns=['Primary Type'])\n", + "dflow_summarize.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we choose to not group by anything, we will instead get a single record over the entire dataset. Here we will get the number of rows that have the column ID with non-null values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_summarize_nogroup = dflow.summarize(\n", + " summary_columns=[\n", + " dprep.SummaryColumnsValue(\n", + " column_id='ID',\n", + " summary_column_name='ID Count', \n", + " summary_function=dprep.SummaryFunction.COUNT)])\n", + "dflow_summarize_nogroup.head(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Conversely, we can group by multiple columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_summarize_2group = dflow.summarize(\n", + " summary_columns=[\n", + " dprep.SummaryColumnsValue(\n", + " column_id='ID',\n", + " summary_column_name='Primary Type & Location Description ID Counts', \n", + " summary_function=dprep.SummaryFunction.COUNT)],\n", + " group_by_columns=['Primary Type', 'Location Description'])\n", + "dflow_summarize_2group.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In a similar vein, we can compute multiple aggregates in a single summary. Each aggregate function is independent and it is possible to aggregate the same column multiple times." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_summarize_multi_agg = dflow.summarize(\n", + " summary_columns=[\n", + " dprep.SummaryColumnsValue(\n", + " column_id='ID',\n", + " summary_column_name='Primary Type ID Counts', \n", + " summary_function=dprep.SummaryFunction.COUNT),\n", + " dprep.SummaryColumnsValue(\n", + " column_id='ID',\n", + " summary_column_name='Primary Type Min ID', \n", + " summary_function=dprep.SummaryFunction.MIN),\n", + " dprep.SummaryColumnsValue(\n", + " column_id='Date',\n", + " summary_column_name='Primary Type Max Date', \n", + " summary_function=dprep.SummaryFunction.MAX)],\n", + " group_by_columns=['Primary Type'])\n", + "dflow_summarize_multi_agg.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we wanted this summary data back into our original data set, we can make use of `join_back` and optionally `join_back_columns_prefix` for easy naming distinctions. Summary columns will be added to the end. `group_by_columns` is not necessary for using `join_back`, however the behavior will be more like an append instead of a join." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_summarize_join = dflow.summarize(\n", + " summary_columns=[\n", + " dprep.SummaryColumnsValue(\n", + " column_id='ID',\n", + " summary_column_name='Primary Type ID Counts', \n", + " summary_function=dprep.SummaryFunction.COUNT)],\n", + " group_by_columns=['Primary Type'],\n", + " join_back=True,\n", + " join_back_columns_prefix='New_')\n", + "dflow_summarize_join.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary Functions\n", + "Here we will go over all the possible aggregates in Data Prep.\n", + "The most up to date set of functions can be found by enumerating the `SummaryFunction` enum." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "[x.name for x in dprep.SummaryFunction]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SummaryFunction.MIN\n", + "Data Prep can aggregate and find the minimum value of a column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow_min = dflow.summarize(\n", + " summary_columns=[\n", + " dprep.SummaryColumnsValue(\n", + " column_id='Date',\n", + " summary_column_name='Primary Type Min Date', \n", + " summary_function=dprep.SummaryFunction.MIN)],\n", + " group_by_columns=['Primary Type'])\n", + "dflow_min.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SummaryFunction.MAX\n", + "Data Prep can find the maximum value of a column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow_min = dflow.summarize(\n", + " summary_columns=[\n", + " dprep.SummaryColumnsValue(\n", + " column_id='Date',\n", + " summary_column_name='Primary Type Max Date', \n", + " summary_function=dprep.SummaryFunction.MAX)],\n", + " group_by_columns=['Primary Type'])\n", + "dflow_min.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SummaryFunction.MEAN\n", + "Data Prep can find the statistical mean of a column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow_min = dflow.summarize(\n", + " summary_columns=[\n", + " dprep.SummaryColumnsValue(\n", + " column_id='Latitude',\n", + " summary_column_name='Primary Type Latitude Mean', \n", + " summary_function=dprep.SummaryFunction.MEAN)],\n", + " group_by_columns=['Primary Type'])\n", + "dflow_min.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SummaryFunction.MEDIAN\n", + "Data Prep can find the median value of a column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow_min = dflow.summarize(\n", + " summary_columns=[\n", + " dprep.SummaryColumnsValue(\n", + " column_id='Latitude',\n", + " summary_column_name='Primary Type Latitude Median', \n", + " summary_function=dprep.SummaryFunction.MEDIAN)],\n", + " group_by_columns=['Primary Type'])\n", + "dflow_min.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SummaryFunction.VAR\n", + "Data Prep can find the statistical variance of a column. We will need more than one data point to calculate this, otherwise we will be unable to give results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow_min = dflow.summarize(\n", + " summary_columns=[\n", + " dprep.SummaryColumnsValue(\n", + " column_id='Latitude',\n", + " summary_column_name='Primary Type Latitude Variance', \n", + " summary_function=dprep.SummaryFunction.VAR)],\n", + " group_by_columns=['Primary Type'])\n", + "dflow_min.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that despite there being two cases of BATTERY, one of them is missing geographical location, thus only CRIMINAL DAMAGE can yield variance information. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SummaryFunction.SD\n", + "Data Prep can find the standard deviation of a column. We will need more than one data point to calculate this, otherwise we will be unable to give results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow_min = dflow.summarize(\n", + " summary_columns=[\n", + " dprep.SummaryColumnsValue(\n", + " column_id='Latitude',\n", + " summary_column_name='Primary Type Latitude Standard Deviation', \n", + " summary_function=dprep.SummaryFunction.SD)],\n", + " group_by_columns=['Primary Type'])\n", + "dflow_min.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similar to when we calculate variance, despite there being two cases of BATTERY, one of them is missing geographical location, thus only CRIMINAL DAMAGE can yield variance information. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SummaryFunction.COUNT\n", + "Data Prep can count the number of rows that have a column with non-null values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow_min = dflow.summarize(\n", + " summary_columns=[\n", + " dprep.SummaryColumnsValue(\n", + " column_id='Latitude',\n", + " summary_column_name='Primary Type Latitude Count', \n", + " summary_function=dprep.SummaryFunction.COUNT)],\n", + " group_by_columns=['Primary Type'])\n", + "dflow_min.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that despite there being two cases of BATTERY, one of them is missing geographical location, thus when we group by Primary Type, we only get a count of one for Latitude." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SummaryFunction.SUM\n", + "Data Prep can aggregate and sum the values of a column. Our dataset does not have many numerical facts, but here we sum IDs grouped by Primary Type." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow_min = dflow.summarize(\n", + " summary_columns=[\n", + " dprep.SummaryColumnsValue(\n", + " column_id='ID',\n", + " summary_column_name='Primary Type ID Sum', \n", + " summary_function=dprep.SummaryFunction.SUM)],\n", + " group_by_columns=['Primary Type'])\n", + "dflow_min.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SummaryFunction.SKEWNESS\n", + "Data Prep can calculate the skewness of data in a column. We will need more than one data point to calculate this, otherwise we will be unable to give results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow_min = dflow.summarize(\n", + " summary_columns=[\n", + " dprep.SummaryColumnsValue(\n", + " column_id='Latitude',\n", + " summary_column_name='Primary Type Latitude Skewness', \n", + " summary_function=dprep.SummaryFunction.SKEWNESS)],\n", + " group_by_columns=['Primary Type'])\n", + "dflow_min.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SummaryFunction.KURTOSIS\n", + "Data Prep can calculate the kurtosis of data in a column. We will need more than one data point to calculate this, otherwise we will be unable to give results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", + "dflow_min = dflow.summarize(\n", + " summary_columns=[\n", + " dprep.SummaryColumnsValue(\n", + " column_id='Latitude',\n", + " summary_column_name='Primary Type Latitude Kurtosis', \n", + " summary_function=dprep.SummaryFunction.KURTOSIS)],\n", + " group_by_columns=['Primary Type'])\n", + "dflow_min.head(10)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/working-with-file-streams.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/working-with-file-streams.ipynb new file mode 100644 index 00000000..90552eb1 --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/working-with-file-streams.ipynb @@ -0,0 +1,186 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Working With File Streams\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In addition to loading and parsing tabular data (see [here](./data-ingestion.ipynb) for more details), Data Prep also supports a variety of operations on raw file streams. \n", + "\n", + "File streams are usually created by calling `Dataflow.get_files`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dprep.Dataflow.get_files(path='../data/*.csv')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The result of this operation is a Dataflow with a single column named \"Path\". This column contains values of type `StreamInfo`, each of which represents a different file matched by the search pattern specified when calling `get_files`. The string representation of a `StreamInfo` follows this pattern:\n", + "\n", + "StreamInfo(_Location_://_ResourceIdentifier_\\[_Arguments_\\])\n", + "\n", + "Location is the type of storage where the stream is located (e.g. Azure Blob, Local, or ADLS); ResouceIdentifier is the name of the file within that storage, such as a file path; and Arguments is a list of arguments required to load and read the file.\n", + "\n", + "On their own, `StreamInfo` objects are not particularly useful; however, you can use them as input to other functions." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Retrieving File Names\n", + "\n", + "In the example above, we matched a set of CSV files by using a search pattern and got back a column with several `StreamInfo` objects, each representing a different file. Now, we will extract the file path and name for each of these values into a new string column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dflow.add_column(expression=dprep.get_stream_name(dflow['Path']),\n", + " new_column_name='FilePath',\n", + " prior_column='Path')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `get_stream_name` function will return the full name of the file referenced by a `StreamInfo`. In the case of a local file, this will be an absolute path. From here, you can use the `derive_column_by_example` method to extract just the file name." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "first_file_path = dflow.head(1)['FilePath'][0]\n", + "first_file_name = os.path.basename(first_file_path)\n", + "dflow = dflow.derive_column_by_example(new_column_name='FileName',\n", + " source_columns=['FilePath'],\n", + " example_data=(first_file_path, first_file_name))\n", + "dflow = dflow.drop_columns(['FilePath'])\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Writing Streams\n", + "\n", + "Whenever you have a column containing `StreamInfo` objects, it's possible to write these out to any of the locations Data Prep supports. You can do this by calling `Dataflow.write_streams`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow.write_streams(streams_column='Path', base_path=dprep.LocalFileOutput('./test_out/')).run_local()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `base_path` parameter specifies the location the files will be written to. By default, the name of the file will be the resource identifier of the stream with any invalid characters replaced by `_`. In the case of streams referencing local files, this would be the full path of the original file. You can also specify the desired file names by referencing a column containing them:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow.write_streams(streams_column='Path',\n", + " base_path=dprep.LocalFileOutput('./test_out/'),\n", + " file_names_column='FileName').run_local()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using this functionality, you can transfer files from any source to any destination supported by Data Prep. In addition, since the streams are just values in the Dataflow, you can use all of the functionality available.\n", + "\n", + "Here, for example, we will write out only the files that start with the prefix \"crime-\". The resulting file names will have the prefix stripped and will be written to a folder named \"crime\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prefix = 'crime-'\n", + "dflow = dflow.filter(dflow['FileName'].starts_with(prefix))\n", + "dflow = dflow.add_column(expression=dflow['FileName'].substring(len(prefix)),\n", + " new_column_name='CleanName',\n", + " prior_column='FileName')\n", + "dflow.write_streams(streams_column='Path',\n", + " base_path=dprep.LocalFileOutput('./test_out/crime/'),\n", + " file_names_column='CleanName').run_local()" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/writing-data.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/writing-data.ipynb new file mode 100644 index 00000000..46b17cff --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/writing-data.ipynb @@ -0,0 +1,177 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Writing Data\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is possible to write out the data at any point in a Dataflow. These writes are added as steps to the resulting Dataflow and will be executed every time the Dataflow is executed. Since there are no limitations to how many write steps there are in a pipeline, this makes it easy to write out intermediate results for troubleshooting or to be picked up by other pipelines.\n", + "\n", + "It is important to note that the execution of each write results in a full pull of the data in the Dataflow. For example, a Dataflow with three write steps will read and process every record in the dataset three times." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Writing to Files" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data can be written to files in any of our supported locations (Local File System, Azure Blob Storage, and Azure Data Lake Storage). In order to parallelize the write, the data is written to multiple partition files. A sentinel file named SUCCESS is also output once the write has completed. This makes it possible to identify when an intermediate write has completed without having to wait for the whole pipeline to complete.\n", + "\n", + "> When running a Dataflow in Spark, attempting to execute a write to an existing folder will fail. It is important to ensure the folder is empty or use a different target location per execution.\n", + "\n", + "The following file formats are currently supported:\n", + "- Delimited Files (CSV, TSV, etc.)\n", + "- Parquet Files" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll start by loading data into a Dataflow which will be re-used with different formats." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow = dprep.auto_read_file('../data/crime.txt')\n", + "dflow = dflow.to_number('Column2')\n", + "dflow.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delimited Files" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we create a dataflow with a write step.\n", + "\n", + "This operation is lazy until we invoke `run_local` (or any operation that forces execution like `to_pandas_dataframe`), only then will we execute the write operation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_write = dflow.write_to_csv(directory_path=dprep.LocalFileOutput('./test_out/'))\n", + "\n", + "dflow_write.run_local()\n", + "\n", + "dflow_written_files = dprep.read_csv('./test_out/part-*')\n", + "dflow_written_files.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The data we wrote out contains several errors in the numeric columns due to numbers that we were unable to parse. When written out to CSV, these are replaced with the string \"ERROR\" by default. We can parameterize this as part of our write call. In the same vein, it is also possible to set what string to use to represent null values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_write_errors = dflow.write_to_csv(directory_path=dprep.LocalFileOutput('./test_out/'), \n", + " error='BadData',\n", + " na='NA')\n", + "dflow_write_errors.run_local()\n", + "dflow_written = dprep.read_csv('./test_out/part-*')\n", + "dflow_written.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parquet Files" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similar to `write_to_csv`, `write_to_parquet` returns a new Dataflow with a Write Parquet Step which hasn't been executed yet.\n", + "\n", + "Then we run the Dataflow with `run_local`, which executes the write operation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dflow_write_parquet = dflow.write_to_parquet(directory_path=dprep.LocalFileOutput('./test_parquet_out/'),\n", + " error='MiscreantData')\n", + "\n", + "dflow_write_parquet.run_local()\n", + "\n", + "dflow_written_parquet = dprep.read_parquet_file('./test_parquet_out/part-*')\n", + "dflow_written_parquet.head(5)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/tutorials/getting-started/getting-started.ipynb b/how-to-use-azureml/work-with-data/dataprep/tutorials/getting-started/getting-started.ipynb new file mode 100644 index 00000000..23ad7a4c --- /dev/null +++ b/how-to-use-azureml/work-with-data/dataprep/tutorials/getting-started/getting-started.ipynb @@ -0,0 +1,435 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Getting started with Azure ML Data Prep SDK\n", + "Copyright (c) Microsoft Corporation. All rights reserved.
\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Wonder how you can make the most of the Azure ML Data Prep SDK? In this \"Getting Started\" guide, we'll demonstrate how to do your normal data wrangling with this SDK and showcase a few highlights that make this SDK shine. Using a sample of this [Kaggle crime dataset](https://www.kaggle.com/currie32/crimes-in-chicago/home) as an example, we'll cover how to:\n", + "\n", + "* [Read in data](#Read)\n", + "* [Profile your data](#Profile)\n", + "* [Append rows](#Append)\n", + "* [Apply common data science transforms](#Data-science-transforms)\n", + " * [Summarize](#Summarize)\n", + " * [Join](#Join)\n", + " * [Filter](#Filter)\n", + " * [Replace](#Replace)\n", + "* [Consume your cleaned dataset](#Consume)\n", + "* [Explore advanced features](#Explore)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display\n", + "from os import path\n", + "from tempfile import mkdtemp\n", + "\n", + "import pandas as pd\n", + "import azureml.dataprep as dprep\n", + "\n", + "# Paths for datasets\n", + "file_crime_dirty = '../../data/crime-dirty.csv'\n", + "file_crime_spring = '../../data/crime-spring.csv'\n", + "file_crime_winter = '../../data/crime-winter.csv'\n", + "file_aldermen = '../../data/chicago-aldermen-2015.csv'\n", + "\n", + "# Seed\n", + "RAND_SEED = 7251" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read in data\n", + "\n", + "Azure ML Data Prep supports many different file reading formats (i.e. CSV, Excel, Parquet) and the ability to infer column types automatically. To see how powerful the `auto_read_file` capability is, let's take a peek at the `dirty-crime.csv`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dprep.read_csv(path=file_crime_dirty).head(7)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A common occurrence in many datasets is to have a column of values with commas; in our case, the last column represents location in the form of longitude-latitude pair. The default CSV reader interprets this comma as a delimiter and thus splits the data into two columns. Furthermore, it incorrectly reads in the header as the column name. Normally, we would need to `skip` the header and specify the delimiter as `|`, but our `auto_read_file` eliminates that work:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "crime_dirty = dprep.auto_read_file(path=file_crime_dirty)\n", + "\n", + "crime_dirty.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Advanced features:__ if you'd like to specify the file type and adjust how you want to read files in, you can see the list of our specialized file readers and how to use them [here](../../how-to-guides/data-ingestion.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Profile your data\n", + "\n", + "Let's understand what our data looks like. Azure ML Data Prep facilitates this process by offering data profiles that help us glimpse into column types and column summary statistics. Notice that our auto file reader automatically guessed the column type:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "crime_dirty.get_profile()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Append rows\n", + "\n", + "What if your data is split across multiple files? We support the ability to append multiple datasets column-wise and row-wise. Here, we demonstrate how you can coalesce datasets row-wise:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Datasets with the same schema as crime_dirty\n", + "crime_winter = dprep.auto_read_file(path=file_crime_winter)\n", + "crime_spring = dprep.auto_read_file(path=file_crime_spring)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "crime = (crime_dirty.append_rows(dataflows=[crime_winter, crime_spring]))\n", + "\n", + "crime.take_sample(probability=0.25, seed=RAND_SEED).head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Advanced features:__ you can learn how to append column-wise and how to deal with appending data with different schemas [here](../../how-to-guides/append-columns-and-rows.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Apply common data science transforms\n", + "\n", + "Azure ML Data Prep supports almost all common data science transforms found in other industry-standard data science libraries. Here, we'll explore the ability to `summarize`, `join`, `filter`, and `replace`. \n", + "\n", + "__Advanced features:__\n", + "* We also provide \"smart\" transforms not found in pandas that use machine learning to [derive new columns](../../how-to-guides/derive-column-by-example.ipynb), [split columns](../../how-to-guides/split-column-by-example.ipynb), and [fuzzy grouping](../../how-to-guides/fuzzy-group.ipynb).\n", + "* Finally, we also help featurize your dataset to prepare it for machine learning; learn more about our featurizers like [one-hot encoder](../../how-to-guides/one-hot-encoder.ipynb), [label encoder](../../how-to-guides/label-encoder.ipynb), [min-max scaler](../../how-to-guides/min-max-scaler.ipynb), and [random (train-test) split](../../how-to-guides/random-split.ipynb).\n", + "* Our complete list of example Notebooks for transforms can be found in our [How-to Guides](../../how-to-guides)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Summarize\n", + "\n", + "Let's see which wards had the most crimes in our sample dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "crime_summary = (crime\n", + " .summarize(\n", + " summary_columns=[\n", + " dprep.SummaryColumnsValue(\n", + " column_id='ID', \n", + " summary_column_name='total_ward_crimes', \n", + " summary_function=dprep.SummaryFunction.COUNT\n", + " )\n", + " ],\n", + " group_by_columns=['Ward']\n", + " )\n", + ")\n", + "\n", + "(crime_summary\n", + " .sort(sort_order=[('total_ward_crimes', True)])\n", + " .head(5)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Join\n", + "\n", + "Let's annotate each observation with more information about the ward where the crime occurred. Let's do so by joining `crime` with a dataset which lists the current aldermen for each ward:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "aldermen = dprep.auto_read_file(path=file_aldermen)\n", + "\n", + "aldermen.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "crime.join(\n", + " left_dataflow=crime,\n", + " right_dataflow=aldermen,\n", + " join_key_pairs=[\n", + " ('Ward', 'Ward')\n", + " ]\n", + ").head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Advanced features:__ [Learn more](../../how-to-guides/join.ipynb) about how you can do all variants of `join`, like inner-, left-, right-, anti-, and semi-joins." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filter\n", + "\n", + "Let's look at theft crimes:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "theft = crime.filter(crime['Primary Type'] == 'THEFT')\n", + "\n", + "theft.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Replace\n", + "\n", + "Notice that our `theft` dataset has empty strings in column `Location`. Let's replace those with a missing value:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "theft_replaced = (theft\n", + " .replace_na(\n", + " columns=['Location'], \n", + " use_empty_string_as_na=True\n", + " )\n", + ")\n", + "\n", + "theft_replaced.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Advanced features:__ [Learn more](../../how-to-guides/replace-fill-error.ipynb) about more advanced `replace` and `fill` capabilities." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Consume your cleaned dataset\n", + "\n", + "Azure ML Data Prep allows you to \"choose your own adventure\" once you're done wrangling. You can:\n", + "\n", + "1. Write to a pandas dataframe\n", + "2. Execute on Spark\n", + "3. Consume directly in Azure Machine Learning models\n", + "\n", + "In this quickstart guide, we'll show how you can export to a pandas dataframe.\n", + "\n", + "__Advanced features:__ \n", + "* One of the beautiful features of Azure ML Data Prep is that you only need to write your code once and choose whether to scale up or out.\n", + "* You can directly consume your new DataFlow in model builders like Azure Machine Learning's [automated machine learning](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/dataprep/auto-ml-dataprep.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "theft_replaced.to_pandas_dataframe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore advanced features\n", + "\n", + "Congratulations on finishing your introduction to the Azure ML Data Prep SDK! If you'd like more detailed tutorials on how to construct machine learning datasets or dive deeper into all of its functionality, you can find more information in our detailed notebooks [here](https://github.com/Microsoft/PendletonDocs). There, we cover topics including how to:\n", + "\n", + "* [Cache your Dataflow to speed up your iterations](../../how-to-guides/cache.ipynb)\n", + "* [Add your custom Python transforms](../../how-to-guides/custom-python-transforms.ipynb)\n", + "* [Impute missing values](../../how-to-guides/impute-missing-values.ipynb)\n", + "* [Sample your data](../../how-to-guides/subsetting-sampling.ipynb)\n", + "* [Reference and link between Dataflows](../../how-to-guides/join.ipynb)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "sihhu" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file