From 2e404cfc3a4ebccda9e743edd39bd03732671b9e Mon Sep 17 00:00:00 2001 From: vizhur Date: Mon, 14 Oct 2019 22:30:58 +0000 Subject: [PATCH] update samples from Release-153 as a part of 1.0.69 SDK release --- README.md | 4 - configuration.ipynb | 2 +- .../RAPIDS/azure-ml-with-nvidia-rapids.ipynb | 69 +- contrib/RAPIDS/process_data.py | 93 +- end-to-end-samples/README.md | 0 .../automated-machine-learning/automl_env.yml | 2 +- .../automl_env_mac.yml | 2 +- .../auto-ml-classification-bank-marketing.yml | 1 + ...to-ml-classification-credit-card-fraud.yml | 1 + .../auto-ml-dataset-remote-execution.yml | 1 + .../auto-ml-forecasting-energy-demand.ipynb | 4 +- .../auto-ml-forecasting-energy-demand.yml | 3 +- .../automl-forecasting-function.ipynb | 615 +++ .../automl-forecasting-function.yml | 9 + ...to-ml-forecasting-orange-juice-sales.ipynb | 2 +- ...ml-model-explanations-remote-compute.ipynb | 2 +- ...o-ml-model-explanations-remote-compute.yml | 3 +- .../auto-ml-model-explanation.ipynb | 6 +- .../auto-ml-model-explanation.yml | 3 +- .../auto-ml-regression-concrete-strength.yml | 1 + ...uto-ml-regression-hardware-performance.yml | 1 + .../auto-ml-remote-amlcompute-with-onnx.yml | 1 + .../auto-ml-remote-amlcompute.yml | 1 + .../explain-model-on-amlcompute.ipynb | 18 +- .../explain-model-on-amlcompute.yml | 5 +- .../remote-explanation/train_explain.py | 4 +- ...ve-retrieve-explanations-run-history.ipynb | 20 +- ...save-retrieve-explanations-run-history.yml | 5 +- ...ain-explain-model-locally-and-deploy.ipynb | 10 +- ...train-explain-model-locally-and-deploy.yml | 5 +- ...plain-model-on-amlcompute-and-deploy.ipynb | 12 +- ...explain-model-on-amlcompute-and-deploy.yml | 6 +- .../scoring-time/train_explain.py | 6 +- ...eature-transformations-explain-local.ipynb | 18 +- ...-feature-transformations-explain-local.yml | 5 +- .../explain-binary-classification-local.ipynb | 18 +- .../explain-binary-classification-local.yml | 5 +- ...lain-multiclass-classification-local.ipynb | 18 +- ...xplain-multiclass-classification-local.yml | 5 +- .../explain-regression-local.ipynb | 18 +- .../tabular-data/explain-regression-local.yml | 5 +- ...eature-transformations-explain-local.ipynb | 22 +- ...-feature-transformations-explain-local.yml | 5 +- .../aml-pipelines-data-transfer.ipynb | 38 +- ...l-pipelines-how-to-use-estimatorstep.ipynb | 6 +- ...asing-datapath-and-pipelineparameter.ipynb | 6 +- .../pytorch_mnist.py | 2 +- .../tf_mnist.py | 2 +- ...rparameter-tune-deploy-with-tensorflow.yml | 3 +- .../tf_mnist_with_checkpoint.py | 2 +- .../train-tensorflow-resume-training.yml | 2 +- .../data-drift/azure-ml-datadrift.ipynb | 42 +- .../data-drift/azure-ml-datadrift.yml | 2 +- .../logging-api/logging-api.ipynb | 2 +- .../tensorboard/tensorboard.yml | 2 +- .../export-run-history-to-tensorboard.yml | 2 +- how-to-use-azureml/work-with-data/README.md | 21 +- .../work-with-data/dataprep/README.md | 300 -- .../new-york-taxi/new-york-taxi.ipynb | 513 -- .../new-york-taxi_scale-out.ipynb | 135 - .../dataprep/data/10x10-float64-csr.npz | Bin 1015 -> 0 bytes .../dataprep/data/ADLSgen2-datapreptest.crt | 45 - .../dataprep/data/adls-dpreptestfiles.crt | 45 - .../dataprep/data/chicago-aldermen-2015.csv | 54 - .../dataprep/data/crime-dirty.csv | 15 - .../dataprep/data/crime-full.csv | 1001 ---- .../dataprep/data/crime-spring.csv | 11 - .../dataprep/data/crime-winter.csv | 11 - .../work-with-data/dataprep/data/crime.dprep | 204 - .../work-with-data/dataprep/data/crime.jsonl | 10 - .../dataprep/data/crime.parquet | Bin 3607 -> 0 bytes .../work-with-data/dataprep/data/crime.txt | 10 - .../work-with-data/dataprep/data/crime.xlsx | Bin 16109 -> 0 bytes .../work-with-data/dataprep/data/crime.zip | Bin 3685 -> 0 bytes .../dataprep/data/crime_duplicate_headers.csv | 12 - .../dataprep/data/crime_fixed_width_file.txt | 10 - .../data/crime_multiple_separators.csv | 11 - .../dataprep/data/crime_partfiles/_SUCCESS | 0 ...8e77b-f17a-4c20-972c-aa382e830fca-c000.csv | 914 ---- ...8e77b-f17a-4c20-972c-aa382e830fca-c000.csv | 921 ---- ...8e77b-f17a-4c20-972c-aa382e830fca-c000.csv | 930 ---- ...8e77b-f17a-4c20-972c-aa382e830fca-c000.csv | 953 ---- ...8e77b-f17a-4c20-972c-aa382e830fca-c000.csv | 923 ---- ...8e77b-f17a-4c20-972c-aa382e830fca-c000.csv | 887 ---- ...8e77b-f17a-4c20-972c-aa382e830fca-c000.csv | 971 ---- ...8e77b-f17a-4c20-972c-aa382e830fca-c000.csv | 759 --- .../work-with-data/dataprep/data/file-url.csv | 4 - .../work-with-data/dataprep/data/json.json | 1306 ----- .../dataprep/data/large_dflow.json | 4415 ----------------- .../work-with-data/dataprep/data/map_func.py | 4 - .../dataprep/data/median_income.csv | 251 - .../data/median_income_transformed.csv | 251 - .../dataprep/data/parquet.parquet | Bin 3091 -> 0 bytes ...7a7-c3cd-4926-92b2-ba2dcd3f95b7.gz.parquet | Bin 6078 -> 0 bytes ...7a7-c3cd-4926-92b2-ba2dcd3f95b7.gz.parquet | Bin 5083 -> 0 bytes .../dataprep/data/secrets.dprep | 63 - .../dataprep/data/stream-path.csv | 11 - .../add-column-using-expression.ipynb | 360 -- .../append-columns-and-rows.ipynb | 251 - .../dataprep/how-to-guides/assertions.ipynb | 133 - .../how-to-guides/auto-read-file.ipynb | 189 - .../dataprep/how-to-guides/cache.ipynb | 194 - .../how-to-guides/column-manipulations.ipynb | 563 --- .../column-type-transforms.ipynb | 473 -- .../custom-python-transforms.ipynb | 232 - .../how-to-guides/data-ingestion.ipynb | 1210 ----- .../dataprep/how-to-guides/data-profile.ipynb | 179 - .../dataprep/how-to-guides/datastore.ipynb | 246 - .../derive-column-by-example.ipynb | 187 - .../how-to-guides/external-references.ipynb | 118 - .../dataprep/how-to-guides/filtering.ipynb | 220 - .../dataprep/how-to-guides/fuzzy-group.ipynb | 211 - .../how-to-guides/impute-missing-values.ipynb | 147 - .../dataprep/how-to-guides/join.ipynb | 265 - .../how-to-guides/label-encoder.ipynb | 168 - .../how-to-guides/min-max-scaler.ipynb | 239 - .../how-to-guides/one-hot-encoder.ipynb | 179 - .../how-to-guides/open-save-dataflows.ipynb | 184 - .../quantile-transformation.ipynb | 91 - .../dataprep/how-to-guides/random-split.ipynb | 170 - ...replace-datasource-replace-reference.ipynb | 130 - .../how-to-guides/replace-fill-error.ipynb | 239 - .../dataprep/how-to-guides/secrets.ipynb | 140 - .../how-to-guides/semantic-types.ipynb | 164 - .../split-column-by-example.ipynb | 220 - .../how-to-guides/subsetting-sampling.ipynb | 240 - .../dataprep/how-to-guides/summarize.ipynb | 590 --- .../working-with-file-streams.ipynb | 212 - .../dataprep/how-to-guides/writing-data.ipynb | 183 - .../getting-started/getting-started.ipynb | 433 -- .../dataset-api-change-notice.md | 5 +- ...tabular-timeseries-dataset-filtering.ipynb | 30 +- .../datasets-tutorial/train-dataset/iris.csv | 151 + .../train-with-datasets.ipynb | 61 +- .../work-with-data/datasets/README.md | 22 - .../train-dataset/Titanic.csv | 892 ---- index.md | 155 +- setup-environment/configuration.ipynb | 2 +- .../img-classification-part1-training.ipynb | 3 +- 139 files changed, 1250 insertions(+), 25949 deletions(-) delete mode 100644 end-to-end-samples/README.md create mode 100644 how-to-use-azureml/automated-machine-learning/forecasting-high-frequency/automl-forecasting-function.ipynb create mode 100644 how-to-use-azureml/automated-machine-learning/forecasting-high-frequency/automl-forecasting-function.yml delete mode 100644 how-to-use-azureml/work-with-data/dataprep/README.md delete mode 100644 how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi_scale-out.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/10x10-float64-csr.npz delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/ADLSgen2-datapreptest.crt delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/adls-dpreptestfiles.crt delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/chicago-aldermen-2015.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime-dirty.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime-full.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime-spring.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime-winter.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime.dprep delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime.jsonl delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime.parquet delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime.txt delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime.xlsx delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime.zip delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_duplicate_headers.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_fixed_width_file.txt delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_multiple_separators.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/_SUCCESS delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/part-00000-0b08e77b-f17a-4c20-972c-aa382e830fca-c000.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/part-00001-0b08e77b-f17a-4c20-972c-aa382e830fca-c000.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/part-00002-0b08e77b-f17a-4c20-972c-aa382e830fca-c000.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/part-00003-0b08e77b-f17a-4c20-972c-aa382e830fca-c000.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/part-00004-0b08e77b-f17a-4c20-972c-aa382e830fca-c000.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/part-00005-0b08e77b-f17a-4c20-972c-aa382e830fca-c000.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/part-00006-0b08e77b-f17a-4c20-972c-aa382e830fca-c000.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/crime_partfiles/part-00007-0b08e77b-f17a-4c20-972c-aa382e830fca-c000.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/file-url.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/json.json delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/large_dflow.json delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/map_func.py delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/median_income.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/median_income_transformed.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/parquet.parquet delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/parquet_dataset/Arrest=false/part-00000-34f8a7a7-c3cd-4926-92b2-ba2dcd3f95b7.gz.parquet delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/parquet_dataset/Arrest=true/part-00000-34f8a7a7-c3cd-4926-92b2-ba2dcd3f95b7.gz.parquet delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/secrets.dprep delete mode 100644 how-to-use-azureml/work-with-data/dataprep/data/stream-path.csv delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/add-column-using-expression.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/append-columns-and-rows.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/assertions.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/auto-read-file.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/cache.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-manipulations.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-type-transforms.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/custom-python-transforms.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-ingestion.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-profile.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/datastore.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/derive-column-by-example.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/external-references.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/filtering.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/fuzzy-group.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/impute-missing-values.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/join.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/label-encoder.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/min-max-scaler.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/one-hot-encoder.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/open-save-dataflows.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/quantile-transformation.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/random-split.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-datasource-replace-reference.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-fill-error.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/secrets.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/semantic-types.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/split-column-by-example.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/subsetting-sampling.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/summarize.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/working-with-file-streams.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/how-to-guides/writing-data.ipynb delete mode 100644 how-to-use-azureml/work-with-data/dataprep/tutorials/getting-started/getting-started.ipynb rename how-to-use-azureml/work-with-data/{datasets => }/dataset-api-change-notice.md (97%) rename how-to-use-azureml/work-with-data/{datasets => }/datasets-tutorial/tabular-timeseries-dataset-filtering.ipynb (96%) create mode 100644 how-to-use-azureml/work-with-data/datasets-tutorial/train-dataset/iris.csv rename how-to-use-azureml/work-with-data/{datasets => }/datasets-tutorial/train-with-datasets.ipynb (93%) delete mode 100644 how-to-use-azureml/work-with-data/datasets/README.md delete mode 100644 how-to-use-azureml/work-with-data/datasets/datasets-tutorial/train-dataset/Titanic.csv diff --git a/README.md b/README.md index c898f7b8..b9e9241b 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,6 @@ pip install azureml-sdk Read more detailed instructions on [how to set up your environment](./NBSETUP.md) using Azure Notebook service, your own Jupyter notebook server, or Docker. ## How to navigate and use the example notebooks? - -This [index](https://github.com/Azure/MachineLearningNotebooks/blob/master/index.md) should assist in navigating the Azure Machine Learning notebook samples and encourage efficient retrieval of topics and content. - - If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, you should always run the [Configuration](./configuration.ipynb) notebook first when setting up a notebook library on a new machine or in a new environment. It configures your notebook library to connect to an Azure Machine Learning workspace, and sets up your workspace and compute to be used by many of the other examples. If you want to... diff --git a/configuration.ipynb b/configuration.ipynb index 95532139..d73d1345 100644 --- a/configuration.ipynb +++ b/configuration.ipynb @@ -103,7 +103,7 @@ "source": [ "import azureml.core\n", "\n", - "print(\"This notebook was created using version 1.0.65 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.0.69 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, diff --git a/contrib/RAPIDS/azure-ml-with-nvidia-rapids.ipynb b/contrib/RAPIDS/azure-ml-with-nvidia-rapids.ipynb index 97fecf56..3d8007cc 100644 --- a/contrib/RAPIDS/azure-ml-with-nvidia-rapids.ipynb +++ b/contrib/RAPIDS/azure-ml-with-nvidia-rapids.ipynb @@ -9,6 +9,13 @@ "Licensed under the MIT License." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/contrib/RAPIDS/azure-ml-with-nvidia-rapids/azure-ml-with-nvidia-rapids.png)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -20,7 +27,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The [RAPIDS](https://www.developer.nvidia.com/rapids) suite of software libraries from NVIDIA enables the execution of end-to-end data science and analytics pipelines entirely on GPUs. In many machine learning projects, a significant portion of the model training time is spent in setting up the data; this stage of the process is known as Extraction, Transformation and Loading, or ETL. By using the DataFrame API for ETL\u00c3\u201a\u00c2\u00a0and GPU-capable ML algorithms in RAPIDS, data preparation and training models can be done in GPU-accelerated end-to-end pipelines without incurring serialization costs between the pipeline stages. This notebook demonstrates how to use NVIDIA RAPIDS to prepare data and train model\u00c2\u00a0in Azure.\n", + "The [RAPIDS](https://www.developer.nvidia.com/rapids) suite of software libraries from NVIDIA enables the execution of end-to-end data science and analytics pipelines entirely on GPUs. In many machine learning projects, a significant portion of the model training time is spent in setting up the data; this stage of the process is known as Extraction, Transformation and Loading, or ETL. By using the DataFrame API for ETL\u00c2\u00a0and GPU-capable ML algorithms in RAPIDS, data preparation and training models can be done in GPU-accelerated end-to-end pipelines without incurring serialization costs between the pipeline stages. This notebook demonstrates how to use NVIDIA RAPIDS to prepare data and train model\u00c3\u201a\u00c2\u00a0in Azure.\n", " \n", "In this notebook, we will do the following:\n", " \n", @@ -119,8 +126,10 @@ "outputs": [], "source": [ "ws = Workspace.from_config()\n", + "\n", "# if a locally-saved configuration file for the workspace is not available, use the following to load workspace\n", "# ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name)\n", + "\n", "print('Workspace name: ' + ws.name, \n", " 'Azure region: ' + ws.location, \n", " 'Subscription id: ' + ws.subscription_id, \n", @@ -161,7 +170,7 @@ "if gpu_cluster_name in ws.compute_targets:\n", " gpu_cluster = ws.compute_targets[gpu_cluster_name]\n", " if gpu_cluster and type(gpu_cluster) is AmlCompute:\n", - " print('found compute target. just use it. ' + gpu_cluster_name)\n", + " print('Found compute target. Will use {0} '.format(gpu_cluster_name))\n", "else:\n", " print(\"creating new cluster\")\n", " # vm_size parameter below could be modified to one of the RAPIDS-supported VM types\n", @@ -183,7 +192,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The _process_data.py_ script used in the step below is a slightly modified implementation of [RAPIDS E2E example](https://github.com/rapidsai/notebooks/blob/master/mortgage/E2E.ipynb)." + "The _process_data.py_ script used in the step below is a slightly modified implementation of [RAPIDS Mortgage E2E example](https://github.com/rapidsai/notebooks-contrib/blob/master/intermediate_notebooks/E2E/mortgage/mortgage_e2e.ipynb)." ] }, { @@ -194,10 +203,7 @@ "source": [ "# copy process_data.py into the script folder\n", "import shutil\n", - "shutil.copy('./process_data.py', os.path.join(scripts_folder, 'process_data.py'))\n", - "\n", - "with open(os.path.join(scripts_folder, './process_data.py'), 'r') as process_data_script:\n", - " print(process_data_script.read())" + "shutil.copy('./process_data.py', os.path.join(scripts_folder, 'process_data.py'))" ] }, { @@ -221,13 +227,6 @@ "### Downloading Data" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Important: Python package progressbar2 is necessary to run the following cell. If it is not available in your environment where this notebook is running, please install it." - ] - }, { "cell_type": "code", "execution_count": null, @@ -237,7 +236,6 @@ "import tarfile\n", "import hashlib\n", "from urllib.request import urlretrieve\n", - "from progressbar import ProgressBar\n", "\n", "def validate_downloaded_data(path):\n", " if(os.path.isdir(path) and os.path.exists(path + '//names.csv')) :\n", @@ -267,7 +265,7 @@ " url_format = 'http://rapidsai-data.s3-website.us-east-2.amazonaws.com/notebook-mortgage-data/{0}.tgz'\n", " url = url_format.format(fileroot)\n", " print(\"...Downloading file :{0}\".format(filename))\n", - " urlretrieve(url, filename,show_progress)\n", + " urlretrieve(url, filename)\n", " pbar.finish()\n", " print(\"...File :{0} finished downloading\".format(filename))\n", " else:\n", @@ -282,9 +280,7 @@ " so_far = 0\n", " for member_info in members:\n", " tar.extract(member_info,path=path)\n", - " show_progress(so_far, 1, numFiles)\n", " so_far += 1\n", - " pbar.finish()\n", " print(\"...All {0} files have been decompressed\".format(numFiles))\n", " tar.close()" ] @@ -324,7 +320,9 @@ "\n", "# download and uncompress data in a local directory before uploading to data store\n", "# directory specified in src_dir parameter below should have the acq, perf directories with data and names.csv file\n", - "ds.upload(src_dir=path, target_path=fileroot, overwrite=True, show_progress=True)\n", + "\n", + "# ---->>>> UNCOMMENT THE BELOW LINE TO UPLOAD YOUR DATA IF NOT DONE SO ALREADY <<<<----\n", + "# ds.upload(src_dir=path, target_path=fileroot, overwrite=True, show_progress=True)\n", "\n", "# data already uploaded to the datastore\n", "data_ref = DataReference(data_reference_name='data', datastore=ds, path_on_datastore=fileroot)" @@ -360,7 +358,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The following code shows how to use an existing image from [Docker Hub](https://hub.docker.com/r/rapidsai/rapidsai/) that has a prebuilt conda environment named 'rapids' when creating a RunConfiguration. Note that this conda environment does not include azureml-defaults package that is required for using AML functionality like metrics tracking, model management etc. This package is automatically installed when you use 'Specify package dependencies' option and that is why it is the recommended option to create RunConfiguraiton in AML." + "The following code shows how to install RAPIDS using conda. The `rapids.yml` file contains the list of packages necessary to run this tutorial. **NOTE:** Initial build of the image might take up to 20 minutes as the service needs to build and cache the new image; once the image is built the subequent runs use the cached image and the overhead is minimal." ] }, { @@ -369,17 +367,13 @@ "metadata": {}, "outputs": [], "source": [ - "run_config = RunConfiguration()\n", + "cd = CondaDependencies(conda_dependencies_file_path='rapids.yml')\n", + "run_config = RunConfiguration(conda_dependencies=cd)\n", "run_config.framework = 'python'\n", - "run_config.environment.python.user_managed_dependencies = True\n", - "run_config.environment.python.interpreter_path = '/conda/envs/rapids/bin/python'\n", "run_config.target = gpu_cluster_name\n", "run_config.environment.docker.enabled = True\n", "run_config.environment.docker.gpu_support = True\n", - "run_config.environment.docker.base_image = \"rapidsai/rapidsai:cuda9.2-runtime-ubuntu18.04\"\n", - "# run_config.environment.docker.base_image_registry.address = '' # not required if the base_image is in Docker hub\n", - "# run_config.environment.docker.base_image_registry.username = '' # needed only for private images\n", - "# run_config.environment.docker.base_image_registry.password = '' # needed only for private images\n", + "run_config.environment.docker.base_image = \"mcr.microsoft.com/azureml/base-gpu:intelmpi2018.3-cuda10.0-cudnn7-ubuntu16.04\"\n", "run_config.environment.spark.precache_packages = False\n", "run_config.data_references={'data':data_ref.to_config()}" ] @@ -388,14 +382,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Specify package dependencies" + "#### Using Docker" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The following code shows how to list package dependencies in a conda environment definition file (rapids.yml) when creating a RunConfiguration" + "Alternatively, you can specify RAPIDS Docker image." ] }, { @@ -404,16 +398,17 @@ "metadata": {}, "outputs": [], "source": [ - "# cd = CondaDependencies(conda_dependencies_file_path='rapids.yml')\n", - "# run_config = RunConfiguration(conda_dependencies=cd)\n", + "# run_config = RunConfiguration()\n", "# run_config.framework = 'python'\n", + "# run_config.environment.python.user_managed_dependencies = True\n", + "# run_config.environment.python.interpreter_path = '/conda/envs/rapids/bin/python'\n", "# run_config.target = gpu_cluster_name\n", "# run_config.environment.docker.enabled = True\n", "# run_config.environment.docker.gpu_support = True\n", - "# run_config.environment.docker.base_image = \"\"\n", - "# run_config.environment.docker.base_image_registry.address = '' # not required if the base_image is in Docker hub\n", - "# run_config.environment.docker.base_image_registry.username = '' # needed only for private images\n", - "# run_config.environment.docker.base_image_registry.password = '' # needed only for private images\n", + "# run_config.environment.docker.base_image = \"rapidsai/rapidsai:cuda9.2-runtime-ubuntu18.04\"\n", + "# # run_config.environment.docker.base_image_registry.address = '' # not required if the base_image is in Docker hub\n", + "# # run_config.environment.docker.base_image_registry.username = '' # needed only for private images\n", + "# # run_config.environment.docker.base_image_registry.password = '' # needed only for private images\n", "# run_config.environment.spark.precache_packages = False\n", "# run_config.data_references={'data':data_ref.to_config()}" ] @@ -551,9 +546,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.6" + "version": "3.6.8" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } \ No newline at end of file diff --git a/contrib/RAPIDS/process_data.py b/contrib/RAPIDS/process_data.py index 474cc83a..be8d54de 100644 --- a/contrib/RAPIDS/process_data.py +++ b/contrib/RAPIDS/process_data.py @@ -15,21 +15,6 @@ from glob import glob import os import argparse -def initialize_rmm_pool(): - from librmm_cffi import librmm_config as rmm_cfg - - rmm_cfg.use_pool_allocator = True - #rmm_cfg.initial_pool_size = 2<<30 # set to 2GiB. Default is 1/2 total GPU memory - import cudf - return cudf._gdf.rmm_initialize() - -def initialize_rmm_no_pool(): - from librmm_cffi import librmm_config as rmm_cfg - - rmm_cfg.use_pool_allocator = False - import cudf - return cudf._gdf.rmm_initialize() - def run_dask_task(func, **kwargs): task = func(**kwargs) return task @@ -207,26 +192,26 @@ def gpu_load_names(col_path): def create_ever_features(gdf, **kwargs): everdf = gdf[['loan_id', 'current_loan_delinquency_status']] - everdf = everdf.groupby('loan_id', method='hash').max() + everdf = everdf.groupby('loan_id', method='hash').max().reset_index() del(gdf) - everdf['ever_30'] = (everdf['max_current_loan_delinquency_status'] >= 1).astype('int8') - everdf['ever_90'] = (everdf['max_current_loan_delinquency_status'] >= 3).astype('int8') - everdf['ever_180'] = (everdf['max_current_loan_delinquency_status'] >= 6).astype('int8') - everdf.drop_column('max_current_loan_delinquency_status') + everdf['ever_30'] = (everdf['current_loan_delinquency_status'] >= 1).astype('int8') + everdf['ever_90'] = (everdf['current_loan_delinquency_status'] >= 3).astype('int8') + everdf['ever_180'] = (everdf['current_loan_delinquency_status'] >= 6).astype('int8') + everdf.drop_column('current_loan_delinquency_status') return everdf def create_delinq_features(gdf, **kwargs): delinq_gdf = gdf[['loan_id', 'monthly_reporting_period', 'current_loan_delinquency_status']] del(gdf) - delinq_30 = delinq_gdf.query('current_loan_delinquency_status >= 1')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min() - delinq_30['delinquency_30'] = delinq_30['min_monthly_reporting_period'] - delinq_30.drop_column('min_monthly_reporting_period') - delinq_90 = delinq_gdf.query('current_loan_delinquency_status >= 3')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min() - delinq_90['delinquency_90'] = delinq_90['min_monthly_reporting_period'] - delinq_90.drop_column('min_monthly_reporting_period') - delinq_180 = delinq_gdf.query('current_loan_delinquency_status >= 6')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min() - delinq_180['delinquency_180'] = delinq_180['min_monthly_reporting_period'] - delinq_180.drop_column('min_monthly_reporting_period') + delinq_30 = delinq_gdf.query('current_loan_delinquency_status >= 1')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min().reset_index() + delinq_30['delinquency_30'] = delinq_30['monthly_reporting_period'] + delinq_30.drop_column('monthly_reporting_period') + delinq_90 = delinq_gdf.query('current_loan_delinquency_status >= 3')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min().reset_index() + delinq_90['delinquency_90'] = delinq_90['monthly_reporting_period'] + delinq_90.drop_column('monthly_reporting_period') + delinq_180 = delinq_gdf.query('current_loan_delinquency_status >= 6')[['loan_id', 'monthly_reporting_period']].groupby('loan_id', method='hash').min().reset_index() + delinq_180['delinquency_180'] = delinq_180['monthly_reporting_period'] + delinq_180.drop_column('monthly_reporting_period') del(delinq_gdf) delinq_merge = delinq_30.merge(delinq_90, how='left', on=['loan_id'], type='hash') delinq_merge['delinquency_90'] = delinq_merge['delinquency_90'].fillna(np.dtype('datetime64[ms]').type('1970-01-01').astype('datetime64[ms]')) @@ -279,16 +264,15 @@ def create_joined_df(gdf, everdf, **kwargs): def create_12_mon_features(joined_df, **kwargs): testdfs = [] n_months = 12 + for y in range(1, n_months + 1): tmpdf = joined_df[['loan_id', 'timestamp_year', 'timestamp_month', 'delinquency_12', 'upb_12']] tmpdf['josh_months'] = tmpdf['timestamp_year'] * 12 + tmpdf['timestamp_month'] tmpdf['josh_mody_n'] = ((tmpdf['josh_months'].astype('float64') - 24000 - y) / 12).floor() - tmpdf = tmpdf.groupby(['loan_id', 'josh_mody_n'], method='hash').agg({'delinquency_12': 'max','upb_12': 'min'}) - tmpdf['delinquency_12'] = (tmpdf['max_delinquency_12']>3).astype('int32') - tmpdf['delinquency_12'] +=(tmpdf['min_upb_12']==0).astype('int32') - tmpdf.drop_column('max_delinquency_12') - tmpdf['upb_12'] = tmpdf['min_upb_12'] - tmpdf.drop_column('min_upb_12') + tmpdf = tmpdf.groupby(['loan_id', 'josh_mody_n'], method='hash').agg({'delinquency_12': 'max','upb_12': 'min'}).reset_index() + tmpdf['delinquency_12'] = (tmpdf['delinquency_12']>3).astype('int32') + tmpdf['delinquency_12'] +=(tmpdf['upb_12']==0).astype('int32') + tmpdf['upb_12'] = tmpdf['upb_12'] tmpdf['timestamp_year'] = (((tmpdf['josh_mody_n'] * n_months) + 24000 + (y - 1)) / 12).floor().astype('int16') tmpdf['timestamp_month'] = np.int8(y) tmpdf.drop_column('josh_mody_n') @@ -329,6 +313,7 @@ def last_mile_cleaning(df, **kwargs): 'delinquency_30', 'delinquency_90', 'delinquency_180', 'upb_12', 'zero_balance_effective_date','foreclosed_after', 'disposition_date','timestamp' ] + for column in drop_list: df.drop_column(column) for col, dtype in df.dtypes.iteritems(): @@ -342,7 +327,6 @@ def last_mile_cleaning(df, **kwargs): return df.to_arrow(preserve_index=False) def main(): - #print('XGBOOST_BUILD_DOC is ' + os.environ['XGBOOST_BUILD_DOC']) parser = argparse.ArgumentParser("rapidssample") parser.add_argument("--data_dir", type=str, help="location of data") parser.add_argument("--num_gpu", type=int, help="Number of GPUs to use", default=1) @@ -364,7 +348,6 @@ def main(): print('data_dir = {0}'.format(data_dir)) print('num_gpu = {0}'.format(num_gpu)) print('part_count = {0}'.format(part_count)) - #part_count = part_count + 1 # adding one because the usage below is not inclusive print('end_year = {0}'.format(end_year)) print('cpu_predictor = {0}'.format(cpu_predictor)) @@ -380,19 +363,17 @@ def main(): client print(client.ncores()) -# to download data for this notebook, visit https://rapidsai.github.io/demos/datasets/mortgage-data and update the following paths accordingly + # to download data for this notebook, visit https://rapidsai.github.io/demos/datasets/mortgage-data and update the following paths accordingly acq_data_path = "{0}/acq".format(data_dir) #"/rapids/data/mortgage/acq" perf_data_path = "{0}/perf".format(data_dir) #"/rapids/data/mortgage/perf" col_names_path = "{0}/names.csv".format(data_dir) # "/rapids/data/mortgage/names.csv" start_year = 2000 -#end_year = 2000 # end_year is inclusive -- converted to parameter -#part_count = 2 # the number of data files to train against -- converted to parameter - client.run(initialize_rmm_pool) client - print(client.ncores()) -# NOTE: The ETL calculates additional features which are then dropped before creating the XGBoost DMatrix. -# This can be optimized to avoid calculating the dropped features. + print('--->>> Workers used: {0}'.format(client.ncores())) + + # NOTE: The ETL calculates additional features which are then dropped before creating the XGBoost DMatrix. + # This can be optimized to avoid calculating the dropped features. print("Reading ...") t1 = datetime.datetime.now() gpu_dfs = [] @@ -414,14 +395,9 @@ def main(): wait(gpu_dfs) t2 = datetime.datetime.now() - print("Reading time ...") - print(t2-t1) - print('len(gpu_dfs) is {0}'.format(len(gpu_dfs))) - - client.run(cudf._gdf.rmm_finalize) - client.run(initialize_rmm_no_pool) - client - print(client.ncores()) + print("Reading time: {0}".format(str(t2-t1))) + print('--->>> Number of data parts: {0}'.format(len(gpu_dfs))) + dxgb_gpu_params = { 'nround': 100, 'max_depth': 8, @@ -438,7 +414,7 @@ def main(): 'n_gpus': 1, 'distributed_dask': True, 'loss': 'ls', - 'objective': 'gpu:reg:linear', + 'objective': 'reg:squarederror', 'max_features': 'auto', 'criterion': 'friedman_mse', 'grow_policy': 'lossguide', @@ -446,13 +422,13 @@ def main(): } if cpu_predictor: - print('Training using CPUs') + print('\n---->>>> Training using CPUs <<<<----\n') dxgb_gpu_params['predictor'] = 'cpu_predictor' dxgb_gpu_params['tree_method'] = 'hist' dxgb_gpu_params['objective'] = 'reg:linear' else: - print('Training using GPUs') + print('\n---->>>> Training using GPUs <<<<----\n') print('Training parameters are {0}'.format(dxgb_gpu_params)) @@ -481,14 +457,13 @@ def main(): gpu_dfs = [gpu_df.persist() for gpu_df in gpu_dfs] gc.collect() wait(gpu_dfs) - + + # TRAIN THE MODEL labels = None t1 = datetime.datetime.now() bst = dxgb_gpu.train(client, dxgb_gpu_params, gpu_dfs, labels, num_boost_round=dxgb_gpu_params['nround']) t2 = datetime.datetime.now() - print("Training time ...") - print(t2-t1) - print('str(bst) is {0}'.format(str(bst))) + print('\n---->>>> Training time: {0} <<<<----\n'.format(str(t2-t1))) print('Exiting script') if __name__ == '__main__': diff --git a/end-to-end-samples/README.md b/end-to-end-samples/README.md deleted file mode 100644 index e69de29b..00000000 diff --git a/how-to-use-azureml/automated-machine-learning/automl_env.yml b/how-to-use-azureml/automated-machine-learning/automl_env.yml index 20bf96b9..d39415ab 100644 --- a/how-to-use-azureml/automated-machine-learning/automl_env.yml +++ b/how-to-use-azureml/automated-machine-learning/automl_env.yml @@ -22,6 +22,6 @@ dependencies: - azureml-train-automl - azureml-widgets - azureml-explain-model - - azureml-contrib-explain-model + - azureml-contrib-interpret - pandas_ml diff --git a/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml b/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml index 179e46b5..02c528c1 100644 --- a/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml +++ b/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml @@ -23,6 +23,6 @@ dependencies: - azureml-train-automl - azureml-widgets - azureml-explain-model - - azureml-contrib-explain-model + - azureml-contrib-interpret - pandas_ml diff --git a/how-to-use-azureml/automated-machine-learning/classification-bank-marketing/auto-ml-classification-bank-marketing.yml b/how-to-use-azureml/automated-machine-learning/classification-bank-marketing/auto-ml-classification-bank-marketing.yml index 4c8a39ca..8301106f 100644 --- a/how-to-use-azureml/automated-machine-learning/classification-bank-marketing/auto-ml-classification-bank-marketing.yml +++ b/how-to-use-azureml/automated-machine-learning/classification-bank-marketing/auto-ml-classification-bank-marketing.yml @@ -2,6 +2,7 @@ name: auto-ml-classification-bank-marketing dependencies: - pip: - azureml-sdk + - interpret - azureml-defaults - azureml-explain-model - azureml-train-automl diff --git a/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.yml b/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.yml index f4a3601e..6ff39d2c 100644 --- a/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.yml +++ b/how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.yml @@ -2,6 +2,7 @@ name: auto-ml-classification-credit-card-fraud dependencies: - pip: - azureml-sdk + - interpret - azureml-defaults - azureml-explain-model - azureml-train-automl diff --git a/how-to-use-azureml/automated-machine-learning/dataset-remote-execution/auto-ml-dataset-remote-execution.yml b/how-to-use-azureml/automated-machine-learning/dataset-remote-execution/auto-ml-dataset-remote-execution.yml index aa6e4e65..120cf810 100644 --- a/how-to-use-azureml/automated-machine-learning/dataset-remote-execution/auto-ml-dataset-remote-execution.yml +++ b/how-to-use-azureml/automated-machine-learning/dataset-remote-execution/auto-ml-dataset-remote-execution.yml @@ -2,6 +2,7 @@ name: auto-ml-dataset-remote-execution dependencies: - pip: - azureml-sdk + - interpret - azureml-defaults - azureml-explain-model - azureml-train-automl diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb b/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb index 3d9d497d..eccade14 100644 --- a/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb +++ b/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb @@ -619,7 +619,7 @@ "source": [ "engineered_explanations = explainer.explain(['local', 'global'], eval_dataset=automl_explainer_setup_obj.X_test_transform)\n", "print(engineered_explanations.get_feature_importance_dict())\n", - "from azureml.contrib.explain.model.visualize import ExplanationDashboard\n", + "from azureml.contrib.interpret.visualize import ExplanationDashboard\n", "ExplanationDashboard(engineered_explanations, automl_explainer_setup_obj.automl_estimator, automl_explainer_setup_obj.X_test_transform)" ] }, @@ -641,7 +641,7 @@ " raw_feature_names=automl_explainer_setup_obj.raw_feature_names,\n", " eval_dataset=automl_explainer_setup_obj.X_test_transform)\n", "print(raw_explanations.get_feature_importance_dict())\n", - "from azureml.contrib.explain.model.visualize import ExplanationDashboard\n", + "from azureml.contrib.interpret.visualize import ExplanationDashboard\n", "ExplanationDashboard(raw_explanations, automl_explainer_setup_obj.automl_pipeline, automl_explainer_setup_obj.X_test_raw)" ] }, diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.yml b/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.yml index 693b5f4d..4a4aeabd 100644 --- a/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.yml +++ b/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.yml @@ -2,10 +2,11 @@ name: auto-ml-forecasting-energy-demand dependencies: - pip: - azureml-sdk + - interpret - azureml-train-automl - azureml-widgets - matplotlib - pandas_ml - statsmodels - azureml-explain-model - - azureml-contrib-explain-model + - azureml-contrib-interpret diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-high-frequency/automl-forecasting-function.ipynb b/how-to-use-azureml/automated-machine-learning/forecasting-high-frequency/automl-forecasting-function.ipynb new file mode 100644 index 00000000..882dc050 --- /dev/null +++ b/how-to-use-azureml/automated-machine-learning/forecasting-high-frequency/automl-forecasting-function.ipynb @@ -0,0 +1,615 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Automated Machine Learning\n", + "\n", + "## Forecasting away from training data\n", + "\n", + "This notebook demonstrates the full interface to the `forecast()` function. \n", + "\n", + "The best known and most frequent usage of `forecast` enables forecasting on test sets that immediately follows training data. \n", + "\n", + "However, in many use cases it is necessary to continue using the model for some time before retraining it. This happens especially in **high frequency forecasting** when forecasts need to be made more frequently than the model can be retrained. Examples are in Internet of Things and predictive cloud resource scaling.\n", + "\n", + "Here we show how to use the `forecast()` function when a time gap exists between training data and prediction period.\n", + "\n", + "Terminology:\n", + "* forecast origin: the last period when the target value is known\n", + "* forecast periods(s): the period(s) for which the value of the target is desired.\n", + "* forecast horizon: the number of forecast periods\n", + "* lookback: how many past periods (before forecast origin) the model function depends on. The larger of number of lags and length of rolling window.\n", + "* prediction context: `lookback` periods immediately preceding the forecast origin\n", + "\n", + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/automated-machine-learning/automl-forecasting-function.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Please make sure you have followed the `configuration.ipynb` notebook so that your ML workspace information is saved in the config file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import logging\n", + "import warnings\n", + "\n", + "from pandas.tseries.frequencies import to_offset\n", + "\n", + "# Squash warning messages for cleaner output in the notebook\n", + "warnings.showwarning = lambda *args, **kwargs: None\n", + "\n", + "np.set_printoptions(precision=4, suppress=True, linewidth=120)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.core\n", + "from azureml.core.workspace import Workspace\n", + "from azureml.core.experiment import Experiment\n", + "from azureml.train.automl import AutoMLConfig\n", + "\n", + "ws = Workspace.from_config()\n", + "\n", + "# choose a name for the run history container in the workspace\n", + "experiment_name = 'automl-forecast-function-demo'\n", + "\n", + "experiment = Experiment(ws, experiment_name)\n", + "\n", + "output = {}\n", + "output['SDK version'] = azureml.core.VERSION\n", + "output['Subscription ID'] = ws.subscription_id\n", + "output['Workspace'] = ws.name\n", + "output['Resource Group'] = ws.resource_group\n", + "output['Location'] = ws.location\n", + "output['Run History Name'] = experiment_name\n", + "pd.set_option('display.max_colwidth', -1)\n", + "outputDf = pd.DataFrame(data = output, index = [''])\n", + "outputDf.T" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data\n", + "For the demonstration purposes we will generate the data artificially and use them for the forecasting." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TIME_COLUMN_NAME = 'date'\n", + "GRAIN_COLUMN_NAME = 'grain'\n", + "TARGET_COLUMN_NAME = 'y'\n", + "\n", + "def get_timeseries(train_len: int,\n", + " test_len: int,\n", + " time_column_name: str,\n", + " target_column_name: str,\n", + " grain_column_name: str,\n", + " grains: int = 1,\n", + " freq: str = 'H'):\n", + " \"\"\"\n", + " Return the time series of designed length.\n", + "\n", + " :param train_len: The length of training data (one series).\n", + " :type train_len: int\n", + " :param test_len: The length of testing data (one series).\n", + " :type test_len: int\n", + " :param time_column_name: The desired name of a time column.\n", + " :type time_column_name: str\n", + " :param\n", + " :param grains: The number of grains.\n", + " :type grains: int\n", + " :param freq: The frequency string representing pandas offset.\n", + " see https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html\n", + " :type freq: str\n", + " :returns: the tuple of train and test data sets.\n", + " :rtype: tuple\n", + "\n", + " \"\"\"\n", + " data_train = [] # type: List[pd.DataFrame]\n", + " data_test = [] # type: List[pd.DataFrame]\n", + " data_length = train_len + test_len\n", + " for i in range(grains):\n", + " X = pd.DataFrame({\n", + " time_column_name: pd.date_range(start='2000-01-01',\n", + " periods=data_length,\n", + " freq=freq),\n", + " target_column_name: np.arange(data_length).astype(float) + np.random.rand(data_length) + i*5,\n", + " 'ext_predictor': np.asarray(range(42, 42 + data_length)),\n", + " grain_column_name: np.repeat('g{}'.format(i), data_length)\n", + " })\n", + " data_train.append(X[:train_len])\n", + " data_test.append(X[train_len:])\n", + " X_train = pd.concat(data_train)\n", + " y_train = X_train.pop(target_column_name).values\n", + " X_test = pd.concat(data_test)\n", + " y_test = X_test.pop(target_column_name).values\n", + " return X_train, y_train, X_test, y_test\n", + "\n", + "n_test_periods = 6\n", + "n_train_periods = 30\n", + "X_train, y_train, X_test, y_test = get_timeseries(train_len=n_train_periods,\n", + " test_len=n_test_periods,\n", + " time_column_name=TIME_COLUMN_NAME,\n", + " target_column_name=TARGET_COLUMN_NAME,\n", + " grain_column_name=GRAIN_COLUMN_NAME,\n", + " grains=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see what the training data looks like." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot the example time series\n", + "import matplotlib.pyplot as plt\n", + "whole_data = X_train.copy()\n", + "whole_data['y'] = y_train\n", + "for g in whole_data.groupby('grain'): \n", + " plt.plot(g[1]['date'].values, g[1]['y'].values, label=g[0])\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create the configuration and train a forecaster\n", + "First generate the configuration, in which we:\n", + "* Set metadata columns: target, time column and grain column names.\n", + "* Ask for 10 iterations through models, last of which will represent the Ensemble of previous ones.\n", + "* Validate our data using cross validation with rolling window method.\n", + "* Set normalized root mean squared error as a metric to select the best model.\n", + "\n", + "* Finally, we set the task to be forecasting.\n", + "* By default, we apply the lag lead operator and rolling window to the target value i.e. we use the previous values as a predictor for the future ones." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lags = [1,2,3]\n", + "rolling_window_length = 0 # don't do rolling windows\n", + "max_horizon = n_test_periods\n", + "time_series_settings = { \n", + " 'time_column_name': TIME_COLUMN_NAME,\n", + " 'grain_column_names': [ GRAIN_COLUMN_NAME ],\n", + " 'max_horizon': max_horizon,\n", + " 'target_lags': lags\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the model selection and training process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.workspace import Workspace\n", + "from azureml.core.experiment import Experiment\n", + "from azureml.train.automl import AutoMLConfig\n", + "\n", + "\n", + "automl_config = AutoMLConfig(task='forecasting',\n", + " debug_log='automl_forecasting_function.log',\n", + " primary_metric='normalized_root_mean_squared_error', \n", + " iterations=10, \n", + " X=X_train,\n", + " y=y_train,\n", + " n_cross_validations=3,\n", + " verbosity = logging.INFO,\n", + " **time_series_settings)\n", + "\n", + "local_run = experiment.submit(automl_config, show_output=True)\n", + "\n", + "# Retrieve the best model to use it further.\n", + "_, fitted_model = local_run.get_output()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Forecasting from the trained model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this section we will review the `forecast` interface for two main scenarios: forecasting right after the training data, and the more complex interface for forecasting when there is a gap (in the time sense) between training and testing data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### X_train is directly followed by the X_test\n", + "\n", + "Let's first consider the case when the prediction period immediately follows the training data. This is typical in scenarios where we have the time to retrain the model every time we wish to forecast. Forecasts that are made on daily and slower cadence typically fall into this category. Retraining the model every time benefits the accuracy because the most recent data is often the most informative.\n", + "\n", + "![Forecasting after training](forecast_function_at_train.png)\n", + "\n", + "The `X_test` and `y_query` below, taken together, form the **forecast request**. The two are interpreted as aligned - `y_query` could actally be a column in `X_test`. `NaN`s in `y_query` are the question marks. These will be filled with the forecasts.\n", + "\n", + "When the forecast period immediately follows the training period, the models retain the last few points of data. You can simply fill `y_query` filled with question marks - the model has the data for the lookback already.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Typical path: X_test is known, forecast all upcoming periods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The data set contains hourly data, the training set ends at 01/02/2000 at 05:00\n", + "\n", + "# These are predictions we are asking the model to make (does not contain thet target column y),\n", + "# for 6 periods beginning with 2000-01-02 06:00, which immediately follows the training data\n", + "X_test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_query = np.repeat(np.NaN, X_test.shape[0])\n", + "y_pred_no_gap, xy_nogap = fitted_model.forecast(X_test, y_query)\n", + "\n", + "# xy_nogap contains the predictions in the _automl_target_col column.\n", + "# Those same numbers are output in y_pred_no_gap\n", + "xy_nogap" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Distribution forecasts\n", + "\n", + "Often the figure of interest is not just the point prediction, but the prediction at some quantile of the distribution. \n", + "This arises when the forecast is used to control some kind of inventory, for example of grocery items of virtual machines for a cloud service. In such case, the control point is usually something like \"we want the item to be in stock and not run out 99% of the time\". This is called a \"service level\". Here is how you get quantile forecasts." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# specify which quantiles you would like \n", + "fitted_model.quantiles = [0.01, 0.5, 0.95]\n", + "# use forecast_quantiles function, not the forecast() one\n", + "y_pred_quantiles = fitted_model.forecast_quantiles(X_test, y_query)\n", + "\n", + "# it all nicely aligns column-wise\n", + "pd.concat([X_test.reset_index(), pd.DataFrame({'query' : y_query}), y_pred_quantiles], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Destination-date forecast: \"just do something\"\n", + "\n", + "In some scenarios, the X_test is not known. The forecast is likely to be weak, becaus eit is missing contemporaneous predictors, which we will need to impute. If you still wish to predict forward under the assumption that the last known values will be carried forward, you can forecast out to \"destination date\". The destination date still needs to fit within the maximum horizon from training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We will take the destination date as a last date in the test set.\n", + "dest = max(X_test[TIME_COLUMN_NAME])\n", + "y_pred_dest, xy_dest = fitted_model.forecast(forecast_destination=dest)\n", + "\n", + "# This form also shows how we imputed the predictors which were not given. (Not so well! Use with caution!)\n", + "xy_dest" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Forecasting away from training data\n", + "\n", + "Suppose we trained a model, some time passed, and now we want to apply the model without re-training. If the model \"looks back\" -- uses previous values of the target -- then we somehow need to provide those values to the model.\n", + "\n", + "![Forecasting after training](forecast_function_away_from_train.png)\n", + "\n", + "The notion of forecast origin comes into play: the forecast origin is **the last period for which we have seen the target value**. This applies per grain, so each grain can have a different forecast origin. \n", + "\n", + "The part of data before the forecast origin is the **prediction context**. To provide the context values the model needs when it looks back, we pass definite values in `y_test` (aligned with corresponding times in `X_test`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# generate the same kind of test data we trained on, \n", + "# but now make the train set much longer, so that the test set will be in the future\n", + "X_context, y_context, X_away, y_away = get_timeseries(train_len=42, # train data was 30 steps long\n", + " test_len=4,\n", + " time_column_name=TIME_COLUMN_NAME,\n", + " target_column_name=TARGET_COLUMN_NAME,\n", + " grain_column_name=GRAIN_COLUMN_NAME,\n", + " grains=2)\n", + "\n", + "# end of the data we trained on\n", + "print(X_train.groupby(GRAIN_COLUMN_NAME)[TIME_COLUMN_NAME].max())\n", + "# start of the data we want to predict on\n", + "print(X_away.groupby(GRAIN_COLUMN_NAME)[TIME_COLUMN_NAME].min())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is a gap of 12 hours between end of training and beginning of `X_away`. (It looks like 13 because all timestamps point to the start of the one hour periods.) Using only `X_away` will fail without adding context data for the model to consume." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try: \n", + " y_query = y_away.copy()\n", + " y_query.fill(np.NaN)\n", + " y_pred_away, xy_away = fitted_model.forecast(X_away, y_query)\n", + " xy_away\n", + "except Exception as e:\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How should we read that eror message? The forecast origin is at the last time themodel saw an actual values of `y` (the target). That was at the end of the training data! Because the model received all `NaN` (and not an actual target value), it is attempting to forecast from the end of training data. But the requested forecast periods are past the maximum horizon. We need to provide a define `y` value to establish the forecast origin.\n", + "\n", + "We will use this helper function to take the required amount of context from the data preceding the testing data. It's definition is intentionally simplified to keep the idea in the clear." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def make_forecasting_query(fulldata, time_column_name, target_column_name, forecast_origin, horizon, lookback):\n", + "\n", + " \"\"\"\n", + " This function will take the full dataset, and create the query\n", + " to predict all values of the grain from the `forecast_origin`\n", + " forward for the next `horizon` horizons. Context from previous\n", + " `lookback` periods will be included.\n", + "\n", + " \n", + "\n", + " fulldata: pandas.DataFrame a time series dataset. Needs to contain X and y.\n", + " time_column_name: string which column (must be in fulldata) is the time axis\n", + " target_column_name: string which column (must be in fulldata) is to be forecast\n", + " forecast_origin: datetime type the last time we (pretend to) have target values \n", + " horizon: timedelta how far forward, in time units (not periods)\n", + " lookback: timedelta how far back does the model look?\n", + "\n", + " Example:\n", + "\n", + "\n", + " ```\n", + "\n", + " forecast_origin = pd.to_datetime('2012-09-01') + pd.DateOffset(days=5) # forecast 5 days after end of training\n", + " print(forecast_origin)\n", + "\n", + " X_query, y_query = make_forecasting_query(data, \n", + " forecast_origin = forecast_origin,\n", + " horizon = pd.DateOffset(days=7), # 7 days into the future\n", + " lookback = pd.DateOffset(days=1), # model has lag 1 period (day)\n", + " )\n", + "\n", + " ```\n", + " \"\"\"\n", + "\n", + " X_past = fulldata[ (fulldata[ time_column_name ] > forecast_origin - lookback) &\n", + " (fulldata[ time_column_name ] <= forecast_origin)\n", + " ]\n", + "\n", + " X_future = fulldata[ (fulldata[ time_column_name ] > forecast_origin) &\n", + " (fulldata[ time_column_name ] <= forecast_origin + horizon)\n", + " ]\n", + "\n", + " y_past = X_past.pop(target_column_name).values.astype(np.float)\n", + " y_future = X_future.pop(target_column_name).values.astype(np.float)\n", + "\n", + " # Now take y_future and turn it into question marks\n", + " y_query = y_future.copy().astype(np.float) # because sometimes life hands you an int\n", + " y_query.fill(np.NaN)\n", + "\n", + "\n", + " print(\"X_past is \" + str(X_past.shape) + \" - shaped\")\n", + " print(\"X_future is \" + str(X_future.shape) + \" - shaped\")\n", + " print(\"y_past is \" + str(y_past.shape) + \" - shaped\")\n", + " print(\"y_query is \" + str(y_query.shape) + \" - shaped\")\n", + "\n", + "\n", + " X_pred = pd.concat([X_past, X_future])\n", + " y_pred = np.concatenate([y_past, y_query])\n", + " return X_pred, y_pred" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see where the context data ends - it ends, by construction, just before the testing data starts." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(X_context.groupby(GRAIN_COLUMN_NAME)[TIME_COLUMN_NAME].agg(['min','max','count']))\n", + "print( X_away.groupby(GRAIN_COLUMN_NAME)[TIME_COLUMN_NAME].agg(['min','max','count']))\n", + "X_context.tail(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Since the length of the lookback is 3, \n", + "# we need to add 3 periods from the context to the request\n", + "# so that the model has the data it needs\n", + "\n", + "# Put the X and y back together for a while. \n", + "# They like each other and it makes them happy.\n", + "X_context[TARGET_COLUMN_NAME] = y_context\n", + "X_away[TARGET_COLUMN_NAME] = y_away\n", + "fulldata = pd.concat([X_context, X_away])\n", + "\n", + "# forecast origin is the last point of data, which is one 1-hr period before test\n", + "forecast_origin = X_away[TIME_COLUMN_NAME].min() - pd.DateOffset(hours=1)\n", + "# it is indeed the last point of the context\n", + "assert forecast_origin == X_context[TIME_COLUMN_NAME].max()\n", + "print(\"Forecast origin: \" + str(forecast_origin))\n", + " \n", + "# the model uses lags and rolling windows to look back in time\n", + "n_lookback_periods = max(max(lags), rolling_window_length)\n", + "lookback = pd.DateOffset(hours=n_lookback_periods)\n", + "\n", + "horizon = pd.DateOffset(hours=max_horizon)\n", + "\n", + "# now make the forecast query from context (refer to figure)\n", + "X_pred, y_pred = make_forecasting_query(fulldata, TIME_COLUMN_NAME, TARGET_COLUMN_NAME,\n", + " forecast_origin, horizon, lookback)\n", + "\n", + "# show the forecast request aligned\n", + "X_show = X_pred.copy()\n", + "X_show[TARGET_COLUMN_NAME] = y_pred\n", + "X_show" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the forecast origin is at 17:00 for both grains, and periods from 18:00 are to be forecast." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Now everything works\n", + "y_pred_away, xy_away = fitted_model.forecast(X_pred, y_pred)\n", + "\n", + "# show the forecast aligned\n", + "X_show = xy_away.reset_index()\n", + "# without the generated features\n", + "X_show[['date', 'grain', 'ext_predictor', '_automl_target_col']]\n", + "# prediction is in _automl_target_col" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "erwright, nirovins" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-high-frequency/automl-forecasting-function.yml b/how-to-use-azureml/automated-machine-learning/forecasting-high-frequency/automl-forecasting-function.yml new file mode 100644 index 00000000..cac7198c --- /dev/null +++ b/how-to-use-azureml/automated-machine-learning/forecasting-high-frequency/automl-forecasting-function.yml @@ -0,0 +1,9 @@ +name: automl-forecasting-function +dependencies: +- pip: + - azureml-sdk + - azureml-train-automl + - azureml-widgets + - pandas_ml + - statsmodels + - matplotlib diff --git a/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb b/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb index 6f5ff246..8c89c86e 100644 --- a/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb +++ b/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb @@ -805,7 +805,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.7" } }, "nbformat": 4, diff --git a/how-to-use-azureml/automated-machine-learning/model-explanation-remote-amlcompute/auto-ml-model-explanations-remote-compute.ipynb b/how-to-use-azureml/automated-machine-learning/model-explanation-remote-amlcompute/auto-ml-model-explanations-remote-compute.ipynb index 129cdf89..9f6bf2fd 100644 --- a/how-to-use-azureml/automated-machine-learning/model-explanation-remote-amlcompute/auto-ml-model-explanations-remote-compute.ipynb +++ b/how-to-use-azureml/automated-machine-learning/model-explanation-remote-amlcompute/auto-ml-model-explanations-remote-compute.ipynb @@ -537,7 +537,7 @@ "outputs": [], "source": [ "from azureml.explain.model._internal.explanation_client import ExplanationClient\n", - "from azureml.contrib.explain.model.visualize import ExplanationDashboard\n", + "from azureml.contrib.interpret.visualize import ExplanationDashboard\n", "client = ExplanationClient.from_run(automl_run)\n", "engineered_explanations = client.download_model_explanation(raw=False)\n", "print(engineered_explanations.get_feature_importance_dict())\n", diff --git a/how-to-use-azureml/automated-machine-learning/model-explanation-remote-amlcompute/auto-ml-model-explanations-remote-compute.yml b/how-to-use-azureml/automated-machine-learning/model-explanation-remote-amlcompute/auto-ml-model-explanations-remote-compute.yml index d5733851..7838e77b 100644 --- a/how-to-use-azureml/automated-machine-learning/model-explanation-remote-amlcompute/auto-ml-model-explanations-remote-compute.yml +++ b/how-to-use-azureml/automated-machine-learning/model-explanation-remote-amlcompute/auto-ml-model-explanations-remote-compute.yml @@ -2,9 +2,10 @@ name: auto-ml-model-explanations-remote-compute dependencies: - pip: - azureml-sdk + - interpret - azureml-train-automl - azureml-widgets - matplotlib - pandas_ml - azureml-explain-model - - azureml-contrib-explain-model + - azureml-contrib-interpret diff --git a/how-to-use-azureml/automated-machine-learning/model-explanation/auto-ml-model-explanation.ipynb b/how-to-use-azureml/automated-machine-learning/model-explanation/auto-ml-model-explanation.ipynb index 1e9c85d8..bf520462 100644 --- a/how-to-use-azureml/automated-machine-learning/model-explanation/auto-ml-model-explanation.ipynb +++ b/how-to-use-azureml/automated-machine-learning/model-explanation/auto-ml-model-explanation.ipynb @@ -395,7 +395,7 @@ "source": [ "engineered_explanations = explainer.explain(['local', 'global'], eval_dataset=automl_explainer_setup_obj.X_test_transform)\n", "print(engineered_explanations.get_feature_importance_dict())\n", - "from azureml.contrib.explain.model.visualize import ExplanationDashboard\n", + "from azureml.contrib.interpret.visualize import ExplanationDashboard\n", "ExplanationDashboard(engineered_explanations, automl_explainer_setup_obj.automl_estimator, automl_explainer_setup_obj.X_test_transform)" ] }, @@ -417,7 +417,7 @@ " raw_feature_names=automl_explainer_setup_obj.raw_feature_names,\n", " eval_dataset=automl_explainer_setup_obj.X_test_transform)\n", "print(raw_explanations.get_feature_importance_dict())\n", - "from azureml.contrib.explain.model.visualize import ExplanationDashboard\n", + "from azureml.contrib.interpret.visualize import ExplanationDashboard\n", "ExplanationDashboard(raw_explanations, automl_explainer_setup_obj.automl_pipeline, automl_explainer_setup_obj.X_test_raw)" ] }, @@ -443,7 +443,7 @@ "from azureml.explain.model.scoring.scoring_explainer import TreeScoringExplainer, save\n", "\n", "# Initialize the ScoringExplainer\n", - "scoring_explainer = TreeScoringExplainer(explainer._internal_explainer, feature_maps=[automl_explainer_setup_obj.feature_map])\n", + "scoring_explainer = TreeScoringExplainer(explainer.explainer, feature_maps=[automl_explainer_setup_obj.feature_map])\n", "\n", "# Pickle scoring explainer locally\n", "save(scoring_explainer, exist_ok=True)\n", diff --git a/how-to-use-azureml/automated-machine-learning/model-explanation/auto-ml-model-explanation.yml b/how-to-use-azureml/automated-machine-learning/model-explanation/auto-ml-model-explanation.yml index 2d0c7623..dc30fa9f 100644 --- a/how-to-use-azureml/automated-machine-learning/model-explanation/auto-ml-model-explanation.yml +++ b/how-to-use-azureml/automated-machine-learning/model-explanation/auto-ml-model-explanation.yml @@ -2,9 +2,10 @@ name: auto-ml-model-explanation dependencies: - pip: - azureml-sdk + - interpret - azureml-train-automl - azureml-widgets - matplotlib - pandas_ml - azureml-explain-model - - azureml-contrib-explain-model + - azureml-contrib-interpret diff --git a/how-to-use-azureml/automated-machine-learning/regression-concrete-strength/auto-ml-regression-concrete-strength.yml b/how-to-use-azureml/automated-machine-learning/regression-concrete-strength/auto-ml-regression-concrete-strength.yml index 30e42672..4d364e6c 100644 --- a/how-to-use-azureml/automated-machine-learning/regression-concrete-strength/auto-ml-regression-concrete-strength.yml +++ b/how-to-use-azureml/automated-machine-learning/regression-concrete-strength/auto-ml-regression-concrete-strength.yml @@ -2,6 +2,7 @@ name: auto-ml-regression-concrete-strength dependencies: - pip: - azureml-sdk + - interpret - azureml-defaults - azureml-explain-model - azureml-train-automl diff --git a/how-to-use-azureml/automated-machine-learning/regression-hardware-performance/auto-ml-regression-hardware-performance.yml b/how-to-use-azureml/automated-machine-learning/regression-hardware-performance/auto-ml-regression-hardware-performance.yml index 7d1b2aec..7b1435ac 100644 --- a/how-to-use-azureml/automated-machine-learning/regression-hardware-performance/auto-ml-regression-hardware-performance.yml +++ b/how-to-use-azureml/automated-machine-learning/regression-hardware-performance/auto-ml-regression-hardware-performance.yml @@ -2,6 +2,7 @@ name: auto-ml-regression-hardware-performance dependencies: - pip: - azureml-sdk + - interpret - azureml-defaults - azureml-explain-model - azureml-train-automl diff --git a/how-to-use-azureml/automated-machine-learning/remote-amlcompute-with-onnx/auto-ml-remote-amlcompute-with-onnx.yml b/how-to-use-azureml/automated-machine-learning/remote-amlcompute-with-onnx/auto-ml-remote-amlcompute-with-onnx.yml index 22bad59a..7ad75c58 100644 --- a/how-to-use-azureml/automated-machine-learning/remote-amlcompute-with-onnx/auto-ml-remote-amlcompute-with-onnx.yml +++ b/how-to-use-azureml/automated-machine-learning/remote-amlcompute-with-onnx/auto-ml-remote-amlcompute-with-onnx.yml @@ -2,6 +2,7 @@ name: auto-ml-remote-amlcompute-with-onnx dependencies: - pip: - azureml-sdk + - interpret - azureml-defaults - azureml-explain-model - azureml-train-automl diff --git a/how-to-use-azureml/automated-machine-learning/remote-amlcompute/auto-ml-remote-amlcompute.yml b/how-to-use-azureml/automated-machine-learning/remote-amlcompute/auto-ml-remote-amlcompute.yml index 6ec4511a..11f46ce3 100644 --- a/how-to-use-azureml/automated-machine-learning/remote-amlcompute/auto-ml-remote-amlcompute.yml +++ b/how-to-use-azureml/automated-machine-learning/remote-amlcompute/auto-ml-remote-amlcompute.yml @@ -2,6 +2,7 @@ name: auto-ml-remote-amlcompute dependencies: - pip: - azureml-sdk + - interpret - azureml-defaults - azureml-explain-model - azureml-train-automl diff --git a/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb b/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb index fbec37b1..a139d9d7 100644 --- a/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb +++ b/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb @@ -239,8 +239,8 @@ "run_config.environment.python.user_managed_dependencies = False\n", "\n", "azureml_pip_packages = [\n", - " 'azureml-defaults', 'azureml-contrib-explain-model', 'azureml-core', 'azureml-telemetry',\n", - " 'azureml-explain-model', 'sklearn-pandas', 'azureml-dataprep'\n", + " 'azureml-defaults', 'azureml-contrib-interpret', 'azureml-core', 'azureml-telemetry',\n", + " 'azureml-interpret', 'sklearn-pandas', 'azureml-dataprep'\n", "]\n", "\n", "# specify CondaDependencies obj\n", @@ -340,8 +340,8 @@ "run_config.environment.docker.enabled = True\n", "\n", "azureml_pip_packages = [\n", - " 'azureml-defaults', 'azureml-contrib-explain-model', 'azureml-core', 'azureml-telemetry',\n", - " 'azureml-explain-model', 'azureml-dataprep'\n", + " 'azureml-defaults', 'azureml-contrib-interpret', 'azureml-core', 'azureml-telemetry',\n", + " 'azureml-interpret', 'azureml-dataprep'\n", "]\n", "\n", "# specify CondaDependencies obj\n", @@ -451,8 +451,8 @@ "run_config.environment.docker.enabled = True\n", "\n", "azureml_pip_packages = [\n", - " 'azureml-defaults', 'azureml-contrib-explain-model', 'azureml-core', 'azureml-telemetry',\n", - " 'azureml-explain-model', 'azureml-dataprep'\n", + " 'azureml-defaults', 'azureml-contrib-interpret', 'azureml-core', 'azureml-telemetry',\n", + " 'azureml-interpret', 'azureml-dataprep'\n", "]\n", "\n", "\n", @@ -497,7 +497,7 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.contrib.explain.model.explanation.explanation_client import ExplanationClient\n", + "from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient\n", "\n", "client = ExplanationClient.from_run(run)\n", "# Get the top k (e.g., 4) most important features with their importance values\n", @@ -562,7 +562,7 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.contrib.explain.model.explanation.explanation_client import ExplanationClient\n", + "from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient\n", "\n", "# Get model explanation data\n", "client = ExplanationClient.from_run(run)\n", @@ -669,7 +669,7 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.contrib.explain.model.visualize import ExplanationDashboard" + "from azureml.contrib.interpret.visualize import ExplanationDashboard" ] }, { diff --git a/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.yml b/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.yml index 53d58768..bf88ad6c 100644 --- a/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.yml +++ b/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.yml @@ -2,7 +2,8 @@ name: explain-model-on-amlcompute dependencies: - pip: - azureml-sdk - - azureml-explain-model - - azureml-contrib-explain-model + - interpret + - azureml-interpret + - azureml-contrib-interpret - sklearn-pandas - azureml-dataprep diff --git a/how-to-use-azureml/explain-model/azure-integration/remote-explanation/train_explain.py b/how-to-use-azureml/explain-model/azure-integration/remote-explanation/train_explain.py index c38839cc..fa7b0451 100644 --- a/how-to-use-azureml/explain-model/azure-integration/remote-explanation/train_explain.py +++ b/how-to-use-azureml/explain-model/azure-integration/remote-explanation/train_explain.py @@ -3,8 +3,8 @@ from sklearn import datasets from sklearn.linear_model import Ridge -from azureml.explain.model.tabular_explainer import TabularExplainer -from azureml.contrib.explain.model.explanation.explanation_client import ExplanationClient +from interpret.ext.blackbox import TabularExplainer +from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient from sklearn.model_selection import train_test_split from azureml.core.run import Run from sklearn.externals import joblib diff --git a/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.ipynb b/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.ipynb index d8fbce8a..8a20442f 100644 --- a/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.ipynb +++ b/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.ipynb @@ -91,22 +91,22 @@ "\n", "# Explainers:\n", "# 1. SHAP Tabular Explainer\n", - "from azureml.explain.model.tabular_explainer import TabularExplainer\n", + "from interpret.ext.blackbox import TabularExplainer\n", "\n", "# OR\n", "\n", "# 2. Mimic Explainer\n", - "from azureml.explain.model.mimic.mimic_explainer import MimicExplainer\n", + "from interpret.ext.blackbox import MimicExplainer\n", "# You can use one of the following four interpretable models as a global surrogate to the black box model\n", - "from azureml.explain.model.mimic.models.lightgbm_model import LGBMExplainableModel\n", - "from azureml.explain.model.mimic.models.linear_model import LinearExplainableModel\n", - "from azureml.explain.model.mimic.models.linear_model import SGDExplainableModel\n", - "from azureml.explain.model.mimic.models.tree_model import DecisionTreeExplainableModel\n", + "from interpret.ext.glassbox import LGBMExplainableModel\n", + "from interpret.ext.glassbox import LinearExplainableModel\n", + "from interpret.ext.glassbox import SGDExplainableModel\n", + "from interpret.ext.glassbox import DecisionTreeExplainableModel\n", "\n", "# OR\n", "\n", "# 3. PFI Explainer\n", - "from azureml.explain.model.permutation.permutation_importance import PFIExplainer " + "from interpret.ext.blackbox import PFIExplainer " ] }, { @@ -451,8 +451,8 @@ "source": [ "import azureml.core\n", "from azureml.core import Workspace, Experiment, Run\n", - "from azureml.explain.model.tabular_explainer import TabularExplainer\n", - "from azureml.contrib.explain.model.explanation.explanation_client import ExplanationClient\n", + "from interpret.ext.blackbox import TabularExplainer\n", + "from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient\n", "# Check core SDK version number\n", "print(\"SDK version:\", azureml.core.VERSION)" ] @@ -564,7 +564,7 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.contrib.explain.model.visualize import ExplanationDashboard" + "from azureml.contrib.interpret.visualize import ExplanationDashboard" ] }, { diff --git a/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.yml b/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.yml index bc244c09..9da4043a 100644 --- a/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.yml +++ b/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.yml @@ -2,5 +2,6 @@ name: save-retrieve-explanations-run-history dependencies: - pip: - azureml-sdk - - azureml-explain-model - - azureml-contrib-explain-model + - interpret + - azureml-interpret + - azureml-contrib-interpret diff --git a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb index 403536e7..53b55897 100644 --- a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb +++ b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb @@ -173,7 +173,7 @@ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn_pandas import DataFrameMapper\n", "\n", - "from azureml.explain.model.tabular_explainer import TabularExplainer\n", + "from interpret.ext.blackbox import TabularExplainer\n", "\n", "os.makedirs('./outputs', exist_ok=True)\n", "\n", @@ -260,7 +260,7 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.explain.model.scoring.scoring_explainer import TreeScoringExplainer, save\n", + "from azureml.interpret.scoring.scoring_explainer import TreeScoringExplainer, save\n", "# ScoringExplainer\n", "scoring_explainer = TreeScoringExplainer(tabular_explainer)\n", "# Pickle scoring explainer locally\n", @@ -290,7 +290,7 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.contrib.explain.model.visualize import ExplanationDashboard" + "from azureml.contrib.interpret.visualize import ExplanationDashboard" ] }, { @@ -321,8 +321,8 @@ "\n", "# WARNING: to install this, g++ needs to be available on the Docker image and is not by default (look at the next cell)\n", "azureml_pip_packages = [\n", - " 'azureml-defaults', 'azureml-contrib-explain-model', 'azureml-core', 'azureml-telemetry',\n", - " 'azureml-explain-model'\n", + " 'azureml-defaults', 'azureml-contrib-interpret', 'azureml-core', 'azureml-telemetry',\n", + " 'azureml-interpret'\n", "]\n", " \n", "\n", diff --git a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.yml b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.yml index 8338f5fe..067d7168 100644 --- a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.yml +++ b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.yml @@ -2,6 +2,7 @@ name: train-explain-model-locally-and-deploy dependencies: - pip: - azureml-sdk - - azureml-explain-model - - azureml-contrib-explain-model + - interpret + - azureml-interpret + - azureml-contrib-interpret - sklearn-pandas diff --git a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb index 7138574f..e4e4fc39 100644 --- a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb +++ b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb @@ -240,8 +240,8 @@ "run_config.auto_prepare_environment = True\n", "\n", "azureml_pip_packages = [\n", - " 'azureml-defaults', 'azureml-contrib-explain-model', 'azureml-core', 'azureml-telemetry',\n", - " 'azureml-explain-model', 'azureml-dataprep'\n", + " 'azureml-defaults', 'azureml-contrib-interpret', 'azureml-core', 'azureml-telemetry',\n", + " 'azureml-interpret', 'azureml-dataprep'\n", "]\n", " \n", "\n", @@ -321,7 +321,7 @@ "outputs": [], "source": [ "# retrieve global explanation for visualization\n", - "from azureml.contrib.explain.model.explanation.explanation_client import ExplanationClient\n", + "from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient\n", "\n", "# get model explanation data\n", "client = ExplanationClient.from_run(run)\n", @@ -355,7 +355,7 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.contrib.explain.model.visualize import ExplanationDashboard" + "from azureml.contrib.interpret.visualize import ExplanationDashboard" ] }, { @@ -392,8 +392,8 @@ "\n", "# WARNING: to install this, g++ needs to be available on the Docker image and is not by default (look at the next cell)\n", "azureml_pip_packages = [\n", - " 'azureml-defaults', 'azureml-contrib-explain-model', 'azureml-core', 'azureml-telemetry',\n", - " 'azureml-explain-model'\n", + " 'azureml-defaults', 'azureml-contrib-interpret', 'azureml-core', 'azureml-telemetry',\n", + " 'azureml-interpret'\n", "]\n", " \n", "\n", diff --git a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.yml b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.yml index 5657cbe3..0915508d 100644 --- a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.yml +++ b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.yml @@ -2,7 +2,9 @@ name: train-explain-model-on-amlcompute-and-deploy dependencies: - pip: - azureml-sdk - - azureml-explain-model - - azureml-contrib-explain-model + - interpret + - azureml-interpret + - azureml-contrib-interpret - sklearn-pandas - azureml-dataprep + - azureml-core diff --git a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train_explain.py b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train_explain.py index b8fb1bd8..6c62d26e 100644 --- a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train_explain.py +++ b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train_explain.py @@ -14,9 +14,9 @@ from sklearn.linear_model import LogisticRegression from sklearn_pandas import DataFrameMapper from azureml.core.run import Run -from azureml.explain.model.tabular_explainer import TabularExplainer -from azureml.contrib.explain.model.explanation.explanation_client import ExplanationClient -from azureml.explain.model.scoring.scoring_explainer import LinearScoringExplainer, save +from interpret.ext.blackbox import TabularExplainer +from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient +from azureml.interpret.scoring.scoring_explainer import LinearScoringExplainer, save OUTPUT_DIR = './outputs/' os.makedirs(OUTPUT_DIR, exist_ok=True) diff --git a/how-to-use-azureml/explain-model/tabular-data/advanced-feature-transformations-explain-local.ipynb b/how-to-use-azureml/explain-model/tabular-data/advanced-feature-transformations-explain-local.ipynb index 32364bdb..8cafd6e1 100644 --- a/how-to-use-azureml/explain-model/tabular-data/advanced-feature-transformations-explain-local.ipynb +++ b/how-to-use-azureml/explain-model/tabular-data/advanced-feature-transformations-explain-local.ipynb @@ -50,7 +50,7 @@ "\n", "We will showcase raw feature transformations with three tabular data explainers: TabularExplainer (SHAP), MimicExplainer (global surrogate), and PFIExplainer.\n", "\n", - "| ![Interpretability Toolkit Architecture](./img/interpretability-architecture.PNG) |\n", + "| ![Interpretability Toolkit Architecture](./img/interpretability-architecture.png) |\n", "|:--:|\n", "| *Interpretability Toolkit Architecture* |\n", "\n", @@ -93,22 +93,22 @@ "\n", "# Explainers:\n", "# 1. SHAP Tabular Explainer\n", - "from azureml.explain.model.tabular_explainer import TabularExplainer\n", + "from interpret.ext.blackbox import TabularExplainer\n", "\n", "# OR\n", "\n", "# 2. Mimic Explainer\n", - "from azureml.explain.model.mimic.mimic_explainer import MimicExplainer\n", + "from interpret.ext.blackbox import MimicExplainer\n", "# You can use one of the following four interpretable models as a global surrogate to the black box model\n", - "from azureml.explain.model.mimic.models.lightgbm_model import LGBMExplainableModel\n", - "from azureml.explain.model.mimic.models.linear_model import LinearExplainableModel\n", - "from azureml.explain.model.mimic.models.linear_model import SGDExplainableModel\n", - "from azureml.explain.model.mimic.models.tree_model import DecisionTreeExplainableModel\n", + "from interpret.ext.glassbox import LGBMExplainableModel\n", + "from interpret.ext.glassbox import LinearExplainableModel\n", + "from interpret.ext.glassbox import SGDExplainableModel\n", + "from interpret.ext.glassbox import DecisionTreeExplainableModel\n", "\n", "# OR\n", "\n", "# 3. PFI Explainer\n", - "from azureml.explain.model.permutation.permutation_importance import PFIExplainer " + "from interpret.ext.blackbox import PFIExplainer " ] }, { @@ -442,7 +442,7 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.contrib.explain.model.visualize import ExplanationDashboard" + "from azureml.contrib.interpret.visualize import ExplanationDashboard" ] }, { diff --git a/how-to-use-azureml/explain-model/tabular-data/advanced-feature-transformations-explain-local.yml b/how-to-use-azureml/explain-model/tabular-data/advanced-feature-transformations-explain-local.yml index f3ff11fb..ee9cbc2b 100644 --- a/how-to-use-azureml/explain-model/tabular-data/advanced-feature-transformations-explain-local.yml +++ b/how-to-use-azureml/explain-model/tabular-data/advanced-feature-transformations-explain-local.yml @@ -2,6 +2,7 @@ name: advanced-feature-transformations-explain-local dependencies: - pip: - azureml-sdk - - azureml-explain-model - - azureml-contrib-explain-model + - interpret + - azureml-interpret + - azureml-contrib-interpret - sklearn-pandas diff --git a/how-to-use-azureml/explain-model/tabular-data/explain-binary-classification-local.ipynb b/how-to-use-azureml/explain-model/tabular-data/explain-binary-classification-local.ipynb index ef7003fa..39d47f94 100644 --- a/how-to-use-azureml/explain-model/tabular-data/explain-binary-classification-local.ipynb +++ b/how-to-use-azureml/explain-model/tabular-data/explain-binary-classification-local.ipynb @@ -48,7 +48,7 @@ "\n", "We will showcase three tabular data explainers: TabularExplainer (SHAP), MimicExplainer (global surrogate), and PFIExplainer.\n", "\n", - "| ![Interpretability Toolkit Architecture](./img/interpretability-architecture.PNG) |\n", + "| ![Interpretability Toolkit Architecture](./img/interpretability-architecture.png) |\n", "|:--:|\n", "| *Interpretability Toolkit Architecture* |\n", "\n", @@ -86,22 +86,22 @@ "\n", "# Explainers:\n", "# 1. SHAP Tabular Explainer\n", - "from azureml.explain.model.tabular_explainer import TabularExplainer\n", + "from interpret.ext.blackbox import TabularExplainer\n", "\n", "# OR\n", "\n", "# 2. Mimic Explainer\n", - "from azureml.explain.model.mimic.mimic_explainer import MimicExplainer\n", + "from interpret.ext.blackbox import MimicExplainer\n", "# You can use one of the following four interpretable models as a global surrogate to the black box model\n", - "from azureml.explain.model.mimic.models.lightgbm_model import LGBMExplainableModel\n", - "from azureml.explain.model.mimic.models.linear_model import LinearExplainableModel\n", - "from azureml.explain.model.mimic.models.linear_model import SGDExplainableModel\n", - "from azureml.explain.model.mimic.models.tree_model import DecisionTreeExplainableModel\n", + "from interpret.ext.glassbox import LGBMExplainableModel\n", + "from interpret.ext.glassbox import LinearExplainableModel\n", + "from interpret.ext.glassbox import SGDExplainableModel\n", + "from interpret.ext.glassbox import DecisionTreeExplainableModel\n", "\n", "# OR\n", "\n", "# 3. PFI Explainer\n", - "from azureml.explain.model.permutation.permutation_importance import PFIExplainer " + "from interpret.ext.blackbox import PFIExplainer " ] }, { @@ -322,7 +322,7 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.contrib.explain.model.visualize import ExplanationDashboard" + "from azureml.contrib.interpret.visualize import ExplanationDashboard" ] }, { diff --git a/how-to-use-azureml/explain-model/tabular-data/explain-binary-classification-local.yml b/how-to-use-azureml/explain-model/tabular-data/explain-binary-classification-local.yml index 08042837..0a34ef54 100644 --- a/how-to-use-azureml/explain-model/tabular-data/explain-binary-classification-local.yml +++ b/how-to-use-azureml/explain-model/tabular-data/explain-binary-classification-local.yml @@ -2,5 +2,6 @@ name: explain-binary-classification-local dependencies: - pip: - azureml-sdk - - azureml-explain-model - - azureml-contrib-explain-model + - interpret + - azureml-interpret + - azureml-contrib-interpret diff --git a/how-to-use-azureml/explain-model/tabular-data/explain-multiclass-classification-local.ipynb b/how-to-use-azureml/explain-model/tabular-data/explain-multiclass-classification-local.ipynb index a5d7e7f9..9ff417da 100644 --- a/how-to-use-azureml/explain-model/tabular-data/explain-multiclass-classification-local.ipynb +++ b/how-to-use-azureml/explain-model/tabular-data/explain-multiclass-classification-local.ipynb @@ -49,7 +49,7 @@ "\n", "We will showcase three tabular data explainers: TabularExplainer (SHAP), MimicExplainer (global surrogate), and PFIExplainer.\n", "\n", - "| ![Interpretability Toolkit Architecture](./img/interpretability-architecture.PNG) |\n", + "| ![Interpretability Toolkit Architecture](./img/interpretability-architecture.png) |\n", "|:--:|\n", "| *Interpretability Toolkit Architecture* |\n", "\n", @@ -87,22 +87,22 @@ "\n", "# Explainers:\n", "# 1. SHAP Tabular Explainer\n", - "from azureml.explain.model.tabular_explainer import TabularExplainer\n", + "from interpret.ext.blackbox import TabularExplainer\n", "\n", "# OR\n", "\n", "# 2. Mimic Explainer\n", - "from azureml.explain.model.mimic.mimic_explainer import MimicExplainer\n", + "from interpret.ext.blackbox import MimicExplainer\n", "# You can use one of the following four interpretable models as a global surrogate to the black box model\n", - "from azureml.explain.model.mimic.models.lightgbm_model import LGBMExplainableModel\n", - "from azureml.explain.model.mimic.models.linear_model import LinearExplainableModel\n", - "from azureml.explain.model.mimic.models.linear_model import SGDExplainableModel\n", - "from azureml.explain.model.mimic.models.tree_model import DecisionTreeExplainableModel\n", + "from interpret.ext.glassbox import LGBMExplainableModel\n", + "from interpret.ext.glassbox import LinearExplainableModel\n", + "from interpret.ext.glassbox import SGDExplainableModel\n", + "from interpret.ext.glassbox import DecisionTreeExplainableModel\n", "\n", "# OR\n", "\n", "# 3. PFI Explainer\n", - "from azureml.explain.model.permutation.permutation_importance import PFIExplainer " + "from interpret.ext.blackbox import PFIExplainer " ] }, { @@ -326,7 +326,7 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.contrib.explain.model.visualize import ExplanationDashboard" + "from azureml.contrib.interpret.visualize import ExplanationDashboard" ] }, { diff --git a/how-to-use-azureml/explain-model/tabular-data/explain-multiclass-classification-local.yml b/how-to-use-azureml/explain-model/tabular-data/explain-multiclass-classification-local.yml index 98f22a4d..874c2cae 100644 --- a/how-to-use-azureml/explain-model/tabular-data/explain-multiclass-classification-local.yml +++ b/how-to-use-azureml/explain-model/tabular-data/explain-multiclass-classification-local.yml @@ -2,5 +2,6 @@ name: explain-multiclass-classification-local dependencies: - pip: - azureml-sdk - - azureml-explain-model - - azureml-contrib-explain-model + - interpret + - azureml-interpret + - azureml-contrib-interpret diff --git a/how-to-use-azureml/explain-model/tabular-data/explain-regression-local.ipynb b/how-to-use-azureml/explain-model/tabular-data/explain-regression-local.ipynb index 655c21fe..6d423a0e 100644 --- a/how-to-use-azureml/explain-model/tabular-data/explain-regression-local.ipynb +++ b/how-to-use-azureml/explain-model/tabular-data/explain-regression-local.ipynb @@ -48,7 +48,7 @@ "\n", "We will showcase three tabular data explainers: TabularExplainer (SHAP), MimicExplainer (global surrogate), and PFIExplainer.\n", "\n", - "| ![Interpretability Toolkit Architecture](./img/interpretability-architecture.PNG) |\n", + "| ![Interpretability Toolkit Architecture](./img/interpretability-architecture.png) |\n", "|:--:|\n", "| *Interpretability Toolkit Architecture* |\n", "\n", @@ -86,22 +86,22 @@ "\n", "# Explainers:\n", "# 1. SHAP Tabular Explainer\n", - "from azureml.explain.model.tabular_explainer import TabularExplainer\n", + "from interpret.ext.blackbox import TabularExplainer\n", "\n", "# OR\n", "\n", "# 2. Mimic Explainer\n", - "from azureml.explain.model.mimic.mimic_explainer import MimicExplainer\n", + "from interpret.ext.blackbox import MimicExplainer\n", "# You can use one of the following four interpretable models as a global surrogate to the black box model\n", - "from azureml.explain.model.mimic.models.lightgbm_model import LGBMExplainableModel\n", - "from azureml.explain.model.mimic.models.linear_model import LinearExplainableModel\n", - "from azureml.explain.model.mimic.models.linear_model import SGDExplainableModel\n", - "from azureml.explain.model.mimic.models.tree_model import DecisionTreeExplainableModel\n", + "from interpret.ext.glassbox import LGBMExplainableModel\n", + "from interpret.ext.glassbox import LinearExplainableModel\n", + "from interpret.ext.glassbox import SGDExplainableModel\n", + "from interpret.ext.glassbox import DecisionTreeExplainableModel\n", "\n", "# OR\n", "\n", "# 3. PFI Explainer\n", - "from azureml.explain.model.permutation.permutation_importance import PFIExplainer " + "from interpret.ext.blackbox import PFIExplainer " ] }, { @@ -315,7 +315,7 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.contrib.explain.model.visualize import ExplanationDashboard" + "from azureml.contrib.interpret.visualize import ExplanationDashboard" ] }, { diff --git a/how-to-use-azureml/explain-model/tabular-data/explain-regression-local.yml b/how-to-use-azureml/explain-model/tabular-data/explain-regression-local.yml index 6002b9ab..38fb1aba 100644 --- a/how-to-use-azureml/explain-model/tabular-data/explain-regression-local.yml +++ b/how-to-use-azureml/explain-model/tabular-data/explain-regression-local.yml @@ -2,5 +2,6 @@ name: explain-regression-local dependencies: - pip: - azureml-sdk - - azureml-explain-model - - azureml-contrib-explain-model + - interpret + - azureml-interpret + - azureml-contrib-interpret diff --git a/how-to-use-azureml/explain-model/tabular-data/simple-feature-transformations-explain-local.ipynb b/how-to-use-azureml/explain-model/tabular-data/simple-feature-transformations-explain-local.ipynb index 6e4b280f..1239547c 100644 --- a/how-to-use-azureml/explain-model/tabular-data/simple-feature-transformations-explain-local.ipynb +++ b/how-to-use-azureml/explain-model/tabular-data/simple-feature-transformations-explain-local.ipynb @@ -49,7 +49,7 @@ "\n", "We will showcase raw feature transformations with three tabular data explainers: TabularExplainer (SHAP), MimicExplainer (global surrogate), and PFIExplainer.\n", "\n", - "| ![Interpretability Toolkit Architecture](./img/interpretability-architecture.PNG) |\n", + "| ![Interpretability Toolkit Architecture](./img/interpretability-architecture.png) |\n", "|:--:|\n", "| *Interpretability Toolkit Architecture* |\n", "\n", @@ -92,22 +92,22 @@ "\n", "# Explainers:\n", "# 1. SHAP Tabular Explainer\n", - "from azureml.explain.model.tabular_explainer import TabularExplainer\n", + "from interpret.ext.blackbox import TabularExplainer\n", "\n", "# OR\n", "\n", "# 2. Mimic Explainer\n", - "from azureml.explain.model.mimic.mimic_explainer import MimicExplainer\n", + "from interpret.ext.blackbox import MimicExplainer\n", "# You can use one of the following four interpretable models as a global surrogate to the black box model\n", - "from azureml.explain.model.mimic.models.lightgbm_model import LGBMExplainableModel\n", - "from azureml.explain.model.mimic.models.linear_model import LinearExplainableModel\n", - "from azureml.explain.model.mimic.models.linear_model import SGDExplainableModel\n", - "from azureml.explain.model.mimic.models.tree_model import DecisionTreeExplainableModel\n", + "from interpret.ext.glassbox import LGBMExplainableModel\n", + "from interpret.ext.glassbox import LinearExplainableModel\n", + "from interpret.ext.glassbox import SGDExplainableModel\n", + "from interpret.ext.glassbox import DecisionTreeExplainableModel\n", "\n", "# OR\n", "\n", "# 3. PFI Explainer\n", - "from azureml.explain.model.permutation.permutation_importance import PFIExplainer " + "from interpret.ext.blackbox import PFIExplainer " ] }, { @@ -224,7 +224,7 @@ "# Append classifier to preprocessing pipeline.\n", "# Now we have a full prediction pipeline.\n", "clf = Pipeline(steps=[('preprocessor', transformations),\n", - " ('classifier', SVC(kernel='linear', C = 1.0, probability=True))])" + " ('classifier', SVC(C = 1.0, probability=True))])" ] }, { @@ -250,7 +250,7 @@ "# Append classifier to preprocessing pipeline.\n", "# Now we have a full prediction pipeline.\n", "clf = Pipeline(steps=[('preprocessor', transformations),\n", - " ('classifier', SVC(kernel='linear', C = 1.0, probability=True))]) \n", + " ('classifier', SVC(C = 1.0, probability=True))]) \n", "\n", "\n", "\n", @@ -450,7 +450,7 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.contrib.explain.model.visualize import ExplanationDashboard" + "from azureml.contrib.interpret.visualize import ExplanationDashboard" ] }, { diff --git a/how-to-use-azureml/explain-model/tabular-data/simple-feature-transformations-explain-local.yml b/how-to-use-azureml/explain-model/tabular-data/simple-feature-transformations-explain-local.yml index 969e1f52..91579253 100644 --- a/how-to-use-azureml/explain-model/tabular-data/simple-feature-transformations-explain-local.yml +++ b/how-to-use-azureml/explain-model/tabular-data/simple-feature-transformations-explain-local.yml @@ -2,6 +2,7 @@ name: simple-feature-transformations-explain-local dependencies: - pip: - azureml-sdk - - azureml-explain-model - - azureml-contrib-explain-model + - interpret + - azureml-interpret + - azureml-contrib-interpret - sklearn-pandas diff --git a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-data-transfer.ipynb b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-data-transfer.ipynb index 2f9b6fb9..116d3353 100644 --- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-data-transfer.ipynb +++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-data-transfer.ipynb @@ -19,12 +19,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Azure Machine Learning Pipeline with DataTranferStep\n", - "This notebook is used to demonstrate the use of DataTranferStep in Azure Machine Learning Pipeline.\n", + "# Azure Machine Learning Pipeline with DataTransferStep\n", + "This notebook is used to demonstrate the use of DataTransferStep in an Azure Machine Learning Pipeline.\n", "\n", "In certain cases, you will need to transfer data from one data location to another. For example, your data may be in Azure SQL Database and you may want to move it to Azure Data Lake storage. Or, your data is in an ADLS account and you want to make it available in the Blob storage. The built-in **DataTransferStep** class helps you transfer data in these situations.\n", "\n", - "The below examples show how to move data between an ADLS account, Blob storage, SQL Server, PostgreSQL server. \n", + "The below examples show how to move data between different storage types supported in Azure Machine Learning.\n", "\n", "## Data transfer currently supports following storage types:\n", "\n", @@ -99,7 +99,7 @@ "\n", "For background on registering your data store, consult this article:\n", "\n", - "https://docs.microsoft.com/en-us/azure/data-lake-store/data-lake-store-service-to-service-authenticate-using-active-directory\n", + "https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-access-data\n", "\n", "> Please make sure to update the following code examples with appropriate values." ] @@ -108,13 +108,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Azure Blob Storage" + "### Azure Blob Storage\n", + "\n", + "> Since Blob Storage can contain a file and directory with the same name, you can use **source_reference_type** and **destination_reference_type** optional arguments in DataTransferStep constructor to explicitly specify whether you're referring to the file or the directory." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "datastore-remarks-sample" + ] + }, "outputs": [], "source": [ "from msrest.exceptions import HttpOperationError\n", @@ -146,7 +152,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Azure Data Lake Storage Gen1" + "### Azure Data Lake Storage Gen1\n", + "\n", + "Please consult the following articles for detailed steps on setting up service principal authentication and assigning correct permissions to Data Lake Storage account:\n", + "\n", + "https://docs.microsoft.com/en-us/azure/data-lake-store/data-lake-store-service-to-service-authenticate-using-active-directory\n", + "https://docs.microsoft.com/en-us/azure/data-factory/connector-azure-data-lake-store#use-service-principal-authentication" ] }, { @@ -188,7 +199,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Azure Data Lake Storage Gen2" + "### Azure Data Lake Storage Gen2\n", + "\n", + "Please consult the following article for detailed steps on setting up service principal authentication and assigning correct permissions to Data lake Storage Gen2 account:\n", + "\n", + "https://docs.microsoft.com/en-us/azure/data-factory/connector-azure-data-lake-storage#service-principal-authentication" ] }, { @@ -228,7 +243,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Azure SQL Database" + "### Azure SQL Database\n", + "\n", + "For enabling service principal authentication for an Azure SQL Database, please follow this section in Azure Data Factory documentation: https://docs.microsoft.com/en-us/azure/data-factory/connector-azure-sql-database#service-principal-authentication\n", + "\n", + "> Note: When copying data **to** an Azure SQL Database, data will be _appended_ to an existing table. We also expect the source file to have a header row and the names should exactly match with column names in destination table." ] }, { @@ -476,7 +495,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ diff --git a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-estimatorstep.ipynb b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-estimatorstep.ipynb index a038b9e9..ec4626a1 100644 --- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-estimatorstep.ipynb +++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-estimatorstep.ipynb @@ -158,7 +158,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "datareference-remarks-sample" + ] + }, "outputs": [], "source": [ "from azureml.core import Datastore\n", diff --git a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-showcasing-datapath-and-pipelineparameter.ipynb b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-showcasing-datapath-and-pipelineparameter.ipynb index 607f24f0..904ae175 100644 --- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-showcasing-datapath-and-pipelineparameter.ipynb +++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-showcasing-datapath-and-pipelineparameter.ipynb @@ -185,7 +185,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "datapath-remarks-sample" + ] + }, "outputs": [], "source": [ "def_blob_store = ws.get_default_datastore()\n", diff --git a/how-to-use-azureml/ml-frameworks/pytorch/training/distributed-pytorch-with-nccl-gloo/pytorch_mnist.py b/how-to-use-azureml/ml-frameworks/pytorch/training/distributed-pytorch-with-nccl-gloo/pytorch_mnist.py index e2b982d2..7a9aeb60 100644 --- a/how-to-use-azureml/ml-frameworks/pytorch/training/distributed-pytorch-with-nccl-gloo/pytorch_mnist.py +++ b/how-to-use-azureml/ml-frameworks/pytorch/training/distributed-pytorch-with-nccl-gloo/pytorch_mnist.py @@ -62,7 +62,7 @@ if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) -train_dataset = datasets.MNIST('data', train=True, download=True, +train_dataset = datasets.MNIST('data-%d' % args.rank, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) diff --git a/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/tf_mnist.py b/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/tf_mnist.py index 34bb8fa0..32e0b8f2 100644 --- a/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/tf_mnist.py +++ b/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/tf_mnist.py @@ -10,7 +10,7 @@ import glob from azureml.core import Run from utils import load_data -print("TensorFlow version:", tf.VERSION) +print("TensorFlow version:", tf.__version__) parser = argparse.ArgumentParser() parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point') diff --git a/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.yml b/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.yml index 2d50f3c4..387a8597 100644 --- a/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.yml +++ b/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.yml @@ -1,14 +1,13 @@ name: train-hyperparameter-tune-deploy-with-tensorflow dependencies: - numpy -- tensorflow - matplotlib - pip: - azureml-sdk - azureml-widgets - pandas - keras - - tensorflow-gpu + - tensorflow-gpu==1.13.2 - matplotlib - azureml-dataprep - fuse diff --git a/how-to-use-azureml/ml-frameworks/tensorflow/training/train-tensorflow-resume-training/tf_mnist_with_checkpoint.py b/how-to-use-azureml/ml-frameworks/tensorflow/training/train-tensorflow-resume-training/tf_mnist_with_checkpoint.py index acfd711e..598c4b08 100644 --- a/how-to-use-azureml/ml-frameworks/tensorflow/training/train-tensorflow-resume-training/tf_mnist_with_checkpoint.py +++ b/how-to-use-azureml/ml-frameworks/tensorflow/training/train-tensorflow-resume-training/tf_mnist_with_checkpoint.py @@ -11,7 +11,7 @@ import glob from azureml.core import Run from utils import load_data -print("TensorFlow version:", tf.VERSION) +print("TensorFlow version:", tf.__version__) parser = argparse.ArgumentParser() parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point') diff --git a/how-to-use-azureml/ml-frameworks/tensorflow/training/train-tensorflow-resume-training/train-tensorflow-resume-training.yml b/how-to-use-azureml/ml-frameworks/tensorflow/training/train-tensorflow-resume-training/train-tensorflow-resume-training.yml index 1731084f..c58ca83d 100644 --- a/how-to-use-azureml/ml-frameworks/tensorflow/training/train-tensorflow-resume-training/train-tensorflow-resume-training.yml +++ b/how-to-use-azureml/ml-frameworks/tensorflow/training/train-tensorflow-resume-training/train-tensorflow-resume-training.yml @@ -5,7 +5,7 @@ dependencies: - azureml-widgets - pandas - keras - - tensorflow-gpu + - tensorflow-gpu==1.13.2 - matplotlib - azureml-dataprep - fuse diff --git a/how-to-use-azureml/monitor-models/data-drift/azure-ml-datadrift.ipynb b/how-to-use-azureml/monitor-models/data-drift/azure-ml-datadrift.ipynb index a04ae149..dee55323 100644 --- a/how-to-use-azureml/monitor-models/data-drift/azure-ml-datadrift.ipynb +++ b/how-to-use-azureml/monitor-models/data-drift/azure-ml-datadrift.ipynb @@ -33,9 +33,9 @@ "source": [ "## Install the DataDrift package\n", "\n", - "Install the azureml-contrib-datadrift, azureml-opendatasets and lightgbm packages before running this notebook.\n", + "Install the azureml-datadrift, azureml-opendatasets and lightgbm packages before running this notebook.\n", "```\n", - "pip install azureml-contrib-datadrift\n", + "pip install azureml-datadrift\n", "pip install lightgbm\n", "```" ] @@ -61,15 +61,14 @@ "import numpy as np\n", "import pandas as pd\n", "import requests\n", - "from azureml.contrib.datadrift import DataDriftDetector, AlertConfiguration\n", - "from azureml.opendatasets import NoaaIsdWeather\n", - "from azureml.core import Dataset, Workspace, Run\n", + "from azureml.core import Dataset, Workspace\n", "from azureml.core.compute import AksCompute, ComputeTarget\n", "from azureml.core.conda_dependencies import CondaDependencies\n", - "from azureml.core.experiment import Experiment\n", "from azureml.core.image import ContainerImage\n", "from azureml.core.model import Model\n", "from azureml.core.webservice import Webservice, AksWebservice\n", + "from azureml.datadrift import DataDriftDetector, AlertConfiguration\n", + "from azureml.opendatasets import NoaaIsdWeather\n", "from azureml.widgets import RunDetails\n", "from sklearn.externals import joblib\n", "from sklearn.model_selection import train_test_split\n" @@ -248,18 +247,16 @@ "metadata": {}, "outputs": [], "source": [ - "dataset_name = \"dataset\"\n", - "name_suffix = datetime.utcnow().strftime(\"%Y-%m-%d-%H-%M-%S\")\n", - "snapshot_name = \"snapshot-{}\".format(name_suffix)\n", - "\n", + "dataset_name = \"training_dataset\"\n", "dstore = ws.get_default_datastore()\n", "dstore.upload(training_dir, \"data/training\", show_progress=True)\n", - "dpath = dstore.path(\"data/training/training.csv\")\n", - "trainingDataset = Dataset.auto_read_files(dpath, include_path=True)\n", - "trainingDataset = trainingDataset.register(workspace=ws, name=dataset_name, description=\"dset\", exist_ok=True)\n", + "\n", + "datastore_path = [(dstore, 'data/training/training.csv')]\n", + "trainingDataset = Dataset.Tabular.from_delimited_files(path=datastore_path)\n", + "trainingDataset = trainingDataset.register(workspace=ws, name=dataset_name, description=\"training\", create_new_version=True)\n", "\n", "datasets = [(Dataset.Scenario.TRAINING, trainingDataset)]\n", - "print(\"dataset registration done.\\n\")\n", + "print(\"Dataset registration done.\\n\")\n", "datasets" ] }, @@ -505,11 +502,9 @@ "outputs": [], "source": [ "# One Hot Encode the scoring dataset to match the training dataset schema\n", - "columns_dict = model.datasets[\"training\"][0].get_profile().columns\n", - "extra_cols = ('Path', 'Column1')\n", - "for k in extra_cols:\n", - " columns_dict.pop(k, None)\n", - "training_columns = list(columns_dict.keys())\n", + "columns = list(model.datasets[\"training\"][0].to_pandas_dataframe().columns)\n", + "extra_cols = ['Path', 'Column1']\n", + "training_columns = [c for c in columns if c not in extra_cols]\n", "\n", "categorical_columns = scoring_df.dtypes == object\n", "categorical_columns = categorical_columns[categorical_columns == True]\n", @@ -641,9 +636,8 @@ "metadata": {}, "outputs": [], "source": [ - "exp = Experiment(ws, datadrift._id)\n", - "dd_run = Run(experiment=exp, run_id=run)\n", - "RunDetails(dd_run).show()" + "child_run = list(run.get_children())[0]\n", + "RunDetails(child_run).show()" ] }, { @@ -659,9 +653,7 @@ "metadata": {}, "outputs": [], "source": [ - "children = list(dd_run.get_children())\n", - "for child in children:\n", - " child.wait_for_completion()\n", + "child_run.wait_for_completion(wait_post_processing=True)\n", "\n", "drift_metrics = datadrift.get_output(start_time=start, end_time=end)\n", "drift_metrics" diff --git a/how-to-use-azureml/monitor-models/data-drift/azure-ml-datadrift.yml b/how-to-use-azureml/monitor-models/data-drift/azure-ml-datadrift.yml index 5d80f7b7..80428f7f 100644 --- a/how-to-use-azureml/monitor-models/data-drift/azure-ml-datadrift.yml +++ b/how-to-use-azureml/monitor-models/data-drift/azure-ml-datadrift.yml @@ -2,7 +2,7 @@ name: azure-ml-datadrift dependencies: - pip: - azureml-sdk - - azureml-contrib-datadrift + - azureml-datadrift - azureml-opendatasets - lightgbm - azureml-widgets diff --git a/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb b/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb index 7f0f6169..2a23024e 100644 --- a/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb +++ b/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb @@ -100,7 +100,7 @@ "\n", "# Check core SDK version number\n", "\n", - "print(\"This notebook was created using SDK version 1.0.65, you are currently running version\", azureml.core.VERSION)" + "print(\"This notebook was created using SDK version 1.0.69, you are currently running version\", azureml.core.VERSION)" ] }, { diff --git a/how-to-use-azureml/track-and-monitor-experiments/tensorboard/tensorboard.yml b/how-to-use-azureml/track-and-monitor-experiments/tensorboard/tensorboard.yml index de683457..3709016d 100644 --- a/how-to-use-azureml/track-and-monitor-experiments/tensorboard/tensorboard.yml +++ b/how-to-use-azureml/track-and-monitor-experiments/tensorboard/tensorboard.yml @@ -3,4 +3,4 @@ dependencies: - pip: - azureml-sdk - azureml-tensorboard - - tensorflow + - tensorflow<2.0.0 diff --git a/how-to-use-azureml/training-with-deep-learning/export-run-history-to-tensorboard/export-run-history-to-tensorboard.yml b/how-to-use-azureml/training-with-deep-learning/export-run-history-to-tensorboard/export-run-history-to-tensorboard.yml index bb1effa4..fe20bc28 100644 --- a/how-to-use-azureml/training-with-deep-learning/export-run-history-to-tensorboard/export-run-history-to-tensorboard.yml +++ b/how-to-use-azureml/training-with-deep-learning/export-run-history-to-tensorboard/export-run-history-to-tensorboard.yml @@ -3,7 +3,7 @@ dependencies: - pip: - azureml-sdk - azureml-tensorboard - - tensorflow + - tensorflow<2.0.0 - tqdm - scipy - sklearn diff --git a/how-to-use-azureml/work-with-data/README.md b/how-to-use-azureml/work-with-data/README.md index cfc265b2..b5ed6340 100644 --- a/how-to-use-azureml/work-with-data/README.md +++ b/how-to-use-azureml/work-with-data/README.md @@ -1,9 +1,20 @@ -# Work With Data Using Azure Machine Learning Service +# Azure Machine Learning datasets (preview) -Azure Machine Learning Datasets (preview) make it easier to access and work with your data. Datasets manage data in various scenarios such as model training and pipeline creation. Using the Azure Machine Learning SDK, you can access underlying storage, explore and prepare data, manage the life cycle of different Dataset definitions, and compare between Datasets used in training and in production. +Azure Machine Learning datasets (preview) let data scientists and machine learning engineers apply data for ML with confidence. By creating a dataset, you create a reference to the data source location, along with a copy of its metadata. The data remains in its existing location, so no extra storage cost is incurred. -- For an example of using Datasets, see the [sample](datasets). -- For advanced data preparation examples, see [dataprep](dataprep). +With Azure Machine Learning datasets, you can: +* **Keep a single copy of data in your storage** referenced by datasets. +* **Easily access data during model training** without worrying about connection string or data path. +* **Share data & collaborate** with other users. -![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/README..png) \ No newline at end of file +## Learn how to use Azure Machine Learning datasets +* [Create and register datasets](https://aka.ms/azureml/howto/createdatasets) +* Use [Datasets in training](datasets-tutorial/train-with-datasets.ipynb) +* Use TabularDatasets in [automated machine learning training](https://aka.ms/automl-dataset) +* Use FileDatasets in [image classification](https://aka.ms/filedataset-samplenotebook) +* Use FileDatasets in [deep learning with hyperparameter tuning](https://aka.ms/filedataset-hyperdrive) +* For existing Dataset users: [Dataset API change notice](dataset-api-change-notice.md) + + +![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/README.png) \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/README.md b/how-to-use-azureml/work-with-data/dataprep/README.md deleted file mode 100644 index a356d134..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/README.md +++ /dev/null @@ -1,300 +0,0 @@ -# Azure Machine Learning Data Prep SDK - -The Azure Machine Learning Data Prep SDK helps data scientists explore, cleanse and transform data for machine learning workflows in any Python environment. - -Key benefits to the SDK: -- Cross-platform functionality. Write with a single SDK and run it on Windows, macOS, or Linux. -- Intelligent transformations powered by AI, including grouping similar values to their canonical form and deriving columns by examples without custom code. -- Capability to work with large, multiple files of different schema. -- Scalability on a single machine by streaming data during processing rather than loading into memory. -- Seamless integration with other Azure Machine Learning services. You can simply pass your prepared data file into `AutoMLConfig` object for automated machine learning training. - -You will find in this repo: -- [Getting Started Tutorial](tutorials/getting-started/getting-started.ipynb) for a quick introduction to the main features of Data Prep SDK. -- [Case Study Notebooks](case-studies/new-york-taxi) that present an end-to-end data preparation tutorial where users start with small dataset, profile data with statistics summary, cleanse and perform feature engineering. All transformation steps are saved in a dataflow object. Users can easily reapply the same steps on the full dataset, and run it on Spark. -- [How-To Guide Notebooks](how-to-guides) for more in-depth sample code at feature level. - -## Installation -Here are the [SDK installation steps](https://aka.ms/aml-data-prep-installation). - -## Documentation -Here is more information on how to use the new Data Prep SDK: -- [SDK overview and API reference docs](http://aka.ms/data-prep-sdk) that show different classes, methods, and function parameters for the SDK. -- [Tutorial: Prep NYC taxi data](https://docs.microsoft.com/azure/machine-learning/service/tutorial-data-prep) for regression modeling and then run automated machine learning to build the model. -- [How to load data](https://docs.microsoft.com/azure/machine-learning/service/how-to-load-data) is an overview guide on how to load data using the Data Prep SDK. -- [How to transform data](https://docs.microsoft.com/azure/machine-learning/service/how-to-transform-data) is an overview guide on how to transform data. -- [How to write data](https://docs.microsoft.com/azure/machine-learning/service/how-to-write-data) is an overview guide on how to write data to different storage locations. - -## Support - -If you have any questions or feedback, send us an email at: [askamldataprep@microsoft.com](mailto:askamldataprep@microsoft.com). - -## Release Notes - -### 2019-07-25 (version 1.1.9) -New features -- Added support for reading a file directly from a http or https url. - -Bug fixes and improvements -- Improved error message when attempting to read a Parquet Dataset from a remote source (which is not currently supported). -- Fixed a bug when writing to Parquet file format in ADLS Gen 2, and updating the ADLS Gen 2 container name in the path. - -### 2019-07-09 (version 1.1.8) - -New features -- Dataflow objects can now be iterated over, producing a sequence of records. See documentation for `Dataflow.to_record_iterator`. - -Bug fixes and improvements -- Increased the robustness of DataPrep SDK. -- Improved handling of pandas DataFrames with non-string Column Indexes. -- Improved the performance of `to_pandas_dataframe` in Datasets. -- Fixed a bug where Spark execution of Datasets failed when run in a multi-node environment. - -### 2019-07-01 (version 1.1.7) - -We reverted a change that improved performance, as it was causing issues for some customers using Azure Databricks. If you experienced an issue on Azure Databricks, you can upgrade to version 1.1.7 using one of the methods below: -1. Run this script to upgrade: `%sh /home/ubuntu/databricks/python/bin/pip install azureml-dataprep==1.1.7` -2. Recreate the cluster, which will install the latest Data Prep SDK version. - -### 2019-06-24 (version 1.1.6) - -New features -- Added summary functions for top values (`SummaryFunction.TOPVALUES`) and bottom values (`SummaryFunction.BOTTOMVALUES`). - -Bug fixes and improvements -- Significantly improved the performance of `read_pandas_dataframe`. -- Fixed a bug that would cause `get_profile()` on a Dataflow pointing to binary files to fail. -- Exposed `set_diagnostics_collection()` to allow for programmatic enabling/disabling of the telemetry collection. -- Changed the behavior of `get_profile()`. NaN values are now ignored for Min, Mean, Std, and Sum, which aligns with the behavior of Pandas. - -### 2019-06-10 (version 1.1.5) - -Bug fixes and improvements -- For interpreted datetime values that have a 2-digit year format, the range of valid years has been updated to match Windows May Release. The range has been changed from 1930-2029 to 1950-2049. -- When reading in a file and setting `handleQuotedLineBreaks=True`, `\r` will be treated as a new line. -- Fixed a bug that caused `read_pandas_dataframe` to fail in some cases. -- Improved performance of `get_profile`. -- Improved error messages. - -### 2019-05-28 (version 1.1.4) - -New features -- You can now use the following expression language functions to extract and parse datetime values into new columns. - - `RegEx.extract_record()` extracts datetime elements into a new column. - - `create_datetime()` creates datetime objects from separate datetime elements. -- When calling `get_profile()`, you can now see that quantile columns are labeled as (est.) to clearly indicate that the values are approximations. -- You can now use ** globbing when reading from Azure Blob Storage. - - e.g. `dprep.read_csv(path='https://yourblob.blob.core.windows.net/yourcontainer/**/data/*.csv')` - -Bug fixes -- Fixed a bug related to reading a Parquet file from a remote source (Azure Blob). - -### 2019-05-08 (version 1.1.3) - -New features -- Added support to read from a PostgresSQL database, either by calling `read_postgresql` or using a Datastore. - - See examples in how-to guides: - - [Data Ingestion notebook](https://aka.ms/aml-data-prep-ingestion-nb) - - [Datastore notebook](https://aka.ms/aml-data-prep-datastore-nb) - -Bug fixes and improvements -- Fixed issues with column type conversion: - - Now correctly converts a boolean or numeric column to a boolean column. - - Now does not fail when attempting to set a date column to be date type. -- Improved JoinType types and accompanying reference documentation. When joining two dataflows, you can now specify one of these types of join: - - NONE, MATCH, INNER, UNMATCHLEFT, LEFTANTI, LEFTOUTER, UNMATCHRIGHT, RIGHTANTI, RIGHTOUTER, FULLANTI, FULL. -- Improved data type inference to recognize more date formats. - -### 2019-04-17 (version 1.1.2) - -Note: Data Prep Python SDK will no longer install `numpy` and `pandas` packages. See [updated installation instructions](https://aka.ms/aml-data-prep-installation). - -New features -- You can now use the Pivot transform. - - How-to guide: [Pivot notebook](https://aka.ms/aml-data-prep-pivot-nb) -- You can now use regular expressions in native functions. - - Examples: - - `dflow.filter(dprep.RegEx('pattern').is_match(dflow['column_name']))` - - `dflow.assert_value('column_name', dprep.RegEx('pattern').is_match(dprep.value))` -- You can now use `to_upper` and `to_lower` functions in expression language. -- You can now see the number of unique values of each column in a data profile. -- For some of the commonly used reader steps, you can now pass in the `infer_column_types` argument. If it is set to `True`, Data Prep will attempt to detect and automatically convert column types. - - `inference_arguments` is now deprecated. -- You can now call `Dataflow.shape`. - -Bug fixes and improvements -- `keep_columns` now accepts an additional optional argument `validate_column_exists`, which checks if the result of `keep_columns` will contain any columns. -- All reader steps (which read from a file) now accept an additional optional argument `verify_exists`. -- Improved performance of reading from pandas dataframe and getting data profiles. -- Fixed a bug where slicing a single step from a Dataflow failed with a single index. - -### 2019-04-08 (version 1.1.1) - -New features -- You can read multiple Datastore/DataPath/DataReference sources using read_* transforms. -- You can perform the following operations on columns to create a new column: division, floor, modulo, power, length. -- Data Prep is now part of the Azure ML diagnostics suite and will log diagnostic information by default. - - To turn this off, set this environment variable to true: DISABLE_DPREP_LOGGER - -Bug fixes and improvements -- Improved code documentation for commonly used classes and functions. -- Fixed a bug in auto_read_file that failed to read Excel files. -- Added option to overwrite the folder in read_pandas_dataframe. -- Improved performance of dotnetcore2 dependency installation, and added support for Fedora 27/28 and Ubuntu 1804. -- Improved the performance of reading from Azure Blobs. -- Column type detection now supports columns of type Long. -- Fixed a bug where some date values were being displayed as timestamps instead of Python datetime objects. -- Fixed a bug where some type counts were being displayed as doubles instead of integers. - -### 2019-03-25 (version 1.1.0) - -Breaking changes -- The concept of the Data Prep Package has been deprecated and is no longer supported. Instead of persisting multiple Dataflows in one Package, you can persist Dataflows individually. - - How-to guide: [Opening and Saving Dataflows notebook](https://aka.ms/aml-data-prep-open-save-dataflows-nb) - -New features -- Data Prep can now recognize columns that match a particular Semantic Type, and split accordingly. The STypes currently supported include: email address, geographic coordinates (latitude & longitude), IPv4 and IPv6 addresses, US phone number, and US zip code. - - How-to guide: [Semantic Types notebook](https://aka.ms/aml-data-prep-semantic-types-nb) -- Data Prep now supports the following operations to generate a resultant column from two numeric columns: subtract, multiply, divide, and modulo. -- You can call `verify_has_data()` on a Dataflow to check whether the Dataflow would produce records if executed. - -Bug fixes and improvements -- You can now specify the number of bins to use in a histogram for numeric column profiles. -- The `read_pandas_dataframe` transform now requires the DataFrame to have string- or byte- typed column names. -- Fixed a bug in the `fill_nulls` transform, where values were not correctly filled in if the column was missing. - -### 2019-03-11 (version 1.0.17) - -New features -- Now supports adding two numeric columns to generate a resultant column using the expression language. - -Bug fixes and improvements -- Improved the documentation and parameter checking for random_split. - -### 2019-02-27 (version 1.0.16) - -Bug fix -- Fixed a Service Principal authentication issue that was caused by an API change. - -### 2019-02-25 (version 1.0.15) - -New features -- Data Prep now supports writing file streams from a dataflow. Also provides the ability to manipulate the file stream names to create new file names. - - How-to guide: [Working With File Streams notebook](https://aka.ms/aml-data-prep-file-stream-nb) - -Bug fixes and improvements -- Improved performance of t-Digest on large data sets. -- Data Prep now supports reading data from a DataPath. -- One hot encoding now works on boolean and numeric columns. -- Other miscellaneous bug fixes. - -### 2019-02-11 (version 1.0.12) - -New features -- Data Prep now supports reading from an Azure SQL database using Datastore. - -Changes -- Significantly improved the memory performance of certain operations on large data. -- `read_pandas_dataframe()` now requires `temp_folder` to be specified. -- The `name` property on `ColumnProfile` has been deprecated - use `column_name` instead. - -### 2019-01-28 (version 1.0.8) - -Bug fixes -- Significantly improved the performance of getting data profiles. -- Fixed minor bugs related to error reporting. - -### 2019-01-14 (version 1.0.7) - -New features -- Datastore improvements (documented in [Datastore how-to-guide](https://aka.ms/aml-data-prep-datastore-nb)) - - Added ability to read from and write to Azure File Share and ADLS Datastores in scale-up. - - When using Datastores, Data Prep now supports using service principal authentication instead of interactive authentication. - - Added support for wasb and wasbs urls. - -### 2019-01-09 (version 1.0.6) - -Bug fixes -- Fixed bug with reading from public readable Azure Blob containers on Spark. - -### 2018-12-19 (version 1.0.4) - -New features -- `to_bool` function now allows mismatched values to be converted to Error values. This is the new default mismatch behavior for `to_bool` and `set_column_types`, whereas the previous default behavior was to convert mismatched values to False. -- When calling `to_pandas_dataframe`, there is a new option to interpret null/missing values in numeric columns as NaN. -- Added ability to check the return type of some expressions to ensure type consistency and fail early. -- You can now call `parse_json` to parse values in a column as JSON objects and expand them into multiple columns. - -Bug fixes -- Fixed a bug that crashed `set_column_types` in Python 3.5.2. -- Fixed a bug that crashed when connecting to Datastore using an AML image. - -### 2018-12-07 (version 0.5.3) - -Fixed missing dependency issue for .NET Core2 on Ubuntu 16. - -### 2018-12-03 (version 0.5.2) - -Breaking changes -- `SummaryFunction.N` was renamed to `SummaryFunction.Count`. - -Bug fixes -- Use latest AML Run Token when reading from and writing to datastores on remote runs. Previously, if the AML Run Token is updated in Python, the Data Prep runtime will not be updated with the updated AML Run Token. -- Additional clearer error messages -- to_spark_dataframe() will no longer crash when Spark uses Kryo serialization -- Value Count Inspector can now show more than 1000 unique values -- Random Split no longer fails if the original Dataflow doesn’t have a name - -### 2018-11-19 (version 0.5.0) - -New features -- Created a new DataPrep CLI to execute DataPrep packages and view the data profile for a dataset or dataflow -- Redesigned SetColumnType API to improve usability -- Renamed smart_read_file to auto_read_file -- Now includes skew and kurtosis in the Data Profile -- Can sample with stratified sampling -- Can read from zip files that contain CSV files -- Can split datasets row-wise with Random Split (e.g. into test-train sets) -- Can get all the column data types from a dataflow or a data profile by calling .dtypes -- Can get the row count from a dataflow or a data profile by calling .row_count - -Bug fixes -- Fixed long to double conversion -- Fixed assert after any add column -- Fixed an issue with FuzzyGrouping, where it would not detect groups in some cases -- Fixed sort function to respect multi-column sort order -- Fixed and/or expressions to be similar to how Pandas handles them -- Fixed reading from dbfs path. -- Made error messages more understandable -- Now no longer fails when reading on remote compute target using AML token -- Now no longer fails on Linux DSVM -- Now no longer crashes when non-string values are in string predicates -- Now handles assertion errors when Dataflow should fail correctly -- Now supports dbutils mounted storage locations on Azure Databricks - -### 2018-11-05 (version 0.4.0) - -New features -- Type Count added to Data Profile -- Value Count and Histogram is now available -- More percentiles in Data Profile -- The Median is available in Summarize -- Python 3.7 is now supported -- When you save a dataflow that contains datastores to a Data Prep package, the datastore information will be persisted as part of the Data Prep package -- Writing to datastore is now supported - -Bug fixes -- 64bit unsigned integer overflows are now handled properly on Linux -- Fixed incorrect text label for plain text files in smart_read -- String column type now shows up in metrics view -- Type count now is fixed to show ValueKinds mapped to single FieldType instead of individual ones -- Write_to_csv no longer fails when path is provided as a string -- When using Replace, leaving “find” blank will no longer fail - -## Datasets License Information - -IMPORTANT: Please read the notice and find out more about this NYC Taxi and Limousine Commission dataset here: http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml - -IMPORTANT: Please read the notice and find out more about this Chicago Police Department dataset here: https://catalog.data.gov/dataset/crimes-2001-to-present-398a4 - -![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/README.png) diff --git a/how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi.ipynb b/how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi.ipynb deleted file mode 100644 index 36dfd27a..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi.ipynb +++ /dev/null @@ -1,513 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Cleaning up New York Taxi Cab data\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's use DataPrep to clean and featurize the data which can then be used to predict taxi trip duration. We will not use the For Hire Vehicle (FHV) datasets as they are not really taxi rides and they don't provide drop-off time and geo-coordinates." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from IPython.display import display\n", - "from os import path\n", - "from tempfile import mkdtemp\n", - "\n", - "import pandas as pd\n", - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take a quick peek at yellow cab data and green cab data to see what the data looks like. DataPrep supports globing, so you will notice below that we have added a `*` in the path.\n", - "\n", - "*We are using a small sample of the taxi data for this demo. You can find a bigger sample ~6GB by changing \"green-small\" to \"green-sample\" and \"yellow-small\" to \"yellow-sample\" in the paths below.*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pd.set_option('display.max_columns', None)\n", - "\n", - "cache_location = mkdtemp()\n", - "green_path = \"https://dprepdata.blob.core.windows.net/demo/green-small/*\"\n", - "yellow_path = \"https://dprepdata.blob.core.windows.net/demo/yellow-small/*\"\n", - "# (optional) Download and view a subset of the data: https://dprepdata.blob.core.windows.net/demo/green-small/green_tripdata_2013-08.csv\n", - "\n", - "print(\"Retrieving data from the following two sources:\")\n", - "print(green_path)\n", - "print(yellow_path)\n", - "\n", - "green_df = dprep.read_csv(path=green_path, header=dprep.PromoteHeadersMode.GROUPED)\n", - "yellow_df = dprep.auto_read_file(path=yellow_path)\n", - "\n", - "display(green_df.head(5))\n", - "display(yellow_df.head(5))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Cleanup" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's define some shortcut transforms that will apply to all Dataflows." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all_columns = dprep.ColumnSelector(term=\".*\", use_regex=True)\n", - "drop_if_all_null = [all_columns, dprep.ColumnRelationship(dprep.ColumnRelationship.ALL)]\n", - "useful_columns = [\n", - " \"cost\", \"distance\"\"distance\", \"dropoff_datetime\", \"dropoff_latitude\", \"dropoff_longitude\",\n", - " \"passengers\", \"pickup_datetime\", \"pickup_latitude\", \"pickup_longitude\", \"store_forward\", \"vendor\"\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's first work with the green taxi data and get it into a good shape that then can be combined with the yellow taxi data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp_df = (green_df\n", - " .replace_na(columns=all_columns)\n", - " .drop_nulls(*drop_if_all_null)\n", - " .rename_columns(column_pairs={\n", - " \"VendorID\": \"vendor\",\n", - " \"lpep_pickup_datetime\": \"pickup_datetime\",\n", - " \"Lpep_dropoff_datetime\": \"dropoff_datetime\",\n", - " \"lpep_dropoff_datetime\": \"dropoff_datetime\",\n", - " \"Store_and_fwd_flag\": \"store_forward\",\n", - " \"store_and_fwd_flag\": \"store_forward\",\n", - " \"Pickup_longitude\": \"pickup_longitude\",\n", - " \"Pickup_latitude\": \"pickup_latitude\",\n", - " \"Dropoff_longitude\": \"dropoff_longitude\",\n", - " \"Dropoff_latitude\": \"dropoff_latitude\",\n", - " \"Passenger_count\": \"passengers\",\n", - " \"Fare_amount\": \"cost\",\n", - " \"Trip_distance\": \"distance\"\n", - " })\n", - " .keep_columns(columns=useful_columns))\n", - "tmp_df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "green_df = tmp_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's do the same thing to yellow taxi data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp_df = (yellow_df\n", - " .replace_na(columns=all_columns)\n", - " .drop_nulls(*drop_if_all_null)\n", - " .rename_columns(column_pairs={\n", - " \"vendor_name\": \"vendor\",\n", - " \"VendorID\": \"vendor\",\n", - " \"vendor_id\": \"vendor\",\n", - " \"Trip_Pickup_DateTime\": \"pickup_datetime\",\n", - " \"tpep_pickup_datetime\": \"pickup_datetime\",\n", - " \"Trip_Dropoff_DateTime\": \"dropoff_datetime\",\n", - " \"tpep_dropoff_datetime\": \"dropoff_datetime\",\n", - " \"store_and_forward\": \"store_forward\",\n", - " \"store_and_fwd_flag\": \"store_forward\",\n", - " \"Start_Lon\": \"pickup_longitude\",\n", - " \"Start_Lat\": \"pickup_latitude\",\n", - " \"End_Lon\": \"dropoff_longitude\",\n", - " \"End_Lat\": \"dropoff_latitude\",\n", - " \"Passenger_Count\": \"passengers\",\n", - " \"passenger_count\": \"passengers\",\n", - " \"Fare_Amt\": \"cost\",\n", - " \"fare_amount\": \"cost\",\n", - " \"Trip_Distance\": \"distance\",\n", - " \"trip_distance\": \"distance\"\n", - " })\n", - " .keep_columns(columns=useful_columns))\n", - "tmp_df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "yellow_df = tmp_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's now append the rows from the `yellow_df` to `green_df`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_df = green_df.append_rows(dataflows=[yellow_df])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take a look at the pickup and drop-off coordinates' data profile to see how the data is distributed." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "decimal_type = dprep.TypeConverter(data_type=dprep.FieldType.DECIMAL)\n", - "combined_df = combined_df.set_column_types(type_conversions={\n", - " \"pickup_longitude\": decimal_type,\n", - " \"pickup_latitude\": decimal_type,\n", - " \"dropoff_longitude\": decimal_type,\n", - " \"dropoff_latitude\": decimal_type\n", - "})\n", - "combined_df.keep_columns(columns=[\n", - " \"pickup_longitude\", \"pickup_latitude\", \n", - " \"dropoff_longitude\", \"dropoff_latitude\"\n", - "]).get_profile()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the data profile, we can see that there are coordinates that are missing and coordinates that are not in New York. Let's filter out coordinates not in the [city border](https://mapmakerapp.com?map=5b60a055a191245990310739f658)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp_df = (combined_df\n", - " .drop_nulls(\n", - " columns=[\"pickup_longitude\", \"pickup_latitude\", \"dropoff_longitude\", \"dropoff_latitude\"],\n", - " column_relationship=dprep.ColumnRelationship(dprep.ColumnRelationship.ANY)\n", - " ) \n", - " .filter(dprep.f_and(\n", - " dprep.col(\"pickup_longitude\") <= -73.72,\n", - " dprep.col(\"pickup_longitude\") >= -74.09,\n", - " dprep.col(\"pickup_latitude\") <= 40.88,\n", - " dprep.col(\"pickup_latitude\") >= 40.53,\n", - " dprep.col(\"dropoff_longitude\") <= -73.72,\n", - " dprep.col(\"dropoff_longitude\") >= -74.09,\n", - " dprep.col(\"dropoff_latitude\") <= 40.88,\n", - " dprep.col(\"dropoff_latitude\") >= 40.53\n", - " )))\n", - "tmp_df.keep_columns(columns=[\n", - " \"pickup_longitude\", \"pickup_latitude\", \n", - " \"dropoff_longitude\", \"dropoff_latitude\"\n", - "]).get_profile()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_df = tmp_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take a look at the data profile for the `store_forward` column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_df.keep_columns(columns='store_forward').get_profile()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the data profile of `store_forward` above, we can see that the data is inconsistent and there are missing values. Let's fix them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_df = combined_df.replace(columns=\"store_forward\", find=\"0\", replace_with=\"N\").fill_nulls(\"store_forward\", \"N\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's now split the pick up and drop off datetimes into a date column and a time column. We will use `split_column_by_example` to perform the split. If the `example` parameter of `split_column_by_example` is omitted, we will automatically try to figure out where to split based on the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp_df = (combined_df\n", - " .split_column_by_example(source_column=\"pickup_datetime\")\n", - " .split_column_by_example(source_column=\"dropoff_datetime\"))\n", - "tmp_df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_df = tmp_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's rename the columns generated by `split_column_by_example` into meaningful names." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp_df = (combined_df\n", - " .rename_columns(column_pairs={\n", - " \"pickup_datetime_1\": \"pickup_date\",\n", - " \"pickup_datetime_2\": \"pickup_time\",\n", - " \"dropoff_datetime_1\": \"dropoff_date\",\n", - " \"dropoff_datetime_2\": \"dropoff_time\"\n", - " }))\n", - "tmp_df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_df = tmp_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Feature Engineering" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Datetime features" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's split the pickup and drop-off date further into day of week, day of month, and month. For pickup and drop-off time columns, we will split it into hour, minute, and second." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp_df = (combined_df\n", - " .derive_column_by_example(\n", - " source_columns=\"pickup_date\", \n", - " new_column_name=\"pickup_weekday\", \n", - " example_data=[(\"2009-01-04\", \"Sunday\"), (\"2013-08-22\", \"Thursday\")]\n", - " )\n", - " .derive_column_by_example(\n", - " source_columns=\"dropoff_date\",\n", - " new_column_name=\"dropoff_weekday\",\n", - " example_data=[(\"2013-08-22\", \"Thursday\"), (\"2013-11-03\", \"Sunday\")]\n", - " )\n", - " .split_column_by_example(source_column=\"pickup_date\")\n", - " .split_column_by_example(source_column=\"pickup_time\")\n", - " .split_column_by_example(source_column=\"dropoff_date\")\n", - " .split_column_by_example(source_column=\"dropoff_time\")\n", - " .split_column_by_example(source_column=\"pickup_time_1\")\n", - " .split_column_by_example(source_column=\"dropoff_time_1\")\n", - " .drop_columns(columns=[\n", - " \"pickup_date\", \"pickup_time\", \"dropoff_date\", \"dropoff_time\", \n", - " \"pickup_date_1\", \"dropoff_date_1\", \"pickup_time_1\", \"dropoff_time_1\"\n", - " ])\n", - " .rename_columns(column_pairs={\n", - " \"pickup_date_2\": \"pickup_month\",\n", - " \"pickup_date_3\": \"pickup_monthday\",\n", - " \"pickup_time_1_1\": \"pickup_hour\",\n", - " \"pickup_time_1_2\": \"pickup_minute\",\n", - " \"pickup_time_2\": \"pickup_second\",\n", - " \"dropoff_date_2\": \"dropoff_month\",\n", - " \"dropoff_date_3\": \"dropoff_monthday\",\n", - " \"dropoff_time_1_1\": \"dropoff_hour\",\n", - " \"dropoff_time_1_2\": \"dropoff_minute\",\n", - " \"dropoff_time_2\": \"dropoff_second\"\n", - " }))\n", - "tmp_df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_df = tmp_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the data above, we can see that the pickup and drop-off date and time components produced from the transforms above looks good. Let's drop the `pickup_datetime` and `dropoff_datetime` columns as they are no longer needed." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp_df = combined_df.drop_columns(columns=[\"pickup_datetime\", \"dropoff_datetime\"])\n", - "tmp_df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "combined_df = tmp_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's now save the transformation steps into a DataPrep package so we can use it to to run on spark." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_path = path.join(mkdtemp(), \"new_york_taxi.dprep\")\n", - "combined_df.save(file_path=dflow_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi.png)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi_scale-out.ipynb b/how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi_scale-out.ipynb deleted file mode 100644 index fd69f736..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi_scale-out.ipynb +++ /dev/null @@ -1,135 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Scale-Out Data Preparation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once we are done with preparing and featurizing the data locally, we can run the same steps on the full dataset in scale-out mode. The new york taxi cab data is about 300GB in total, which is perfect for scale-out. Let's start by downloading the package we saved earlier to disk. Feel free to run the `new_york_taxi_cab.ipynb` notebook to generate the package yourself, in which case you may comment out the download code and set the `package_path` to where the package is saved." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from tempfile import mkdtemp\n", - "from os import path\n", - "from urllib.request import urlretrieve\n", - "\n", - "dflow_root = mkdtemp()\n", - "dflow_path = path.join(dflow_root, \"new_york_taxi.dprep\")\n", - "print(\"Downloading Dataflow to: {}\".format(dflow_path))\n", - "urlretrieve(\"https://dprepdata.blob.core.windows.net/demo/new_york_taxi_v2.dprep\", dflow_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's load the package we just downloaded." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "\n", - "df = dprep.Dataflow.open(dflow_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's replace the datasources with the full dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from uuid import uuid4\n", - "\n", - "other_step = df._get_steps()[7].arguments['dataflows'][0]['anonymousSteps'][0]\n", - "other_step['id'] = str(uuid4())\n", - "other_step['arguments']['path']['target'] = 1\n", - "other_step['arguments']['path']['resourceDetails'][0]['path'] = 'https://wranglewestus.blob.core.windows.net/nyctaxi/yellow_tripdata*'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "green_dsource = dprep.BlobDataSource(\"https://wranglewestus.blob.core.windows.net/nyctaxi/green_tripdata*\")\n", - "df = df.replace_datasource(green_dsource)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once we have replaced the datasource, we can now run the same steps on the full dataset. We will print the first 5 rows of the spark DataFrame. Since we are running on the full dataset, this might take a little while depending on your spark cluster's size." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "spark_df = df.to_spark_dataframe()\n", - "spark_df.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi_scale-out.png)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License.", - "skip_execute_as_test": true - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/data/10x10-float64-csr.npz b/how-to-use-azureml/work-with-data/dataprep/data/10x10-float64-csr.npz deleted file mode 100644 index 3f7505df5f3d42dbf61c0ca4453faf4026c0c455..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1015 zcmWIWW@Zs#U|`??Vnv3->MSP$fUGG%%*`OekeQc~nVedzmse29$RGd~2Ffsilz_nO zH{!3P1x}s_I2N#W-lV8GF-!86F36jlxoF+u1@orGhXhTTAwEBT$|NqKa_=*meym;M z=_+DNn0`tvvzleawUtYRLC=hhjV(=q?V`c~mLm)SD2_Se(aIMBv}Fwtb3q(aP*MbQ z3rG|MKq^3hY`3(vK^*ghS&vO2fh9@JC|F3u8=U$UC_VhDu7gTPl~CF)EHUDcvN-%wV>;*fCvbw__w(sbtGW z8>WQHrG(RVbtxw*Su471QmNIpDw|L1pk|+WXU5n));a$1KF{y_yZ@f&`3##8<_e?W zatO|b=EJ7oQxDcdp=|czQOKWROiXk%nS(VUg3G2d-G|DtKG?h@H&^dvLfyf*^nrl*F>@&dI+%Xv@5E&Nx++#+LA+4dN{Gk7BfLoXT8xj=O1_$_?*Gd-8#% z19C-m=wUghsWkI|AG>bX%X6#EMsGBof9kTeu?Fieaai0N{>0hY;ZbzQYnoX^eJi(i zPKfjG1xKx2Ja5@7Pp^hFp5uoH3pJi^V&Hc4q_y+e?@DTPAqHY*&@)I`e#})xXE$_~ zt=u*J#>z*dg=yEr*ZcS=mA{^FEm^p+RcJGC5smrr*+KpKyf!!@;vM=)cG}kXO9Y(X z+=R-6>B5$VJE0q+v%d^ul-_^N!U-N^Y*im>m|*Hf~dJ4L;N+NzY2-^(5EgMXnWRFMZ=m9iH3`;>WAbO_+xB+ z1ukD9b57gjf@Avk?`ii-q{(orsSZXO!=DA$#A%vdiH6@zZdiExAA}SaBDQ^A#zm<@ zzdt-?pGaY|>_q!$)b-B8hF(qFWhHBaMUQDMq!re<#V#}N8I?Bg6{1d`@s4dPmdJXr zoo}Dp)!N#8m0Dz!5q)kS`tWzL476L`mAU?V{O^IuP|e?&IR&%6Rp*$FfjcH|&+3pX zghm%~C5b=?O{I~%E?APk6$d5P%4f}ZQ@Q)Ha_^CjmKV=QpKOcV_BhhQ&%Zh44(4_e z-OpnwbDpte-?T$)kL?L-;O`qL73UA$*PmhR;@G_-QP#LJDytxO%|Jt2@YfqNwr^OK ziYx7%X?;`*H`fox*JLj<>Uk-QByT?|F`=br*!?V8cZ5*gr7X13l+oKl+1I3wy$B5t z|4kXQ7=%VTF>t9O)sd0;dSlTL(ROD7t!0@-T94AMP54Xmou3D-#}eoA{US%i=JZb| zMrqt|!mPC?7ew@S+Z^-T9=SWM=I~;A zxmdT9`g({NK^OQsR6r*Ki)u^`t&w-wvo|zo&%T{a*OoAKdLY)+$s44-H;Ke)P9*X} z4Ab3#@t;ukRnJ3P(>g5J57EJw0t6U+ZIJm%_{W+cfz6buZPJULp%ni2cb(F6c6eeE z{D>ZZ;ezv_y4@XAcG5q`OPQW|sUBjfvj%0%{hJ6U)X{9Xe3})*qZ^0+%H5KAYm>Zvu}wgH z!9H^GM-TjhWFqs7*h_}hSns`2*Q%*0u%7>aSU+MZv|q+e{|%NFn?GBNm<$c^WVIPiQFOqh)} zLkF=?FcEwh25=Jjw9qZ07`}8i(A5KbSQbadSBr5>p@h3xB~WQ$gTe#C)B++~DiCp` zNwn2T34E0tF~;I2a;1UUCWX3)DlvFL?qfG7RZul3@M+Z4gng~Kb4zevgP!^T8L5M^gr&+5K$7_=W zSyN2Sspex8dps2;n0xBsQRb>@!&r(br30eWLH6JsvA+sYVWLJlu>}Zr98GD100Ouy z3>DVXa7bIUzz_sv07${<45^!i5N|M)4`?6+#0Py{AV>qXp;SXa27odHoryoS6#+ZZ zlw34m0C?3<2O3W3Cc>3tD8p!g0IrUt<0frCqR@<{6rceM6gtUN7^TTlGZF~mMu`P7 z09-WI0pnvbj$AYbNRi2eG*zR6h}9J@G66u>bR950PzZDrq8x{S96-aSItUpvt(7te z$W(49I^ZwEr$tL3KmoLQhOP|bBZ)xe5TyVDasc(2>FoV!OcAgZqFjamqjGF90h~Cz zTCg6twE$qRr4B?!8UflN%2pr(@-C`wq$bQcLKUGYsh|>6fZey!nZawPU+XI!4dejo zvC%=gGohtY&;U?Twi61WJt+v3j-zB^fE=8f-dPk=l!FypeQP;|@@2HGd}*TU;&5|v Tb@Fg>Tj-5?1m0sd_(}K&3>y?D diff --git a/how-to-use-azureml/work-with-data/dataprep/data/crime.txt b/how-to-use-azureml/work-with-data/dataprep/data/crime.txt deleted file mode 100644 index d6d8b8d7..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/data/crime.txt +++ /dev/null @@ -1,10 +0,0 @@ -10140490 HY329907 7/5/2015 23:50 050XX N NEWLAND AVE 820 THEFT -10139776 HY329265 7/5/2015 23:30 011XX W MORSE AVE 460 BATTERY -10140270 HY329253 7/5/2015 23:20 121XX S FRONT AVE 486 BATTERY -10139885 HY329308 7/5/2015 23:19 051XX W DIVISION ST 610 BURGLARY -10140379 HY329556 7/5/2015 23:00 012XX W LAKE ST 930 MOTOR VEHICLE THEFT -10140868 HY330421 7/5/2015 22:54 118XX S PEORIA ST 1320 CRIMINAL DAMAGE -10139762 HY329232 7/5/2015 22:42 026XX W 37TH PL 1020 ARSON -10139722 HY329228 7/5/2015 22:30 016XX S CENTRAL PARK AVE 1811 NARCOTICS -10139774 HY329209 7/5/2015 22:15 048XX N ASHLAND AVE 1310 CRIMINAL DAMAGE -10139697 HY329177 7/5/2015 22:10 058XX S ARTESIAN AVE 1320 CRIMINAL DAMAGE diff --git a/how-to-use-azureml/work-with-data/dataprep/data/crime.xlsx b/how-to-use-azureml/work-with-data/dataprep/data/crime.xlsx deleted file mode 100644 index 21f200b4e38cb035371f6a27f7325b6af6302bca..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16109 zcmeHuRdih0uC19l#+WH)cFYWkF~*pgnIXo+%*@OfGcz+Yl__Rs##iY+=XQ6x?;H31 zzI*n^uBtIgb7*TzOH*1}vXbBs=paxaFd!fx#2~XyIM*JaARw?%ARuo*V8GOct*smk ztsJx!U2F{PHR-=tS`cMJfKg_GfC2yipX-0I1$q<*t-2UdT9ps@McR~lIogJTqn-#TUToG-z5w|3jx>t8669L4GG*5&^_@6o{D?@A>r+1M6MFj1@3IFpHPs2DS`T1I*PNLtvzUa<@o;k90BxhRKdq@H&jRJ zwM8dipu7d3kVajPrUGqkX0p#ORQPqh9o4$(h7dP%gjBsdd_|Ec#Y@klE-fSP2= zfG}49phH$2fop2HbGl2?wNX{hmmCx~C$xA`SnpJ$>1L%a z$>)X!Xw(?H4`D}QU0x&>#fRUoS;DBdvlg!Tf=f2Ced)Hj{ zR=biwE+*HA+s5}_dn-H2+B+r&!)*_q&3D&w-@?W&U2Zz;YCV`oN)wqcG8U!!Uj75QUov>w&ixSwWKB!1sQsI-%Sj*H zcNtrlkqcC;7zVR%nJRhp$CSs}(Z*J*9)MQ(d&qnp=i5yI29rK8?;!*K@x_9{+1k!r z&)VAjXOb)VtZtpdf$pho`U<|0=G_P>8zc8kM7^jvm@91zerf=cLIpCVRyx}1{i{cW zxEKRYSW{Y9AVdGorSsT*c;i!zHLkv-Z*Hozw8M`?C|&Y$N`TZTL%k#oGhz|rcbjgU zq~r*ZQ3J45_7(V zb(Na^u*aARMg%Bm=L86CTGP>6El28IGoT`oP&Ang=1YU9rF3iH^LpnK&Psy-B%%{j z;n__Pv$vK>8kX2tcWt2cx0H-GVCg6?173ajGTwRBIcjQf^!I9-AT=ID9y|PY7QR}3 zI$sHVZi(;fh}4Vizvz*7Ay>i4sQh5Tw)4b6^s0d>B|~XJ&W5D0D&uU{nD$@B&oy5! zdxWp&QV26;RVfw~#VE?(fyj6NsD&f4md9(UGbB~PbjRA{gkm2Sr! zb-RCwXP^226J-&X7N~Nh$d-qGp4db^1=jt!o|hg?#BS8~IPY3A^e3xPic8kyf~1Vz zMQ582zT$(n+vpeBAucy6zt%bc zG-D-NfV72v5x>i0a_k9PMQ*}G4jJnu(}>X{=hX&2spBdYWfdMo(^ASzD&u4zJ5XbxB4_bWynpU zsyp{-q3!Q8lTCxu@vjVEZ(?ZZV9)T|2h-0~kgBF>4QN2~ zXzzGN_FvNvLBAFx9n20t?r}^;u7^GOzCjrz2e^oftLfEyeMAx+n!{`SAg>!D{biv0 z$H)&8k)6BpjA7A3x>5E^3&$x;{PGnE67@WaW9|L? z=lT7wul7%*!|m2uQ#B^2-DdVvo>ak(<$#qDttrE*eS<>>(E){ch8S5WZX8tw1^2Z02G#S1P1LmF>GaXwM{6x3^C`iWoNmY~%!BRi zWf4Y;cGBp`m%S}_pQG#4hgGegT81vRbOjGzs1%vapGWrj#b2MylB-&ortSEKtEF!g z9ReH8Ev@LaogLYeER&KCzhZ^SuJCxIq=1|hZe=8qdDA5u&U*WOwv%%r;RS}0IR!Gc^pGkeF0AbZp^FC*O5bwrvZis_X?v%vdSL>qV zXYr`MS8HRr)>kSPac6GH=UuZtxuS!Vmg_`5^70V9&lkSE4%St_0=D3i@4cazs%ngxL>!jtD1X`gYv{(h0_;c4cN#7T9W{>lIEL9zWEL z;?0%od+QrJO4(V+_Pd889wu%q6oy0MahK~8_e+`7nFn&PmFY9}^UX7jR>G=tbL>QW zttT-cK+J0}p}5zutRwVoE&6AAg{34>}JuMI?yOi z?LhldO9sKoP|BDpQp;Il5OwYv8xd5~h4#duFCCg3yLq_@G>5|Wnp+0i){?TpfH{u| zz$`<86Wi#+^Bn=w&@drrXUn1AYJqd~d&yluPsxWwjD%3f9-srG=9rXFCr zb_>a&W5%>ke1?&O=?GRNT-qw;f?DDJU1?%6>eNlmFUy!f9!D}SJvDZa^N zhu(>*q7%1&QG%8Lp=`N!(fqs|#PoHLS%KDq4^T;2>s(P2!ilACOr8GtYPgCpKX7KD zb&tkyVBi{crn6k>8njy-5yU9wq>9`@=xIP^4&x!ex7ravpI2=QWH;Cfi(ucQnl0#( z(cyUMDj2IMbYnOBymdN^#2*jwQ_PjIdQ3aknZbqPf+-ZH(n5SbS$ylP{;gLZb~*uh$>e`6qZb!|P<=OB=`4efw(RHcF^{Ov)cb{f>=Vr0aJ3&ksR z1RTe@BM<~n?TwI^cqk$md&ayVBq;+n%nv+un|1^w;2S8)jOw8ViB7b<>9ka?Q}zB{ zZpl)!%3+ozLZ*vfkHlgap;m5DE<9WNRbenTcY7n9g#0UgtpEznz8&eJ}k zo!F+r_hY_sa^w zyUA_=DQ(A!s5aUL?Rw~nAbSkaNPvqfXr?+Kw6e`^w890b3&c~OTnM9*O64k-lp9G; zESF;pt7C{9ccJ>QP*Yy$2U9<%alX&1i&i(=xRbcaFLBQuGPkq_?Nsdjc8FfTdIJw?D^##ayD1KwmhSJ*~5ctv?_Vcv^)u?5OthlGw z90IRj;i8St>Z;$K(Q{kfz^?JxsHG)aWt?g|(^E&$ z6a8Y`9JleAS~fPVbH)ubd~GIlXZhy&y}rid1}t$0P>&(Dj^xP^@T$*%5_qn+jPFWt zXmH;)8RigGrn8{j3_^nIcK_7lr#WgB;GZ^Zk2BTlsd?0@MnQL$Tk+V#g(1}LD_+zC zdZUn{EX^u;t9mo>V@nO0IF+ZIL$O?ArT4usRu--DENZbtDtGNzf)jcTpsRufzlyaq zz0=~6Ie0Ux)N;q1m2yLd6US!?mmFsRfn5%5A*P!Dv*atdFXIPz@vGa$~P_tkh zEFz2E>p-%uYeZ@V_^R=;OTe;~740_pfns)ASOf+wX0=aNldr;mO0BicK`8@KtX9_) z9jB?b{-|=lEZ^!BG;y+UY5#B z-4vK|6f_u@J4gEl4`#0CmSWDJH9u(oKt7nU`(4^xkiBomDIV@%S!*qxe_GRSQ6c~r zSyj$p_@Ex^;n6RaUau7iAHZj*D)%hJ-sgX7uGUhfuF6-SzIGvl_G#&mmf^xcYXX7S z{xvOva9I85hs%nlP-aCE#kRay#EB#927)TR4^ zinBO9ZIO&rwSP#WiK!%(J{IGBq}@KEbSSZ;_kd~}LiT#z#Fc0if)Vs3an!i?RNkby zHjyAaJwY{{XT)^1-B%*NRGafz{Na}7y`h`%v>w}NwIh&rh}q3AEvL4f99S{0h?$MO z3hVhE?Cb+ZAZ%zPUif1~Dx%2iyPjUDXqRDFPKJ=RjU)Pxze0f%`vte*Av|1M30%9K z3(B}2tV@nZJWXe<#ehFsKW^>Wen4fbHrw8xwtWoYYQZXA;pL=;b>Nr$T4tW1E_!iB zmsT)MLQbB&!u;euP(_%(8JNt2)=1Bd9W_{!*qZn3z{hWeEGmNc+B~Dj7ZZH);p4M? z-f7L&mGr8sR7Ui?3)7%ydO`JWf;T!etAcP`cmJA?121`a^g%cRn?G_i2`A@Jbp~B< zJxTBPo)Q9E%CLTCBwo(sq1B;jrP?hlyJYnc??%xN}Us zmD609Hb*7lem|K9YX1jE+MNYuHl$D-7L<}kopjK74Q3$S=mpY^_u!qGxKQ4LK+5qd zg$;xB&?j&<;S6PV($>sH>BW7EutX2aSkkuj$L=t z6bzL6;tP&b56hI!Iox?wF*=XHvC{6E(?>8o#>&JilskX95H7EotU0aR7MNW3a-%;u zN8(S;f$}kJ;Zh{G2C2qHhxwM48i3Kgfa@9Ds@toj{SmeBFshKwO&9Sdj6ml%#<9Ni zP23xjyN0b0rORH8(6^FXpNMfQwMh6z&4%6W5E}+xoh9!Aw4o40r#uC77%yI5ZG?gE*Iz2#7jF`LFxNZg@|q*LppWVgV;?a5M!(<$ku-p<=-$lRiqs$9d!Nj0C_&9#J~lvAtJpL=)+|c1VJi=&FUcQgYrrlKgjfs+C}iS38!sHfpV*;yi}YJ z)58D}A7Oi3ucHCbcFxY%;iy@sn*3xO1pj6npTt=_bSG}|8EF15jH7PLxMnsZyL2_5 zfgjB3N^Hiz4i3)0Z4$f&QIA@G2wJ`m_>dwTHEgmguqgy=FwVRPJy)NharcM4n_Zfr zak-V{AR5!KFhE*rN3joS<`F_}m)L|mfV!ae!O_eC*{6(eqCCYX@tAt-+SZm@`M~wyNn@{_cdm=|~R1=y~N+lbwgM`ma$6)|y*x&o* zF_={vFHpgl*0U?zOGeIc0fcPy9nOp&D}A>pTMj=Ut&DDZtS_kIim3zwl@W0oRq4?9 zykc9f8R|Zh#0NQ-X^WfY zfr)bB9<}avx)j8sJ22niumQq2DwFDjh#^{xeJvO8bl$5LgRy;=7J@QEY@jWnfUW&r zEN*NBp1Z9OD`$t9ryE0{N-;!MF6H|u7NhXG% zWtNq6@b*z4KjAfmjQTTN6P*}WbJu4*(VC8L`Vkmvx$5XSf6^W_I);=xY7|*rXAL>Q zG`#l9^qC&FQJbr~{qi!y(#4kAx8mHc2$_0w{>^8s{*d__gs_k#eZN?Q0v}SUQ=goe z?Jk~QEr(rU9u4+YoY$EUw(^{O8jNTklpq`6UYEYz?wP#j&UF)+emCE&;DI$%cYoCi z%JfG+k*&O`876Mn-El#6S`Za`U=DLI6?`&uh+#o_NxbO#AV=^u{p#8!k!1{C0gGn5 zyZ@Nya}6BW+8Z(pNn#r@1EPrB@ZQtu~h(SfhFWVk4BCsoZbL1I)1$60I$WwqDzjT@jSiIlJ5KH8gTt$4lteoN6e#QPR8KJeTz7hv>XmN zy`i=+v+f(peLU#s6Fd?(>=1U?R1_6K0JpB2N3%rw^f1}ibh`Ocmj}hw^Qj;$D+aID zaLzX)N;^C3*@r;^C6ra7*T;UI;89bKW%F4O*Rh78*M28yVawL0=S^mY_n{}@!x``K z>XlWswgyXx5Ce9QcnNufFOpp^Nkd~^fh~@|x4}EHP})90gMjoA{oL>UtqI~_VrXf| z@Z0^j{>PE(Xb3JldOOwwKZ3pU)BDW`^40Zmi+CC4?ACxu#rPNuE`?*$h@^Od<#O2=Jg(*9z<(`gtlgNBYts&-+ZRx*9w0^gr;>-_;NuiK8@~xG@}CJ z!}&~fZT0SX$zv_q`YUr;5plZ)e|o=KGv~%741=z{@HeXnK7y^$RO51yh~JOo6{J*1 z*tgX}Pnh{YzJbY;iEj+{BFId@R5gT9a<2CP$fse@to_m6=`<)(AxeC{E9dii$tMl2 zV`R>a)6@x4!Ii)HqSrGhRrf@bIm=GOkk0G&a<+dN(9ZkyYVW>VyQ;C4TG9%dTtVSv zHbVRL`59|fyZzlU5Kt1(&ITOe_9WtP*1b~>Yh{K$?yP}mO9Zvmt)0o7XVMC53+@) z-F&f$@Vt>MJgNLcCFHVxs1vp`*LrdC@Y$J7y3|U{Wn)lt&_tB^3B5!K$n7pmrt*gT zVpxUY4q`A`P(DZJltO!@a47!wX5ql?d0yrXsfwVh6O?kFz-e+}I~{U)`Wj)qz~L@K zBHyKp(N)Xx?n3ifUUvghU7#L}o`FT%eZwa7a!2xSsnC-rHetcKPKqlfE zOm+F7PwOkW!xeD-R%7iY<@NVOzGKo$c{*gop&A`~_>r@?3!{}o3IV!sZ z7#hRd^H|FYV{2RQFPM};sB5|RZ$(xlG�Y6u9g?9=Ac@xSQb+-PGTVqR5$h^XU4+z#(9*cn4yYO<71`Lk36xpw#*%W-F|BJgAD|$$5yt8PpyMGD7|QF_tTZ(r{PAs~Yx}kk;JQ zBz?KVCc4Ca&gHYc*6iKuW3kb+7GJsHn0eBrTHp1}wV+v2i@Vp*yVHxkbeWdQZ5HC# zlVQl-W+7(}h?dDvXvRW-ToXm38R!_vUGn=x+T~bMx=q`6{1nVf5fcb%!iFmyslig$ zieFgXW$Nb!IMQHbOq4!;y8IFmHvP%Cv7{-v2p%%}{tJ~xm~etb{+=4|>1yfBq352W zvuXkq<>EB9u9Ok3oFlfJHj)s!ma(taVv@?FgUY1rK|+?=2it8pMoSEgFWWckqOIEj zf=hz_MtHHi`WOW@NiCseaAQdyTw_>@l0aqk7+8oR926FtY+cM8O^aJabLmJ!9je0^ zrx?Ale$aA{#y_g|H6;biFv~RRHR9T_9E2gGnv0T9G%4&Z9 zW)Hi2{tHwi>5Mv7wnBpwj(ED&owIH{4pvowNGz8|%%>qJrl>@>*d97r``&p&XUJr}C3uQ6Hk_0zMAj)g(ICC+eNfrR zqznxqw#;I3XVxJ-Phv6nO{>fbJK^k*J(TH=aegoL!Hh%9egO9t;`a;TnJVf$tLIO2 zc4g-DjYw#-E)p$2W;{Z`I~nORMwq2lTRwFMS69mcYFKDt5M*Z9?kpWYaPXd&QSc9> zji@;a&>l*p-8~E~x6B$rJA3iC+I;0Gkl&LPvQ8kSe3-%=h-K3WS&mT+gIR0gq@a8C zq_TiskyxOMeXHA!qUa-^gykxT_;LT18Ms30aQbGLe$B$ynmkl zWOJKt)DJZzL`r)EtMyX`3|QQBkeP^PM-lbN4UEiWZR>!+NFqd6`=ty1+< zMAwyyNj#pjR#HTyrCBT4Lw2}YyLxpPGjc=E9MwYTI0^2;dfeyCXCfMXooy-)gEo*` zv*x;2;J(w})9*W0F`2(22IlTG7PV*$2dlOH@CFD6 z_(hFi-KysyuE1n2LA^=NVM|O#sohD;y=)sGb+K|Nm2`t_5J?~(UzBXJXQp7~Ck4F# zM0y9_{SZAabs?Sd#je}^#FyHar&RZ$(JB^_$*a)3b3=9Cn%X`iwK_s#^MQLw0W=*X zhIOoV0!lR!iEG{kJ+368AViA+ytyOEFTbGxwTNb6GpP-k-pm@hv`)W*6(Ki9&}##-pu6UZqYRWVXMx zvsG=A_gYvT=8Vjy_jIX}VuQSEWDMx`mQ-ME%1ik+q?0I-wOFs3ZmQve(vDorYDrxTPHgU-H+9|}sO5>RI zT}C0lo5aUt3R>Oc+Q9ardqSx_V7vR@Wg)pTmjR>x>I5a=F#CHalQIE2%{xR z_5|fR>9qEk2+h5YF!iZ(W?QG!qfpmKS2XMg?2B#v6QlNuVHW#k0*kfXD2X)zow62d zqh^`X;xh6xAR|Y4n}0R-1QjCBXD}P{>>@F1!RzVaNb<~NdioTK&?KU9yu0X?3C^9N z`3BsYIgiDftu?%b7Ord7p{gaM%A|+abrY}|L$s&O^pQ_Zg+6Khpm0_$s}f7Z`k3Oe{6d>%0QX81Gj0VU_n68|K;o^x^{*J3J!LrR>r^iyt|TSL=GFeNBPXF zP8Od<-Y3SLZll=2Arxb8OdAL?mP`4dcMedF-_XTo?{D42->JdIcx)}q^WCOyri`?| zXt(CP2G`M~JP5KFsR)a( zhCVJmjks-EJzfi@J~bMxEY#gT%`d$m-~%YPa@UD^?yUxAm#;c{SLP_YsG6Isr5+HR zIN~9u9bBv_{Ln@@FbQ!{rr&ucQ5n1TWSS2shu_%M^x9CkR-Oz`h=S098A|=o2kFp=)lV zyM=p+t%nxP%r6!gdjYpa@oZ_j0NY}R&g(T6NWm=0m^j-uHWw|stKcPAj|Gn|R?GbS zRo)so<>9)oOLWy1_Yf{}4ogDvJ14sSd+-$|E`4m=89UA0Tk)=*N{lXlP~Qw$vKU3F$?GriL0g6z z2~d;o6kWSVaIK&8qHy{~h%jZFD;zB5>nb~~B5Ov^aiDO-@=f9fI8sO&W23~XJ3eMu zOtQ9Co4+$EHpfDZ@1z$4$H`Gd1pwYf5Gy;X4E7+m$P%F%zo8c8L69S1Ch)Z>ltS+v zd#9(j01+eOQfSF!q_&Ve*I$P}D`LmSw6)d)aVyp}YCT^25AQ6w7eU6C~Ne<$eKUE48{P(efQw4VW9vuw3;b76Nn4Gf_^XjNpK!frcJsv zH<6;vHz77$)xscom6YhMJ%d>?DKeaCl8)B(gdZdRRh7O*7AUdEuMMJT@e7%6s$bE} z8iaE}p`#DZ2t_hOmk=J>+6au^U~eem2Y6n37Mqc~5IhOI-sgbhzaY}J{o_m_w6PVwYK9)(vncEUwyI>{9!7@55 z9Z#h0G<{n#_@_@2LH&^`81R*Ta%9ig-JU&*mEo{Ok$gLg;z2HqgZCl06D7rFmKWO4 zH4L9>PieP2lzq5(&#Vi|` zXdack4GKw6+u$@(niaCp#T>Nmvq`kQE>ZYUY#?QLEw7&>b@KxtW8x%T&Awd@zMQ6j z4+X!?Sd;l_ULzr+d77i|+bPvk$xjmul1J-sW{dF-d_Erb1)AXFO``Jy9&G60$cnx5 zKRUtq`p23spaalifPhf^?gTc#wc5eZPQlQ@;pd{w@XK!|dOcICBgIjPWm#-qmdfM3 zd?fT`2f17t+dYJsqLt4{xlR&OzGN&7woDH4FrkByA|iR-cW?Q`66iH=K%1P1d!cP| zcJo;l;y0ekE3kE~r9Mjmfcn|XkY4mV@FVQeV1Diau+Ty{2+Uv%RUoI~CU8|P+2Ee$e?9~+>&guk?{m|^Vg`|~zF?soy3 z55(;zZ;zX{6t6;fAGZz``}3;j&XMOz&IzcUHQ>Hq#8oQabCnich@QQ`K90~kEr2T; z?^mrATCHb~Q+5;FDj>kTVC~y5q@x1f_49q$y1n=1l5wm`eUFpM2y@H+FR?7=Sf(aIYrL(VvR(abAjF4#-*2@9vE{1x1EUT1VqaShm*y*Plk6 z#Gn!lV5LxJ1)*X@m-W6tng;D%(?YIu_=?0TF|NAvq9;^fzRaf{taP+&oZ#G7Tnsf^ z7=(JE@KmS8%l>&&W40OXTa^ghJ6fA^%eo0s(9G<-I>L3d<(Px_n9KK$3wQL;_g@B; zC^|K327Av9(4N$hxAMh8pmF?6O6cqqv_6Iu9HSWu3l%WsF{N3+>N<=N^qpQ`^6wlL z9*eplp7WEWF$v)$T?euY%x%5g^HwqZ<8t(3F{BapGb;ge7RsOfP}j!hKW+##*}tyT zXhCZLBZA*4_zfY#3NGiOEP_`_wix?Wg)V4xo$=><{j55&z>=t|`G`8Xc~xXcSJHce zbp9B5<^H-sF0nmth90}95QG-QyvqQg<>RlKPbfIi=Xs{y0Ya%Dy>TxOqLvDQkprMc zct0vp-pgr72G#k`t$Q1eOJK;G2Lc=-vw#PBQgkxZ&&ZTrJgI(&txHleY@k#2^*Daa zK&10IzCZK)8jMgf>lhkV?SIE`O=zspAuD%tX_6&pM^Bv(@bdT+{)tQ3ZnZe=J)#AN zigLc7?1g2fRp*r?;BA8J_wYG<6*wy4+b(nUprc8mD-ORsDZ9Qm`qy=pI0x@Y$e&TB z-pqmcN>u6MA#E>nm#;XtzD3YT#eMvS>g#9U)0zWyW?lO}h>wH-`@<@#@%$y-uKSyP zqA(8oD_vZK^EG4ktgyZemBiw;G5#N#%K^B;XHEN;Sxd9;1PA+*=$Q~_=omQPdko~W zy?|_g-))^}4|zO4Kiu9Xd5ymuG-e@ROK*Gu?ymejqjuYLc5MP(*y;Ckc7Iw*9qk>g zE&t=i|Mp=JkeayBZ~$;Iy^VW7!gSLkath5Mo0jq{7Lzv4&DC^xquY?KQq{>!#$r0_ z%rGg%bx~E;BGU1LJ7HMY?p8jo6X&fuHm|6IfGmjG+s}IiR1bC9`r%-QzDn7wZ}a!S zFjq$TJQI1?^8NcTbif(1j?wYczW3{zY@<@^LVKz`CM~nZ*?2DS_`Ms4hm|Ci!7#yM z7W$CIqiRg8@fs&w?z&lhNL>`#GPQX6D0ysa9K(Jt8L3<*t8itG!w^55*ppZ~1)1-= zy6KSkF40D4fmgj``u>0&?%+_*AB>LL+M^l3L(mq8FeFUCMC^~+%|lFz$24Y}PuUx3 z$n$ABk&zymhOP=NmjCsr&+yqolLf#|k?4E6DgH(OQD#V2c}Vkz&a}UjO1fS8c4CBG zFe$ZNs(h4gK3Wk>9(72xmXwI5G!KutiJD)0+bQ#%B1RR{+?ExQ^envna^u;0e3xKI zGU`y}DJj>|8|Hj>sT{!}Dsk-BN%-eNnrG!#GPS*oPHitrw=Ah9`LV+$+FTpHWwi=` zx;n&VMgf#}m}eMOJN?<@4*{+EtBu}W(|!&Y+f0nYZm0TH!n*bF{AY_@3c}pg??-;z zt+uBEfG&*PF3p!b(5_o#{hR-a4Nx#TU{C0uPoe(d3cr8*hcl_Nl79vG>w&mG1JnU? z#-EPI{VMqDv7kRh_kof6%ORm(h5vdM;SW&|kV?4Ug#Vk93BTg}I+^|l(jmfsDe<>C z^RD<$&(O=t#zasqF#P|aNj`mj@{9{YwSHNF;0)GG^086L88ud?re{=|b1^l(N z`U4P|{x`s1>g#{3xBus$n2i4u(BG=JUqS!M{{PSu1O%TM1mthEz^~$eC2fBe&tv_Q r_`fLJuhM_bmw%QvVgHk~{(ogpSxHD>!2aZiC?E0o30umrdm0kqt5$Q!kk4P5@A`zr_ zk*Cs&AR>W)3Q`x}zW0v1dv@lWxo76bH~0Se&boPMkOaP|Cr^X=Ax3h;-y=54lDEwi($f+BT!c1;_MwGs1+2dS7=IU+vvwN=Ar zSK2+a-s*R8ST1(LwAF@&$fKY!OliqcAgjZf4^er7k?=1`x_4|qYZfb=E6v_LPf?O> z?q2)a6?lhDjeX03BN?GNYauAV9e%t~&#Rv6MtrprJ z5Zpp&&l*-ewlCk6FeqlUS&qocDS&peom}y}n7in%7Vu(nZ4;(!Ua+Hi{t333+?G-& z_pHzmgbwFr0r4`E^!0I{cHNPqwXtQVZ~~wX z%>A}>-9pu+u349etL+t&%YW)%u9Ma?W#`Y79R-FEkmr4>>wplQ zH?H4AvszOKBZQ(RJHycHDDQ$>`$40^Wto+A$&xI{gpO&B~;6&B38P$f@leMig}!5=n$UTMJ3lgL+2nY%$Uoa?P@=I z?lJqlqABe$lVJGRH|xsVo#l)*N?weC<>R_a=MeUM%?}prY-0?2W6?+gOWcLQm9ah} zQ=b&xxT|Mh94yVG3n`&R7mPLuWt*cg2M68}n8APb4yP%|MrUvR8sa&!K6I2O`AY~;^yH;Orky1@D@KJ#~kA(mxqL; zM>2hyQcV13`tq(T`&+Xwt#g!Q8Jf!IJ|5Rk&57O@{HXR-??=!c+Ui>SRpPB5yS}@? zOiPQ-hPip~jd|?B@oj1naj|^vRE5a2m(J(YGEAfV^7BT>o^R~FZ$iQ`!%(2@QB#D{ z)M365?)i5~?F2l&_wbl>ne^R<&Z;q1J7JpFZg~+gPu$8US48p$PIF&>gWW4S=Ob~D^~B8NXj)h0x$Qkl0PjXJ4$1GC-|uP z&K0*ejp(I7IQg80rGJA_6b+ApX@*25eS*nXjuZ1pv8|yHwLK?G9R=OJqh) zKYP9?=JHSjU%I4B3zkb-G2+e7$m9FiDmqrcG}d;zC$b(an0z6$ zPQ|Ngh{H6s1yeUP=$Ev6Y4At_a=)aQU6?RdUMg|Qq99qh7V9#HOEy0$ zNnvp(%bKMyyKdI*oN&cpMO&3D8@O%$sP{Uk+}>if=3=kWb-2PRMc`uNJ20b1^wY~J zwEW>zO$uJdG+gC%x%J+Rq1ta^ec$_87@+Gv;m*e|8926OcTt{;zy(k%pCx;oTs0io z|DCJ6P;Ay&cEr{J)m|HJ`m%lF&{vHNy^nWKBs$&r@j7A)A_*Ex zyiZfgpAb0WKX-RT&`{7??U^2FXFs?(F5rbNVf%3P>4(dqA!U{%%ZG7pb zl>@jSJg2KndyI-<*XuRs_udZIw_Sf82UBAF@M(zzg3EicFEJ7BMLE}%~& zUD$;=Q!NPXkg0bie&B0?rG*k6srW!YY$jkRJiP$MG&G^K%4wUzyjsVJnlAmA_&X$1(&e5erex^qVQ2H`#Usid* zhgqyPhka?37}jXJmYqa=Sd}$S-<)_TFRT`vAtGYA8a31?yC)-_Y>Y9#;P=iw+JrV< zOg`9YR;f0{YDiE2na5JA`H+FQEgX~1%8Rpw9|$NX^M$+X=!#e3OW0mAaZRv;rL#zn zB~-;#V2)-z4HtKz1FYTJ-)t?M*a!8Mhjx5D34D+G**V~U;NCBoeWSU#2y^=b6{jf< zDO0iAXkDFXX+5i0hlw5gddJvKO3D|a{)6L1FMJICaJ0XPv(%)30khK4>8QNpvf__>JIfH?;K{OcM0KHm#<|pZ z4_qFDF2QjzRL^F@=#q7BH+LV0JcS#S7Gft`oqmS&|#^)vyl1*@W^R@yfVhaM&|n3 zm!R$lNsE$aXj_1nwlQ?-d%2q4$m6ZKBXWcx8M!*(|4mHbv!3SH68Q7{ulb4guV*p< zC!kKb#{v~NN%r6Icl`f0OuyrAotf{?@$8@W>36f*p7^^P}gydKvd zZC2bkA$|ad{sTbbgg8{m!VXCdbx=OM2_UWdE^c@y#$ zki-L4u!bW?QL;~UVUmg_c%G*>Gryt zn_a?Wr+c68Y}j}AEi&Gq8wzs-F1a$dkuh09|9(uQ8u22(ar<*u;hJC}oy|MKDZ%@Bkd2avs z>q7JI(-+M1i)+UG?Bc>aVLp~Q+)w8kPv@H7oo@btj{h$!h#4eQ8~qSNNQf;Y^qxdN zBC43Rp6(&Z2@~WLKQ_pk22pqB+%VhB}wk1 zj;&K0UP741GOTiEPsE{QrEM$kKocs*yQ>>etfNEA1r#@-30MP>%0N=PbQs-0c62os zlbft8VfmN>hcyzFM{bs&DXw^^f|?pt5gN@_O|mHV0l$brk{C|PmdT?imT)MN$v8?? z4wFtu)1oABVKL2-k^~3J#r8-gO=O%vP0EN3fcb$8CbfPWQA_^6#KeO7)mVU+GIV|F z*#@kLX;m@=Gf7rXHO$c4E`7{a|#!8T2G2T9CWx=bz%|Hb>l-g&?ls>nGOr} zq&yGHS*^BTorOMN^v=o#u@vsX3$O!m&22q!C%>lr@ASu8&{bo=@BZITpCQnlqPYj$I`U^E}>^7oBPuFcx=;Vz~~0i|68n#m>nt zrYE~go$xx-JZZ&^-vY84r` zA_BE4RT=Gc>bOz0m7-D?ip#iBsTFFiB8sCOrPOt9UJ_m~VTzPr_uX^9@0@ebx#yga z7@ovK0lEo6mwKTta796>42%i@D6XC<4aZkZ3S80Q7I0sslj#1WnPRb1CfUWI=I7I^ z!)n7pg!_XJ&)dt!X(Rl~a;lNKF@0OR`^N1jxviyuUhk!?2@nM7Zz5lKL&drDTW8y9h8 zk8`O?Y@6j)`-v4`okD>|r<9ETfS{ivLg*fOfD3({`#on@l+=Bmu%RSeyzFluc~&0j zOuIeS+4D!~i{^Z8Z;5!rj$?_v%YO|o;r5bWKA!NQxLb6^je4wkdesq^;(sBF{Q@Ji z^^P&oNS#|`{RPrM%;_SoGXRB4Jpe`nYeleywuLr_#*AydeEj0olgY>zweFtx7&q_Sy>q*3 zTj!0gyIuEN>_%#6cD=t_0Z~fw#p!xa7z1K$JtHP6dIn}4o-|#QVA&PLV{XlfV@&|0 zC$>Cr(2x>{JIH~&Xhd+%+6JNK~|X>(Q92+@3Xm~t9-jU zyz}^%8SgFlwPnhvYcE=~;xl!fbI&-&rR9d!*)LqN+TCZ))A%)(JhPi2Vx~mJ#0rE_ zn8RCqE|(de5XnqVh>S`a#Lu39-c=h)5HMSPbDP1>6A0RB9vF1xWV>RnQ{+X@R9pI$ zL&b}hdpKa#*|L4q>n~aCpPsgkE4xMMo3xYaI)YQPr+|O&NaMjg)@LapN#52~dvnK3 zUB7c`O6d4P?~v1P-m>n#=;YGjcxs*SJ*g!>=msa438DmVfoR&as3bvbxDbX+D2z%Q z#1*-?TP_y^aCu_k*SISH#wx%hfe{2lfMqbWVo&*+a$T|17uP})oW6v<_$&-Bt^kOO z!z{A^?pbr)LdDUqTsW zL+wH8Tr0PJ@U~~i=(x!d$`PJD4m(CLz`JMT9~1oM<`;|l`5%N1a!XdI4fXlau|oc$ z@IR^&+fN|L*G9$Vg?;yY4A|b*(ty*G(Ks~4ygI-p#^p*W>*xtWX3I!n?Xnob8{8jV zz)u{uz{64|k$8R>iOaWaiTNwXgR*=;zk{sc&@ViFX9W(Ed4qLU+6PybuV&WXC6y_} zeeXK5)Cxi0-kHotXHPoSX1!TP524g0f8`cL0-dOz2XTky+P3TsF=#t*ee$5%T1Paj zW){9+Q_nO#*fus&6DR0%@Nz1=8#0gk>4CG9h_p?*S?G_Zb2 z#h|qI4+ntNv>r61zv0ywI-@@OKiFc7D_mK=O(zm4nw32fCn2L48fLDmT6(mjr>W_D z;@?{USfABChNh3%K#er8DZp`67@TPIz1ZaI&o$!_JT@SLox6B^J0uD^uw>`eo&UWO zY1x5m2NonfxfIHAa7-CptKl6E;uarWQ@FAy{i~Tv7tBvhAGx3XWZNo1Px&Vkrex&J zr@nZ!MshiR)JL{$vs3SY#}CVcZhcyG;|2h^{1G#W_%;r(W&f;-ps+j1O&B%n&_w4+ z|4nYP=#sEw*Tgx!fsdMOcJD4#?2d56ZE-v_x4zi1&SvbheW!jw%GE3H?9nW8E!^sP zj&rVST31O|#m4%!;v0W#YEyRD&*cS-$G|IAMXiZA64n0?z_DmM9bJk6+}w%1IfKU^ivNT2uB z_xojaU4q=Z!|2Fiy=C_t4^n%oRHtikkeA!~Y$h^O0?J<8}+$4|+vc}|)Z);qs$ydBau9ZOkJRv3103%^)+F73FN&G)&Xz_AUSjB{#9 zzP7Ch392Y5a719CCb>SN$I$X}C7O8|dD1y@l>~hUpmrW4j1(IQP#Rp=3nY2+jD&nu zrd$K-14@B=_w3C)Fr( zwMuo4ac`thEtTjtGF}k^NJ))UtK@lFrBrVhIYKVc>X+0=Wu8`}lo~$CGbI|CUg987 zt5o?pN^L<#xJE9~+eVF!h|LhFWwRQ>~FHb8v6)tu(VOa++MC(YtMzoGXJz zWf`Izg9Sopwl-e|??Q*R6{>Tx4Bxujpc?&xj?!IR8LrPzHkEY<(4Qdec7%b_3@Foh;PAsFdN%@(K>Q#-EZEIf43y%^r&7qS`q<&XJ`(*t z3Ce=uu>Su?clKTiVSqm3I72y^zS#;2gR!B{>Fo2i1Q(t!I8`S?{|14wVC>Z)nO$Lz z190mH}|wxXZ6f=7a}KO6ekjm`v*#cah1yGZmZ zD|j{-8^ZL(zBH170z**ylN{%XrsqIuFh)#U>|-dAH~16455mkji$Iw^bpS9Qu-I!y z5y1WjV+1!MP;La9k4@~e7y-tgml4>6K$#I}J`k~2k0yYxJVql0)V+FiKZV)M!)|pW zuDt*0;^4yw{XPQa!I?>>nOcXLBx{?GBK=dHZm)-4&M+>iO{bf(@zBT=GD<}`f zhSD$T)f~DDaAjcQ)tW4hBv%5P5(n>3oV@wj9N4Jl=gYMmjd~$RCD%$|IhJ4>?H9AO zu*I-~Sg;@DOSLR7mO?4dmgTX6(igLGBr2UKHWF?dj>**dnb~sqos*xPEzvE5AW-hb zGIFwl7B5-i)sHo4fdq-J_vB;^okZ6&3A9$J>uM5J?$8PFPD?Pnu6|u>9zNQ%mk*!C zq`wcJ#Mo6$r-k9O8OIBPAyci+mP>Nr#t_p(Hboc5&Qu0 zilC~Zwg^*P8c>8Nr3mN;0ad6UNQJniNFb5YA3jh8LP9ALf~pdQ7AWfM%d;lyKujc( zJCAej%-p&6%+ls+cMyQE5cOq*4jOvGV4(~Efchyb{VrsYunWAqTNn|3{JwST)&=ls zC5LWbPJ#1@!Cy;YFT~FwmZ*uAtgftBTv5SL2F21Ox&zYh?7P%7(9}eNJ*UrHm?n_n z1Vz+);i~W)&-4BnxjN_Ka9h`jD**KUwB&D-<(m?aQpL6^&Hh5ltuiL0aD* zC@nQ+j3YyRvt6k=BbIXAG^W@gaBtFzW}DmhG^iFb+2!>%uBN0vn$+CnbgHgyXdf6^`g!S8iKB#Aq~gE;g#m3W z0IjZ}(jp}$bt;sU26y+?iwOb%wDNH5n)zP|pjVv!Mh=aIaE{>ta}*wE<>8q`6amZk zriQ{&OpW@&2k(yS>0_}izh$uv~XrgRR5g8#HPqJx(e>|DCb zB4@27C!*rHo*r;Wg$D?hCJjE#bnk?@MhEtO_t_uWy}Ve$K<)2NnMttnG5xSPWWk<3 z@-tbYv7z4nwQ?xX(j?-*uEBpxz?#m@r6NzB>UGecxpjYd_*$D#z0y55X630~hnT6m z>jBs)RWDUb+j(yFvJf8iCF@%2UViny2ktzl|?a$pdQK0fgHJt3WS z=4a#*Q@odrZRn2s*YjbPcpnhPxjIVM>;Z%c8pUol&Ud#Y!#+MX7ee%~5BIPMUK*a< z5Q_RY2m``oyt&KWt~5+-v0#{uZRzgZ(#tEI#U4KHj|F=Z!DvM3o8gW6S)rr*g+Tx* z4W4M2k0*kDX z!RV8?Gr#!fACZ`k>JhVecSPck$d57lsNN}Qy-M-vV?}6w&h-*;+sPvsoZ?ieyULV2 zUW5V2o~7!#i3ATYGOeR8!dQ$e-s6u-ey$slAIV$f&NH%@NgzlA?BI3jd`2_}tq^l*0^9wRf-*^MzviT%qQy3gpuw$i} zzFUH@V)7x}-I6SBj26?6rXt)6nTi-5P0cclkzx9iQgpc4v=*`L`t0aeLXV}lZs^e1d(wm^^%adBJCnr*fS8y1F| zJ+NDz#zm995D(jtWGKW6W(@@LWj2M=Rmj^i5PYJS zOL_Pd&LF&%3PpWjF7~-r|tdFhYSiail pE9a_s&gTzQFRrL$c}HdC;y?ux;3@;v0e@gVxCK8c;eSv${s&hv$<6=( diff --git a/how-to-use-azureml/work-with-data/dataprep/data/secrets.dprep b/how-to-use-azureml/work-with-data/dataprep/data/secrets.dprep deleted file mode 100644 index bf156e3c..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/data/secrets.dprep +++ /dev/null @@ -1,63 +0,0 @@ -{ - "id": "b308e5b8-9b2a-47f8-9d32-0f542b4a34a4", - "name": "read_csv_duplicate_headers", - "blocks": [ - { - "id": "8d9ec228-6a4b-4abf-afb7-65f58dda1581", - "type": "Microsoft.DPrep.GetFilesBlock", - "arguments": { - "path": { - "target": 1, - "resourceDetails": [ - { - "path": "https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv", - "sas": { - "id": "https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv", - "secretType": "AzureMLSecret" - }, - "storageAccountName": null, - "storageAccountKey": null - } - ] - } - }, - "isEnabled": true, - "name": null, - "annotation": null - }, - { - "id": "4ad0460f-ec65-47c0-a0a4-44345404a462", - "type": "Microsoft.DPrep.ParseDelimitedBlock", - "arguments": { - "columnHeadersMode": 3, - "fileEncoding": 0, - "handleQuotedLineBreaks": false, - "preview": false, - "separator": ",", - "skipRows": 0, - "skipRowsMode": 0 - }, - "isEnabled": true, - "name": null, - "annotation": null - }, - { - "id": "1a3e11ba-5854-48da-aa47-53af61beb782", - "type": "Microsoft.DPrep.DropColumnsBlock", - "arguments": { - "columns": { - "type": 0, - "details": { - "selectedColumns": [ - "Path" - ] - } - } - }, - "isEnabled": true, - "name": null, - "annotation": null - } - ], - "inspectors": [] -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/data/stream-path.csv b/how-to-use-azureml/work-with-data/dataprep/data/stream-path.csv deleted file mode 100644 index 175f3801..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/data/stream-path.csv +++ /dev/null @@ -1,11 +0,0 @@ -Stream Path -https://dataset.blob.core.windows.net/blobstore/container/2019/01/01/train.csv -https://dataset.blob.core.windows.net/blobstore/container/2019/01/02/train.csv -https://dataset.blob.core.windows.net/blobstore/container/2019/01/03/train.csv -https://dataset.blob.core.windows.net/blobstore/container/2019/01/04/train.csv -https://dataset.blob.core.windows.net/blobstore/container/2019/01/05/train.csv -https://dataset.blob.core.windows.net/blobstore/container/2019/01/06/train.csv -https://dataset.blob.core.windows.net/blobstore/container/2019/01/07/train.csv -https://dataset.blob.core.windows.net/blobstore/container/2019/01/08/train.csv -https://dataset.blob.core.windows.net/blobstore/container/2019/01/09/train.csv -https://dataset.blob.core.windows.net/blobstore/container/2019/01/10/train.csv diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/add-column-using-expression.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/add-column-using-expression.ipynb deleted file mode 100644 index 3fa0e65e..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/add-column-using-expression.ipynb +++ /dev/null @@ -1,360 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/add-column-using-expression.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Add Column using Expression\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "With Azure ML Data Prep you can add a new column to data with `Dataflow.add_column` by using a Data Prep expression to calculate the value from existing columns. This is similar to using Python to create a [new script column](./custom-python-transforms.ipynb#New-Script-Column) except the Data Prep expressions are more limited and will execute faster. The expressions used are the same as for [filtering rows](./filtering.ipynb#Filtering-rows) and hence have the same functions and operators available.\n", - "

\n", - "Here we add additional columns. First we get input data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# loading data\n", - "dflow = dprep.auto_read_file('../data/crime-spring.csv')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `substring(start, length)`\n", - "Add a new column \"Case Category\" using the `substring(start, length)` expression to extract the prefix from the \"Case Number\" column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "case_category = dflow.add_column(new_column_name='Case Category',\n", - " prior_column='Case Number',\n", - " expression=dflow['Case Number'].substring(0, 2))\n", - "case_category.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `substring(start)`\n", - "Add a new column \"Case Id\" using the `substring(start)` expression to extract just the number from \"Case Number\" column and then convert it to numeric." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "case_id = dflow.add_column(new_column_name='Case Id',\n", - " prior_column='Case Number',\n", - " expression=dflow['Case Number'].substring(2))\n", - "case_id = case_id.to_number('Case Id')\n", - "case_id.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `length()`\n", - "Using the length() expression, add a new numeric column \"Length\", which contains the length of the string in \"Primary Type\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_length = dflow.add_column(new_column_name='Length',\n", - " prior_column='Primary Type',\n", - " expression=dflow['Primary Type'].length())\n", - "dflow_length.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `to_upper()`\n", - "Using the to_upper() expression, add a new numeric column \"Upper Case\", which contains the length of the string in \"Primary Type\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_to_upper = dflow.add_column(new_column_name='Upper Case',\n", - " prior_column='Primary Type',\n", - " expression=dflow['Primary Type'].to_upper())\n", - "dflow_to_upper.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `to_lower()`\n", - "Using the to_lower() expression, add a new numeric column \"Lower Case\", which contains the length of the string in \"Primary Type\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_to_lower = dflow.add_column(new_column_name='Lower Case',\n", - " prior_column='Primary Type',\n", - " expression=dflow['Primary Type'].to_lower())\n", - "dflow_to_lower.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `RegEx.extract_record()`\n", - "Using the `RegEx.extract_record()` expression, add a new record column \"Stream Date Record\", which contains the name capturing groups in the regex with value." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_regex_extract_record = dprep.auto_read_file('../data/stream-path.csv')\n", - "regex = dprep.RegEx('\\/(?\\d{4})\\/(?\\d{2})\\/(?\\d{2})\\/')\n", - "dflow_regex_extract_record = dflow_regex_extract_record.add_column(new_column_name='Stream Date Record',\n", - " prior_column='Stream Path',\n", - " expression=regex.extract_record(dflow_regex_extract_record['Stream Path']))\n", - "dflow_regex_extract_record.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `create_datetime()`\n", - "Using the `create_datetime()` expression, add a new column \"Stream Date\", which contains datetime values constructed from year, month, day values extracted from a record column \"Stream Date Record\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "year = dprep.col('year', dflow_regex_extract_record['Stream Date Record'])\n", - "month = dprep.col('month', dflow_regex_extract_record['Stream Date Record'])\n", - "day = dprep.col('day', dflow_regex_extract_record['Stream Date Record'])\n", - "dflow_create_datetime = dflow_regex_extract_record.add_column(new_column_name='Stream Date',\n", - " prior_column='Stream Date Record',\n", - " expression=dprep.create_datetime(year, month, day))\n", - "dflow_create_datetime.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `col(column1) + col(column2)`\n", - "Add a new column \"Total\" to show the result of adding the values in the \"FBI Code\" column to the \"Community Area\" column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_total = dflow.add_column(new_column_name='Total',\n", - " prior_column='FBI Code',\n", - " expression=dflow['Community Area']+dflow['FBI Code'])\n", - "dflow_total.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `col(column1) - col(column2)`\n", - "Add a new column \"Subtract\" to show the result of subtracting the values in the \"FBI Code\" column from the \"Community Area\" column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_diff = dflow.add_column(new_column_name='Difference',\n", - " prior_column='FBI Code',\n", - " expression=dflow['Community Area']-dflow['FBI Code'])\n", - "dflow_diff.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `col(column1) * col(column2)`\n", - "Add a new column \"Product\" to show the result of multiplying the values in the \"FBI Code\" column to the \"Community Area\" column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_prod = dflow.add_column(new_column_name='Product',\n", - " prior_column='FBI Code',\n", - " expression=dflow['Community Area']*dflow['FBI Code'])\n", - "dflow_prod.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `col(column1) / col(column2)`\n", - "Add a new column \"True Quotient\" to show the result of true (decimal) division of the values in \"Community Area\" column by the \"FBI Code\" column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_true_div = dflow.add_column(new_column_name='True Quotient',\n", - " prior_column='FBI Code',\n", - " expression=dflow['Community Area']/dflow['FBI Code'])\n", - "dflow_true_div.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `col(column1) // col(column2)`\n", - "Add a new column \"Floor Quotient\" to show the result of floor (integer) division of the values in \"Community Area\" column by the \"FBI Code\" column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_floor_div = dflow.add_column(new_column_name='Floor Quotient',\n", - " prior_column='FBI Code',\n", - " expression=dflow['Community Area']//dflow['FBI Code'])\n", - "dflow_floor_div.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `col(column1) % col(column2)`\n", - "Add a new column \"Mod\" to show the result of applying the modulo operation on the \"FBI Code\" column and the \"Community Area\" column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_mod = dflow.add_column(new_column_name='Mod',\n", - " prior_column='FBI Code',\n", - " expression=dflow['Community Area']%dflow['FBI Code'])\n", - "dflow_mod.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `col(column1) ** col(column2)`\n", - "Add a new column \"Power\" to show the result of applying the exponentiation operation when the base is the \"Community Area\" column and the exponent is \"FBI Code\" column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_pow = dflow.add_column(new_column_name='Power',\n", - " prior_column='FBI Code',\n", - " expression=dflow['Community Area']**dflow['FBI Code'])\n", - "dflow_pow.head(5)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/append-columns-and-rows.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/append-columns-and-rows.ipynb deleted file mode 100644 index 51a55e4a..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/append-columns-and-rows.ipynb +++ /dev/null @@ -1,251 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/append-columns-and-rows.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Append Columns and Rows\n", - "\n", - "Often the data we want does not come in a single dataset: they are coming from different locations, have features that are separated, or are simply not homogeneous. Unsurprisingly, we typically want to work with a single dataset at a time.\n", - "\n", - "Azure ML Data Prep allows the concatenation of two or more dataflows by means of column and row appends.\n", - "\n", - "We will demonstrate this by defining a single dataflow that will pull data from multiple datasets.\n", - "\n", - "## Table of Contents\n", - "[append_columns(dataflows)](#append_columns)
\n", - "[append_rows(dataflows)](#append_rows)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## `append_columns(dataflows)`\n", - "We can append data width-wise, which will change some or all existing rows and potentially adding rows (based on an assumption that data in two datasets are aligned on row number).\n", - "\n", - "However we cannot do this if the reference dataflows have clashing schema with the target dataflow. Observe:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import auto_read_file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_chicago = auto_read_file(path='../data/chicago-aldermen-2015.csv')\n", - "dflow_chicago.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import ExecutionError\n", - "try:\n", - " dflow_combined_by_column = dflow.append_columns([dflow_chicago])\n", - " dflow_combined_by_column.head(5)\n", - "except ExecutionError:\n", - " print('Cannot append_columns with schema clash!')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As expected, we cannot call `append_columns` with target dataflows that have clashing schema.\n", - "\n", - "We can make the call once we rename or drop the offending columns. In more complex scenarios, we could opt to skip or filter to make rows align before appending columns. Here we will choose to simply drop the clashing column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_combined_by_column = dflow.append_columns([dflow_chicago.drop_columns(['Ward'])])\n", - "dflow_combined_by_column.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice that the resultant schema has more columns in the first N records (N being the number of records in `dataflow` and the extra columns being the width of the schema of our reference dataflow, chicago, minus the `Ward` column). From the N+1th record onwards, we will only have a schema width matching that of the `Ward`-less chicago set.\n", - "\n", - "Why is this? As much as possible, the data from the reference dataflow(s) will be attached to existing rows in the target dataflow. If there are not enough rows in the target dataflow to attach to, we simply append them as new rows.\n", - "\n", - "Note that these are appends, not joins (for joins please reference [Join](join.ipynb)), so the append may not be logically correct, but will take effect as long as there are no schema clashes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Ward-less data after we skip the first N rows\n", - "dflow_len = dflow.row_count\n", - "dflow_combined_by_column.skip(dflow_len).head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## `append_rows(dataflows)`\n", - "We can append data length-wise, which will only have the effect of adding new rows. No existing data will be changed." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import auto_read_file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_spring = auto_read_file(path='../data/crime-spring.csv')\n", - "dflow_spring.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_chicago = auto_read_file(path='../data/chicago-aldermen-2015.csv')\n", - "dflow_chicago.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_combined_by_row = dflow.append_rows([dflow_chicago, dflow_spring])\n", - "dflow_combined_by_row.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice that neither schema nor data has changed for the target dataflow.\n", - "\n", - "If we skip ahead, we will see our target dataflows' data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# chicago data\n", - "dflow_len = dflow.row_count\n", - "dflow_combined_by_row.skip(dflow_len).head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# crimes spring data\n", - "dflow_chicago_len = dflow_chicago.row_count\n", - "dflow_combined_by_row.skip(dflow_len + dflow_chicago_len).head(5)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/assertions.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/assertions.ipynb deleted file mode 100644 index b33a30ed..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/assertions.ipynb +++ /dev/null @@ -1,133 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/assertions.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Assertions\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Frequently, the data we work with while cleaning and preparing data is just a subset of the total data we will need to work with in production. It is also common to be working on a snapshot of a live dataset that is continuously updated and augmented.\n", - "\n", - "In these cases, some of the assumptions we make as part of our cleaning might turn out to be false. Columns that originally only contained numbers within a certain range might actually contain a wider range of values in later executions. These errors often result in either broken pipelines or bad data.\n", - "\n", - "Azure ML Data Prep supports creating assertions on data, which are evaluated as the pipeline is executed. These assertions enable us to verify that our assumptions on the data continue to be accurate and, when not, to handle failures in a clean way." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To demonstrate, we will load a dataset and then add some assertions based on what we can see in the first few rows." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import auto_read_file\n", - "\n", - "dflow = auto_read_file('../data/crime-dirty.csv')\n", - "dflow.get_profile()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see there are latitude and longitude columns present in this dataset. By definition, these are constrained to specific ranges of values. We can assert that this is indeed the case so that if any records come through with invalid values, we detect them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import value\n", - "\n", - "dflow = dflow.assert_value('Latitude', (value <= 90) & (value >= -90), error_code='InvalidLatitude')\n", - "dflow = dflow.assert_value('Longitude', (value <= 180) & (value >= -180), error_code='InvalidLongitude')\n", - "dflow.keep_columns(['Latitude', 'Longitude']).get_profile()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Any assertion failures are represented as Errors in the resulting dataset. From the profile above, you can see that the Error Count for both of these columns is 1. We can use a filter to retrieve the error and see what value caused the assertion to fail." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import col\n", - "\n", - "dflow_error = dflow.filter(col('Latitude').is_error())\n", - "error = dflow_error.head(10)['Latitude'][0]\n", - "print(error.originalValue)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Our assertion failed because we were not removing missing values from our data. At this point, we have two options: we can go back and edit our code to avoid this error in the first place or we can resolve it now. In this case, we will just filter these out." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import LocalFileOutput\n", - "dflow_clean = dflow.filter(~dflow['Latitude'].is_error())\n", - "dflow_clean.get_profile()" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/auto-read-file.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/auto-read-file.ipynb deleted file mode 100644 index 2a12288c..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/auto-read-file.ipynb +++ /dev/null @@ -1,189 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/auto-read-file.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Auto Read File\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep has the ability to load different kinds of text files. The `auto_read_file` entry point can take any text based file (including excel, json and parquet) and auto-detect how to parse the file. It will also attempt to auto-detect the types of each column and apply type transformations to the columns it detects.\n", - "\n", - "The result will be a Dataflow object that has all the steps added that are required to read the given file(s) and convert their columns to the predicted types. No parameters are required beyond the file path or `FileDataSource` object." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_auto = dprep.auto_read_file('../data/crime_multiple_separators.csv')\n", - "dflow_auto.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_auto1 = dprep.auto_read_file('../data/crime.xlsx')\n", - "dflow_auto1.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_auto2 = dprep.auto_read_file('../data/crime.parquet')\n", - "dflow_auto2.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Looking at the data, we can see that there are two empty columns either side of the 'Completed' column.\n", - "If we compare the dataframe to a few rows from the original file:\n", - "```\n", - "ID |CaseNumber| |Completed|\n", - "10140490 |HY329907| |Y|\n", - "10139776 |HY329265| |Y|\n", - "```\n", - "We can see that the `|`'s have disappeared in the dataframe. This is because `|` is a very common separator character in csv files, so `auto_read_file` guessed it was the column separator. For this data we actually want the `|`'s to remain and instead use space as the column separator.\n", - "\n", - "To achieve this we can use `detect_file_format`. It takes a file path or datasource object and gives back a `FileFormatBuilder` which has learnt some information about the supplied data.\n", - "This is what `auto_read_file` is using behind the scenes to 'learn' the contents of the given file and determine how to parse it. With the `FileFormatBuilder` we can take advantage of the intelligent learning aspect of `auto_read_file` but have the chance to modify some of the learnt information." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ffb = dprep.detect_file_format('../data/crime_multiple_separators.csv')\n", - "ffb_2 = dprep.detect_file_format('../data/crime.xlsx')\n", - "ffb_3 = dprep.detect_file_format('../data/crime_fixed_width_file.txt')\n", - "ffb_4 = dprep.detect_file_format('../data/json.json')\n", - "\n", - "print(ffb.file_format)\n", - "print(ffb_2.file_format)\n", - "print(ffb_3.file_format)\n", - "print(type(ffb_4.file_format))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After calling `detect_file_format` we get a `FileFormatBuilder` that has had `learn` called on it. This means the `file_format` attribute will be populated with a `Properties` object, it contains all the information that was learnt about the file. As we can see above different file types have corresponding file_formats detected. \n", - "Continuing with our delimited example we can change any of these values and then call `ffb.to_dataflow()` to create a `Dataflow` that has the steps required to parse the datasource." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ffb.file_format.separator = ' '\n", - "dflow = ffb.to_dataflow()\n", - "df = dflow.to_pandas_dataframe()\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The result is our desired dataframe with `|`'s included.\n", - "\n", - "If we refer back to the original data output by `auto_read_file`, the 'ID' column was also detected as numeric and converted to a number data type instead of remaining a string like in the data above.\n", - "We can perform type inference on our new dataflow using the `dataflow.builders` property. This property exposes different builders that can `learn` from a dataflow and `apply` the learning to produce a new dataflow, very similar to the pattern we used above for the `FileFormatBuilder`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ctb = dflow.builders.set_column_types()\n", - "ctb.learn()\n", - "ctb.conversion_candidates" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After learning `ctb.conversion_candidates` has been populated with information about the inferred types for each column, it is possible for there to be multiple candidate types per column, in this example there is only one type for each column.\n", - "\n", - "The candidates look correct, we only want to convert `ID` to be an integer column, so applying this `ColumnTypesBuilder` should result in a Dataflow with our columns converted to their respective types." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_converted = ctb.to_dataflow()\n", - "\n", - "df_converted = dflow_converted.to_pandas_dataframe()\n", - "df_converted" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/cache.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/cache.ipynb deleted file mode 100644 index fd47cf0f..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/cache.ipynb +++ /dev/null @@ -1,194 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/cache.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Cache\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A Dataflow can be cached as a file on your disk during a local run by calling `dflow_cached = dflow.cache(directory_path)`. Doing this will run all the steps in the Dataflow, `dflow`, and save the cached data to the specified `directory_path`. The returned Dataflow, `dflow_cached`, has a Caching Step added at the end. Any subsequent runs on on the Dataflow `dflow_cached` will reuse the cached data, and the steps before the Caching Step will not be run again.\n", - "\n", - "Caching avoids running transforms multiple times, which can make local runs more efficient. Here are common places to use Caching:\n", - "- after reading data from remote\n", - "- after expensive transforms, such as Sort\n", - "- after transforms that change the shape of data, such as Sampling, Filter and Summarize\n", - "\n", - "Caching Step will be ignored during scale-out run invoked by `to_spark_dataframe()`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will start by reading in a dataset and applying some transforms to the Dataflow." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "dflow = dprep.read_csv(path='../data/crime-spring.csv')\n", - "dflow = dflow.take_sample(probability=0.2, seed=7)\n", - "dflow = dflow.sort_asc(columns='Primary Type')\n", - "dflow = dflow.keep_columns(['ID', 'Case Number', 'Date', 'Primary Type'])\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we will choose a directory to store the cached data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from pathlib import Path\n", - "cache_dir = str(Path(os.getcwd(), 'dataflow-cache'))\n", - "cache_dir" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will now call `dflow.cache(directory_path)` to cache the Dataflow to your directory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_cached = dflow.cache(directory_path=cache_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we will check steps in the `dflow_cached` to see that all of the previous steps were cached." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "[s.step_type for s in dflow_cached._get_steps()]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We also check the data stored in the cache directory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "os.listdir(cache_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Running against `dflow_cached` will reuse the cached data and skip running all of the previous steps again." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_cached.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Adding additional steps to `dflow_cached` will also reuse the cache data and skip running the steps prior to the Cache Step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_cached_take = dflow_cached.take(10)\n", - "dflow_cached_skip = dflow_cached.skip(10).take(10)\n", - "\n", - "df_cached_take = dflow_cached_take.to_pandas_dataframe()\n", - "df_cached_skip = dflow_cached_skip.to_pandas_dataframe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# shutil.rmtree will then clean up the cached data \n", - "import shutil\n", - "shutil.rmtree(path=cache_dir)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-manipulations.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-manipulations.ipynb deleted file mode 100644 index bf1836f9..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-manipulations.ipynb +++ /dev/null @@ -1,563 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/column-manipulations.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Column Manipulations\n", - "\n", - "Azure ML Data Prep has many methods for manipulating columns, including basic CUD operations and several other more complex manipulations.\n", - "\n", - "This notebook will focus primarily on data-agnostic operations. For all other column manipulation operations, we will link to their specific how-to guide.\n", - "\n", - "## Table of Contents\n", - "[ColumnSelector](#ColumnSelector)
\n", - "[add_column](#add_column)
\n", - "[append_columns](#append_columns)
\n", - "[drop_columns](#drop_columns)
\n", - "[duplicate_column](#duplicate_column)
\n", - "[fuzzy_group_column](#fuzzy_group_column)
\n", - "[keep_columns](#keep_columns)
\n", - "[map_column](#map_column)
\n", - "[new_script_column](#new_script_column)
\n", - "[rename_columns](#rename_columns)
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ColumnSelector\n", - "`ColumnSelector` is a Data Prep class that allows us to select columns by name. The idea is to be able to describe columns generally instead of explicitly, using a search term or regex expression, with various options.\n", - "\n", - "Note that a `ColumnSelector` does not represent the columns they match themselves, but the selector of the described columns. Therefore if we use the same `ColumnSelector` on two different dataflows, we may get different results depending on the columns of each dataflow.\n", - "\n", - "Column manipulations that can utilize `ColumnSelector` will be noted in their respective sections in this book." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import auto_read_file\n", - "dflow = auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "All parameters to a `ColumnSelector` are shown here for completeness. We will use `keep_columns` in our example, which will keep only the columns in the dataflow that we tell it to keep.\n", - "\n", - "In the below example, we match all columns with the letter 'i'. Because we set `ignore_case` to false and `match_whole_word` to false, then any column that contains 'i' or 'I' will be selected." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import ColumnSelector\n", - "column_selector = ColumnSelector(term=\"i\",\n", - " use_regex=False,\n", - " ignore_case=True,\n", - " match_whole_word=False,\n", - " invert=False)\n", - "dflow_selected = dflow.keep_columns(column_selector)\n", - "dflow_selected.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If we set `invert` to true, we get the opposite of what we matched earlier." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "column_selector = ColumnSelector(term=\"i\",\n", - " use_regex=False,\n", - " ignore_case=True,\n", - " match_whole_word=False,\n", - " invert=True)\n", - "dflow_selected = dflow.keep_columns(column_selector)\n", - "dflow_selected.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If we change the search term to 'I' and set case sensitivity to true, we get only the handful of columns that contain an upper case 'I'." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "column_selector = ColumnSelector(term=\"I\",\n", - " use_regex=False,\n", - " ignore_case=False,\n", - " match_whole_word=False,\n", - " invert=False)\n", - "dflow_selected = dflow.keep_columns(column_selector)\n", - "dflow_selected.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And if we set `match_whole_word` to true, we get no results at all as there is no column called 'I'." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "column_selector = ColumnSelector(term=\"I\",\n", - " use_regex=False,\n", - " ignore_case=False,\n", - " match_whole_word=True,\n", - " invert=False)\n", - "dflow_selected = dflow.keep_columns(column_selector)\n", - "dflow_selected.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, the `use_regex` flag dictates whether or not to treat the search term as a regex. It can be combined still with the other options.\n", - "\n", - "Here we define all columns that begin with the capital letter 'I'." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "column_selector = ColumnSelector(term=\"I.*\",\n", - " use_regex=True,\n", - " ignore_case=True,\n", - " match_whole_word=True,\n", - " invert=False)\n", - "dflow_selected = dflow.keep_columns(column_selector)\n", - "dflow_selected.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## add_column" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Please see [add-column-using-expression](add-column-using-expression.ipynb)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## append_columns" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Please see [append-columns-and-rows](append-columns-and-rows.ipynb)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## drop_columns" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep supports dropping columns one or more columns in a single statement. Supports `ColumnSelector`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import auto_read_file\n", - "dflow = auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that there are 22 columns to begin with. We will now drop the 'ID' column and observe that the resulting dataflow contains 21 columns." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_dropped = dflow.drop_columns('ID')\n", - "dflow_dropped.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also drop more than one column at once by passing a list of column names." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_dropped = dflow_dropped.drop_columns(['IUCR', 'Description'])\n", - "dflow_dropped.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## duplicate_column" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep supports duplicating columns one or more columns in a single statement.\n", - "\n", - "Duplicated columns are placed to the immediate right of their source column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import auto_read_file\n", - "dflow = auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We decide which column(s) to duplicate and what the new column name(s) should be with a key value pairing (dictionary)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_dupe = dflow.duplicate_column({'ID': 'ID2', 'IUCR': 'IUCR_Clone'})\n", - "dflow_dupe.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## fuzzy_group_column" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Please see [fuzzy-group](fuzzy-group.ipynb)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## keep_columns" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep supports keeping one or more columns in a single statement. The resulting dataflow will contain only the column(s) specified; dropping all the other columns. Supports `ColumnSelector`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import auto_read_file\n", - "dflow = auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_keep = dflow.keep_columns(['ID', 'Date', 'Description'])\n", - "dflow_keep.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Similar to `drop_columns`, we can pass a single column name or a list of them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_keep = dflow_keep.keep_columns('ID')\n", - "dflow_keep.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## map_column" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep supports string mapping. For a column containing strings, we can provide specific mappings from an original value to a new value, and then produce a new column that contains the mapped values.\n", - "\n", - "The mapped columns are placed to the immediate right of their source column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import auto_read_file\n", - "dflow = auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import ReplacementsValue\n", - "replacements = [ReplacementsValue('THEFT', 'THEFT2'), ReplacementsValue('BATTERY', 'BATTERY!!!')]\n", - "dflow_mapped = dflow.map_column(column='Primary Type', \n", - " new_column_id='Primary Type V2',\n", - " replacements=replacements)\n", - "dflow_mapped.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## new_script_column" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Please see [custom-python-transforms](custom-python-transforms.ipynb)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## rename_columns" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep supports renaming one or more columns in a single statement." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import auto_read_file\n", - "dflow = auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We decide which column(s) to rename and what the new column name(s) should be with a key value pairing (dictionary)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_renamed = dflow.rename_columns({'ID': 'ID2', 'IUCR': 'IUCR_Clone'})\n", - "dflow_renamed.head(5)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-type-transforms.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-type-transforms.ipynb deleted file mode 100644 index bfc4e73f..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-type-transforms.ipynb +++ /dev/null @@ -1,473 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/column-type-transforms.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Column Type Transforms\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When consuming a data set, it is highly useful to know as much as possible about the data. Column types can help you understand more about each column, and enable type-specific transformations later. This provides much more insight than treating all data as strings.\n", - "\n", - "In this notebook, you will learn about:\n", - "- [Built-in column types](#types)\n", - "- How to:\n", - " - [Convert to long (integer)](#long)\n", - " - [Convert to double (floating point or decimal number)](#double)\n", - " - [Convert to boolean](#boolean)\n", - " - [Convert to datetime](#datetime)\n", - "- [How to use `ColumnTypesBuilder` to get suggested column types and convert them](#builder)\n", - "- [How to convert column type for multiple columns if types are known](#multiple-columns)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set up" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_csv('../data/crime-winter.csv')\n", - "dflow = dflow.keep_columns(['Case Number', 'Date', 'IUCR', 'Arrest', 'Longitude', 'Latitude'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Built-in column types" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Currently, Data Prep supports the following column types: string, long (integer), double (floating point or decimal number), boolean, and datetime.\n", - "\n", - "In the previous step, a data set was read in as a Dataflow, with only a few interesting columns kept. We will use this Dataflow to explore column types throughout the notebook." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the first few rows of the Dataflow, you can see that the columns contain different types of data. However, by looking at `dtypes`, you can see that `read_csv()` treats all columns as string columns.\n", - "\n", - "Note that `auto_read_file()` is a data ingestion function that infers column types. Learn more about it [here](./auto-read-file.ipynb)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow.dtypes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Converting to long (integer)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Suppose the \"IUCR\" column should only contain integers. You can call `to_long` to convert the column type of \"IUCR\" to `FieldType.INTEGER`. If you look at the data profile ([learn more about data profiles](./data-profile.ipynb)), you will see numeric metrics populated for that column such as mean, variance, quantiles, etc. This is helpful for understanding the shape and distribution of numeric data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_conversion = dflow.to_long('IUCR')\n", - "profile = dflow_conversion.get_profile()\n", - "profile" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Converting to double (floating point or decimal number)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Suppose the \"Latitude\" and \"Longitude\" columns should only contain decimal numbers. You can call `to_double` to convert the column type of \"Latitude\" and \"Longitude\" to `FieldType.DECIMAL`. In the data profile, you will see numeric metrics populated for these columns as well. Note that after converting the column types, you can see that there are missing values in these columns. Metrics like this can be helpful for noticing issues with the data set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_conversion = dflow_conversion.to_number(['Latitude', 'Longitude'])\n", - "profile = dflow_conversion.get_profile()\n", - "profile" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Converting to boolean" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Suppose the \"Arrest\" column should only contain boolean values. You can call `to_bool` to convert the column type of \"Arrest\" to `FieldType.BOOLEAN`.\n", - "\n", - "The `to_bool` function allows you to specify which values should map to `True` and which values should map to `False`. To do so, you can provide those values in an array as parameters `true_values` and `false_values`. Additionally, you can specify whether all other values should become `True`, `False` or Error by using the `mismatch_as` parameter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_conversion.to_bool('Arrest', \n", - " true_values=[1],\n", - " false_values=[0],\n", - " mismatch_as=dprep.MismatchAsOption.ASERROR).head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the previous conversion, all the values in the \"Arrest\" column became `DataPrepError`, because 'FALSE' didn't match any of the `false_values` nor any of the `true_values`, and all the unmatched values were set to become errors. Let's try the conversion again with different `false_values`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_conversion = dflow_conversion.to_bool('Arrest',\n", - " true_values=['1', 'TRUE'],\n", - " false_values=['0', 'FALSE'],\n", - " mismatch_as=dprep.MismatchAsOption.ASERROR)\n", - "dflow_conversion.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This time, all the string values 'FALSE' have been successfully converted to the boolean value `False`. Take another look at the data profile." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "profile = dflow_conversion.get_profile()\n", - "profile" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Suppose the \"Date\" column should only contain datetime values. You can convert its column type to `FieldType.DateTime` using the `to_datetime` function. Typically, datetime formats can be confusing or inconsistent. Next, we will show you all the tools that can help correctly converting the column to `DateTime`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the first example, directly call `to_datetime` with only the column name. Data Prep will inspect the data in this column and learn what format should be used for the conversion.\n", - "\n", - "Note that if there is data in the column that cannot be converted to datetime, an Error value will be created in that cell." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_conversion_date = dflow_conversion.to_datetime('Date')\n", - "dflow_conversion_date.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this case, we can see that '1/10/2016 11:00' was converted using the format `%m/%d/%Y %H:%M`.\n", - "\n", - "The data in this column is actually somewhat ambiguous. Should the dates be 'October 1' or 'January 10'? The function `to_datetime` determines that both are possible, but defaults to month-first (US format).\n", - "\n", - "If the data was supposed to be day-first, you can customize the conversion." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_alternate_conversion = dflow_conversion.to_datetime('Date', date_time_formats=['%d/%m/%Y %H:%M'])\n", - "dflow_alternate_conversion.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using `ColumnTypesBuilder`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep can help you automatically detect what are the likely column types.\n", - "\n", - "You can call `dflow.builders.set_column_types()` to get a `ColumnTypesBuilder`. Then, calling `learn()` on it will trigger Data Prep to inspect the data in each column. As a result, you can see the suggested column types for each column (conversion candidates)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder = dflow.builders.set_column_types()\n", - "builder.learn()\n", - "builder" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this case, Data Prep suggested the correct column types for \"Arrest\", \"Case Number\", \"Latitude\", and \"Longitude\".\n", - "\n", - "However, for \"Date\", it has suggested two possible date formats: month-first, or day-first. The ambiguity must be resolved before you complete the conversion. To use the month-first format, you can call `builder.ambiguous_date_conversions_keep_month_day()`. Otherwise, call `builder.ambiguous_date_conversions_keep_day_month()`. Note that if there were multiple datetime columns with ambiguous date conversions, calling one of these functions will apply the resolution to all of them.\n", - "\n", - "If you want to skip all the ambiguous date column conversions instead, you can call: `builder.ambiguous_date_conversions_drop()`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.ambiguous_date_conversions_keep_month_day()\n", - "builder.conversion_candidates" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The conversion candidate for \"IUCR\" is currently `FieldType.INTEGER`. If you know that \"IUCR\" should be floating point (called `FieldType.DECIMAL`), you can tweak the builder to change the conversion candidate for that specific column. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.conversion_candidates['IUCR'] = dprep.FieldType.DECIMAL\n", - "builder" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this case we are happy with \"IUCR\" as `FieldType.INTEGER`. So we set it back. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.conversion_candidates['IUCR'] = dprep.FieldType.INTEGER\n", - "builder" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once you are happy with the conversion candidates, you can complete the conversion by calling `builder.to_dataflow()`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_converion_using_builder = builder.to_dataflow()\n", - "dflow_converion_using_builder.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Convert column types for multiple columns" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you already know the column types, you can simply call `dflow.set_column_types()`. This function allows you to specify multiple columns, and the desired column type for each one. Here's how you can convert all five columns at once.\n", - "\n", - "Note that `set_column_types` only supports a subset of column type conversions. For example, we cannot specify the true/false values for a boolean conversion, so the results of this operation is incorrect for the \"Arrest\" column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_conversion_using_set = dflow.set_column_types({\n", - " 'IUCR': dprep.FieldType.INTEGER,\n", - " 'Latitude': dprep.FieldType.DECIMAL,\n", - " 'Longitude': dprep.FieldType.DECIMAL,\n", - " 'Arrest': dprep.FieldType.BOOLEAN,\n", - " 'Date': (dprep.FieldType.DATE, ['%m/%d/%Y %H:%M']),\n", - "})\n", - "dflow_conversion_using_set.head(5)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/custom-python-transforms.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/custom-python-transforms.ipynb deleted file mode 100644 index c0653203..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/custom-python-transforms.ipynb +++ /dev/null @@ -1,232 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/custom-python-transforms.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Custom Python Transforms\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There will be scenarios when the easiest thing for you to do is just to write some Python code. This SDK provides three extension points that you can use.\n", - "\n", - "1. New Script Column\n", - "2. New Script Filter\n", - "3. Transform Partition\n", - "\n", - "Each of these are supported in both the scale-up and the scale-out runtime. A key advantage of using these extension points is that you don't need to pull all of the data in order to create a dataframe. Your custom python code will be run just like other transforms, at scale, by partition, and typically in parallel." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initial data prep" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We start by loading crime data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "col = dprep.col\n", - "\n", - "dflow = dprep.read_csv(path='../data/crime-spring.csv')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We trim the dataset down and keep only the columns we are interested in. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.keep_columns(['Case Number','Primary Type', 'Description', 'Latitude', 'Longitude'])\n", - "dflow = dflow.replace_na(columns=['Latitude', 'Longitude'], custom_na_list='')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We look for null values using a filter. We found some, so now we'll look at a way to fill these missing values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow.filter(col('Latitude').is_null()).head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Transform Partition" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We want to replace all null values with a 0, so we decide to use a handy pandas function. This code will be run by partition, not on all of the dataset at a time. This means that on a large dataset, this code may run in parallel as the runtime processes the data partition by partition." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pt_dflow = dflow\n", - "\n", - "def transform(df, index):\n", - " df['Latitude'].fillna('0',inplace=True)\n", - " df['Longitude'].fillna('0',inplace=True)\n", - " return df\n", - "\n", - "dflow = pt_dflow.map_partition(fn=transform)\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Transform Partition With File" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Being able to use any python code to manipulate your data as a pandas DataFrame is extremely useful for complex and specific data operations that DataPrep doesn't handle natively. Though the code isn't very testable unfortunately, it's just sitting inside a string.\n", - "So to improve code testability and ease of script writing there is another transform_partiton interface that takes the path to a python script which must contain a function matching the 'transform' signature defined above.\n", - "\n", - "The `script_path` argument should be a relative path to ensure Dataflow portability. Here `map_func.py` contains the same code as in the previous example." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = pt_dflow.transform_partition_with_file('../data/map_func.py')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## New Script Column" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We want to create a new column that has both the latitude and longitude. We can achieve it easily using [Data Prep expression](./add-column-using-expression.ipynb), which is faster in execution. Alternatively, We can do this using Python code by using the `new_script_column()` method on the dataflow. Note that we use custom Python code here for demo purpose only. In practise, you should always use Data Prep native functions as a preferred method, and use custom Python code when the functionality is not available in Data Prep. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.new_script_column(new_column_name='coordinates', insert_after='Longitude', script=\"\"\"\n", - "def newvalue(row):\n", - " return '(' + row['Latitude'] + ', ' + row['Longitude'] + ')'\n", - "\"\"\")\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## New Script Filter" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we want to filter the dataset down to only the crimes that incurred over $300 in loss. We can build a Python expression that returns True if we want to keep the row, and False to drop the row." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.new_script_filter(\"\"\"\n", - "def includerow(row):\n", - " val = row['Description']\n", - " return 'OVER $ 300' in val\n", - "\"\"\")\n", - "dflow.head(5)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-ingestion.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-ingestion.ipynb deleted file mode 100644 index 58b6df4a..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-ingestion.ipynb +++ /dev/null @@ -1,1210 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/data-ingestion.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data Ingestion\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep has the ability to load different types of input data. You can use auto-reading functionality to detect the type of a file, or directly specify a file type and its parameters." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Table of Contents\n", - "[Read Lines](#lines)
\n", - "[Read CSV](#csv)
\n", - "[Read Compressed CSV](#compressed-csv)
\n", - "[Read Excel](#excel)
\n", - "[Read Fixed Width Files](#fixed-width)
\n", - "[Read Parquet](#parquet)
\n", - "[Read Npz](#npz)
\n", - "[Read Part Files Using Globbing](#globbing)
\n", - "[Read JSON](#json)
\n", - "[Read JSON Lines](#jsonlines)
\n", - "[Read SQL](#sql)
\n", - "[Read PostgreSQL](#postgresql)
\n", - "[Read From Azure Blob](#azure-blob)
\n", - "[Read From ADLS](#adls)
\n", - "[Read From ADLSGen2](#adlsgen2)
\n", - "[Read Pandas DataFrame](#pandas-df)
\n", - "[Read From HTTP/HTTPS Link](#http)
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read Lines" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "One of the simplest ways to read data using Data Prep is to just read it as text lines." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_lines(path='../data/crime.txt')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "With ingestion done, you can go ahead and start prepping the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = dflow.to_pandas_dataframe()\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read CSV" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When reading delimited files, the only required parameter is `path`. Other parameters (e.g. separator, encoding, whether to use headers, etc.) are available to modify default behavior.\n", - "In this case, you can read a file by specifying only its location, then retrieve the first 5 rows to evaluate the result." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_duplicate_headers = dprep.read_csv(path='../data/crime_duplicate_headers.csv')\n", - "dflow_duplicate_headers.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the result, you can see that the delimiter and encoding were correctly detected. Column headers were also detected. However, the first line seems to be a duplicate of the column headers. One of the parameters is a number of lines to skip from the files being read. You can use this to filter out the duplicate line." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_skip_headers = dprep.read_csv(path='../data/crime_duplicate_headers.csv', skip_rows=1)\n", - "dflow_skip_headers.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now the data set contains the correct headers and the extraneous row has been skipped by `read_csv`. Next, look at the data types of the columns." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_skip_headers.dtypes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Unfortunately, all of the columns came back as strings. This is because, by default, Data Prep will not change the type of the data. Since the data source is a text file, all values are kept as strings. In this case, however, numeric columns should be parsed as numbers. To do this, set the `infer_column_types` parameter to `True`, which will trigger type inference to be performed.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_inferred_types = dprep.read_csv(path='../data/crime_duplicate_headers.csv',\n", - " skip_rows=1,\n", - " infer_column_types=True)\n", - "dflow_inferred_types.dtypes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now several of the columns were correctly detected as numbers and their `FieldType` is Decimal.\n", - "\n", - "With ingestion done, the data set is ready to start preparing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = dflow_inferred_types.to_pandas_dataframe()\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read Compressed CSV" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep can also read delimited files compressed in an archive. The `archive_options` parameter specifies the type of archive and glob pattern of entries in the archive.\n", - "\n", - "At this moment, only reading from ZIP archives is supported." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import ArchiveOptions, ArchiveType\n", - "\n", - "dflow = dprep.read_csv(path='../data/crime.zip',\n", - " archive_options=ArchiveOptions(archive_type=ArchiveType.ZIP, entry_glob='*10-20.csv'))\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read Excel" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep can also load Excel files using the `read_excel` method." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_default_sheet = dprep.read_excel(path='../data/crime.xlsx')\n", - "dflow_default_sheet.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here, the first sheet of the Excel document has been loaded. You could achieve the same result by specifying the name of the desired sheet explicitly." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_second_sheet = dprep.read_excel(path='../data/crime.xlsx', sheet_name='Sheet2')\n", - "dflow_second_sheet.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you can see, the table in the second sheet had headers as well as three empty rows, so you can modify the arguments accordingly." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_skipped_rows = dprep.read_excel(path='../data/crime.xlsx',\n", - " sheet_name='Sheet2',\n", - " use_column_headers=True,\n", - " skip_rows=3)\n", - "dflow_skipped_rows.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = dflow_skipped_rows.to_pandas_dataframe()\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can see in the results that the FBI Code column now contains some NaN values where before, when calling head, it didn't. By default, `to_pandas_dataframe` attempts to coalesce columns into a single type for better performance and lower memory overhead. This specific column has a mixutre of both numbers and strings and the strings were replaced with NaN values.\n", - "\n", - "If you wish to keep the mixed-type column in the Pandas DataFrame, you can set the `extended_types` argument to True when calling `to_pandas_dataframe`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = dflow_skipped_rows.to_pandas_dataframe(extended_types=True)\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read Fixed Width Files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For fixed-width files, you can specify a list of offsets. The first column is always assumed to start at offset 0." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_fixed_width = dprep.read_fwf('../data/crime.txt', offsets=[8, 17, 26, 33, 56, 58, 74])\n", - "dflow_fixed_width.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Looking at the data, you can see that the first row was used as headers. In this particular case, however, there are no headers in the file, so the first row should be treated as data.\n", - "\n", - "Passing in `PromoteHeadersMode.NONE` to the `header` keyword argument avoids header detection and gets the correct data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_no_headers = dprep.read_fwf('../data/crime.txt',\n", - " offsets=[8, 17, 26, 33, 56, 58, 74],\n", - " header=dprep.PromoteHeadersMode.NONE)\n", - "dflow_no_headers.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = dflow_no_headers.to_pandas_dataframe()\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read Parquet" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep has two different methods for reading data stored as Parquet.\n", - "\n", - "Currently, both methods require the `pyarrow` package to be installed in your Python environment. This can be done via `pip install azureml-dataprep[parquet]`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Read Parquet File" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For reading single `.parquet` files, or a folder full of only Parquet files, use `read_parquet_file`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_parquet_file('../data/crime.parquet')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Parquet data is explicitly typed so no type inference is needed." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow.dtypes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Read Parquet Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A Parquet Dataset is different from a Parquet file in that it could be a folder containing a number of Parquet files within a complex directory structure. It may have a hierarchical structure that partitions the data by value of a column. These more complex forms of Parquet data are commonly produced by Spark/HIVE.\n", - "\n", - "For these more complex data sets, you can use `read_parquet_dataset`, which uses pyarrow to handle complex Parquet layouts. This will also handle single Parquet files, though these are better read using `read_parquet_file`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_parquet_dataset('../data/parquet_dataset')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The above data was partitioned by the value of the `Arrest` column. It is a boolean column in the original crime0 data set and hence was partitioned by `Arrest=true` and `Arrest=false`.\n", - "\n", - "The directory structure is printed below for clarity." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "for path, dirs, files in os.walk('../data/parquet_dataset'):\n", - " level = path.replace('../data/parquet_dataset', '').count(os.sep)\n", - " indent = ' ' * (level)\n", - " print(indent + os.path.basename(path) + '/')\n", - " fileindent = ' ' * (level + 1)\n", - " for f in files:\n", - " print(fileindent + f)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read Npz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For reading `.npz` files use `read_npz_file`.\n", - "\n", - "**Note:** Currently the only supported npz files are those containing CSR Matrixes saved using SciPy's `sparse.save_npz` method." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_npz_file('../data/10x10-float64-csr.npz')\n", - "df = dflow.to_pandas_dataframe()\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep.native import preppy_to_ndarrays\n", - "from pandas.util.testing import assert_frame_equal\n", - "import os\n", - "import pandas\n", - "import glob\n", - "from collections import OrderedDict\n", - "\n", - "paths = [os.path.abspath(file) for file in glob.iglob('./testdata/npz-10x10-csr/part-*', recursive=False)]\n", - "paths.sort()\n", - "dataset = preppy_to_ndarrays(paths)\n", - "expected_df = pandas.DataFrame.from_dict(OrderedDict(dataset))\n", - "assert_frame_equal(expected_df, df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read Part Files Using Globbing" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep supports globbing, which allows you to read partitioned files (or any other type of files) in a folder. Globbing is supported by all of the read transformations that take in file paths, such as `read_csv`, `read_lines`, etc. By specifying `../data/crime_partfiles/part-*` in the path, we will read all files start with `part-`in `crime_partfiles` folder and return them in one Dataflow. [`auto_read_file`](./auto-read-file.ipynb) will detect column types of your part files and parse them automatically." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_partfiles = dprep.auto_read_file(path='../data/crime_partfiles/part-*')\n", - "dflow_partfiles.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read JSON" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep can also load JSON files." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_json = dprep.read_json(path='../data/json.json')\n", - "dflow_json.head(15)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When you use `read_json`, Data Prep will attempt to extract data from the file into a table. You can also control the file encoding Data Prep should use as well as whether Data Prep should flatten nested JSON arrays.\n", - "\n", - "Choosing the option to flatten nested arrays could result in a much larger number of rows." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_flat_arrays = dprep.read_json(path='../data/json.json', flatten_nested_arrays=True)\n", - "dflow_flat_arrays.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read JSON Lines Files\n", - "\n", - "In addition to JSON objects, Data Prep can also read files consisting of JSON lines." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_json_lines = dprep.read_json_lines('../data/crime.jsonl')\n", - "dflow_json_lines.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read SQL" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep can also fetch data from SQL servers. Currently, only Microsoft SQL Server is supported.\n", - "\n", - "To read data from a SQL server, first create a data source object that contains the connection information." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "secret = dprep.register_secret(value=\"dpr3pTestU$er\", id=\"dprepTestUser\")\n", - "ds = dprep.MSSQLDataSource(server_name=\"dprep-sql-test.database.windows.net\",\n", - " database_name=\"dprep-sql-test\",\n", - " user_name=\"dprepTestUser\",\n", - " password=secret)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you can see, the password parameter of `MSSQLDataSource` accepts a Secret object. You can get a Secret object in two ways:\n", - "1. Register the secret and its value with the execution engine.\n", - "2. Create the secret with just an id (useful if the secret value was already registered in the execution environment).\n", - "\n", - "Now that you have created a data source object, you can proceed to read data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_sql(ds, \"SELECT top 100 * FROM [SalesLT].[Product]\")\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = dflow.to_pandas_dataframe(extended_types=True)\n", - "df.dtypes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read PostgreSQL" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep can also fetch data from Azure PostgreSQL servers.\n", - "\n", - "To read data from a PostgreSQL server, first create a data source object that contains the connection information." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "secret = dprep.register_secret(value=\"dpr3pTestU$er\", id=\"dprepPostgresqlUser\")\n", - "ds = dprep.PostgreSQLDataSource(server_name=\"dprep-postgresql-test.postgres.database.azure.com\",\n", - " database_name=\"dprep-postgresql-testdb\",\n", - " user_name=\"dprepPostgresqlReadOnlyUser@dprep-postgresql-test\",\n", - " password=secret)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you can see, the password parameter of `PostgreSQLDataSource` accepts a Secret object as well.\n", - "Now that you have created a PostgreSQL data source object, you can proceed to read data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_postgresql(ds, \"SELECT * FROM public.people\")\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow.dtypes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read from Azure Blob" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can read files stored in public Azure Blob by directly passing a file url. To read file from a protected Blob, pass SAS (Shared Access Signature) URI with both resource URI and SAS token in the path." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv', skip_rows=1)\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read from ADLS" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep currently supports both ADLS and ADLSGen2. There are two ways the Data Prep API can acquire the necessary OAuth token to access Azure DataLake Storage:\n", - "1. Retrieve the access token from a recent login session of the user's [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) login.\n", - "2. Use a ServicePrincipal (SP) and a certificate as a secret." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using Access Token from a recent Azure CLI session" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "On your local machine, run the following command:\n", - "```\n", - "az login\n", - "```\n", - "If your user account is a member of more than one Azure tenant, you need to specify the tenant, either in the AAD url hostname form '.onmicrosoft.com' or the tenantId GUID. The latter can be retrieved as follows:\n", - "```\n", - "az account show --query tenantId\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```python\n", - "dflow = read_csv(path = DataLakeDataSource(path='adl://dpreptestfiles.azuredatalakestore.net/crime-spring.csv', tenant='microsoft.onmicrosoft.com'))\n", - "head = dflow.head(5)\n", - "head\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a ServicePrincipal via Azure CLI" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A ServicePrincipal and the corresponding certificate can be created via [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest).\n", - "This particular SP is configured as Reader, with its scope reduced to just the ADLS account 'dpreptestfiles'.\n", - "```\n", - "az account set --subscription \"Data Wrangling development\"\n", - "az ad sp create-for-rbac -n \"SP-ADLS-dpreptestfiles\" --create-cert --role reader --scopes /subscriptions/35f16a99-532a-4a47-9e93-00305f6c40f2/resourceGroups/dpreptestfiles/providers/Microsoft.DataLakeStore/accounts/dpreptestfiles\n", - "```\n", - "This command emits the appId and the path to the certificate file (usually in the home folder). The .crt file contains both the public certificate and the private key in PEM format.\n", - "\n", - "Extract the thumbprint with:\n", - "```\n", - "openssl x509 -in adls-dpreptestfiles.crt -noout -fingerprint\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Configure ADLS Account for ServicePrincipal" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To configure the ACL (Access Control List) for the ADLS filesystem, use the objectId of the user or, here, ServicePrincipal:\n", - "```\n", - "az ad sp show --id \"fbc406bf-f7c2-410d-bc26-8b08e4dab1aa\" --query objectId\n", - "```\n", - "Configure both Read and Execute access for the ADLS file system. Since the underlying HDFS ACL model doesn't support inheritance, folders and files need to be ACL-ed individually. Please double check if the app also has permission to access the hierarchical containers.\n", - "```\n", - "az dls fs access set-entry --account dpreptestfiles --acl-spec \"user:999a21ef-75aa-4538-b325-249285672204:r-x\" --path /\n", - "az dls fs access set-entry --account dpreptestfiles --acl-spec \"user:999a21ef-75aa-4538-b325-249285672204:r--\" --path /crime-spring.csv\n", - "```\n", - "\n", - "References:\n", - "- [az ad sp](https://docs.microsoft.com/en-us/cli/azure/ad/sp?view=azure-cli-latest)\n", - "- [az dls fs access](https://docs.microsoft.com/en-us/cli/azure/dls/fs/access?view=azure-cli-latest)\n", - "- [ACL model for ADLS](https://github.com/MicrosoftDocs/azure-docs/blob/master/articles/data-lake-store/data-lake-store-access-control.md)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "certThumbprint = '84:60:1E:BE:AA:FF:90:A0:7A:73:38:80:F7:D0:12:44:98:70:9C:3A'\n", - "certificate = ''\n", - "with open('../data/adls-dpreptestfiles.crt', 'rt', encoding='utf-8') as crtFile:\n", - " certificate = crtFile.read()\n", - "\n", - "servicePrincipalAppId = \"fbc406bf-f7c2-410d-bc26-8b08e4dab1aa\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Acquire an OAuth Access Token" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use the adal package (via: `pip install adal`) to create an authentication context on the MSFT tenant and acquire an OAuth access token. Note that for ADLS, the `resource` in the token request must be for 'datalake.azure.net', which is different from most other Azure resources." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import adal\n", - "from azureml.dataprep.api.datasources import DataLakeDataSource\n", - "\n", - "ctx = adal.AuthenticationContext('https://login.microsoftonline.com/microsoft.onmicrosoft.com')\n", - "token = ctx.acquire_token_with_client_certificate('https://datalake.azure.net/', servicePrincipalAppId, certificate, certThumbprint)\n", - "dflow = dprep.read_csv(path = DataLakeDataSource(path='adl://dpreptestfiles.azuredatalakestore.net/crime-spring.csv', accessToken=token['accessToken']))\n", - "dflow.to_pandas_dataframe().head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read from ADLSGen2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Please refer to the Read for ADLS section above to get details of how to register a Service Principal and obtain an OAuth access token.[ADLS](http://localhost:8888/notebooks/notebooks/how-to-guides/data-ingestion.ipynb#adls)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Configure ADLSGen2 Account for ServicePrincipal" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "certThumbprint = '23:66:84:6B:3A:14:9E:B1:17:CA:EE:E3:BB:2C:21:2D:20:B0:DF:F2'\n", - "certificate = ''\n", - "with open('../data/ADLSgen2-datapreptest.crt', 'rt', encoding='utf-8') as crtFile:\n", - " certificate = crtFile.read()\n", - "\n", - "servicePrincipalAppId = \"127a58c3-f307-46a1-969e-a6b63da3f411\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Acquire an OAuth Access Token for ADLSGen2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import adal\n", - "from azureml.dataprep.api.datasources import ADLSGen2\n", - "\n", - "ctx = adal.AuthenticationContext('https://login.microsoftonline.com/72f988bf-86f1-41af-91ab-2d7cd011db47')\n", - "token = ctx.acquire_token_with_client_certificate('https://storage.azure.com/', servicePrincipalAppId, certificate, certThumbprint)\n", - "dflow = dprep.read_csv(path = ADLSGen2(path='https://adlsgen2datapreptest.dfs.core.windows.net/datapreptest/people.csv', accessToken=token['accessToken']))\n", - "dflow.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Reading from ADLSGen2 using the ABFS uri syntax is also supported by Data Prep." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_csv(path = ADLSGen2(path='abfss://adlsgen2datapreptest.dfs.core.windows.net/datapreptest/people.csv', accessToken=token['accessToken']))\n", - "dflow.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read Pandas DataFrame" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are situations where you may already have some data in the form of a pandas DataFrame.\n", - "The steps taken to get to this DataFrame may be non-trivial or not easy to convert to Data Prep Steps. The `read_pandas_dataframe` reader can take a DataFrame and use it as the data source for a Dataflow.\n", - "\n", - "You can pass in a path to a directory (that doesn't exist yet) for Data Prep to store the contents of the DataFrame; otherwise, a temporary directory will be made in the system's temp folder. The files written to this directory will be named `part-00000` and so on; they are written out in Data Prep's internal row-based file format." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_excel(path='../data/crime.xlsx')\n", - "dflow = dflow.drop_columns(columns=['Column1'])\n", - "df = dflow.to_pandas_dataframe()\n", - "df.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After loading in the data you can now do `read_pandas_dataframe`. If you only need to consume the Dataflow created from the current environment, you can read the DataFrame in memory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_df = dprep.read_pandas_dataframe(df, in_memory=True)\n", - "dflow_df.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "However, if you intend to use this Dataflow past the end of your current Python session (such as by saving the Dataflow to a file), you can provide a cache directory where the contents of the DataFrame will be stored so they can be retrieved later." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import shutil\n", - "cache_dir = 'dflow_df'\n", - "shutil.rmtree(cache_dir, ignore_errors=True)\n", - "dflow_df = dprep.read_pandas_dataframe(df, cache_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_df.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Read from HTTP/HTTPS Link" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can pass in an HTTP/HTTPS path when loading remote data source." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_csv(dprep.HttpDataSource('https://dprepdata.blob.core.windows.net/test/Sample-Spreadsheet-10-rows.csv'))\n", - "dflow.head(5)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-profile.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-profile.ipynb deleted file mode 100644 index 97b42ee1..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-profile.ipynb +++ /dev/null @@ -1,179 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/data-profile.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data Profile\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A DataProfile collects summary statistics on each column of the data produced by a Dataflow. This can be used to:\n", - "- Understand the input data.\n", - "- Determine which columns might need further preparation.\n", - "- Verify that data preparation operations produced the desired result.\n", - "\n", - "`Dataflow.get_profile()` executes the Dataflow, calculates profile information, and returns a newly constructed DataProfile." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "\n", - "dflow = dprep.auto_read_file('../data/crime-spring.csv')\n", - "\n", - "profile = dflow.get_profile()\n", - "profile" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A DataProfile contains a collection of ColumnProfiles, indexed by column name. Each ColumnProfile has attributes for the calculated column statistics. For non-numeric columns, profiles include only basic statistics like min, max, and error count. For numeric columns, profiles also include statistical moments and estimated quantiles." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "profile.columns['Beat']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also extract and filter data from profiles by using list and dict comprehensions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "variances = [c.variance for c in profile.columns.values() if c.variance]\n", - "variances" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "column_types = {c.name: c.type for c in profile.columns.values()}\n", - "column_types" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If a column has fewer than a thousand unique values, its ColumnProfile contains a summary of values with their respective counts." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "profile.columns['Primary Type'].value_counts" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Numeric ColumnProfiles include an estimated histogram of the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "profile.columns['District'].histogram" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To configure the number of bins in the histogram, you can pass an integer as the `number_of_histogram_bins` parameter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "profile_more_bins = dflow.get_profile(number_of_histogram_bins=5)\n", - "profile_more_bins.columns['District'].histogram" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For columns containing data of mixed types, the ColumnProfile also provides counts of each type." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "profile.columns['X Coordinate'].type_counts" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/datastore.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/datastore.ipynb deleted file mode 100644 index 76bc0ed4..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/datastore.ipynb +++ /dev/null @@ -1,246 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/datastore.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Reading from and Writing to Datastores" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A datastore is a reference that points to an Azure storage service like a blob container for example. It belongs to a workspace and a workspace can have many datastores.\n", - "\n", - "A data path points to a path on the underlying Azure storage service the datastore references. For example, given a datastore named `blob` that points to an Azure blob container, a data path can point to `/test/data/titanic.csv` in the blob container." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read data from Datastore" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep supports reading data from a `Datastore` or a `DataPath` or a `DataReference`. \n", - "\n", - "Passing in a datastore into all the `read_*` methods of Data Prep will result in reading everything in the underlying Azure storage service. To read a specific folder or file in the underlying storage, you have to pass in a data reference." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core import Workspace, Datastore\n", - "from azureml.data.datapath import DataPath\n", - "\n", - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, get or create a workspace. Feel free to replace `subscription_id`, `resource_group`, and `workspace_name` with other values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "subscription_id = '35f16a99-532a-4a47-9e93-00305f6c40f2'\n", - "resource_group = 'DataStoreTest'\n", - "workspace_name = 'dataprep-centraleuap'\n", - "\n", - "workspace = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "workspace.datastores" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can now read a crime data set from the datastore. If you are using your own workspace, the `crime0-10.csv` will not be there by default. You will have to upload the data to the datastore yourself." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "datastore = Datastore(workspace=workspace, name='dataprep_blob')\n", - "dflow = dprep.read_csv(path=datastore.path('crime0-10.csv'))\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also read from an Azure SQL database. To do that, you will first get an Azure SQL database datastore instance and pass it to Data Prep for reading." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "datastore = Datastore(workspace=workspace, name='test_sql')\n", - "dflow_sql = dprep.read_sql(data_source=datastore, query='SELECT * FROM team')\n", - "dflow_sql.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also read from a PostgreSQL database. To do that, you will first get a PostgreSQL database datastore instance and pass it to Data Prep for reading." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "datastore = Datastore(workspace=workspace, name='postgre_test')\n", - "dflow_sql = dprep.read_postgresql(data_source=datastore, query='SELECT * FROM public.people')\n", - "dflow_sql.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Write data to Datastore" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also write a dataflow to a datastore. The code below will write the file you read in earlier to the folder in the datastore." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dest_datastore = Datastore(workspace, 'dataprep_blob_key')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow.write_to_csv(directory_path=dest_datastore.path('output/crime0-10')).run_local()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now you can read all the files in the `dataprep_adls` datastore which references an Azure Data Lake store." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "datastore = Datastore(workspace=workspace, name='dataprep_adls')\n", - "dflow_adls = dprep.read_csv(path=DataPath(datastore, path_on_datastore='/input/crime0-10.csv'))\n", - "dflow_adls.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now you can read all the files in the `dataprep_adlsgen2` datastore which references an ADLSGen2 Storage account." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# read a file from ADLSGen2\n", - "datastore = Datastore(workspace=workspace, name='adlsgen2')\n", - "dflow_adlsgen2 = dprep.read_csv(path=DataPath(datastore, path_on_datastore='/testfolder/peopletest.csv'))\n", - "dflow_adlsgen2.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# read all files from ADLSGen2 directory\n", - "datastore = Datastore(workspace=workspace, name='adlsgen2')\n", - "dflow_adlsgen2 = dprep.read_csv(path=DataPath(datastore, path_on_datastore='/testfolder/testdir'))\n", - "dflow_adlsgen2.head()" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/derive-column-by-example.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/derive-column-by-example.ipynb deleted file mode 100644 index 5d1db4ee..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/derive-column-by-example.ipynb +++ /dev/null @@ -1,187 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/derive-column-by-example.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Derive Column By Example\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "One of the more advanced tools in Data Prep is the ability to derive columns by providing examples of desired results and letting Data Prep generate code to achieve the intended derivation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_csv(path = '../data/crime-spring.csv')\n", - "df = dflow.head(5)\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you can see, this is a fairly simple file, but let's assume that we need to be able to join this with a dataset where date and time come in a format 'Apr 4, 2016 | 10PM-12AM'.\n", - "\n", - "Let's wrangle the data into the shape we need." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder = dflow.builders.derive_column_by_example(source_columns = ['Date'], new_column_name = 'date_timerange')\n", - "builder.add_example(source_data = df.iloc[0], example_value = 'Apr 4, 2016 10PM-12AM')\n", - "builder.preview() # will preview top 10 rows" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The code above first creates a builder for the derived column by providing an array of source columns to consider ('DATE') and name for the new column to be added.\n", - "\n", - "Then, we provide the first example by passing in the first row (index 0) of the DataFrame printed above and giving an expected value for the derived column.\n", - "\n", - "Finally, we call `builder.preview()` and observe the derived column next to the source column." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Everything looks good here. However, we just noticed that it's not quite what we wanted. We forgot to separate date and time range by '|' to generate the format we need.\n", - "\n", - "To fix that, we will add another example. This time, instead of passing in a row from the preview, we just construct a dictionary of column name to value for the source_data parameter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.add_example(source_data = {'Date': '4/15/2016 10:00'}, example_value = 'Apr 15, 2016 | 10AM-12PM')\n", - "builder.preview()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This clearly had negative effects, as now the only rows that have any values in derived column are the ones that match exactly with the examples we have provided.\n", - "\n", - "Let's look at the examples:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "examples = builder.list_examples()\n", - "examples" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here we can see that we have provided inconsistent examples. To fix the issue, we need to replace the first example with a correct one (including '|' between date and time).\n", - "\n", - "We can achieve this by deleting examples that are incorrect (by either passing in example_row from examples DataFrame, or by just passing in example_id value) and then adding new modified examples back." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.delete_example(example_id = -1)\n", - "builder.add_example(examples.iloc[0], 'Apr 4, 2016 | 10PM-12AM')\n", - "builder.preview()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now this looks correct and we can finally call to_dataflow() on the builder, which would return a dataflow with the desired derived columns added." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = builder.to_dataflow()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = dflow.to_pandas_dataframe()\n", - "df" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/external-references.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/external-references.ipynb deleted file mode 100644 index 579d9087..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/external-references.ipynb +++ /dev/null @@ -1,118 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/external-references.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# External References\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In addition to opening existing Dataflows in code and modifying them, it is also possible to create and persist Dataflows that reference another Dataflow that has been persisted to a .dprep file. In this case, executing this Dataflow will load and execute the referenced Dataflow dynamically, and then execute the steps in the referencing Dataflow." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To demonstrate, we will create a Dataflow that loads and transforms some data. After that, we will persist this Dataflow to disk. To learn more about saving and opening .dprep files, see: [Opening and Saving Dataflows](./open-save-dataflows.ipynb)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "import tempfile\n", - "import os\n", - "\n", - "dflow = dprep.auto_read_file('../data/crime.txt')\n", - "dflow = dflow.drop_errors(['Column7', 'Column8', 'Column9'], dprep.ColumnRelationship.ANY)\n", - "dflow_path = os.path.join(tempfile.gettempdir(), 'package.dprep')\n", - "dflow.save(dflow_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we have a .dprep file, we can create a new Dataflow that references it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_new = dprep.Dataflow.reference(dprep.ExternalReference(dflow_path))\n", - "dflow_new.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When executed, the new Dataflow returns the same results as the one we saved to the .dprep file. Since this reference is resolved on execution, updating the referenced Dataflow results in the changes being visible when re-executing the referencing Dataflow." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.take(5)\n", - "dflow.save(dflow_path)\n", - "\n", - "dflow_new.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As we can see, even though we did not modify `dflow_new`, it now returns only 5 records, as the referenced Dataflow was updated with the result from `dflow.take(5)`." - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/filtering.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/filtering.ipynb deleted file mode 100644 index 545fd3ca..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/filtering.ipynb +++ /dev/null @@ -1,220 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/filtering.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Filtering\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Azure ML Data Prep has the ability to filter out columns or rows using `Dataflow.drop_columns` or `Dataflow.filter`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# initial set up\n", - "import azureml.dataprep as dprep\n", - "from datetime import datetime\n", - "dflow = dprep.read_csv(path='../data/crime-spring.csv')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Filtering columns" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To filter columns, use `Dataflow.drop_columns`. This method takes a list of columns to drop or a more complex argument called `ColumnSelector`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Filtering columns with list of strings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this example, `drop_columns` takes a list of strings. Each string should exactly match the desired column to drop." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.drop_columns(['ID', 'Location Description', 'Ward', 'Community Area', 'FBI Code'])\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Filtering columns with regex" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Alternatively, a `ColumnSelector` can be used to drop columns that match a regex expression. In this example, we drop all the columns that match the expression `Column*|.*longitud|.*latitude`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.drop_columns(dprep.ColumnSelector('Column*|.*longitud|.*latitude', True, True))\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Filtering rows" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To filter rows, use `DataFlow.filter`. This method takes an `Expression` as an argument, and returns a new dataflow with the rows in which the expression evaluates to `True`. Expressions are built by indexing the `Dataflow` with a column name (`dataflow['myColumn']`) and regular operators (`>`, `<`, `>=`, `<=`, `==`, `!=`)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Filtering rows with simple expressions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Index into the Dataflow specifying the column name as a string argument `dataflow['column_name']` and in combination with one of the following standard operators `>, <, >=, <=, ==, !=`, build an expression such as `dataflow['District'] > 9`. Finally, pass the built expression into the `Dataflow.filter` function.\n", - "\n", - "In this example, `dataflow.filter(dataflow['District'] > 9)` returns a new dataflow with the rows in which the value of \"District\" is greater than '10' \n", - "\n", - "*Note that \"District\" is first converted to numeric, which allows us to build an expression comparing it against other numeric values.*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.to_number(['District'])\n", - "dflow = dflow.filter(dflow['District'] > 9)\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Filtering rows with complex expressions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To filter using complex expressions, combine one or more simple expressions with the operators `&`, `|`, and `~`. Please note that the precedence of these operators is lower than that of the comparison operators; therefore, you'll need to use parentheses to group clauses together. \n", - "\n", - "In this example, `Dataflow.filter` returns a new dataflow with the rows in which \"Primary Type\" equals 'DECEPTIVE PRACTICE' and \"District\" is greater than or equal to '10'." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.to_number(['District'])\n", - "dflow = dflow.filter((dflow['Primary Type'] == 'DECEPTIVE PRACTICE') & (dflow['District'] >= 10))\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It is also possible to filter rows combining more than one expression builder to create a nested expression.\n", - "\n", - "*Note that `'Date'` and `'Updated On'` are first converted to datetime, which allows us to build an expression comparing it against other datetime values.*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.to_datetime(['Date', 'Updated On'], ['%Y-%m-%d %H:%M:%S'])\n", - "dflow = dflow.to_number(['District', 'Y Coordinate'])\n", - "comparison_date = datetime(2016,4,13)\n", - "dflow = dflow.filter(\n", - " ((dflow['Date'] > comparison_date) | (dflow['Updated On'] > comparison_date))\n", - " | ((dflow['Y Coordinate'] > 1900000) & (dflow['District'] > 10.0)))\n", - "dflow.head(5)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/fuzzy-group.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/fuzzy-group.ipynb deleted file mode 100644 index 53f309a7..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/fuzzy-group.ipynb +++ /dev/null @@ -1,211 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/fuzzy-group.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Fuzzy Grouping\n", - "\n", - "Unprepared data often represents the same entity with multiple values; examples include different spellings, varying capitalizations, and abbreviations. This is common when working with data gathered from multiple sources or through human input. One way to canonicalize and reconcile these variants is to use Data Prep's fuzzy_group_column (also known as \"text clustering\") functionality.\n", - "\n", - "Data Prep inspects a column to determine clusters of similar values. A new column is added in which clustered values are replaced with the canonical value of its cluster, thus significantly reducing the number of distinct values. You can control the degree of similarity required for values to be clustered together, override canonical form, and set clusters if automatic clustering did not provide the desired results.\n", - "\n", - "Let's explore the capabilities of `fuzzy_group_column` by first reading in a dataset and inspecting it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_json(path='../data/json.json')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you can see above, the column `inspections.business.city` contains several forms of the city name \"San Francisco\".\n", - "Let's add a column with values replaced by the automatically detected canonical form. To do so call fuzzy_group_column() on an existing Dataflow:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_clean = dflow.fuzzy_group_column(source_column='inspections.business.city',\n", - " new_column_name='city_grouped',\n", - " similarity_threshold=0.8,\n", - " similarity_score_column_name='similarity_score')\n", - "dflow_clean.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The arguments `source_column` and `new_column_name` are required, whereas the others are optional.\n", - "If `similarity_threshold` is provided, it will be used to control the required similarity level for the values to be grouped together.\n", - "If `similarity_score_column_name` is provided, a second new column will be added to show similarity score between every pair of original and canonical values.\n", - "\n", - "In the resulting data set, you can see that all the different variations of representing \"San Francisco\" in the data were normalized to the same string, \"San Francisco\".\n", - "\n", - "But what if you want more control over what gets grouped, what doesn't, and what the canonical value should be? \n", - "\n", - "To get more control over grouping, canonical values, and exceptions, you need to use the `FuzzyGroupBuilder` class.\n", - "Let's see what it has to offer below:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder = dflow.builders.fuzzy_group_column(source_column='inspections.business.city',\n", - " new_column_name='city_grouped',\n", - " similarity_threshold=0.8,\n", - " similarity_score_column_name='similarity_score')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# calling learn() to get fuzzy groups\n", - "builder.learn()\n", - "builder.groups" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here you can see that `fuzzy_group_column` detected one group with four values that all map to \"San Francisco\" as the canonical value.\n", - "You can see the effects of changing the similarity threshold next:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.similarity_threshold = 0.9\n", - "builder.learn()\n", - "builder.groups" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that you are using a similarity threshold of `0.9`, two distinct groups of values are generated.\n", - "\n", - "Let's tweak some of the detected groups before completing the builder and getting back the Dataflow with the resulting fuzzy grouped column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.similarity_threshold = 0.8\n", - "builder.learn()\n", - "groups = builder.groups\n", - "groups" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# change the canonical value for the first group\n", - "groups[0]['canonicalValue'] = 'SANFRAN'\n", - "duplicates = groups[0]['duplicates']\n", - "# remove the last duplicate value from the cluster\n", - "duplicates = duplicates[:-1]\n", - "# assign modified duplicate array back\n", - "groups[0]['duplicates'] = duplicates\n", - "# assign modified groups back to builder\n", - "builder.groups = groups\n", - "builder.groups" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here, the canonical value is modified to be used for the single fuzzy group and removed 'S.F.' from this group's duplicates list.\n", - "\n", - "You can mutate the copy of the `groups` list from the builder (be careful to keep the structure of objects inside this list). After getting the desired groups in the list, you can update the builder with it.\n", - "\n", - "Now you can get a dataflow with the FuzzyGroup step in it." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_clean = builder.to_dataflow()\n", - "\n", - "df = dflow_clean.to_pandas_dataframe()\n", - "df" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/impute-missing-values.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/impute-missing-values.ipynb deleted file mode 100644 index a1b9e46e..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/impute-missing-values.ipynb +++ /dev/null @@ -1,147 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/impute-missing-values.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Impute missing values\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Azure ML Data Prep has the ability to impute missing values in specified columns. In this case, we will attempt to impute the missing _Latitude_ and _Longitude_ values in the input data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# loading input data\n", - "dflow = dprep.read_csv(path= '../data/crime-spring.csv')\n", - "dflow = dflow.keep_columns(['ID', 'Arrest', 'Latitude', 'Longitude'])\n", - "dflow = dflow.to_number(['Latitude', 'Longitude'])\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The third record from input data has _Latitude_ and _Longitude_ missing. To impute those missing values, we can use `ImputeMissingValuesBuilder` to learn a fixed program which imputes the columns with either a calculated `MIN`, `MAX` or `MEAN` value or a `CUSTOM` value. When `group_by_columns` is specified, missing values will be imputed by group with `MIN`, `MAX` and `MEAN` calculated per group." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Firstly, let us quickly see check the `MEAN` value of _Latitude_ column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_mean = dflow.summarize(group_by_columns=['Arrest'],\n", - " summary_columns=[dprep.SummaryColumnsValue(column_id='Latitude',\n", - " summary_column_name='Latitude_MEAN',\n", - " summary_function=dprep.SummaryFunction.MEAN)])\n", - "dflow_mean = dflow_mean.filter(dprep.col('Arrest') == 'FALSE')\n", - "dflow_mean.head(1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `MEAN` value of _Latitude_ looks good. So we will impute _Latitude_ with it. As for `Longitude`, we will impute it using `42` based on external knowledge." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# impute with MEAN\n", - "impute_mean = dprep.ImputeColumnArguments(column_id='Latitude',\n", - " impute_function=dprep.ReplaceValueFunction.MEAN)\n", - "# impute with custom value 42\n", - "impute_custom = dprep.ImputeColumnArguments(column_id='Longitude',\n", - " custom_impute_value=42)\n", - "# get instance of ImputeMissingValuesBuilder\n", - "impute_builder = dflow.builders.impute_missing_values(impute_columns=[impute_mean, impute_custom],\n", - " group_by_columns=['Arrest'])\n", - "# call learn() to learn a fixed program to impute missing values\n", - "impute_builder.learn()\n", - "# call to_dataflow() to get a dataflow with impute step added\n", - "dflow_imputed = impute_builder.to_dataflow()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# check impute result\n", - "dflow_imputed.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As the result above, the missing _Latitude_ has been imputed with the `MEAN` value of `Arrest=='false'` group, and the missing _Longitude_ has been imputed with `42`." - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/join.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/join.ipynb deleted file mode 100644 index 2f8c2e47..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/join.ipynb +++ /dev/null @@ -1,265 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/join.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Join\n", - "\n", - "In Data Prep you can easily join two Dataflows." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, get the left side of the data into a shape that is ready for the join." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get the first Dataflow and derive desired key column\n", - "dflow_left = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/testfiles/BostonWeather.csv')\n", - "dflow_left = dflow_left.derive_column_by_example(source_columns='DATE', new_column_name='date_timerange',\n", - " example_data=[('11/11/2015 0:54', 'Nov 11, 2015 | 12AM-2AM'),\n", - " ('2/1/2015 0:54', 'Feb 1, 2015 | 12AM-2AM'),\n", - " ('1/29/2015 20:54', 'Jan 29, 2015 | 8PM-10PM')])\n", - "dflow_left = dflow_left.drop_columns(['DATE'])\n", - "\n", - "# convert types and summarize data\n", - "dflow_left = dflow_left.set_column_types(type_conversions={'HOURLYDRYBULBTEMPF': dprep.TypeConverter(dprep.FieldType.DECIMAL)})\n", - "dflow_left = dflow_left.filter(expression=~dflow_left['HOURLYDRYBULBTEMPF'].is_error())\n", - "dflow_left = dflow_left.summarize(group_by_columns=['date_timerange'],summary_columns=[dprep.SummaryColumnsValue('HOURLYDRYBULBTEMPF', dprep.api.engineapi.typedefinitions.SummaryFunction.MEAN, 'HOURLYDRYBULBTEMPF_Mean')] )\n", - "\n", - "# cache the result so the steps above are not executed every time we pull on the data\n", - "import os\n", - "from pathlib import Path\n", - "cache_dir = str(Path(os.getcwd(), 'dataflow-cache'))\n", - "dflow_left.cache(directory_path=cache_dir)\n", - "dflow_left.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's prepare the data for the right side of the join." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get the second Dataflow and desired key column\n", - "dflow_right = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/bike-share/*-hubway-tripdata.csv')\n", - "dflow_right = dflow_right.keep_columns(['starttime', 'start station id'])\n", - "dflow_right = dflow_right.derive_column_by_example(source_columns='starttime', new_column_name='l_date_timerange',\n", - " example_data=[('2015-01-01 00:21:44', 'Jan 1, 2015 | 12AM-2AM')])\n", - "dflow_right = dflow_right.drop_columns('starttime')\n", - "\n", - "# cache the results\n", - "dflow_right.cache(directory_path=cache_dir)\n", - "dflow_right.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are three ways you can join two Dataflows in Data Prep:\n", - "1. Create a `JoinBuilder` object for interactive join configuration.\n", - "2. Call ```join()``` on one of the Dataflows and pass in the other along with all other arguments.\n", - "3. Call ```Dataflow.join()``` method and pass in two Dataflows along with all other arguments.\n", - "\n", - "We will explore the builder object as it simplifies the determination of correct arguments. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# construct a builder for joining dataflow_l with dataflow_r\n", - "join_builder = dflow_left.builders.join(right_dataflow=dflow_right, left_column_prefix='l', right_column_prefix='r')\n", - "\n", - "join_builder" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So far the builder has no properties set except default values.\n", - "From here you can set each of the options and preview its effect on the join result or use Data Prep to determine some of them.\n", - "\n", - "Let's start with determining appropriate column prefixes for left and right side of the join and lists of columns that would not conflict and therefore don't need to be prefixed." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "join_builder.detect_column_info()\n", - "join_builder" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can see that Data Prep has performed a pull on both Dataflows to determine the column names in them. Given that `dataflow_r` already had a column starting with `l_` new prefix got generated which would not collide with any column names that are already present.\n", - "Additionally columns in each Dataflow that won't conflict during join would remain unprefixed.\n", - "This apprach to column naming is crucial for join robustness to schema changes in the data. Let's say that at some time in future the data consumed by left Dataflow will also have `l_date_timerange` column in it.\n", - "Configured as above the join will still run as expected and the new column will be prefixed with `l2_` ensuring that ig column `l_date_timerange` was consumed by some other future transformation it remains unaffected.\n", - "\n", - "Note: `KEY_generated` is appended to both lists and is reserved for Data Prep use in case Autojoin is performed.\n", - "\n", - "### Autojoin\n", - "Autojoin is a Data prep feature that determines suitable join arguments given data on both sides. In some cases Autojoin can even derive a key column from a number of available columns in the data.\n", - "Here is how you can use Autojoin:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# generate join suggestions\n", - "join_builder.generate_suggested_join()\n", - "\n", - "# list generated suggestions\n", - "join_builder.list_join_suggestions()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's select the first suggestion and preview the result of the join." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# apply first suggestion\n", - "join_builder.apply_suggestion(0)\n", - "\n", - "join_builder.preview(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, get our new joined Dataflow." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_autojoined = join_builder.to_dataflow().drop_columns(['l_date_timerange'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Joining two Dataflows without pulling the data\n", - "\n", - "If you don't want to pull on data and know what join should look like, you can always use the join method on the Dataflow." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_joined = dprep.Dataflow.join(left_dataflow=dflow_left,\n", - " right_dataflow=dflow_right,\n", - " join_key_pairs=[('date_timerange', 'l_date_timerange')],\n", - " left_column_prefix='l2_',\n", - " right_column_prefix='r_')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_joined.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_joined = dflow_joined.filter(expression=dflow_joined['r_start station id'] == '67')\n", - "df = dflow_joined.to_pandas_dataframe()\n", - "df" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/label-encoder.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/label-encoder.ipynb deleted file mode 100644 index bc7b78c1..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/label-encoder.ipynb +++ /dev/null @@ -1,168 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/label-encoder.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Label Encoder\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data Prep has the ability to encode labels with values between 0 and (number of classes - 1) using `label_encode`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "from datetime import datetime\n", - "dflow = dprep.read_csv(path='../data/crime-spring.csv')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use `label_encode` from a Dataflow, simply specify the source column and the new column name. `label_encode` will figure out all the distinct values or classes in the source column, and it will return a new Dataflow with a new column containing the labels." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.label_encode(source_column='Primary Type', new_column_name='Primary Type Label')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To have more control over the encoded labels, create a builder with `dataflow.builders.label_encode`.\n", - "The builder allows you to preview and modify the encoded labels before generating a new Dataflow with the results. \n", - "To get started, create a builder object with `dataflow.builders.label_encode` specifying the source column and the new column name. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder = dflow.builders.label_encode(source_column='Location Description', new_column_name='Location Description Label')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To generate the encoded labels, call the `learn` method on the builder object:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.learn()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To check the result, access the generated labels through the property `encoded_labels`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.encoded_labels" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To modify the generated results, just assign a new value to `encoded_labels`. The following example adds a missing label not found in the sample data. `builder.encoded_labels` is saved into a variable `encoded_labels`, modified, and assigned back to `builder.encoded_labels`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "encoded_labels = builder.encoded_labels\n", - "encoded_labels['TOWNHOUSE'] = 6\n", - "\n", - "builder.encoded_labels = encoded_labels\n", - "builder.encoded_labels" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once the desired results are achieved, call `builder.to_dataflow` to get the new Dataflow with the encoded labels." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataflow = builder.to_dataflow()\n", - "dataflow.head(5)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/min-max-scaler.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/min-max-scaler.ipynb deleted file mode 100644 index a7e5fd65..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/min-max-scaler.ipynb +++ /dev/null @@ -1,239 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/min-max-scaler.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Min-Max Scaler\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The min-max scaler scales all values in a column to a desired range (typically [0, 1]). This is also known as feature scaling or unity-based normalization. Min-max scaling is commonly used to normalize numeric columns in a data set for machine learning algorithms." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, load a data set containing information about crime in Chicago. Keep only a few columns." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_csv('../data/crime-spring.csv')\n", - "dflow = dflow.keep_columns(columns=['ID', 'District', 'FBI Code'])\n", - "dflow = dflow.to_number(columns=['District', 'FBI Code'])\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using `get_profile()`, you can see the shape of the numeric columns such as the minimum, maximum, count, and number of error values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow.get_profile()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To apply min-max scaling, call the function `min_max_scaler` on the Dataflow and specify the column name. This will trigger a full data scan over the column to determine the min and max values and perform the scaling. Note that the min and max values of the column are preserved at this point. If the same dataflow steps are performed over a different dataset, the min-max scaler must be re-executed." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_district = dflow.min_max_scale(column='District')\n", - "dflow_district.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Look at the data profile to see that the \"District\" column is now scaled; the min is 0 and the max is 1. Any error values and missing values from the source column are preserved." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_district.get_profile()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also specify a custom range for the scaling. Instead of [0, 1], let's choose [-10, 10]." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_district_range = dflow.min_max_scale(column='District', range_min=-10, range_max=10)\n", - "dflow_district_range.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In some cases, you may want to manually provide the min and max of the data in the source column. For example, you may want to avoid a full data scan because the dataset is large and we already know the min and max. You can provide the known min and max to the `min_max_scaler` function. The column will be scaled using the provided values. For example, if you want to scale the `FBI Code` column with 6 (`data_min`) becoming 0 (`range_min`), the program will scan the data to get `data_max`, which will become 1 (`range_max`)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_fbi = dflow.min_max_scale(column='FBI Code', data_min=6)\n", - "dflow_fbi.get_profile()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using a Min-Max Scaler builder" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For more flexibility when constructing the arguments for the min-max scaling, you can use a Min-Max Scaler builder." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder = dflow.builders.min_max_scale(column='District')\n", - "builder" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calling `builder.learn()` will trigger a full data scan to see what `data_min` and `data_max` are. You can choose whether to use these values or set custom values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.learn()\n", - "builder" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you want to provide custom values for any of the arguments, you can update the builder object." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.range_max = 10\n", - "builder.data_min = 6\n", - "builder" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When you are satisfied with the arguments, you will call `builder.to_dataflow()` to get the result. Note that the min and max values of the source column is preserved by the builder at this point. If you need to get the true `data_min` and `data_max` values again, you will need to set those arguments on the builder to `None` and then call `builder.learn()` again." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_builder = builder.to_dataflow()\n", - "dflow_builder.head(5)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/one-hot-encoder.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/one-hot-encoder.ipynb deleted file mode 100644 index 72918540..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/one-hot-encoder.ipynb +++ /dev/null @@ -1,179 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/one-hot-encoder.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# One Hot Encoder\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Azure ML Data Prep has the ability to perform one hot encoding on a selected column using `one_hot_encode`. The result Dataflow will have a new binary column for each categorical label encountered in the selected column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "dflow = dprep.read_csv(path='../data/crime-spring.csv')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use `one_hot_encode` from a Dataflow, simply specify the source column. `one_hot_encode` will figure out all the distinct values or categorical labels in the source column using the current data, and it will return a new Dataflow with a new binary column for each categorical label. Note that the categorical labels are remembered in the Dataflow step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_result = dflow.one_hot_encode(source_column='Location Description')\n", - "dflow_result.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "By default, all the new columns will use the `source_column` name as a prefix. However, if you would like to specify your own prefix, simply pass a `prefix` string as a second parameter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_result = dflow.one_hot_encode(source_column='Location Description', prefix='LOCATION_')\n", - "dflow_result.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To have more control over the categorical labels, create a builder using `dataflow.builders.one_hot_encode`. The builder allows to preview and modify the categorical labels before generating a new Dataflow with the results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder = dflow.builders.one_hot_encode(source_column='Location Description', prefix='LOCATION_')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To generate the categorical labels, call the `learn` method on the builder object:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.learn()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To preview the categorical labels, simply access them through the property `categorical_labels` on the builder object:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.categorical_labels" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To modify the generated `categorical_labels`, assign a new value to `categorical_labels` or modify the existing one. The following example adds a missing label not found on the sample data to `categorical_labels`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.categorical_labels.append('TOWNHOUSE')\n", - "builder.categorical_labels" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once the desired results are achieved, call `builder.to_dataflow` to get the new Dataflow with the encoded labels." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_result = builder.to_dataflow()\n", - "dflow_result.head(5)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/open-save-dataflows.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/open-save-dataflows.ipynb deleted file mode 100644 index 8b17de4e..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/open-save-dataflows.ipynb +++ /dev/null @@ -1,184 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/open-save-dataflows.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Opening and Saving Dataflows\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once you have built a Dataflow, you can save it to a `.dprep` file. This persists all of the information in your Dataflow including steps you've added, examples and programs from by-example steps, computed aggregations, etc.\n", - "\n", - "You can also open `.dprep` files to access any Dataflows you have previously persisted." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Open\n", - "\n", - "Use the `open()` method of the Dataflow class to load existing `.dprep` files." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "dflow_path = os.path.join(os.getcwd(), '..', 'data', 'crime.dprep')\n", - "print(dflow_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import Dataflow" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = Dataflow.open(dflow_path)\n", - "head = dflow.head(5)\n", - "head" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Edit\n", - "\n", - "After a Dataflow is loaded, it can be further edited as needed. In this example, a filter is added." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.dataprep import col" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.filter(col('Description') != 'SIMPLE')\n", - "head = dflow.head(5)\n", - "head" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save\n", - "\n", - "Use the `save()` method of the Dataflow class to write out the `.dprep` file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import tempfile\n", - "temp_dir = tempfile._get_default_tempdir()\n", - "temp_file_name = next(tempfile._get_candidate_names())\n", - "temp_dflow_path = os.path.join(temp_dir, temp_file_name + '.dprep')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow.save(temp_dflow_path)\n", - "temp_dflow_path" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Round-trip\n", - "\n", - "This illustrates the ability to load the edited Dataflow back in and use it, in this case to get a pandas DataFrame." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_to_open = Dataflow.open(temp_dflow_path)\n", - "df = dflow_to_open.to_pandas_dataframe()\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if os.path.isfile(temp_dflow_path):\n", - " os.remove(temp_dflow_path)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/quantile-transformation.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/quantile-transformation.ipynb deleted file mode 100644 index 883bc5c8..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/quantile-transformation.ipynb +++ /dev/null @@ -1,91 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/quantile-transformation.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Quantile Transformation\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "DataPrep has the ability to perform quantile transformation to a numeric column. This transformation can transform the data into a normal or uniform distribution. Values bigger than the learnt boundaries will simply be clipped to the learnt boundaries when applying quantile transformation.\n", - "\n", - "Let's load a sample of the median income of california households in different suburbs from the 1990 census data. From the data profile, we can see that the minimum value and maximum value is 0.9946 and 15 respectively." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "\n", - "dflow = dprep.read_csv(path='../data/median_income.csv').set_column_types(type_conversions={\n", - " 'median_income': dprep.TypeConverter(dprep.FieldType.DECIMAL)\n", - "})\n", - "dflow.get_profile()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's now apply quantile transformation to `median_income` and see how that affects the data. We will apply quantile transformation twice, one that maps the data to a Uniform(0, 1) distribution, one that maps it to a Normal(0, 1) distribution.\n", - "\n", - "From the data profile, we can see that the min and max of the uniform median income is strictly between 0 and 1 and the mean and standard deviation of the normal median income is close to 0 and 1 respectively.\n", - "\n", - "*Note: for normal distribution, we will clip the values at the ends as the 0th percentile and the 100th percentile are -Inf and Inf respectively.*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.quantile_transform(source_column='median_income', new_column='median_income_uniform', quantiles_count=5)\n", - "dflow = dflow.quantile_transform(source_column='median_income', new_column='median_income_normal', \n", - " quantiles_count=5, output_distribution=\"Normal\")\n", - "dflow.get_profile()" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/random-split.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/random-split.ipynb deleted file mode 100644 index 7b8817de..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/random-split.ipynb +++ /dev/null @@ -1,170 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/random-split.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Random Split\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Azure ML Data Prep provides the functionality of splitting a data set into two. When training a machine learning model, it is often desirable to train the model on a subset of data, then validate the model on a different subset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `random_split(percentage, seed=None)` function in Data Prep takes in a Dataflow and randomly splitting it into two distinct subsets (approximately by the percentage specified)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `seed` parameter is optional. If a seed is not provided, a stable one is generated, ensuring that the results for a specific Dataflow remain consistent. Different calls to `random_split` will receive different seeds." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To demonstrate, you can go through the following example. First, you can read the first 10,000 lines from a file. Since the contents of the file don't matter, just the first two columns can be used for a simple example." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_csv(path='https://dpreptestfiles.blob.core.windows.net/testfiles/crime0.csv').take(10000)\n", - "dflow = dflow.keep_columns(['ID', 'Date'])\n", - "profile = dflow.get_profile()\n", - "print('Row count: %d' % (profile.columns['ID'].count))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, you can call `random_split` with the percentage set to 10% (the actual split ratio will be an approximation of `percentage`). You can take a look at the row count of the first returned Dataflow. You should see that `dflow_test` has approximately 1,000 rows (10% of 10,000)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "(dflow_test, dflow_train) = dflow.random_split(percentage=0.1)\n", - "profile_test = dflow_test.get_profile()\n", - "print('Row count of \"test\": %d' % (profile_test.columns['ID'].count))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now you can take a look at the row count of the second returned Dataflow. The row count of `dflow_test` and `dflow_train` sums exactly to 10,000, because `random_split` results in two subsets that make up the original Dataflow." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "profile_train = dflow_train.get_profile()\n", - "print('Row count of \"train\": %d' % (profile_train.columns['ID'].count))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To specify a fixed seed, simply provide it to the `random_split` function." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "(dflow_test, dflow_train) = dflow.random_split(percentage=0.1, seed=12345)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Multi-Split" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In addition to the random split demonstrated above, it is also possible to split a single Dataflow into multiple Dataflows, each containing a random exclusive subset of the overall data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "total_dflow = dprep.read_csv(path='../data/crime-spring.csv')\n", - "subset_dflows = total_dflow.multi_split(4, seed=2) # Split in 4 parts, each part contains a random 25% of the data\n", - "print([dflow.row_count for dflow in subset_dflows])" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-datasource-replace-reference.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-datasource-replace-reference.ipynb deleted file mode 100644 index e8d62acf..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-datasource-replace-reference.ipynb +++ /dev/null @@ -1,130 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/replace-datasource-replace-reference.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Replace DataSource Reference\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A common practice when performing DataPrep is to build up a script or set of cleaning operations on a smaller example file locally. This is quicker and easier than dealing with large amounts of data initially.\n", - "\n", - "After building a Dataflow that performs the desired steps, it's time to run it against the larger dataset, which may be stored in the cloud, or even locally just in a different file. This is where we can use `Dataflow.replace_datasource` to get a Dataflow identical to the one built on the small data, but referencing the newly specified DataSource." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "\n", - "dflow = dprep.read_csv('../data/crime-spring.csv')\n", - "df = dflow.to_pandas_dataframe()\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here we have the first 10 rows of a dataset called 'Crime'. The original dataset is over 100MB (admittedly not that large of a dataset but this is just an example).\n", - "\n", - "We'll perform a few cleaning operations." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_dropped = dflow.drop_columns(['Location', 'Updated On', 'X Coordinate', 'Y Coordinate', 'Description'])\n", - "sctb = dflow_dropped.builders.set_column_types()\n", - "sctb.learn(inference_arguments=dprep.InferenceArguments(day_first=False))\n", - "dflow_typed = sctb.to_dataflow()\n", - "dflow_typed.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we have a Dataflow with all our desired steps, we're ready to run against the 'full' dataset stored in Azure Blob.\n", - "All we need to do is pass the BlobDataSource into `replace_datasource` and we'll get back an identical Dataflow with the new DataSource substituted in." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_replaced = dflow_typed.replace_datasource(dprep.BlobDataSource('https://dpreptestfiles.blob.core.windows.net/testfiles/crime0.csv'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "'replaced_dflow' will now pull data from the 168MB (729734 rows) version of Crime0.csv stored in Azure Blob!\n", - "\n", - "NOTE: Dataflows can also be created by referencing a different Dataflow. Instead of using `replace_datasource`, there is a corresponding `replace_reference` method.\n", - "\n", - "We should be careful now since pulling all that data down and putting it in a pandas dataframe isn't an ideal way to inspect the result of our Dataflow. So instead, to see that our steps are being applied to all the new data, we can add a `take_sample` step, which will select records at random (based on a given probability) to be returned.\n", - "\n", - "The probability below takes the ~730000 rows down to a more inspectable ~73, though the number will vary each time `to_pandas_dataframe()` is run, since they are being randomly selected based on the probability." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_random_sample= dflow_replaced.take_sample(probability=0.0001)\n", - "sample = dflow_random_sample.to_pandas_dataframe()\n", - "sample" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-fill-error.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-fill-error.ipynb deleted file mode 100644 index 04dad995..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-fill-error.ipynb +++ /dev/null @@ -1,239 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/replace-fill-error.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Replace, Fill, Error\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can use the methods in this notebook to change values in your dataset.\n", - "\n", - "* replace - use this method to replace a value with another value. You can also use this to replace null with a value, or a value with null\n", - "* error - use this method to replace a value with an error.\n", - "* fill_nulls - this method lets you fill all nulls in a column with a certain value.\n", - "* fill_errors - this method lets you fill all errors in a column with a certain value." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_csv('../data/crime-spring.csv')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.to_datetime('Date', ['%m/%d/%Y %H:%M'])\n", - "dflow = dflow.to_number(['IUCR', 'District', 'FBI Code'])\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Replace " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### String\n", - "Use `replace` to swap a string value with another string value." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.replace('Primary Type', 'THEFT', 'STOLEN')\n", - "head = dflow.head(5)\n", - "head" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use `replace` to remove a certain string value from the column, replacing it with null. Note that Pandas shows null values as None." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.replace('Primary Type', 'DECEPTIVE PRACTICE', None)\n", - "head = dflow.head(5)\n", - "head" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Numeric\n", - "Use `replace` to swap a numeric value with another numeric value." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.replace('District', 5, 1)\n", - "head = dflow.head(5)\n", - "head" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Date\n", - "Use `replace` to swap in a new Date for an existing Date in the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from datetime import datetime, timezone\n", - "dflow = dflow.replace('Date', \n", - " datetime(2016, 4, 15, 9, 0, tzinfo=timezone.utc), \n", - " datetime(2018, 7, 4, 0, 0, tzinfo=timezone.utc))\n", - "head = dflow.head(5)\n", - "head" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Error \n", - "\n", - "The `error` method lets you create Error values. You can pass to this function the value that you want to find, along with the Error code to use in any Errors created." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.error('IUCR', 890, 'Invalid value')\n", - "head = dflow.head(5)\n", - "head" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Fill Nulls \n", - "\n", - "Use the `fill_nulls` method to replace all null values in columns with another value. This is similar to Panda's fillna() method." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.fill_nulls('Primary Type', 'N/A')\n", - "head = dflow.head(5)\n", - "head" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Fill Errors \n", - "\n", - "Use the `fill_errors` method to replace all error values in columns with another value." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.fill_errors('IUCR', -1)\n", - "head = dflow.head(5)\n", - "head" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/secrets.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/secrets.ipynb deleted file mode 100644 index c77169c9..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/secrets.ipynb +++ /dev/null @@ -1,140 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/secrets.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Providing Secrets\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Currently, secrets are only persisted for the lifetime of the engine process. Even if the dataflow is saved to a file, the secrets are not persisted in the dprep file. If you started a new session (i.e. start a new engine process), loaded a dataflow and wanted to run it, you will need to call `use_secrets` to register the required secrets to use during execution, otherwise the execution will fail as the required secrets are not available.\n", - "\n", - "In this notebook, we will:\n", - "1. Loading a previously saved dataflow\n", - "2. Call `get_missing_secrets` to determine the missing secrets\n", - "3. Call `use_secrets` and pass in the missing secrets to register it with the engine for this session\n", - "4. Call `head` to see the a preview of the data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "\n", - "import os" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's load the previously saved dataflow." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.Dataflow.open(file_path='../data/secrets.dprep')\n", - "dflow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can call `get_missing_secrets` to see which required secrets are missing in the engine." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow.get_missing_secrets()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can now read the secrets from an environment variable, put it in a secret dictionary, and call `use_secrets` with the secrets. This will register the secrets in the engine so you don't need to provide them again in this session.\n", - "\n", - "_Note: It is a bad practice to have secrets in files that will be checked into source control._" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sas = os.environ['SCENARIOS_SECRETS']\n", - "secrets = {\n", - " 'https://dpreptestfiles.blob.core.windows.net/testfiles/read_csv_duplicate_headers.csv': sas\n", - "}\n", - "dflow.use_secrets(secrets=secrets)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can now call `head` without passing in `secrets` and the engine will successfully execute. Here is a preview of the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow.head(5)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/semantic-types.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/semantic-types.ipynb deleted file mode 100644 index 266353df..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/semantic-types.ipynb +++ /dev/null @@ -1,164 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/semantic-types.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Semantic Types\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Some string values can be recognized as semantic types. For example, email addresses, US zip codes or IP addresses have specific formats that can be recognized, and then split in specific ways.\n", - "\n", - "When getting a DataProfile you can optionally ask to collect counts of values recognized as semantic types. [`Dataflow.get_profile()`](./data-profile.ipynb) executes the Dataflow, calculates profile information, and returns a newly constructed DataProfile. Semantic type counts can be included in the data profile by calling `get_profile` with the `include_stype_counts` argument set to true.\n", - "\n", - "The `stype_counts` property of the DataProfile will then include entries for columns where some semantic types were recognized for some values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "dflow = dprep.read_json(path='../data/json.json')\n", - "\n", - "profile = dflow.get_profile(include_stype_counts=True)\n", - "\n", - "print(\"row count: \" + str(profile.row_count))\n", - "profile.stype_counts" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To see all the supported semantic types, you can examine the `SType` enumeration. More types will be added over time." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "[t.name for t in dprep.SType]\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can filter the found semantic types down to just those where all non-empty values matched. The `DataProfile.stype_counts` gives a list of semantic type counts for each column, where at least some matches were found. Those lists are in desecending order of count, so here we consider only the first in each list, as that will be the one with the highest count of values that match.\n", - "\n", - "In this example, the column `inspections.business.postal_code` looks to be a US zip code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "stypes_counts = profile.stype_counts\n", - "all_match = [\n", - " (column, stypes_counts[column][0].stype)\n", - " for column in stypes_counts\n", - " if profile.row_count - profile.columns[column].empty_count == stypes_counts[column][0].count\n", - "]\n", - "all_match" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can use semantic types to compute new columns. The new columns are the values split up into elements, or canonicalized.\n", - "\n", - "Here we reduce our data down to just the `postal` column so we can better see what a `split_stype` operation can do." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_postal = dflow.keep_columns(['inspections.business.postal_code']).rename_columns({'inspections.business.postal_code': 'postal'})\n", - "dflow_postal.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "With `SType.ZipCode`, values are split into their basic five digit zip code and the plus-four add-on of the Zip+4 format." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_split = dflow_postal.split_stype('postal', dprep.SType.ZIPCODE)\n", - "dflow_split.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`split_stype` also allows you to specify the fields of the stype to use and the name of the new columns. For example, if you just needed to strip the plus four from our zip codes, you could use this." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_no_plus4 = dflow_postal.split_stype('postal', dprep.SType.ZIPCODE, ['zip'], ['zipNoPlus4'])\n", - "dflow_no_plus4.head(5)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/split-column-by-example.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/split-column-by-example.ipynb deleted file mode 100644 index 02c74746..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/split-column-by-example.ipynb +++ /dev/null @@ -1,220 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/split-column-by-example.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Split column by example\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "DataPrep also offers you a way to easily split a column into multiple columns.\n", - "The SplitColumnByExampleBuilder class lets you generate a proper split program that will work even when the cases are not trivial, like in example below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.read_lines(path='../data/crime.txt')\n", - "df = dflow.head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df['Line'].iloc[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you can see above, you can't split this particular file by space character as it will create too many columns.\n", - "That's where split_column_by_example could be quite useful." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder = dflow.builders.split_column_by_example('Line', keep_delimiters=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.preview()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Couple things to take note of here. No examples were given, and yet DataPrep was able to generate quite reasonable split program. \n", - "We have passed keep_delimiters=True so we can see all the data split into columns. In practice, though, delimiters are rarely useful, so let's exclude them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.keep_delimiters = False\n", - "builder.preview()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This looks pretty good already, except that one case number is split into 2 columns. Taking the first row as an example, we want to keep case number as \"HY329907\" instead of \"HY\" and \"329907\" seperately. \n", - "If we request generation of suggested examples we will get a list of examples that require input." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "suggestions = builder.generate_suggested_examples()\n", - "suggestions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "suggestions.iloc[0]['Line']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Having retrieved source value we can now provide an example of desired split.\n", - "Notice that we chose not to split date and time but rather keep them together in one column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.add_example(example=(suggestions['Line'].iloc[0], ['10140490','HY329907','7/5/2015 23:50','050XX N NEWLAND AVE','820','THEFT']))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.preview()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As we can see from the preview, some of the crime types (`Line_6`) do not show up as expected. Let's try to add one more example. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "builder.add_example(example=(df['Line'].iloc[1],['10139776','HY329265','7/5/2015 23:30','011XX W MORSE AVE','460','BATTERY']))\n", - "builder.preview()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This looks just like what we need. Let's get a dataflow with splited columns and drop original column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = builder.to_dataflow()\n", - "dflow = dflow.drop_columns(['Line'])\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we have successfully split the data into useful columns through examples." - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/subsetting-sampling.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/subsetting-sampling.ipynb deleted file mode 100644 index 046edbae..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/subsetting-sampling.ipynb +++ /dev/null @@ -1,240 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/subsetting-sampling.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Sampling and Subsetting\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once a Dataflow has been created, it is possible to act on only a subset of the records contained in it. This can help when working with very large datasets or when only a portion of the records is truly relevant." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Head\n", - "\n", - "The `head` method will take the number of records specified, run them through the transformations in the Dataflow, and then return the result as a Pandas dataframe." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "\n", - "dflow = dprep.read_csv('../data/crime_duplicate_headers.csv')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Take\n", - "\n", - "The `take` method adds a step to the Dataflow that will keep the number of records specified (counting from the beginning) and drop the rest. Unlike `head`, which does not modify the Dataflow, all operations applied on a Dataflow on which `take` has been applied will affect only the records kept." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_top_five = dflow.take(5)\n", - "dflow_top_five.to_pandas_dataframe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Skip\n", - "\n", - "It is also possible to skip a certain number of records in a Dataflow, such that transformations are only applied after a specific point. Depending on the underlying data source, a Dataflow with a `skip` step might still have to scan through the data in order to skip past the records." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_skip_top_one = dflow_top_five.skip(1)\n", - "dflow_skip_top_one.to_pandas_dataframe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Take Sample\n", - "\n", - "In addition to taking records from the top of the dataset, it's also possible to take a random sample of the dataset. This is done through the `take_sample(probability, seed=None)` method. This method will scan through all of the records available in the Dataflow and include them based on the probability specified. The `seed` parameter is optional. If a seed is not provided, a stable one is generated, ensuring that the results for a specific Dataflow remain consistent. Different calls to `take_sample` will receive different seeds." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_sampled = dflow.take_sample(0.1)\n", - "dflow_sampled.to_pandas_dataframe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`skip`, `take`, and `take_sample` can all be combined. With this, we can achieve behaviors like getting a random 10% sample fo the middle N records of a dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "seed = 1\n", - "dflow_nested_sample = dflow.skip(1).take(5).take_sample(0.5, seed)\n", - "dflow_nested_sample.to_pandas_dataframe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Take Stratified Sample\n", - "Besides sampling all by a probability, we also have stratified sampling, provided the strata and strata weights, the probability to sample each stratum with.\n", - "This is done through the `take_stratified_sample(columns, fractions, seed=None)` method.\n", - "For all records, we will group each record by the columns specified to stratify, and based on the stratum x weight information in `fractions`, include said record.\n", - "\n", - "Seed behavior is same as in `take_sample`.\n", - "\n", - "If a stratum is not specified or the record cannot be grouped by said stratum, we default the weight to sample by to 0 (it will not be included).\n", - "\n", - "The order of `fractions` must match the order of `columns`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fractions = {}\n", - "fractions[('ASSAULT',)] = 0.5\n", - "fractions[('BATTERY',)] = 0.2\n", - "fractions[('ARSON',)] = 0.5\n", - "fractions[('THEFT',)] = 1.0\n", - "\n", - "columns = ['Primary Type']\n", - "\n", - "single_strata_sample = dflow.take_stratified_sample(columns=columns, fractions = fractions, seed = 42)\n", - "single_strata_sample.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Stratified sampling on multiple columns is also supported." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fractions = {}\n", - "fractions[('ASSAULT', '560')] = 0.5\n", - "fractions[('BATTERY', '460')] = 0.2\n", - "fractions[('ARSON', '1020')] = 0.5\n", - "fractions[('THEFT', '820')] = 1.0\n", - "\n", - "columns = ['Primary Type', 'IUCR']\n", - "\n", - "multi_strata_sample = dflow.take_stratified_sample(columns=columns, fractions = fractions, seed = 42)\n", - "multi_strata_sample.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Selecting Partitions\n", - "\n", - "The data produced by Dataflows is processed in partitions. How different data sources and formats are partitioned is guaranteed to be stable for a specific execution mode and version of azureml.dataprep. Usually, these partitions should not be interacted directly and instead higher-level APIs should be leveraged. In certain advanced scenarios, however, it can be useful to create a Dataflow that contains only a subset of the partitions of another. The `select_partitions` method can help accomplish this." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "partition_count = dflow.get_partition_count()\n", - "# We'll keep only even-numbered partitions\n", - "desired_partitions = [p for p in range(0, partition_count) if p % 2 == 0]\n", - "subset_dflow = dflow.select_partitions(desired_partitions)\n", - "\n", - "subset_dflow.to_pandas_dataframe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Caching\n", - "It is usually a good idea to cache the sampled Dataflow for later uses.\n", - "\n", - "See [here](cache.ipynb) for more details about caching." - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/summarize.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/summarize.ipynb deleted file mode 100644 index 56a37bee..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/summarize.ipynb +++ /dev/null @@ -1,590 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/summarize.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Summarize\n", - "\n", - "Azure ML Data Prep can help summarize your data by providing you a synopsis based on aggregates over specific columns.\n", - "\n", - "## Table of Contents\n", - "[Overview](#overview)
\n", - "[Summmary Functions](#summary)
\n", - "* [SummaryFunction.MIN](#min)
\n", - "* [SummaryFunction.MAX](#max)
\n", - "* [SummaryFunction.MEAN](#mean)
\n", - "* [SummaryFunction.MEDIAN](#median)
\n", - "* [SummaryFunction.VAR](#var)
\n", - "* [SummaryFunction.SD](#sd)
\n", - "* [SummaryFunction.COUNT](#count)
\n", - "* [SummaryFunction.SUM](#sum)
\n", - "* [SummaryFunction.SKEWNESS](#skewness)
\n", - "* [SummaryFunction.KURTOSIS](#kurtosis)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Overview\n", - "Before we drill down into each aggregate function, let us observe `summarize` end to end.\n", - "\n", - "We will start by reading some data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next we count (`SummaryFunction.COUNT`) the number of rows with column ID with non-null values grouped by Primary Type." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_summarize = dflow.summarize(\n", - " summary_columns=[\n", - " dprep.SummaryColumnsValue(\n", - " column_id='ID',\n", - " summary_column_name='Primary Type ID Counts', \n", - " summary_function=dprep.SummaryFunction.COUNT)],\n", - " group_by_columns=['Primary Type'])\n", - "dflow_summarize.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If we choose to not group by anything, we will instead get a single record over the entire dataset. Here we will get the number of rows that have the column ID with non-null values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_summarize_nogroup = dflow.summarize(\n", - " summary_columns=[\n", - " dprep.SummaryColumnsValue(\n", - " column_id='ID',\n", - " summary_column_name='ID Count', \n", - " summary_function=dprep.SummaryFunction.COUNT)])\n", - "dflow_summarize_nogroup.head(1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Conversely, we can group by multiple columns." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_summarize_2group = dflow.summarize(\n", - " summary_columns=[\n", - " dprep.SummaryColumnsValue(\n", - " column_id='ID',\n", - " summary_column_name='Primary Type & Location Description ID Counts', \n", - " summary_function=dprep.SummaryFunction.COUNT)],\n", - " group_by_columns=['Primary Type', 'Location Description'])\n", - "dflow_summarize_2group.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In a similar vein, we can compute multiple aggregates in a single summary. Each aggregate function is independent and it is possible to aggregate the same column multiple times." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_summarize_multi_agg = dflow.summarize(\n", - " summary_columns=[\n", - " dprep.SummaryColumnsValue(\n", - " column_id='ID',\n", - " summary_column_name='Primary Type ID Counts', \n", - " summary_function=dprep.SummaryFunction.COUNT),\n", - " dprep.SummaryColumnsValue(\n", - " column_id='ID',\n", - " summary_column_name='Primary Type Min ID', \n", - " summary_function=dprep.SummaryFunction.MIN),\n", - " dprep.SummaryColumnsValue(\n", - " column_id='Date',\n", - " summary_column_name='Primary Type Max Date', \n", - " summary_function=dprep.SummaryFunction.MAX)],\n", - " group_by_columns=['Primary Type'])\n", - "dflow_summarize_multi_agg.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If we wanted this summary data back into our original data set, we can make use of `join_back` and optionally `join_back_columns_prefix` for easy naming distinctions. Summary columns will be added to the end. `group_by_columns` is not necessary for using `join_back`, however the behavior will be more like an append instead of a join." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_summarize_join = dflow.summarize(\n", - " summary_columns=[\n", - " dprep.SummaryColumnsValue(\n", - " column_id='ID',\n", - " summary_column_name='Primary Type ID Counts', \n", - " summary_function=dprep.SummaryFunction.COUNT)],\n", - " group_by_columns=['Primary Type'],\n", - " join_back=True,\n", - " join_back_columns_prefix='New_')\n", - "dflow_summarize_join.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary Functions\n", - "Here we will go over all the possible aggregates in Data Prep.\n", - "The most up to date set of functions can be found by enumerating the `SummaryFunction` enum." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "[x.name for x in dprep.SummaryFunction]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SummaryFunction.MIN\n", - "Data Prep can aggregate and find the minimum value of a column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow_min = dflow.summarize(\n", - " summary_columns=[\n", - " dprep.SummaryColumnsValue(\n", - " column_id='Date',\n", - " summary_column_name='Primary Type Min Date', \n", - " summary_function=dprep.SummaryFunction.MIN)],\n", - " group_by_columns=['Primary Type'])\n", - "dflow_min.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SummaryFunction.MAX\n", - "Data Prep can find the maximum value of a column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow_min = dflow.summarize(\n", - " summary_columns=[\n", - " dprep.SummaryColumnsValue(\n", - " column_id='Date',\n", - " summary_column_name='Primary Type Max Date', \n", - " summary_function=dprep.SummaryFunction.MAX)],\n", - " group_by_columns=['Primary Type'])\n", - "dflow_min.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SummaryFunction.MEAN\n", - "Data Prep can find the statistical mean of a column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow_min = dflow.summarize(\n", - " summary_columns=[\n", - " dprep.SummaryColumnsValue(\n", - " column_id='Latitude',\n", - " summary_column_name='Primary Type Latitude Mean', \n", - " summary_function=dprep.SummaryFunction.MEAN)],\n", - " group_by_columns=['Primary Type'])\n", - "dflow_min.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SummaryFunction.MEDIAN\n", - "Data Prep can find the median value of a column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow_min = dflow.summarize(\n", - " summary_columns=[\n", - " dprep.SummaryColumnsValue(\n", - " column_id='Latitude',\n", - " summary_column_name='Primary Type Latitude Median', \n", - " summary_function=dprep.SummaryFunction.MEDIAN)],\n", - " group_by_columns=['Primary Type'])\n", - "dflow_min.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SummaryFunction.VAR\n", - "Data Prep can find the statistical variance of a column. We will need more than one data point to calculate this, otherwise we will be unable to give results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow_min = dflow.summarize(\n", - " summary_columns=[\n", - " dprep.SummaryColumnsValue(\n", - " column_id='Latitude',\n", - " summary_column_name='Primary Type Latitude Variance', \n", - " summary_function=dprep.SummaryFunction.VAR)],\n", - " group_by_columns=['Primary Type'])\n", - "dflow_min.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that despite there being two cases of BATTERY, one of them is missing geographical location, thus only CRIMINAL DAMAGE can yield variance information. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SummaryFunction.SD\n", - "Data Prep can find the standard deviation of a column. We will need more than one data point to calculate this, otherwise we will be unable to give results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow_min = dflow.summarize(\n", - " summary_columns=[\n", - " dprep.SummaryColumnsValue(\n", - " column_id='Latitude',\n", - " summary_column_name='Primary Type Latitude Standard Deviation', \n", - " summary_function=dprep.SummaryFunction.SD)],\n", - " group_by_columns=['Primary Type'])\n", - "dflow_min.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Similar to when we calculate variance, despite there being two cases of BATTERY, one of them is missing geographical location, thus only CRIMINAL DAMAGE can yield variance information. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SummaryFunction.COUNT\n", - "Data Prep can count the number of rows that have a column with non-null values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow_min = dflow.summarize(\n", - " summary_columns=[\n", - " dprep.SummaryColumnsValue(\n", - " column_id='Latitude',\n", - " summary_column_name='Primary Type Latitude Count', \n", - " summary_function=dprep.SummaryFunction.COUNT)],\n", - " group_by_columns=['Primary Type'])\n", - "dflow_min.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that despite there being two cases of BATTERY, one of them is missing geographical location, thus when we group by Primary Type, we only get a count of one for Latitude." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SummaryFunction.SUM\n", - "Data Prep can aggregate and sum the values of a column. Our dataset does not have many numerical facts, but here we sum IDs grouped by Primary Type." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow_min = dflow.summarize(\n", - " summary_columns=[\n", - " dprep.SummaryColumnsValue(\n", - " column_id='ID',\n", - " summary_column_name='Primary Type ID Sum', \n", - " summary_function=dprep.SummaryFunction.SUM)],\n", - " group_by_columns=['Primary Type'])\n", - "dflow_min.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SummaryFunction.SKEWNESS\n", - "Data Prep can calculate the skewness of data in a column. We will need more than one data point to calculate this, otherwise we will be unable to give results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow_min = dflow.summarize(\n", - " summary_columns=[\n", - " dprep.SummaryColumnsValue(\n", - " column_id='Latitude',\n", - " summary_column_name='Primary Type Latitude Skewness', \n", - " summary_function=dprep.SummaryFunction.SKEWNESS)],\n", - " group_by_columns=['Primary Type'])\n", - "dflow_min.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SummaryFunction.KURTOSIS\n", - "Data Prep can calculate the kurtosis of data in a column. We will need more than one data point to calculate this, otherwise we will be unable to give results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "dflow = dprep.auto_read_file(path='../data/crime-dirty.csv')\n", - "dflow_min = dflow.summarize(\n", - " summary_columns=[\n", - " dprep.SummaryColumnsValue(\n", - " column_id='Latitude',\n", - " summary_column_name='Primary Type Latitude Kurtosis', \n", - " summary_function=dprep.SummaryFunction.KURTOSIS)],\n", - " group_by_columns=['Primary Type'])\n", - "dflow_min.head(10)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/working-with-file-streams.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/working-with-file-streams.ipynb deleted file mode 100644 index fb30ae5e..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/working-with-file-streams.ipynb +++ /dev/null @@ -1,212 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/working-with-file-streams.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Working With File Streams\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In addition to loading and parsing tabular data (see [here](./data-ingestion.ipynb) for more details), Data Prep also supports a variety of operations on raw file streams. \n", - "\n", - "File streams are usually created by calling `Dataflow.get_files`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.Dataflow.get_files(path='../data/*.csv')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The result of this operation is a Dataflow with a single column named \"Path\". This column contains values of type `StreamInfo`, each of which represents a different file matched by the search pattern specified when calling `get_files`. The string representation of a `StreamInfo` follows this pattern:\n", - "\n", - "StreamInfo(_Location_://_ResourceIdentifier_\\[_Arguments_\\])\n", - "\n", - "Location is the type of storage where the stream is located (e.g. Azure Blob, Local, or ADLS); ResouceIdentifier is the name of the file within that storage, such as a file path; and Arguments is a list of arguments required to load and read the file.\n", - "\n", - "On their own, `StreamInfo` objects are not particularly useful; however, you can use them as input to other functions." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Retrieving File Names\n", - "\n", - "In the example above, we matched a set of CSV files by using a search pattern and got back a column with several `StreamInfo` objects, each representing a different file. Now, we will extract the file path and name for each of these values into a new string column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dflow.add_column(expression=dprep.get_stream_name(dflow['Path']),\n", - " new_column_name='FilePath',\n", - " prior_column='Path')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `get_stream_name` function will return the full name of the file referenced by a `StreamInfo`. In the case of a local file, this will be an absolute path. From here, you can use the `derive_column_by_example` method to extract just the file name." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "first_file_path = dflow.head(1)['FilePath'][0]\n", - "first_file_name = os.path.basename(first_file_path)\n", - "dflow = dflow.derive_column_by_example(new_column_name='FileName',\n", - " source_columns=['FilePath'],\n", - " example_data=(first_file_path, first_file_name))\n", - "dflow = dflow.drop_columns(['FilePath'])\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Writing Streams\n", - "\n", - "Whenever you have a column containing `StreamInfo` objects, it's possible to write these out to any of the locations Data Prep supports. You can do this by calling `Dataflow.write_streams`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow.write_streams(streams_column='Path', base_path=dprep.LocalFileOutput('./test_out/')).run_local()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `base_path` parameter specifies the location the files will be written to. By default, the name of the file will be the resource identifier of the stream with any invalid characters replaced by `_`. In the case of streams referencing local files, this would be the full path of the original file. You can also specify the desired file names by referencing a column containing them:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow.write_streams(streams_column='Path',\n", - " base_path=dprep.LocalFileOutput('./test_out/'),\n", - " file_names_column='FileName').run_local()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using this functionality, you can transfer files from any source to any destination supported by Data Prep. In addition, since the streams are just values in the Dataflow, you can use all of the functionality available.\n", - "\n", - "Here, for example, we will write out only the files that start with the prefix \"crime-\". The resulting file names will have the prefix stripped and will be written to a folder named \"crime\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "prefix = 'crime-'\n", - "dflow = dflow.filter(dflow['FileName'].starts_with(prefix))\n", - "dflow = dflow.add_column(expression=dflow['FileName'].substring(len(prefix)),\n", - " new_column_name='CleanName',\n", - " prior_column='FileName')\n", - "dflow.write_streams(streams_column='Path',\n", - " base_path=dprep.LocalFileOutput('./test_out/crime/'),\n", - " file_names_column='CleanName').run_local()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Converting Data Into Streams\n", - "\n", - "Tabular data can be easily converted into a series of streams containing the data expressed in a binary or text format. These streams can then be written out using the capabilities outlined above. The number of resulting streams will depend on the number of partitions in the input data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tabular_dflow = dprep.auto_read_file('../data/crime-full.csv')\n", - "streams_dflow = tabular_dflow.to_parquet_streams()\n", - "streams_dflow.head(1)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/writing-data.ipynb b/how-to-use-azureml/work-with-data/dataprep/how-to-guides/writing-data.ipynb deleted file mode 100644 index bfbe3865..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/how-to-guides/writing-data.ipynb +++ /dev/null @@ -1,183 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/how-to-guides/writing-data.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Writing Data\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It is possible to write out the data at any point in a Dataflow. These writes are added as steps to the resulting Dataflow and will be executed every time the Dataflow is executed. Since there are no limitations to how many write steps there are in a pipeline, this makes it easy to write out intermediate results for troubleshooting or to be picked up by other pipelines.\n", - "\n", - "It is important to note that the execution of each write results in a full pull of the data in the Dataflow. For example, a Dataflow with three write steps will read and process every record in the dataset three times." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Writing to Files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Data can be written to files in any of our supported locations (Local File System, Azure Blob Storage, and Azure Data Lake Storage). In order to parallelize the write, the data is written to multiple partition files. A sentinel file named SUCCESS is also output once the write has completed. This makes it possible to identify when an intermediate write has completed without having to wait for the whole pipeline to complete.\n", - "\n", - "> When running a Dataflow in Spark, attempting to execute a write to an existing folder will fail. It is important to ensure the folder is empty or use a different target location per execution.\n", - "\n", - "The following file formats are currently supported:\n", - "- Delimited Files (CSV, TSV, etc.)\n", - "- Parquet Files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll start by loading data into a Dataflow which will be re-used with different formats." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow = dprep.auto_read_file('../data/crime.txt')\n", - "dflow = dflow.to_number('Column2')\n", - "dflow.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Delimited Files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here we create a dataflow with a write step.\n", - "\n", - "This operation is lazy until we invoke `run_local` (or any operation that forces execution like `to_pandas_dataframe`), only then will we execute the write operation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_write = dflow.write_to_csv(directory_path=dprep.LocalFileOutput('./test_out/'))\n", - "\n", - "dflow_write.run_local()\n", - "\n", - "dflow_written_files = dprep.read_csv('./test_out/part-*')\n", - "dflow_written_files.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The data we wrote out contains several errors in the numeric columns due to numbers that we were unable to parse. When written out to CSV, these are replaced with the string \"ERROR\" by default. We can parameterize this as part of our write call. In the same vein, it is also possible to set what string to use to represent null values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_write_errors = dflow.write_to_csv(directory_path=dprep.LocalFileOutput('./test_out/'), \n", - " error='BadData',\n", - " na='NA')\n", - "dflow_write_errors.run_local()\n", - "dflow_written = dprep.read_csv('./test_out/part-*')\n", - "dflow_written.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Parquet Files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Similar to `write_to_csv`, `write_to_parquet` returns a new Dataflow with a Write Parquet Step which hasn't been executed yet.\n", - "\n", - "Then we run the Dataflow with `run_local`, which executes the write operation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dflow_write_parquet = dflow.write_to_parquet(directory_path=dprep.LocalFileOutput('./test_parquet_out/'),\n", - " error='MiscreantData')\n", - "\n", - "dflow_write_parquet.run_local()\n", - "\n", - "dflow_written_parquet = dprep.read_parquet_file('./test_parquet_out/part-*')\n", - "dflow_written_parquet.head(5)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/dataprep/tutorials/getting-started/getting-started.ipynb b/how-to-use-azureml/work-with-data/dataprep/tutorials/getting-started/getting-started.ipynb deleted file mode 100644 index 73514661..00000000 --- a/how-to-use-azureml/work-with-data/dataprep/tutorials/getting-started/getting-started.ipynb +++ /dev/null @@ -1,433 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Getting started with Azure ML Data Prep SDK\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "Wonder how you can make the most of the Azure ML Data Prep SDK? In this \"Getting Started\" guide, we'll demonstrate how to do your normal data wrangling with this SDK and showcase a few highlights that make this SDK shine. Using a sample of this [Kaggle crime dataset](https://www.kaggle.com/currie32/crimes-in-chicago/home) as an example, we'll cover how to:\n", - "\n", - "* [Read in data](#Read)\n", - "* [Profile your data](#Profile)\n", - "* [Append rows](#Append)\n", - "* [Apply common data science transforms](#Data-science-transforms)\n", - " * [Summarize](#Summarize)\n", - " * [Join](#Join)\n", - " * [Filter](#Filter)\n", - " * [Replace](#Replace)\n", - "* [Consume your cleaned dataset](#Consume)\n", - "* [Explore advanced features](#Explore)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "\n", - "# Paths for datasets\n", - "file_crime_dirty = '../../data/crime-dirty.csv'\n", - "file_crime_spring = '../../data/crime-spring.csv'\n", - "file_crime_winter = '../../data/crime-winter.csv'\n", - "file_aldermen = '../../data/chicago-aldermen-2015.csv'\n", - "\n", - "# Seed\n", - "RAND_SEED = 7251" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read in data\n", - "\n", - "Azure ML Data Prep supports many different file reading formats (i.e. CSV, Excel, Parquet) and the ability to infer column types automatically. To see how powerful the `auto_read_file` capability is, let's take a peek at the `dirty-crime.csv`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dprep.read_csv(path=file_crime_dirty).head(7)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A common occurrence in many datasets is to have a column of values with commas; in our case, the last column represents location in the form of longitude-latitude pair. The default CSV reader interprets this comma as a delimiter and thus splits the data into two columns. Furthermore, it incorrectly reads in the header as the column name. Normally, we would need to `skip` the header and specify the delimiter as `|`, but our `auto_read_file` eliminates that work:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "crime_dirty = dprep.auto_read_file(path=file_crime_dirty)\n", - "\n", - "crime_dirty.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Advanced features:__ if you'd like to specify the file type and adjust how you want to read files in, you can see the list of our specialized file readers and how to use them [here](../../how-to-guides/data-ingestion.ipynb)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Profile your data\n", - "\n", - "Let's understand what our data looks like. Azure ML Data Prep facilitates this process by offering data profiles that help us glimpse into column types and column summary statistics. Notice that our auto file reader automatically guessed the column type:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "crime_dirty.get_profile()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Append rows\n", - "\n", - "What if your data is split across multiple files? We support the ability to append multiple datasets column-wise and row-wise. Here, we demonstrate how you can coalesce datasets row-wise:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Datasets with the same schema as crime_dirty\n", - "crime_winter = dprep.auto_read_file(path=file_crime_winter)\n", - "crime_spring = dprep.auto_read_file(path=file_crime_spring)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "crime = (crime_dirty.append_rows(dataflows=[crime_winter, crime_spring]))\n", - "\n", - "crime.take_sample(probability=0.25, seed=RAND_SEED).head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Advanced features:__ you can learn how to append column-wise and how to deal with appending data with different schemas [here](../../how-to-guides/append-columns-and-rows.ipynb)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Apply common data science transforms\n", - "\n", - "Azure ML Data Prep supports almost all common data science transforms found in other industry-standard data science libraries. Here, we'll explore the ability to `summarize`, `join`, `filter`, and `replace`. \n", - "\n", - "__Advanced features:__\n", - "* We also provide \"smart\" transforms not found in pandas that use machine learning to [derive new columns](../../how-to-guides/derive-column-by-example.ipynb), [split columns](../../how-to-guides/split-column-by-example.ipynb), and [fuzzy grouping](../../how-to-guides/fuzzy-group.ipynb).\n", - "* Finally, we also help featurize your dataset to prepare it for machine learning; learn more about our featurizers like [one-hot encoder](../../how-to-guides/one-hot-encoder.ipynb), [label encoder](../../how-to-guides/label-encoder.ipynb), [min-max scaler](../../how-to-guides/min-max-scaler.ipynb), and [random (train-test) split](../../how-to-guides/random-split.ipynb).\n", - "* Our complete list of example Notebooks for transforms can be found in our [How-to Guides](../../how-to-guides)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Summarize\n", - "\n", - "Let's see which wards had the most crimes in our sample dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "crime_summary = (crime\n", - " .summarize(\n", - " summary_columns=[\n", - " dprep.SummaryColumnsValue(\n", - " column_id='ID', \n", - " summary_column_name='total_ward_crimes', \n", - " summary_function=dprep.SummaryFunction.COUNT\n", - " )\n", - " ],\n", - " group_by_columns=['Ward']\n", - " )\n", - ")\n", - "\n", - "(crime_summary\n", - " .sort(sort_order=[('total_ward_crimes', True)])\n", - " .head(5)\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Join\n", - "\n", - "Let's annotate each observation with more information about the ward where the crime occurred. Let's do so by joining `crime` with a dataset which lists the current aldermen for each ward:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "aldermen = dprep.auto_read_file(path=file_aldermen)\n", - "\n", - "aldermen.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "crime.join(\n", - " left_dataflow=crime,\n", - " right_dataflow=aldermen,\n", - " join_key_pairs=[\n", - " ('Ward', 'Ward')\n", - " ]\n", - ").head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Advanced features:__ [Learn more](../../how-to-guides/join.ipynb) about how you can do all variants of `join`, like inner-, left-, right-, anti-, and semi-joins." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Filter\n", - "\n", - "Let's look at theft crimes:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "theft = crime.filter(crime['Primary Type'] == 'THEFT')\n", - "\n", - "theft.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Replace\n", - "\n", - "Notice that our `theft` dataset has empty strings in column `Location`. Let's replace those with a missing value:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "theft_replaced = (theft\n", - " .replace_na(\n", - " columns=['Location'], \n", - " use_empty_string_as_na=True\n", - " )\n", - ")\n", - "\n", - "theft_replaced.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Advanced features:__ [Learn more](../../how-to-guides/replace-fill-error.ipynb) about more advanced `replace` and `fill` capabilities." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Consume your cleaned dataset\n", - "\n", - "Azure ML Data Prep allows you to \"choose your own adventure\" once you're done wrangling. You can:\n", - "\n", - "1. Write to a pandas dataframe\n", - "2. Execute on Spark\n", - "3. Consume directly in Azure Machine Learning models\n", - "\n", - "In this quickstart guide, we'll show how you can export to a pandas dataframe.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "theft_replaced.to_pandas_dataframe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Explore advanced features\n", - "\n", - "Congratulations on finishing your introduction to the Azure ML Data Prep SDK! If you'd like more detailed tutorials on how to construct machine learning datasets or dive deeper into all of its functionality, you can find more information in our detailed notebooks [here](https://github.com/Microsoft/PendletonDocs). There, we cover topics including how to:\n", - "\n", - "* [Cache your Dataflow to speed up your iterations](../../how-to-guides/cache.ipynb)\n", - "* [Add your custom Python transforms](../../how-to-guides/custom-python-transforms.ipynb)\n", - "* [Impute missing values](../../how-to-guides/impute-missing-values.ipynb)\n", - "* [Sample your data](../../how-to-guides/subsetting-sampling.ipynb)\n", - "* [Reference and link between Dataflows](../../how-to-guides/join.ipynb)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataprep/tutorials/getting-started/getting-started.png)" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "sihhu" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.2" - }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/datasets/dataset-api-change-notice.md b/how-to-use-azureml/work-with-data/dataset-api-change-notice.md similarity index 97% rename from how-to-use-azureml/work-with-data/datasets/dataset-api-change-notice.md rename to how-to-use-azureml/work-with-data/dataset-api-change-notice.md index 3d1b2683..46fdb8d8 100644 --- a/how-to-use-azureml/work-with-data/datasets/dataset-api-change-notice.md +++ b/how-to-use-azureml/work-with-data/dataset-api-change-notice.md @@ -52,4 +52,7 @@ new_ds.register(workspace, 'new_ds_name') ``` ## How to provide feedback? -If you have any feedback about our product, or if there is any missing capability that is essential for you to use new Dataset API, please email us at [AskAzureMLData@microsoft.com](mailto:AskAzureMLData@microsoft.com). \ No newline at end of file +If you have any feedback about our product, or if there is any missing capability that is essential for you to use new Dataset API, please email us at [AskAzureMLData@microsoft.com](mailto:AskAzureMLData@microsoft.com). + + +![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/dataset-api-change-notice.png) \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/datasets/datasets-tutorial/tabular-timeseries-dataset-filtering.ipynb b/how-to-use-azureml/work-with-data/datasets-tutorial/tabular-timeseries-dataset-filtering.ipynb similarity index 96% rename from how-to-use-azureml/work-with-data/datasets/datasets-tutorial/tabular-timeseries-dataset-filtering.ipynb rename to how-to-use-azureml/work-with-data/datasets-tutorial/tabular-timeseries-dataset-filtering.ipynb index 0672e5d5..447c75d1 100644 --- a/how-to-use-azureml/work-with-data/datasets/datasets-tutorial/tabular-timeseries-dataset-filtering.ipynb +++ b/how-to-use-azureml/work-with-data/datasets-tutorial/tabular-timeseries-dataset-filtering.ipynb @@ -510,16 +510,32 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/datasets/datasets-tutorial/datasets-tutorial.png)" + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/datasets-tutorial/datasets-tutorial.png)" ] } ], "metadata": { "authors": [ { - "name": "copeters" + "name": "ylxiong" } ], + "category": "tutorial", + "compute": [ + "local" + ], + "datasets": [ + "NOAA" + ], + "deployment": [ + "None" + ], + "exclude_from_index": false, + "framework": [ + "Azure ML" + ], + "friendly_name": "Filtering data using Tabular Timeseiries Dataset related API", + "index_order": 1, "kernelspec": { "display_name": "Python 3.6", "language": "python", @@ -537,7 +553,15 @@ "pygments_lexer": "ipython3", "version": "3.6.8" }, - "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License." + "notice": "Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the MIT License.", + "star_tag": [ + "featured" + ], + "tags": [ + "Dataset", + "Tabular Timeseries" + ], + "task": "Filtering" }, "nbformat": 4, "nbformat_minor": 2 diff --git a/how-to-use-azureml/work-with-data/datasets-tutorial/train-dataset/iris.csv b/how-to-use-azureml/work-with-data/datasets-tutorial/train-dataset/iris.csv new file mode 100644 index 00000000..21ae1963 --- /dev/null +++ b/how-to-use-azureml/work-with-data/datasets-tutorial/train-dataset/iris.csv @@ -0,0 +1,151 @@ +sepal_length,sepal_width,petal_length,petal_width,species +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5,3.4,1.5,0.2,Iris-setosa +4.4,2.9,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.4,3.7,1.5,0.2,Iris-setosa +4.8,3.4,1.6,0.2,Iris-setosa +4.8,3,1.4,0.1,Iris-setosa +4.3,3,1.1,0.1,Iris-setosa +5.8,4,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5,3,1.6,0.2,Iris-setosa +5,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5,3.3,1.4,0.2,Iris-setosa +7,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5,2,3.5,1,Iris-versicolor +5.9,3,4.2,1.5,Iris-versicolor +6,2.2,4,1,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3,5,1.7,Iris-versicolor +6,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6,2.7,5.1,1.6,Iris-versicolor +5.4,3,4.5,1.5,Iris-versicolor +6,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3,4.1,1.3,Iris-versicolor +5.5,2.5,4,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3,4.6,1.4,Iris-versicolor +5.8,2.6,4,1.2,Iris-versicolor +5,2.3,3.3,1,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3,5.8,2.2,Iris-virginica +7.6,3,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3,5.5,2.1,Iris-virginica +5.7,2.5,5,2,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6,2.2,5,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2,Iris-virginica +7.7,2.8,6.7,2,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6,3,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3,5.2,2.3,Iris-virginica +6.3,2.5,5,1.9,Iris-virginica +6.5,3,5.2,2,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3,5.1,1.8,Iris-virginica diff --git a/how-to-use-azureml/work-with-data/datasets/datasets-tutorial/train-with-datasets.ipynb b/how-to-use-azureml/work-with-data/datasets-tutorial/train-with-datasets.ipynb similarity index 93% rename from how-to-use-azureml/work-with-data/datasets/datasets-tutorial/train-with-datasets.ipynb rename to how-to-use-azureml/work-with-data/datasets-tutorial/train-with-datasets.ipynb index 42b183e7..c8b24c5d 100644 --- a/how-to-use-azureml/work-with-data/datasets/datasets-tutorial/train-with-datasets.ipynb +++ b/how-to-use-azureml/work-with-data/datasets-tutorial/train-with-datasets.ipynb @@ -13,7 +13,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/work-with-data/datasets/datasets-tutorial/train-with-datasets.png)" + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/work-with-data/datasets-tutorial/train-with-datasets.png)" ] }, { @@ -154,7 +154,7 @@ "### Create a TabularDataset\n", "By creating a dataset, you create a reference to the data source location. If you applied any subsetting transformations to the dataset, they will be stored in the dataset as well. The data remains in its existing location, so no extra storage cost is incurred. \n", "\n", - "Every workspace comes with a default [datastore](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-access-data) (and you can register more) which is backed by the Azure blob storage account associated with the workspace. We can use it to transfer data from local to the cloud, and create Dataset from it.We will now upload the [Titanic data](./train-dataset/Titanic.csv) to the default datastore (blob) within your workspace." + "Every workspace comes with a default [datastore](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-access-data) (and you can register more) which is backed by the Azure blob storage account associated with the workspace. We can use it to transfer data from local to the cloud, and create Dataset from it. We will now upload the [Iris data](./train-dataset/Iris.csv) to the default datastore (blob) within your workspace." ] }, { @@ -164,7 +164,7 @@ "outputs": [], "source": [ "datastore = ws.get_default_datastore()\n", - "datastore.upload_files(files = ['./train-dataset/Titanic.csv'],\n", + "datastore.upload_files(files = ['./train-dataset/iris.csv'],\n", " target_path = 'train-dataset/tabular/',\n", " overwrite = True,\n", " show_progress = True)" @@ -180,11 +180,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "dataset-remarks-tabular-sample" + ] + }, "outputs": [], "source": [ "from azureml.core import Dataset\n", - "dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'train-dataset/tabular/Titanic.csv')])\n", + "dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'train-dataset/tabular/iris.csv')])\n", "\n", "# preview the first 3 rows of the dataset\n", "dataset.take(3).to_pandas_dataframe()" @@ -214,7 +218,7 @@ "metadata": {}, "outputs": [], "source": [ - "%%writefile $script_folder/train_titanic.py\n", + "%%writefile $script_folder/train_iris.py\n", "\n", "import os\n", "\n", @@ -225,15 +229,16 @@ "\n", "run = Run.get_context()\n", "# get input dataset by name\n", - "dataset = run.input_datasets['titanic']\n", + "dataset = run.input_datasets['iris']\n", "\n", "df = dataset.to_pandas_dataframe()\n", "\n", - "x_col = ['Pclass', 'Sex', 'SibSp', 'Parch']\n", - "y_col = ['Survived']\n", + "x_col = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']\n", + "y_col = ['species']\n", "x_df = df.loc[:, x_col]\n", "y_df = df.loc[:, y_col]\n", "\n", + "#dividing X,y into train and test data\n", "x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=223)\n", "\n", "data = {'train': {'X': x_train, 'y': y_train},\n", @@ -275,7 +280,9 @@ "from azureml.core.conda_dependencies import CondaDependencies\n", "\n", "conda_env = Environment('conda-env')\n", - "conda_env.python.conda_dependencies = CondaDependencies.create(pip_packages=['azureml-sdk','azureml-dataprep[pandas,fuse]>=1.1.','scikit-learn'])" + "conda_env.python.conda_dependencies = CondaDependencies.create(pip_packages=['azureml-sdk',\n", + " 'azureml-dataprep[pandas,fuse]',\n", + " 'scikit-learn'])" ] }, { @@ -288,7 +295,7 @@ "* The directory that contains your scripts. All the files in this directory are uploaded into the cluster nodes for execution. \n", "* The training script name, train_titanic.py\n", "* The input Dataset for training\n", - "* The compute target. In this case you will use the AmlCompute you created\n", + "* The compute target. In this case you will use the AmlCompute you created\n", "* The environment definition for the experiment" ] }, @@ -301,9 +308,9 @@ "from azureml.train.estimator import Estimator\n", "\n", "est = Estimator(source_directory=script_folder, \n", - " entry_script='train_titanic.py', \n", + " entry_script='train_iris.py', \n", " # pass dataset object as an input with name 'titanic'\n", - " inputs=[dataset.as_named_input('titanic')],\n", + " inputs=[dataset.as_named_input('iris')],\n", " compute_target=compute_target,\n", " environment_definition= conda_env) " ] @@ -398,7 +405,7 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.core.dataset import Dataset\n", + "from azureml.core import Dataset\n", "\n", "dataset = Dataset.File.from_files(path = [(datastore, 'diabetes/')])\n", "\n", @@ -597,6 +604,23 @@ "name": "sihhu" } ], + "category": "tutorial", + "compute": [ + "remote" + ], + "datasets": [ + "Iris", + "Daibetes" + ], + "deployment": [ + "None" + ], + "exclude_from_index": false, + "framework": [ + "Azure ML" + ], + "friendly_name": "Train with Datasets (Tabular and File)", + "index_order": 1, "kernelspec": { "display_name": "Python 3.6", "language": "python", @@ -613,7 +637,14 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" - } + }, + "star_tag": [ + "featured" + ], + "tags": [ + "Dataset" + ], + "task": "Filtering" }, "nbformat": 4, "nbformat_minor": 2 diff --git a/how-to-use-azureml/work-with-data/datasets/README.md b/how-to-use-azureml/work-with-data/datasets/README.md deleted file mode 100644 index 2a4adb53..00000000 --- a/how-to-use-azureml/work-with-data/datasets/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# Azure Machine Learning datasets (preview) - -Azure Machine Learning datasets (preview) let data scientists and machine learning engineers apply data for ML with confidence. By creating a dataset, you create a reference to the data source location, along with a copy of its metadata. The data remains in its existing location, so no extra storage cost is incurred. - -With Azure Machine Learning datasets, you can: - -* **Keep a single copy of data in your storage** referenced by datasets. - -* **Easily access data during model training** without worrying about connection string or data path. - -* **Share data & collaborate** with other users. - -## Learn how to use Azure Machine Learning datasets: -* [Create and register datasets](https://aka.ms/azureml/howto/createdatasets) -* Use [Datasets in training](datasets-tutorial/train-with-datasets.ipynb) -* Use TabularDatasets in [automated machine learning training](https://aka.ms/automl-dataset) -* Use FileDatasets in [image classification](https://aka.ms/filedataset-samplenotebook) -* Use FileDatasets in [deep learning with hyperparameter tuning](https://aka.ms/filedataset-hyperdrive) -* For existing Dataset users: [Dataset API change notice](dataset-api-change-notice.md) - - -![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/work-with-data/datasets/README.png) \ No newline at end of file diff --git a/how-to-use-azureml/work-with-data/datasets/datasets-tutorial/train-dataset/Titanic.csv b/how-to-use-azureml/work-with-data/datasets/datasets-tutorial/train-dataset/Titanic.csv deleted file mode 100644 index 50801331..00000000 --- a/how-to-use-azureml/work-with-data/datasets/datasets-tutorial/train-dataset/Titanic.csv +++ /dev/null @@ -1,892 +0,0 @@ -PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked -1,0,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,7.25,,S -2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",1,38,1,0,PC 17599,71.2833,C85,C -3,1,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,7.925,,S -4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,53.1,C123,S -5,0,3,"Allen, Mr. William Henry",0,35,0,0,373450,8.05,,S -6,0,3,"Moran, Mr. James",0,,0,0,330877,8.4583,,Q -7,0,1,"McCarthy, Mr. Timothy J",0,54,0,0,17463,51.8625,E46,S -8,0,3,"Palsson, Master. Gosta Leonard",0,2,3,1,349909,21.075,,S -9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27,0,2,347742,11.1333,,S -10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14,1,0,237736,30.0708,,C -11,1,3,"Sandstrom, Miss. Marguerite Rut",1,4,1,1,PP 9549,16.7,G6,S -12,1,1,"Bonnell, Miss. Elizabeth",1,58,0,0,113783,26.55,C103,S -13,0,3,"Saundercock, Mr. William Henry",0,20,0,0,A/5. 2151,8.05,,S -14,0,3,"Andersson, Mr. Anders Johan",0,39,1,5,347082,31.275,,S -15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",1,14,0,0,350406,7.8542,,S -16,1,2,"Hewlett, Mrs. (Mary D Kingcome) ",1,55,0,0,248706,16,,S -17,0,3,"Rice, Master. Eugene",0,2,4,1,382652,29.125,,Q -18,1,2,"Williams, Mr. Charles Eugene",0,,0,0,244373,13,,S -19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",1,31,1,0,345763,18,,S -20,1,3,"Masselmani, Mrs. Fatima",1,,0,0,2649,7.225,,C -21,0,2,"Fynney, Mr. Joseph J",0,35,0,0,239865,26,,S -22,1,2,"Beesley, Mr. Lawrence",0,34,0,0,248698,13,D56,S -23,1,3,"McGowan, Miss. Anna ""Annie""",1,15,0,0,330923,8.0292,,Q -24,1,1,"Sloper, Mr. William Thompson",0,28,0,0,113788,35.5,A6,S -25,0,3,"Palsson, Miss. Torborg Danira",1,8,3,1,349909,21.075,,S -26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",1,38,1,5,347077,31.3875,,S -27,0,3,"Emir, Mr. Farred Chehab",0,,0,0,2631,7.225,,C -28,0,1,"Fortune, Mr. Charles Alexander",0,19,3,2,19950,263,C23 C25 C27,S -29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",1,,0,0,330959,7.8792,,Q -30,0,3,"Todoroff, Mr. Lalio",0,,0,0,349216,7.8958,,S -31,0,1,"Uruchurtu, Don. Manuel E",0,40,0,0,PC 17601,27.7208,,C -32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",1,,1,0,PC 17569,146.5208,B78,C -33,1,3,"Glynn, Miss. Mary Agatha",1,,0,0,335677,7.75,,Q -34,0,2,"Wheadon, Mr. Edward H",0,66,0,0,C.A. 24579,10.5,,S -35,0,1,"Meyer, Mr. Edgar Joseph",0,28,1,0,PC 17604,82.1708,,C -36,0,1,"Holverson, Mr. Alexander Oskar",0,42,1,0,113789,52,,S -37,1,3,"Mamee, Mr. Hanna",0,,0,0,2677,7.2292,,C -38,0,3,"Cann, Mr. Ernest Charles",0,21,0,0,A./5. 2152,8.05,,S -39,0,3,"Vander Planke, Miss. Augusta Maria",1,18,2,0,345764,18,,S -40,1,3,"Nicola-Yarred, Miss. Jamila",1,14,1,0,2651,11.2417,,C -41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",1,40,1,0,7546,9.475,,S -42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",1,27,1,0,11668,21,,S -43,0,3,"Kraeff, Mr. Theodor",0,,0,0,349253,7.8958,,C -44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",1,3,1,2,SC/Paris 2123,41.5792,,C -45,1,3,"Devaney, Miss. Margaret Delia",1,19,0,0,330958,7.8792,,Q -46,0,3,"Rogers, Mr. William John",0,,0,0,S.C./A.4. 23567,8.05,,S -47,0,3,"Lennon, Mr. Denis",0,,1,0,370371,15.5,,Q -48,1,3,"O'Driscoll, Miss. Bridget",1,,0,0,14311,7.75,,Q -49,0,3,"Samaan, Mr. Youssef",0,,2,0,2662,21.6792,,C -50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",1,18,1,0,349237,17.8,,S -51,0,3,"Panula, Master. Juha Niilo",0,7,4,1,3101295,39.6875,,S -52,0,3,"Nosworthy, Mr. Richard Cater",0,21,0,0,A/4. 39886,7.8,,S -53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",1,49,1,0,PC 17572,76.7292,D33,C -54,1,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",1,29,1,0,2926,26,,S -55,0,1,"Ostby, Mr. Engelhart Cornelius",0,65,0,1,113509,61.9792,B30,C -56,1,1,"Woolner, Mr. Hugh",0,,0,0,19947,35.5,C52,S -57,1,2,"Rugg, Miss. Emily",1,21,0,0,C.A. 31026,10.5,,S -58,0,3,"Novel, Mr. Mansouer",0,28.5,0,0,2697,7.2292,,C -59,1,2,"West, Miss. Constance Mirium",1,5,1,2,C.A. 34651,27.75,,S -60,0,3,"Goodwin, Master. William Frederick",0,11,5,2,CA 2144,46.9,,S -61,0,3,"Sirayanian, Mr. Orsen",0,22,0,0,2669,7.2292,,C -62,1,1,"Icard, Miss. Amelie",1,38,0,0,113572,80,B28, -63,0,1,"Harris, Mr. Henry Birkhardt",0,45,1,0,36973,83.475,C83,S -64,0,3,"Skoog, Master. Harald",0,4,3,2,347088,27.9,,S -65,0,1,"Stewart, Mr. Albert A",0,,0,0,PC 17605,27.7208,,C -66,1,3,"Moubarek, Master. Gerios",0,,1,1,2661,15.2458,,C -67,1,2,"Nye, Mrs. (Elizabeth Ramell)",1,29,0,0,C.A. 29395,10.5,F33,S -68,0,3,"Crease, Mr. Ernest James",0,19,0,0,S.P. 3464,8.1583,,S -69,1,3,"Andersson, Miss. Erna Alexandra",1,17,4,2,3101281,7.925,,S -70,0,3,"Kink, Mr. Vincenz",0,26,2,0,315151,8.6625,,S -71,0,2,"Jenkin, Mr. Stephen Curnow",0,32,0,0,C.A. 33111,10.5,,S -72,0,3,"Goodwin, Miss. Lillian Amy",1,16,5,2,CA 2144,46.9,,S -73,0,2,"Hood, Mr. Ambrose Jr",0,21,0,0,S.O.C. 14879,73.5,,S -74,0,3,"Chronopoulos, Mr. Apostolos",0,26,1,0,2680,14.4542,,C -75,1,3,"Bing, Mr. Lee",0,32,0,0,1601,56.4958,,S -76,0,3,"Moen, Mr. Sigurd Hansen",0,25,0,0,348123,7.65,F G73,S -77,0,3,"Staneff, Mr. Ivan",0,,0,0,349208,7.8958,,S -78,0,3,"Moutal, Mr. Rahamin Haim",0,,0,0,374746,8.05,,S -79,1,2,"Caldwell, Master. Alden Gates",0,0.83,0,2,248738,29,,S -80,1,3,"Dowdell, Miss. Elizabeth",1,30,0,0,364516,12.475,,S -81,0,3,"Waelens, Mr. Achille",0,22,0,0,345767,9,,S -82,1,3,"Sheerlinck, Mr. Jan Baptist",0,29,0,0,345779,9.5,,S -83,1,3,"McDermott, Miss. Brigdet Delia",1,,0,0,330932,7.7875,,Q -84,0,1,"Carrau, Mr. Francisco M",0,28,0,0,113059,47.1,,S -85,1,2,"Ilett, Miss. Bertha",1,17,0,0,SO/C 14885,10.5,,S -86,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",1,33,3,0,3101278,15.85,,S -87,0,3,"Ford, Mr. William Neal",0,16,1,3,W./C. 6608,34.375,,S -88,0,3,"Slocovski, Mr. Selman Francis",0,,0,0,SOTON/OQ 392086,8.05,,S -89,1,1,"Fortune, Miss. Mabel Helen",1,23,3,2,19950,263,C23 C25 C27,S -90,0,3,"Celotti, Mr. Francesco",0,24,0,0,343275,8.05,,S -91,0,3,"Christmann, Mr. Emil",0,29,0,0,343276,8.05,,S -92,0,3,"Andreasson, Mr. Paul Edvin",0,20,0,0,347466,7.8542,,S -93,0,1,"Chaffee, Mr. Herbert Fuller",0,46,1,0,W.E.P. 5734,61.175,E31,S -94,0,3,"Dean, Mr. Bertram Frank",0,26,1,2,C.A. 2315,20.575,,S -95,0,3,"Coxon, Mr. Daniel",0,59,0,0,364500,7.25,,S -96,0,3,"Shorney, Mr. Charles Joseph",0,,0,0,374910,8.05,,S -97,0,1,"Goldschmidt, Mr. George B",0,71,0,0,PC 17754,34.6542,A5,C -98,1,1,"Greenfield, Mr. William Bertram",0,23,0,1,PC 17759,63.3583,D10 D12,C -99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",1,34,0,1,231919,23,,S -100,0,2,"Kantor, Mr. Sinai",0,34,1,0,244367,26,,S -101,0,3,"Petranec, Miss. Matilda",1,28,0,0,349245,7.8958,,S -102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",0,,0,0,349215,7.8958,,S -103,0,1,"White, Mr. Richard Frasar",0,21,0,1,35281,77.2875,D26,S -104,0,3,"Johansson, Mr. Gustaf Joel",0,33,0,0,7540,8.6542,,S -105,0,3,"Gustafsson, Mr. Anders Vilhelm",0,37,2,0,3101276,7.925,,S -106,0,3,"Mionoff, Mr. Stoytcho",0,28,0,0,349207,7.8958,,S -107,1,3,"Salkjelsvik, Miss. Anna Kristine",1,21,0,0,343120,7.65,,S -108,1,3,"Moss, Mr. Albert Johan",0,,0,0,312991,7.775,,S -109,0,3,"Rekic, Mr. Tido",0,38,0,0,349249,7.8958,,S -110,1,3,"Moran, Miss. Bertha",1,,1,0,371110,24.15,,Q -111,0,1,"Porter, Mr. Walter Chamberlain",0,47,0,0,110465,52,C110,S -112,0,3,"Zabour, Miss. Hileni",1,14.5,1,0,2665,14.4542,,C -113,0,3,"Barton, Mr. David John",0,22,0,0,324669,8.05,,S -114,0,3,"Jussila, Miss. Katriina",1,20,1,0,4136,9.825,,S -115,0,3,"Attalah, Miss. Malake",1,17,0,0,2627,14.4583,,C -116,0,3,"Pekoniemi, Mr. Edvard",0,21,0,0,STON/O 2. 3101294,7.925,,S -117,0,3,"Connors, Mr. Patrick",0,70.5,0,0,370369,7.75,,Q -118,0,2,"Turpin, Mr. William John Robert",0,29,1,0,11668,21,,S -119,0,1,"Baxter, Mr. Quigg Edmond",0,24,0,1,PC 17558,247.5208,B58 B60,C -120,0,3,"Andersson, Miss. Ellis Anna Maria",1,2,4,2,347082,31.275,,S -121,0,2,"Hickman, Mr. Stanley George",0,21,2,0,S.O.C. 14879,73.5,,S -122,0,3,"Moore, Mr. Leonard Charles",0,,0,0,A4. 54510,8.05,,S -123,0,2,"Nasser, Mr. Nicholas",0,32.5,1,0,237736,30.0708,,C -124,1,2,"Webber, Miss. Susan",1,32.5,0,0,27267,13,E101,S -125,0,1,"White, Mr. Percival Wayland",0,54,0,1,35281,77.2875,D26,S -126,1,3,"Nicola-Yarred, Master. Elias",0,12,1,0,2651,11.2417,,C -127,0,3,"McMahon, Mr. Martin",0,,0,0,370372,7.75,,Q -128,1,3,"Madsen, Mr. Fridtjof Arne",0,24,0,0,C 17369,7.1417,,S -129,1,3,"Peter, Miss. Anna",1,,1,1,2668,22.3583,F E69,C -130,0,3,"Ekstrom, Mr. Johan",0,45,0,0,347061,6.975,,S -131,0,3,"Drazenoic, Mr. Jozef",0,33,0,0,349241,7.8958,,C -132,0,3,"Coelho, Mr. Domingos Fernandeo",0,20,0,0,SOTON/O.Q. 3101307,7.05,,S -133,0,3,"Robins, Mrs. Alexander A (Grace Charity Laury)",1,47,1,0,A/5. 3337,14.5,,S -134,1,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",1,29,1,0,228414,26,,S -135,0,2,"Sobey, Mr. Samuel James Hayden",0,25,0,0,C.A. 29178,13,,S -136,0,2,"Richard, Mr. Emile",0,23,0,0,SC/PARIS 2133,15.0458,,C -137,1,1,"Newsom, Miss. Helen Monypeny",1,19,0,2,11752,26.2833,D47,S -138,0,1,"Futrelle, Mr. Jacques Heath",0,37,1,0,113803,53.1,C123,S -139,0,3,"Osen, Mr. Olaf Elon",0,16,0,0,7534,9.2167,,S -140,0,1,"Giglio, Mr. Victor",0,24,0,0,PC 17593,79.2,B86,C -141,0,3,"Boulos, Mrs. Joseph (Sultana)",1,,0,2,2678,15.2458,,C -142,1,3,"Nysten, Miss. Anna Sofia",1,22,0,0,347081,7.75,,S -143,1,3,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda Dolck)",1,24,1,0,STON/O2. 3101279,15.85,,S -144,0,3,"Burke, Mr. Jeremiah",0,19,0,0,365222,6.75,,Q -145,0,2,"Andrew, Mr. Edgardo Samuel",0,18,0,0,231945,11.5,,S -146,0,2,"Nicholls, Mr. Joseph Charles",0,19,1,1,C.A. 33112,36.75,,S -147,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",0,27,0,0,350043,7.7958,,S -148,0,3,"Ford, Miss. Robina Maggie ""Ruby""",1,9,2,2,W./C. 6608,34.375,,S -149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",0,36.5,0,2,230080,26,F2,S -150,0,2,"Byles, Rev. Thomas Roussel Davids",0,42,0,0,244310,13,,S -151,0,2,"Bateman, Rev. Robert James",0,51,0,0,S.O.P. 1166,12.525,,S -152,1,1,"Pears, Mrs. Thomas (Edith Wearne)",1,22,1,0,113776,66.6,C2,S -153,0,3,"Meo, Mr. Alfonzo",0,55.5,0,0,A.5. 11206,8.05,,S -154,0,3,"van Billiard, Mr. Austin Blyler",0,40.5,0,2,A/5. 851,14.5,,S -155,0,3,"Olsen, Mr. Ole Martin",0,,0,0,Fa 265302,7.3125,,S -156,0,1,"Williams, Mr. Charles Duane",0,51,0,1,PC 17597,61.3792,,C -157,1,3,"Gilnagh, Miss. Katherine ""Katie""",1,16,0,0,35851,7.7333,,Q -158,0,3,"Corn, Mr. Harry",0,30,0,0,SOTON/OQ 392090,8.05,,S -159,0,3,"Smiljanic, Mr. Mile",0,,0,0,315037,8.6625,,S -160,0,3,"Sage, Master. Thomas Henry",0,,8,2,CA. 2343,69.55,,S -161,0,3,"Cribb, Mr. John Hatfield",0,44,0,1,371362,16.1,,S -162,1,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Milne)",1,40,0,0,C.A. 33595,15.75,,S -163,0,3,"Bengtsson, Mr. John Viktor",0,26,0,0,347068,7.775,,S -164,0,3,"Calic, Mr. Jovo",0,17,0,0,315093,8.6625,,S -165,0,3,"Panula, Master. Eino Viljami",0,1,4,1,3101295,39.6875,,S -166,1,3,"Goldsmith, Master. Frank John William ""Frankie""",0,9,0,2,363291,20.525,,S -167,1,1,"Chibnall, Mrs. (Edith Martha Bowerman)",1,,0,1,113505,55,E33,S -168,0,3,"Skoog, Mrs. William (Anna Bernhardina Karlsson)",1,45,1,4,347088,27.9,,S -169,0,1,"Baumann, Mr. John D",0,,0,0,PC 17318,25.925,,S -170,0,3,"Ling, Mr. Lee",0,28,0,0,1601,56.4958,,S -171,0,1,"Van der hoef, Mr. Wyckoff",0,61,0,0,111240,33.5,B19,S -172,0,3,"Rice, Master. Arthur",0,4,4,1,382652,29.125,,Q -173,1,3,"Johnson, Miss. Eleanor Ileen",1,1,1,1,347742,11.1333,,S -174,0,3,"Sivola, Mr. Antti Wilhelm",0,21,0,0,STON/O 2. 3101280,7.925,,S -175,0,1,"Smith, Mr. James Clinch",0,56,0,0,17764,30.6958,A7,C -176,0,3,"Klasen, Mr. Klas Albin",0,18,1,1,350404,7.8542,,S -177,0,3,"Lefebre, Master. Henry Forbes",0,,3,1,4133,25.4667,,S -178,0,1,"Isham, Miss. Ann Elizabeth",1,50,0,0,PC 17595,28.7125,C49,C -179,0,2,"Hale, Mr. Reginald",0,30,0,0,250653,13,,S -180,0,3,"Leonard, Mr. Lionel",0,36,0,0,LINE,0,,S -181,0,3,"Sage, Miss. Constance Gladys",1,,8,2,CA. 2343,69.55,,S -182,0,2,"Pernot, Mr. Rene",0,,0,0,SC/PARIS 2131,15.05,,C -183,0,3,"Asplund, Master. Clarence Gustaf Hugo",0,9,4,2,347077,31.3875,,S -184,1,2,"Becker, Master. Richard F",0,1,2,1,230136,39,F4,S -185,1,3,"Kink-Heilmann, Miss. Luise Gretchen",1,4,0,2,315153,22.025,,S -186,0,1,"Rood, Mr. Hugh Roscoe",0,,0,0,113767,50,A32,S -187,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",1,,1,0,370365,15.5,,Q -188,1,1,"Romaine, Mr. Charles Hallace (""Mr C Rolmane"")",0,45,0,0,111428,26.55,,S -189,0,3,"Bourke, Mr. John",0,40,1,1,364849,15.5,,Q -190,0,3,"Turcin, Mr. Stjepan",0,36,0,0,349247,7.8958,,S -191,1,2,"Pinsky, Mrs. (Rosa)",1,32,0,0,234604,13,,S -192,0,2,"Carbines, Mr. William",0,19,0,0,28424,13,,S -193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",1,19,1,0,350046,7.8542,,S -194,1,2,"Navratil, Master. Michel M",0,3,1,1,230080,26,F2,S -195,1,1,"Brown, Mrs. James Joseph (Margaret Tobin)",1,44,0,0,PC 17610,27.7208,B4,C -196,1,1,"Lurette, Miss. Elise",1,58,0,0,PC 17569,146.5208,B80,C -197,0,3,"Mernagh, Mr. Robert",0,,0,0,368703,7.75,,Q -198,0,3,"Olsen, Mr. Karl Siegwart Andreas",0,42,0,1,4579,8.4042,,S -199,1,3,"Madigan, Miss. Margaret ""Maggie""",1,,0,0,370370,7.75,,Q -200,0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",1,24,0,0,248747,13,,S -201,0,3,"Vande Walle, Mr. Nestor Cyriel",0,28,0,0,345770,9.5,,S -202,0,3,"Sage, Mr. Frederick",0,,8,2,CA. 2343,69.55,,S -203,0,3,"Johanson, Mr. Jakob Alfred",0,34,0,0,3101264,6.4958,,S -204,0,3,"Youseff, Mr. Gerious",0,45.5,0,0,2628,7.225,,C -205,1,3,"Cohen, Mr. Gurshon ""Gus""",0,18,0,0,A/5 3540,8.05,,S -206,0,3,"Strom, Miss. Telma Matilda",1,2,0,1,347054,10.4625,G6,S -207,0,3,"Backstrom, Mr. Karl Alfred",0,32,1,0,3101278,15.85,,S -208,1,3,"Albimona, Mr. Nassef Cassem",0,26,0,0,2699,18.7875,,C -209,1,3,"Carr, Miss. Helen ""Ellen""",1,16,0,0,367231,7.75,,Q -210,1,1,"Blank, Mr. Henry",0,40,0,0,112277,31,A31,C -211,0,3,"Ali, Mr. Ahmed",0,24,0,0,SOTON/O.Q. 3101311,7.05,,S -212,1,2,"Cameron, Miss. Clear Annie",1,35,0,0,F.C.C. 13528,21,,S -213,0,3,"Perkin, Mr. John Henry",0,22,0,0,A/5 21174,7.25,,S -214,0,2,"Givard, Mr. Hans Kristensen",0,30,0,0,250646,13,,S -215,0,3,"Kiernan, Mr. Philip",0,,1,0,367229,7.75,,Q -216,1,1,"Newell, Miss. Madeleine",1,31,1,0,35273,113.275,D36,C -217,1,3,"Honkanen, Miss. Eliina",1,27,0,0,STON/O2. 3101283,7.925,,S -218,0,2,"Jacobsohn, Mr. Sidney Samuel",0,42,1,0,243847,27,,S -219,1,1,"Bazzani, Miss. Albina",1,32,0,0,11813,76.2917,D15,C -220,0,2,"Harris, Mr. Walter",0,30,0,0,W/C 14208,10.5,,S -221,1,3,"Sunderland, Mr. Victor Francis",0,16,0,0,SOTON/OQ 392089,8.05,,S -222,0,2,"Bracken, Mr. James H",0,27,0,0,220367,13,,S -223,0,3,"Green, Mr. George Henry",0,51,0,0,21440,8.05,,S -224,0,3,"Nenkoff, Mr. Christo",0,,0,0,349234,7.8958,,S -225,1,1,"Hoyt, Mr. Frederick Maxfield",0,38,1,0,19943,90,C93,S -226,0,3,"Berglund, Mr. Karl Ivar Sven",0,22,0,0,PP 4348,9.35,,S -227,1,2,"Mellors, Mr. William John",0,19,0,0,SW/PP 751,10.5,,S -228,0,3,"Lovell, Mr. John Hall (""Henry"")",0,20.5,0,0,A/5 21173,7.25,,S -229,0,2,"Fahlstrom, Mr. Arne Jonas",0,18,0,0,236171,13,,S -230,0,3,"Lefebre, Miss. Mathilde",1,,3,1,4133,25.4667,,S -231,1,1,"Harris, Mrs. Henry Birkhardt (Irene Wallach)",1,35,1,0,36973,83.475,C83,S -232,0,3,"Larsson, Mr. Bengt Edvin",0,29,0,0,347067,7.775,,S -233,0,2,"Sjostedt, Mr. Ernst Adolf",0,59,0,0,237442,13.5,,S -234,1,3,"Asplund, Miss. Lillian Gertrud",1,5,4,2,347077,31.3875,,S -235,0,2,"Leyson, Mr. Robert William Norman",0,24,0,0,C.A. 29566,10.5,,S -236,0,3,"Harknett, Miss. Alice Phoebe",1,,0,0,W./C. 6609,7.55,,S -237,0,2,"Hold, Mr. Stephen",0,44,1,0,26707,26,,S -238,1,2,"Collyer, Miss. Marjorie ""Lottie""",1,8,0,2,C.A. 31921,26.25,,S -239,0,2,"Pengelly, Mr. Frederick William",0,19,0,0,28665,10.5,,S -240,0,2,"Hunt, Mr. George Henry",0,33,0,0,SCO/W 1585,12.275,,S -241,0,3,"Zabour, Miss. Thamine",1,,1,0,2665,14.4542,,C -242,1,3,"Murphy, Miss. Katherine ""Kate""",1,,1,0,367230,15.5,,Q -243,0,2,"Coleridge, Mr. Reginald Charles",0,29,0,0,W./C. 14263,10.5,,S -244,0,3,"Maenpaa, Mr. Matti Alexanteri",0,22,0,0,STON/O 2. 3101275,7.125,,S -245,0,3,"Attalah, Mr. Sleiman",0,30,0,0,2694,7.225,,C -246,0,1,"Minahan, Dr. William Edward",0,44,2,0,19928,90,C78,Q -247,0,3,"Lindahl, Miss. Agda Thorilda Viktoria",1,25,0,0,347071,7.775,,S -248,1,2,"Hamalainen, Mrs. William (Anna)",1,24,0,2,250649,14.5,,S -249,1,1,"Beckwith, Mr. Richard Leonard",0,37,1,1,11751,52.5542,D35,S -250,0,2,"Carter, Rev. Ernest Courtenay",0,54,1,0,244252,26,,S -251,0,3,"Reed, Mr. James George",0,,0,0,362316,7.25,,S -252,0,3,"Strom, Mrs. Wilhelm (Elna Matilda Persson)",1,29,1,1,347054,10.4625,G6,S -253,0,1,"Stead, Mr. William Thomas",0,62,0,0,113514,26.55,C87,S -254,0,3,"Lobb, Mr. William Arthur",0,30,1,0,A/5. 3336,16.1,,S -255,0,3,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",1,41,0,2,370129,20.2125,,S -256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",1,29,0,2,2650,15.2458,,C -257,1,1,"Thorne, Mrs. Gertrude Maybelle",1,,0,0,PC 17585,79.2,,C -258,1,1,"Cherry, Miss. Gladys",1,30,0,0,110152,86.5,B77,S -259,1,1,"Ward, Miss. Anna",1,35,0,0,PC 17755,512.3292,,C -260,1,2,"Parrish, Mrs. (Lutie Davis)",1,50,0,1,230433,26,,S -261,0,3,"Smith, Mr. Thomas",0,,0,0,384461,7.75,,Q -262,1,3,"Asplund, Master. Edvin Rojj Felix",0,3,4,2,347077,31.3875,,S -263,0,1,"Taussig, Mr. Emil",0,52,1,1,110413,79.65,E67,S -264,0,1,"Harrison, Mr. William",0,40,0,0,112059,0,B94,S -265,0,3,"Henry, Miss. Delia",1,,0,0,382649,7.75,,Q -266,0,2,"Reeves, Mr. David",0,36,0,0,C.A. 17248,10.5,,S -267,0,3,"Panula, Mr. Ernesti Arvid",0,16,4,1,3101295,39.6875,,S -268,1,3,"Persson, Mr. Ernst Ulrik",0,25,1,0,347083,7.775,,S -269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",1,58,0,1,PC 17582,153.4625,C125,S -270,1,1,"Bissette, Miss. Amelia",1,35,0,0,PC 17760,135.6333,C99,S -271,0,1,"Cairns, Mr. Alexander",0,,0,0,113798,31,,S -272,1,3,"Tornquist, Mr. William Henry",0,25,0,0,LINE,0,,S -273,1,2,"Mellinger, Mrs. (Elizabeth Anne Maidment)",1,41,0,1,250644,19.5,,S -274,0,1,"Natsch, Mr. Charles H",0,37,0,1,PC 17596,29.7,C118,C -275,1,3,"Healy, Miss. Hanora ""Nora""",1,,0,0,370375,7.75,,Q -276,1,1,"Andrews, Miss. Kornelia Theodosia",1,63,1,0,13502,77.9583,D7,S -277,0,3,"Lindblom, Miss. Augusta Charlotta",1,45,0,0,347073,7.75,,S -278,0,2,"Parkes, Mr. Francis ""Frank""",0,,0,0,239853,0,,S -279,0,3,"Rice, Master. Eric",0,7,4,1,382652,29.125,,Q -280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",1,35,1,1,C.A. 2673,20.25,,S -281,0,3,"Duane, Mr. Frank",0,65,0,0,336439,7.75,,Q -282,0,3,"Olsson, Mr. Nils Johan Goransson",0,28,0,0,347464,7.8542,,S -283,0,3,"de Pelsmaeker, Mr. Alfons",0,16,0,0,345778,9.5,,S -284,1,3,"Dorking, Mr. Edward Arthur",0,19,0,0,A/5. 10482,8.05,,S -285,0,1,"Smith, Mr. Richard William",0,,0,0,113056,26,A19,S -286,0,3,"Stankovic, Mr. Ivan",0,33,0,0,349239,8.6625,,C -287,1,3,"de Mulder, Mr. Theodore",0,30,0,0,345774,9.5,,S -288,0,3,"Naidenoff, Mr. Penko",0,22,0,0,349206,7.8958,,S -289,1,2,"Hosono, Mr. Masabumi",0,42,0,0,237798,13,,S -290,1,3,"Connolly, Miss. Kate",1,22,0,0,370373,7.75,,Q -291,1,1,"Barber, Miss. Ellen ""Nellie""",1,26,0,0,19877,78.85,,S -292,1,1,"Bishop, Mrs. Dickinson H (Helen Walton)",1,19,1,0,11967,91.0792,B49,C -293,0,2,"Levy, Mr. Rene Jacques",0,36,0,0,SC/Paris 2163,12.875,D,C -294,0,3,"Haas, Miss. Aloisia",1,24,0,0,349236,8.85,,S -295,0,3,"Mineff, Mr. Ivan",0,24,0,0,349233,7.8958,,S -296,0,1,"Lewy, Mr. Ervin G",0,,0,0,PC 17612,27.7208,,C -297,0,3,"Hanna, Mr. Mansour",0,23.5,0,0,2693,7.2292,,C -298,0,1,"Allison, Miss. Helen Loraine",1,2,1,2,113781,151.55,C22 C26,S -299,1,1,"Saalfeld, Mr. Adolphe",0,,0,0,19988,30.5,C106,S -300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",1,50,0,1,PC 17558,247.5208,B58 B60,C -301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",1,,0,0,9234,7.75,,Q -302,1,3,"McCoy, Mr. Bernard",0,,2,0,367226,23.25,,Q -303,0,3,"Johnson, Mr. William Cahoone Jr",0,19,0,0,LINE,0,,S -304,1,2,"Keane, Miss. Nora A",1,,0,0,226593,12.35,E101,Q -305,0,3,"Williams, Mr. Howard Hugh ""Harry""",0,,0,0,A/5 2466,8.05,,S -306,1,1,"Allison, Master. Hudson Trevor",0,0.92,1,2,113781,151.55,C22 C26,S -307,1,1,"Fleming, Miss. Margaret",1,,0,0,17421,110.8833,,C -308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)",1,17,1,0,PC 17758,108.9,C65,C -309,0,2,"Abelson, Mr. Samuel",0,30,1,0,P/PP 3381,24,,C -310,1,1,"Francatelli, Miss. Laura Mabel",1,30,0,0,PC 17485,56.9292,E36,C -311,1,1,"Hays, Miss. Margaret Bechstein",1,24,0,0,11767,83.1583,C54,C -312,1,1,"Ryerson, Miss. Emily Borie",1,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C -313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",1,26,1,1,250651,26,,S -314,0,3,"Hendekovic, Mr. Ignjac",0,28,0,0,349243,7.8958,,S -315,0,2,"Hart, Mr. Benjamin",0,43,1,1,F.C.C. 13529,26.25,,S -316,1,3,"Nilsson, Miss. Helmina Josefina",1,26,0,0,347470,7.8542,,S -317,1,2,"Kantor, Mrs. Sinai (Miriam Sternin)",1,24,1,0,244367,26,,S -318,0,2,"Moraweck, Dr. Ernest",0,54,0,0,29011,14,,S -319,1,1,"Wick, Miss. Mary Natalie",1,31,0,2,36928,164.8667,C7,S -320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corning Stone)",1,40,1,1,16966,134.5,E34,C -321,0,3,"Dennis, Mr. Samuel",0,22,0,0,A/5 21172,7.25,,S -322,0,3,"Danoff, Mr. Yoto",0,27,0,0,349219,7.8958,,S -323,1,2,"Slayter, Miss. Hilda Mary",1,30,0,0,234818,12.35,,Q -324,1,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harbaugh)",1,22,1,1,248738,29,,S -325,0,3,"Sage, Mr. George John Jr",0,,8,2,CA. 2343,69.55,,S -326,1,1,"Young, Miss. Marie Grice",1,36,0,0,PC 17760,135.6333,C32,C -327,0,3,"Nysveen, Mr. Johan Hansen",0,61,0,0,345364,6.2375,,S -328,1,2,"Ball, Mrs. (Ada E Hall)",1,36,0,0,28551,13,D,S -329,1,3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",1,31,1,1,363291,20.525,,S -330,1,1,"Hippach, Miss. Jean Gertrude",1,16,0,1,111361,57.9792,B18,C -331,1,3,"McCoy, Miss. Agnes",1,,2,0,367226,23.25,,Q -332,0,1,"Partner, Mr. Austen",0,45.5,0,0,113043,28.5,C124,S -333,0,1,"Graham, Mr. George Edward",0,38,0,1,PC 17582,153.4625,C91,S -334,0,3,"Vander Planke, Mr. Leo Edmondus",0,16,2,0,345764,18,,S -335,1,1,"Frauenthal, Mrs. Henry William (Clara Heinsheimer)",1,,1,0,PC 17611,133.65,,S -336,0,3,"Denkoff, Mr. Mitto",0,,0,0,349225,7.8958,,S -337,0,1,"Pears, Mr. Thomas Clinton",0,29,1,0,113776,66.6,C2,S -338,1,1,"Burns, Miss. Elizabeth Margaret",1,41,0,0,16966,134.5,E40,C -339,1,3,"Dahl, Mr. Karl Edwart",0,45,0,0,7598,8.05,,S -340,0,1,"Blackwell, Mr. Stephen Weart",0,45,0,0,113784,35.5,T,S -341,1,2,"Navratil, Master. Edmond Roger",0,2,1,1,230080,26,F2,S -342,1,1,"Fortune, Miss. Alice Elizabeth",1,24,3,2,19950,263,C23 C25 C27,S -343,0,2,"Collander, Mr. Erik Gustaf",0,28,0,0,248740,13,,S -344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",0,25,0,0,244361,13,,S -345,0,2,"Fox, Mr. Stanley Hubert",0,36,0,0,229236,13,,S -346,1,2,"Brown, Miss. Amelia ""Mildred""",1,24,0,0,248733,13,F33,S -347,1,2,"Smith, Miss. Marion Elsie",1,40,0,0,31418,13,,S -348,1,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",1,,1,0,386525,16.1,,S -349,1,3,"Coutts, Master. William Loch ""William""",0,3,1,1,C.A. 37671,15.9,,S -350,0,3,"Dimic, Mr. Jovan",0,42,0,0,315088,8.6625,,S -351,0,3,"Odahl, Mr. Nils Martin",0,23,0,0,7267,9.225,,S -352,0,1,"Williams-Lambert, Mr. Fletcher Fellows",0,,0,0,113510,35,C128,S -353,0,3,"Elias, Mr. Tannous",0,15,1,1,2695,7.2292,,C -354,0,3,"Arnold-Franchi, Mr. Josef",0,25,1,0,349237,17.8,,S -355,0,3,"Yousif, Mr. Wazli",0,,0,0,2647,7.225,,C -356,0,3,"Vanden Steen, Mr. Leo Peter",0,28,0,0,345783,9.5,,S -357,1,1,"Bowerman, Miss. Elsie Edith",1,22,0,1,113505,55,E33,S -358,0,2,"Funk, Miss. Annie Clemmer",1,38,0,0,237671,13,,S -359,1,3,"McGovern, Miss. Mary",1,,0,0,330931,7.8792,,Q -360,1,3,"Mockler, Miss. Helen Mary ""Ellie""",1,,0,0,330980,7.8792,,Q -361,0,3,"Skoog, Mr. Wilhelm",0,40,1,4,347088,27.9,,S -362,0,2,"del Carlo, Mr. Sebastiano",0,29,1,0,SC/PARIS 2167,27.7208,,C -363,0,3,"Barbara, Mrs. (Catherine David)",1,45,0,1,2691,14.4542,,C -364,0,3,"Asim, Mr. Adola",0,35,0,0,SOTON/O.Q. 3101310,7.05,,S -365,0,3,"O'Brien, Mr. Thomas",0,,1,0,370365,15.5,,Q -366,0,3,"Adahl, Mr. Mauritz Nils Martin",0,30,0,0,C 7076,7.25,,S -367,1,1,"Warren, Mrs. Frank Manley (Anna Sophia Atkinson)",1,60,1,0,110813,75.25,D37,C -368,1,3,"Moussa, Mrs. (Mantoura Boulos)",1,,0,0,2626,7.2292,,C -369,1,3,"Jermyn, Miss. Annie",1,,0,0,14313,7.75,,Q -370,1,1,"Aubart, Mme. Leontine Pauline",1,24,0,0,PC 17477,69.3,B35,C -371,1,1,"Harder, Mr. George Achilles",0,25,1,0,11765,55.4417,E50,C -372,0,3,"Wiklund, Mr. Jakob Alfred",0,18,1,0,3101267,6.4958,,S -373,0,3,"Beavan, Mr. William Thomas",0,19,0,0,323951,8.05,,S -374,0,1,"Ringhini, Mr. Sante",0,22,0,0,PC 17760,135.6333,,C -375,0,3,"Palsson, Miss. Stina Viola",1,3,3,1,349909,21.075,,S -376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",1,,1,0,PC 17604,82.1708,,C -377,1,3,"Landergren, Miss. Aurora Adelia",1,22,0,0,C 7077,7.25,,S -378,0,1,"Widener, Mr. Harry Elkins",0,27,0,2,113503,211.5,C82,C -379,0,3,"Betros, Mr. Tannous",0,20,0,0,2648,4.0125,,C -380,0,3,"Gustafsson, Mr. Karl Gideon",0,19,0,0,347069,7.775,,S -381,1,1,"Bidois, Miss. Rosalie",1,42,0,0,PC 17757,227.525,,C -382,1,3,"Nakid, Miss. Maria (""Mary"")",1,1,0,2,2653,15.7417,,C -383,0,3,"Tikkanen, Mr. Juho",0,32,0,0,STON/O 2. 3101293,7.925,,S -384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline Towner)",1,35,1,0,113789,52,,S -385,0,3,"Plotcharsky, Mr. Vasil",0,,0,0,349227,7.8958,,S -386,0,2,"Davies, Mr. Charles Henry",0,18,0,0,S.O.C. 14879,73.5,,S -387,0,3,"Goodwin, Master. Sidney Leonard",0,1,5,2,CA 2144,46.9,,S -388,1,2,"Buss, Miss. Kate",1,36,0,0,27849,13,,S -389,0,3,"Sadlier, Mr. Matthew",0,,0,0,367655,7.7292,,Q -390,1,2,"Lehmann, Miss. Bertha",1,17,0,0,SC 1748,12,,C -391,1,1,"Carter, Mr. William Ernest",0,36,1,2,113760,120,B96 B98,S -392,1,3,"Jansson, Mr. Carl Olof",0,21,0,0,350034,7.7958,,S -393,0,3,"Gustafsson, Mr. Johan Birger",0,28,2,0,3101277,7.925,,S -394,1,1,"Newell, Miss. Marjorie",1,23,1,0,35273,113.275,D36,C -395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)",1,24,0,2,PP 9549,16.7,G6,S -396,0,3,"Johansson, Mr. Erik",0,22,0,0,350052,7.7958,,S -397,0,3,"Olsson, Miss. Elina",1,31,0,0,350407,7.8542,,S -398,0,2,"McKane, Mr. Peter David",0,46,0,0,28403,26,,S -399,0,2,"Pain, Dr. Alfred",0,23,0,0,244278,10.5,,S -400,1,2,"Trout, Mrs. William H (Jessie L)",1,28,0,0,240929,12.65,,S -401,1,3,"Niskanen, Mr. Juha",0,39,0,0,STON/O 2. 3101289,7.925,,S -402,0,3,"Adams, Mr. John",0,26,0,0,341826,8.05,,S -403,0,3,"Jussila, Miss. Mari Aina",1,21,1,0,4137,9.825,,S -404,0,3,"Hakkarainen, Mr. Pekka Pietari",0,28,1,0,STON/O2. 3101279,15.85,,S -405,0,3,"Oreskovic, Miss. Marija",1,20,0,0,315096,8.6625,,S -406,0,2,"Gale, Mr. Shadrach",0,34,1,0,28664,21,,S -407,0,3,"Widegren, Mr. Carl/Charles Peter",0,51,0,0,347064,7.75,,S -408,1,2,"Richards, Master. William Rowe",0,3,1,1,29106,18.75,,S -409,0,3,"Birkeland, Mr. Hans Martin Monsen",0,21,0,0,312992,7.775,,S -410,0,3,"Lefebre, Miss. Ida",1,,3,1,4133,25.4667,,S -411,0,3,"Sdycoff, Mr. Todor",0,,0,0,349222,7.8958,,S -412,0,3,"Hart, Mr. Henry",0,,0,0,394140,6.8583,,Q -413,1,1,"Minahan, Miss. Daisy E",1,33,1,0,19928,90,C78,Q -414,0,2,"Cunningham, Mr. Alfred Fleming",0,,0,0,239853,0,,S -415,1,3,"Sundman, Mr. Johan Julian",0,44,0,0,STON/O 2. 3101269,7.925,,S -416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",1,,0,0,343095,8.05,,S -417,1,2,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",1,34,1,1,28220,32.5,,S -418,1,2,"Silven, Miss. Lyyli Karoliina",1,18,0,2,250652,13,,S -419,0,2,"Matthews, Mr. William John",0,30,0,0,28228,13,,S -420,0,3,"Van Impe, Miss. Catharina",1,10,0,2,345773,24.15,,S -421,0,3,"Gheorgheff, Mr. Stanio",0,,0,0,349254,7.8958,,C -422,0,3,"Charters, Mr. David",0,21,0,0,A/5. 13032,7.7333,,Q -423,0,3,"Zimmerman, Mr. Leo",0,29,0,0,315082,7.875,,S -424,0,3,"Danbom, Mrs. Ernst Gilbert (Anna Sigrid Maria Brogren)",1,28,1,1,347080,14.4,,S -425,0,3,"Rosblom, Mr. Viktor Richard",0,18,1,1,370129,20.2125,,S -426,0,3,"Wiseman, Mr. Phillippe",0,,0,0,A/4. 34244,7.25,,S -427,1,2,"Clarke, Mrs. Charles V (Ada Maria Winfield)",1,28,1,0,2003,26,,S -428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louise Phillips Marshall"")",1,19,0,0,250655,26,,S -429,0,3,"Flynn, Mr. James",0,,0,0,364851,7.75,,Q -430,1,3,"Pickard, Mr. Berk (Berk Trembisky)",0,32,0,0,SOTON/O.Q. 392078,8.05,E10,S -431,1,1,"Bjornstrom-Steffansson, Mr. Mauritz Hakan",0,28,0,0,110564,26.55,C52,S -432,1,3,"Thorneycroft, Mrs. Percival (Florence Kate White)",1,,1,0,376564,16.1,,S -433,1,2,"Louch, Mrs. Charles Alexander (Alice Adelaide Slow)",1,42,1,0,SC/AH 3085,26,,S -434,0,3,"Kallio, Mr. Nikolai Erland",0,17,0,0,STON/O 2. 3101274,7.125,,S -435,0,1,"Silvey, Mr. William Baird",0,50,1,0,13507,55.9,E44,S -436,1,1,"Carter, Miss. Lucile Polk",1,14,1,2,113760,120,B96 B98,S -437,0,3,"Ford, Miss. Doolina Margaret ""Daisy""",1,21,2,2,W./C. 6608,34.375,,S -438,1,2,"Richards, Mrs. Sidney (Emily Hocking)",1,24,2,3,29106,18.75,,S -439,0,1,"Fortune, Mr. Mark",0,64,1,4,19950,263,C23 C25 C27,S -440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",0,31,0,0,C.A. 18723,10.5,,S -441,1,2,"Hart, Mrs. Benjamin (Esther Ada Bloomfield)",1,45,1,1,F.C.C. 13529,26.25,,S -442,0,3,"Hampe, Mr. Leon",0,20,0,0,345769,9.5,,S -443,0,3,"Petterson, Mr. Johan Emil",0,25,1,0,347076,7.775,,S -444,1,2,"Reynaldo, Ms. Encarnacion",1,28,0,0,230434,13,,S -445,1,3,"Johannesen-Bratthammer, Mr. Bernt",0,,0,0,65306,8.1125,,S -446,1,1,"Dodge, Master. Washington",0,4,0,2,33638,81.8583,A34,S -447,1,2,"Mellinger, Miss. Madeleine Violet",1,13,0,1,250644,19.5,,S -448,1,1,"Seward, Mr. Frederic Kimber",0,34,0,0,113794,26.55,,S -449,1,3,"Baclini, Miss. Marie Catherine",1,5,2,1,2666,19.2583,,C -450,1,1,"Peuchen, Major. Arthur Godfrey",0,52,0,0,113786,30.5,C104,S -451,0,2,"West, Mr. Edwy Arthur",0,36,1,2,C.A. 34651,27.75,,S -452,0,3,"Hagland, Mr. Ingvald Olai Olsen",0,,1,0,65303,19.9667,,S -453,0,1,"Foreman, Mr. Benjamin Laventall",0,30,0,0,113051,27.75,C111,C -454,1,1,"Goldenberg, Mr. Samuel L",0,49,1,0,17453,89.1042,C92,C -455,0,3,"Peduzzi, Mr. Joseph",0,,0,0,A/5 2817,8.05,,S -456,1,3,"Jalsevac, Mr. Ivan",0,29,0,0,349240,7.8958,,C -457,0,1,"Millet, Mr. Francis Davis",0,65,0,0,13509,26.55,E38,S -458,1,1,"Kenyon, Mrs. Frederick R (Marion)",1,,1,0,17464,51.8625,D21,S -459,1,2,"Toomey, Miss. Ellen",1,50,0,0,F.C.C. 13531,10.5,,S -460,0,3,"O'Connor, Mr. Maurice",0,,0,0,371060,7.75,,Q -461,1,1,"Anderson, Mr. Harry",0,48,0,0,19952,26.55,E12,S -462,0,3,"Morley, Mr. William",0,34,0,0,364506,8.05,,S -463,0,1,"Gee, Mr. Arthur H",0,47,0,0,111320,38.5,E63,S -464,0,2,"Milling, Mr. Jacob Christian",0,48,0,0,234360,13,,S -465,0,3,"Maisner, Mr. Simon",0,,0,0,A/S 2816,8.05,,S -466,0,3,"Goncalves, Mr. Manuel Estanslas",0,38,0,0,SOTON/O.Q. 3101306,7.05,,S -467,0,2,"Campbell, Mr. William",0,,0,0,239853,0,,S -468,0,1,"Smart, Mr. John Montgomery",0,56,0,0,113792,26.55,,S -469,0,3,"Scanlan, Mr. James",0,,0,0,36209,7.725,,Q -470,1,3,"Baclini, Miss. Helene Barbara",1,0.75,2,1,2666,19.2583,,C -471,0,3,"Keefe, Mr. Arthur",0,,0,0,323592,7.25,,S -472,0,3,"Cacic, Mr. Luka",0,38,0,0,315089,8.6625,,S -473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",1,33,1,2,C.A. 34651,27.75,,S -474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",1,23,0,0,SC/AH Basle 541,13.7917,D,C -475,0,3,"Strandberg, Miss. Ida Sofia",1,22,0,0,7553,9.8375,,S -476,0,1,"Clifford, Mr. George Quincy",0,,0,0,110465,52,A14,S -477,0,2,"Renouf, Mr. Peter Henry",0,34,1,0,31027,21,,S -478,0,3,"Braund, Mr. Lewis Richard",0,29,1,0,3460,7.0458,,S -479,0,3,"Karlsson, Mr. Nils August",0,22,0,0,350060,7.5208,,S -480,1,3,"Hirvonen, Miss. Hildur E",1,2,0,1,3101298,12.2875,,S -481,0,3,"Goodwin, Master. Harold Victor",0,9,5,2,CA 2144,46.9,,S -482,0,2,"Frost, Mr. Anthony Wood ""Archie""",0,,0,0,239854,0,,S -483,0,3,"Rouse, Mr. Richard Henry",0,50,0,0,A/5 3594,8.05,,S -484,1,3,"Turkula, Mrs. (Hedwig)",1,63,0,0,4134,9.5875,,S -485,1,1,"Bishop, Mr. Dickinson H",0,25,1,0,11967,91.0792,B49,C -486,0,3,"Lefebre, Miss. Jeannie",1,,3,1,4133,25.4667,,S -487,1,1,"Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)",1,35,1,0,19943,90,C93,S -488,0,1,"Kent, Mr. Edward Austin",0,58,0,0,11771,29.7,B37,C -489,0,3,"Somerton, Mr. Francis William",0,30,0,0,A.5. 18509,8.05,,S -490,1,3,"Coutts, Master. Eden Leslie ""Neville""",0,9,1,1,C.A. 37671,15.9,,S -491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",0,,1,0,65304,19.9667,,S -492,0,3,"Windelov, Mr. Einar",0,21,0,0,SOTON/OQ 3101317,7.25,,S -493,0,1,"Molson, Mr. Harry Markland",0,55,0,0,113787,30.5,C30,S -494,0,1,"Artagaveytia, Mr. Ramon",0,71,0,0,PC 17609,49.5042,,C -495,0,3,"Stanley, Mr. Edward Roland",0,21,0,0,A/4 45380,8.05,,S -496,0,3,"Yousseff, Mr. Gerious",0,,0,0,2627,14.4583,,C -497,1,1,"Eustis, Miss. Elizabeth Mussey",1,54,1,0,36947,78.2667,D20,C -498,0,3,"Shellard, Mr. Frederick William",0,,0,0,C.A. 6212,15.1,,S -499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",1,25,1,2,113781,151.55,C22 C26,S -500,0,3,"Svensson, Mr. Olof",0,24,0,0,350035,7.7958,,S -501,0,3,"Calic, Mr. Petar",0,17,0,0,315086,8.6625,,S -502,0,3,"Canavan, Miss. Mary",1,21,0,0,364846,7.75,,Q -503,0,3,"O'Sullivan, Miss. Bridget Mary",1,,0,0,330909,7.6292,,Q -504,0,3,"Laitinen, Miss. Kristina Sofia",1,37,0,0,4135,9.5875,,S -505,1,1,"Maioni, Miss. Roberta",1,16,0,0,110152,86.5,B79,S -506,0,1,"Penasco y Castellana, Mr. Victor de Satode",0,18,1,0,PC 17758,108.9,C65,C -507,1,2,"Quick, Mrs. Frederick Charles (Jane Richards)",1,33,0,2,26360,26,,S -508,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",0,,0,0,111427,26.55,,S -509,0,3,"Olsen, Mr. Henry Margido",0,28,0,0,C 4001,22.525,,S -510,1,3,"Lang, Mr. Fang",0,26,0,0,1601,56.4958,,S -511,1,3,"Daly, Mr. Eugene Patrick",0,29,0,0,382651,7.75,,Q -512,0,3,"Webber, Mr. James",0,,0,0,SOTON/OQ 3101316,8.05,,S -513,1,1,"McGough, Mr. James Robert",0,36,0,0,PC 17473,26.2875,E25,S -514,1,1,"Rothschild, Mrs. Martin (Elizabeth L. Barrett)",1,54,1,0,PC 17603,59.4,,C -515,0,3,"Coleff, Mr. Satio",0,24,0,0,349209,7.4958,,S -516,0,1,"Walker, Mr. William Anderson",0,47,0,0,36967,34.0208,D46,S -517,1,2,"Lemore, Mrs. (Amelia Milley)",1,34,0,0,C.A. 34260,10.5,F33,S -518,0,3,"Ryan, Mr. Patrick",0,,0,0,371110,24.15,,Q -519,1,2,"Angle, Mrs. William A (Florence ""Mary"" Agnes Hughes)",1,36,1,0,226875,26,,S -520,0,3,"Pavlovic, Mr. Stefo",0,32,0,0,349242,7.8958,,S -521,1,1,"Perreault, Miss. Anne",1,30,0,0,12749,93.5,B73,S -522,0,3,"Vovk, Mr. Janko",0,22,0,0,349252,7.8958,,S -523,0,3,"Lahoud, Mr. Sarkis",0,,0,0,2624,7.225,,C -524,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",1,44,0,1,111361,57.9792,B18,C -525,0,3,"Kassem, Mr. Fared",0,,0,0,2700,7.2292,,C -526,0,3,"Farrell, Mr. James",0,40.5,0,0,367232,7.75,,Q -527,1,2,"Ridsdale, Miss. Lucy",1,50,0,0,W./C. 14258,10.5,,S -528,0,1,"Farthing, Mr. John",0,,0,0,PC 17483,221.7792,C95,S -529,0,3,"Salonen, Mr. Johan Werner",0,39,0,0,3101296,7.925,,S -530,0,2,"Hocking, Mr. Richard George",0,23,2,1,29104,11.5,,S -531,1,2,"Quick, Miss. Phyllis May",1,2,1,1,26360,26,,S -532,0,3,"Toufik, Mr. Nakli",0,,0,0,2641,7.2292,,C -533,0,3,"Elias, Mr. Joseph Jr",0,17,1,1,2690,7.2292,,C -534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",1,,0,2,2668,22.3583,,C -535,0,3,"Cacic, Miss. Marija",1,30,0,0,315084,8.6625,,S -536,1,2,"Hart, Miss. Eva Miriam",1,7,0,2,F.C.C. 13529,26.25,,S -537,0,1,"Butt, Major. Archibald Willingham",0,45,0,0,113050,26.55,B38,S -538,1,1,"LeRoy, Miss. Bertha",1,30,0,0,PC 17761,106.425,,C -539,0,3,"Risien, Mr. Samuel Beard",0,,0,0,364498,14.5,,S -540,1,1,"Frolicher, Miss. Hedwig Margaritha",1,22,0,2,13568,49.5,B39,C -541,1,1,"Crosby, Miss. Harriet R",1,36,0,2,WE/P 5735,71,B22,S -542,0,3,"Andersson, Miss. Ingeborg Constanzia",1,9,4,2,347082,31.275,,S -543,0,3,"Andersson, Miss. Sigrid Elisabeth",1,11,4,2,347082,31.275,,S -544,1,2,"Beane, Mr. Edward",0,32,1,0,2908,26,,S -545,0,1,"Douglas, Mr. Walter Donald",0,50,1,0,PC 17761,106.425,C86,C -546,0,1,"Nicholson, Mr. Arthur Ernest",0,64,0,0,693,26,,S -547,1,2,"Beane, Mrs. Edward (Ethel Clarke)",1,19,1,0,2908,26,,S -548,1,2,"Padro y Manent, Mr. Julian",0,,0,0,SC/PARIS 2146,13.8625,,C -549,0,3,"Goldsmith, Mr. Frank John",0,33,1,1,363291,20.525,,S -550,1,2,"Davies, Master. John Morgan Jr",0,8,1,1,C.A. 33112,36.75,,S -551,1,1,"Thayer, Mr. John Borland Jr",0,17,0,2,17421,110.8833,C70,C -552,0,2,"Sharp, Mr. Percival James R",0,27,0,0,244358,26,,S -553,0,3,"O'Brien, Mr. Timothy",0,,0,0,330979,7.8292,,Q -554,1,3,"Leeni, Mr. Fahim (""Philip Zenni"")",0,22,0,0,2620,7.225,,C -555,1,3,"Ohman, Miss. Velin",1,22,0,0,347085,7.775,,S -556,0,1,"Wright, Mr. George",0,62,0,0,113807,26.55,,S -557,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherland) (""Mrs Morgan"")",1,48,1,0,11755,39.6,A16,C -558,0,1,"Robbins, Mr. Victor",0,,0,0,PC 17757,227.525,,C -559,1,1,"Taussig, Mrs. Emil (Tillie Mandelbaum)",1,39,1,1,110413,79.65,E67,S -560,1,3,"de Messemaeker, Mrs. Guillaume Joseph (Emma)",1,36,1,0,345572,17.4,,S -561,0,3,"Morrow, Mr. Thomas Rowan",0,,0,0,372622,7.75,,Q -562,0,3,"Sivic, Mr. Husein",0,40,0,0,349251,7.8958,,S -563,0,2,"Norman, Mr. Robert Douglas",0,28,0,0,218629,13.5,,S -564,0,3,"Simmons, Mr. John",0,,0,0,SOTON/OQ 392082,8.05,,S -565,0,3,"Meanwell, Miss. (Marion Ogden)",1,,0,0,SOTON/O.Q. 392087,8.05,,S -566,0,3,"Davies, Mr. Alfred J",0,24,2,0,A/4 48871,24.15,,S -567,0,3,"Stoytcheff, Mr. Ilia",0,19,0,0,349205,7.8958,,S -568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",1,29,0,4,349909,21.075,,S -569,0,3,"Doharr, Mr. Tannous",0,,0,0,2686,7.2292,,C -570,1,3,"Jonsson, Mr. Carl",0,32,0,0,350417,7.8542,,S -571,1,2,"Harris, Mr. George",0,62,0,0,S.W./PP 752,10.5,,S -572,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",1,53,2,0,11769,51.4792,C101,S -573,1,1,"Flynn, Mr. John Irwin (""Irving"")",0,36,0,0,PC 17474,26.3875,E25,S -574,1,3,"Kelly, Miss. Mary",1,,0,0,14312,7.75,,Q -575,0,3,"Rush, Mr. Alfred George John",0,16,0,0,A/4. 20589,8.05,,S -576,0,3,"Patchett, Mr. George",0,19,0,0,358585,14.5,,S -577,1,2,"Garside, Miss. Ethel",1,34,0,0,243880,13,,S -578,1,1,"Silvey, Mrs. William Baird (Alice Munger)",1,39,1,0,13507,55.9,E44,S -579,0,3,"Caram, Mrs. Joseph (Maria Elias)",1,,1,0,2689,14.4583,,C -580,1,3,"Jussila, Mr. Eiriik",0,32,0,0,STON/O 2. 3101286,7.925,,S -581,1,2,"Christy, Miss. Julie Rachel",1,25,1,1,237789,30,,S -582,1,1,"Thayer, Mrs. John Borland (Marian Longstreth Morris)",1,39,1,1,17421,110.8833,C68,C -583,0,2,"Downton, Mr. William James",0,54,0,0,28403,26,,S -584,0,1,"Ross, Mr. John Hugo",0,36,0,0,13049,40.125,A10,C -585,0,3,"Paulner, Mr. Uscher",0,,0,0,3411,8.7125,,C -586,1,1,"Taussig, Miss. Ruth",1,18,0,2,110413,79.65,E68,S -587,0,2,"Jarvis, Mr. John Denzil",0,47,0,0,237565,15,,S -588,1,1,"Frolicher-Stehli, Mr. Maxmillian",0,60,1,1,13567,79.2,B41,C -589,0,3,"Gilinski, Mr. Eliezer",0,22,0,0,14973,8.05,,S -590,0,3,"Murdlin, Mr. Joseph",0,,0,0,A./5. 3235,8.05,,S -591,0,3,"Rintamaki, Mr. Matti",0,35,0,0,STON/O 2. 3101273,7.125,,S -592,1,1,"Stephenson, Mrs. Walter Bertram (Martha Eustis)",1,52,1,0,36947,78.2667,D20,C -593,0,3,"Elsbury, Mr. William James",0,47,0,0,A/5 3902,7.25,,S -594,0,3,"Bourke, Miss. Mary",1,,0,2,364848,7.75,,Q -595,0,2,"Chapman, Mr. John Henry",0,37,1,0,SC/AH 29037,26,,S -596,0,3,"Van Impe, Mr. Jean Baptiste",0,36,1,1,345773,24.15,,S -597,1,2,"Leitch, Miss. Jessie Wills",1,,0,0,248727,33,,S -598,0,3,"Johnson, Mr. Alfred",0,49,0,0,LINE,0,,S -599,0,3,"Boulos, Mr. Hanna",0,,0,0,2664,7.225,,C -600,1,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",0,49,1,0,PC 17485,56.9292,A20,C -601,1,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Christy)",1,24,2,1,243847,27,,S -602,0,3,"Slabenoff, Mr. Petco",0,,0,0,349214,7.8958,,S -603,0,1,"Harrington, Mr. Charles H",0,,0,0,113796,42.4,,S -604,0,3,"Torber, Mr. Ernst William",0,44,0,0,364511,8.05,,S -605,1,1,"Homer, Mr. Harry (""Mr E Haven"")",0,35,0,0,111426,26.55,,C -606,0,3,"Lindell, Mr. Edvard Bengtsson",0,36,1,0,349910,15.55,,S -607,0,3,"Karaic, Mr. Milan",0,30,0,0,349246,7.8958,,S -608,1,1,"Daniel, Mr. Robert Williams",0,27,0,0,113804,30.5,,S -609,1,2,"Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue)",1,22,1,2,SC/Paris 2123,41.5792,,C -610,1,1,"Shutes, Miss. Elizabeth W",1,40,0,0,PC 17582,153.4625,C125,S -611,0,3,"Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)",1,39,1,5,347082,31.275,,S -612,0,3,"Jardin, Mr. Jose Neto",0,,0,0,SOTON/O.Q. 3101305,7.05,,S -613,1,3,"Murphy, Miss. Margaret Jane",1,,1,0,367230,15.5,,Q -614,0,3,"Horgan, Mr. John",0,,0,0,370377,7.75,,Q -615,0,3,"Brocklebank, Mr. William Alfred",0,35,0,0,364512,8.05,,S -616,1,2,"Herman, Miss. Alice",1,24,1,2,220845,65,,S -617,0,3,"Danbom, Mr. Ernst Gilbert",0,34,1,1,347080,14.4,,S -618,0,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",1,26,1,0,A/5. 3336,16.1,,S -619,1,2,"Becker, Miss. Marion Louise",1,4,2,1,230136,39,F4,S -620,0,2,"Gavey, Mr. Lawrence",0,26,0,0,31028,10.5,,S -621,0,3,"Yasbeck, Mr. Antoni",0,27,1,0,2659,14.4542,,C -622,1,1,"Kimball, Mr. Edwin Nelson Jr",0,42,1,0,11753,52.5542,D19,S -623,1,3,"Nakid, Mr. Sahid",0,20,1,1,2653,15.7417,,C -624,0,3,"Hansen, Mr. Henry Damsgaard",0,21,0,0,350029,7.8542,,S -625,0,3,"Bowen, Mr. David John ""Dai""",0,21,0,0,54636,16.1,,S -626,0,1,"Sutton, Mr. Frederick",0,61,0,0,36963,32.3208,D50,S -627,0,2,"Kirkland, Rev. Charles Leonard",0,57,0,0,219533,12.35,,Q -628,1,1,"Longley, Miss. Gretchen Fiske",1,21,0,0,13502,77.9583,D9,S -629,0,3,"Bostandyeff, Mr. Guentcho",0,26,0,0,349224,7.8958,,S -630,0,3,"O'Connell, Mr. Patrick D",0,,0,0,334912,7.7333,,Q -631,1,1,"Barkworth, Mr. Algernon Henry Wilson",0,80,0,0,27042,30,A23,S -632,0,3,"Lundahl, Mr. Johan Svensson",0,51,0,0,347743,7.0542,,S -633,1,1,"Stahelin-Maeglin, Dr. Max",0,32,0,0,13214,30.5,B50,C -634,0,1,"Parr, Mr. William Henry Marsh",0,,0,0,112052,0,,S -635,0,3,"Skoog, Miss. Mabel",1,9,3,2,347088,27.9,,S -636,1,2,"Davis, Miss. Mary",1,28,0,0,237668,13,,S -637,0,3,"Leinonen, Mr. Antti Gustaf",0,32,0,0,STON/O 2. 3101292,7.925,,S -638,0,2,"Collyer, Mr. Harvey",0,31,1,1,C.A. 31921,26.25,,S -639,0,3,"Panula, Mrs. Juha (Maria Emilia Ojala)",1,41,0,5,3101295,39.6875,,S -640,0,3,"Thorneycroft, Mr. Percival",0,,1,0,376564,16.1,,S -641,0,3,"Jensen, Mr. Hans Peder",0,20,0,0,350050,7.8542,,S -642,1,1,"Sagesser, Mlle. Emma",1,24,0,0,PC 17477,69.3,B35,C -643,0,3,"Skoog, Miss. Margit Elizabeth",1,2,3,2,347088,27.9,,S -644,1,3,"Foo, Mr. Choong",0,,0,0,1601,56.4958,,S -645,1,3,"Baclini, Miss. Eugenie",1,0.75,2,1,2666,19.2583,,C -646,1,1,"Harper, Mr. Henry Sleeper",0,48,1,0,PC 17572,76.7292,D33,C -647,0,3,"Cor, Mr. Liudevit",0,19,0,0,349231,7.8958,,S -648,1,1,"Simonius-Blumer, Col. Oberst Alfons",0,56,0,0,13213,35.5,A26,C -649,0,3,"Willey, Mr. Edward",0,,0,0,S.O./P.P. 751,7.55,,S -650,1,3,"Stanley, Miss. Amy Zillah Elsie",1,23,0,0,CA. 2314,7.55,,S -651,0,3,"Mitkoff, Mr. Mito",0,,0,0,349221,7.8958,,S -652,1,2,"Doling, Miss. Elsie",1,18,0,1,231919,23,,S -653,0,3,"Kalvik, Mr. Johannes Halvorsen",0,21,0,0,8475,8.4333,,S -654,1,3,"O'Leary, Miss. Hanora ""Norah""",1,,0,0,330919,7.8292,,Q -655,0,3,"Hegarty, Miss. Hanora ""Nora""",1,18,0,0,365226,6.75,,Q -656,0,2,"Hickman, Mr. Leonard Mark",0,24,2,0,S.O.C. 14879,73.5,,S -657,0,3,"Radeff, Mr. Alexander",0,,0,0,349223,7.8958,,S -658,0,3,"Bourke, Mrs. John (Catherine)",1,32,1,1,364849,15.5,,Q -659,0,2,"Eitemiller, Mr. George Floyd",0,23,0,0,29751,13,,S -660,0,1,"Newell, Mr. Arthur Webster",0,58,0,2,35273,113.275,D48,C -661,1,1,"Frauenthal, Dr. Henry William",0,50,2,0,PC 17611,133.65,,S -662,0,3,"Badt, Mr. Mohamed",0,40,0,0,2623,7.225,,C -663,0,1,"Colley, Mr. Edward Pomeroy",0,47,0,0,5727,25.5875,E58,S -664,0,3,"Coleff, Mr. Peju",0,36,0,0,349210,7.4958,,S -665,1,3,"Lindqvist, Mr. Eino William",0,20,1,0,STON/O 2. 3101285,7.925,,S -666,0,2,"Hickman, Mr. Lewis",0,32,2,0,S.O.C. 14879,73.5,,S -667,0,2,"Butler, Mr. Reginald Fenton",0,25,0,0,234686,13,,S -668,0,3,"Rommetvedt, Mr. Knud Paust",0,,0,0,312993,7.775,,S -669,0,3,"Cook, Mr. Jacob",0,43,0,0,A/5 3536,8.05,,S -670,1,1,"Taylor, Mrs. Elmer Zebley (Juliet Cummins Wright)",1,,1,0,19996,52,C126,S -671,1,2,"Brown, Mrs. Thomas William Solomon (Elizabeth Catherine Ford)",1,40,1,1,29750,39,,S -672,0,1,"Davidson, Mr. Thornton",0,31,1,0,F.C. 12750,52,B71,S -673,0,2,"Mitchell, Mr. Henry Michael",0,70,0,0,C.A. 24580,10.5,,S -674,1,2,"Wilhelms, Mr. Charles",0,31,0,0,244270,13,,S -675,0,2,"Watson, Mr. Ennis Hastings",0,,0,0,239856,0,,S -676,0,3,"Edvardsson, Mr. Gustaf Hjalmar",0,18,0,0,349912,7.775,,S -677,0,3,"Sawyer, Mr. Frederick Charles",0,24.5,0,0,342826,8.05,,S -678,1,3,"Turja, Miss. Anna Sofia",1,18,0,0,4138,9.8417,,S -679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",1,43,1,6,CA 2144,46.9,,S -680,1,1,"Cardeza, Mr. Thomas Drake Martinez",0,36,0,1,PC 17755,512.3292,B51 B53 B55,C -681,0,3,"Peters, Miss. Katie",1,,0,0,330935,8.1375,,Q -682,1,1,"Hassab, Mr. Hammad",0,27,0,0,PC 17572,76.7292,D49,C -683,0,3,"Olsvigen, Mr. Thor Anderson",0,20,0,0,6563,9.225,,S -684,0,3,"Goodwin, Mr. Charles Edward",0,14,5,2,CA 2144,46.9,,S -685,0,2,"Brown, Mr. Thomas William Solomon",0,60,1,1,29750,39,,S -686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",0,25,1,2,SC/Paris 2123,41.5792,,C -687,0,3,"Panula, Mr. Jaako Arnold",0,14,4,1,3101295,39.6875,,S -688,0,3,"Dakic, Mr. Branko",0,19,0,0,349228,10.1708,,S -689,0,3,"Fischer, Mr. Eberhard Thelander",0,18,0,0,350036,7.7958,,S -690,1,1,"Madill, Miss. Georgette Alexandra",1,15,0,1,24160,211.3375,B5,S -691,1,1,"Dick, Mr. Albert Adrian",0,31,1,0,17474,57,B20,S -692,1,3,"Karun, Miss. Manca",1,4,0,1,349256,13.4167,,C -693,1,3,"Lam, Mr. Ali",0,,0,0,1601,56.4958,,S -694,0,3,"Saad, Mr. Khalil",0,25,0,0,2672,7.225,,C -695,0,1,"Weir, Col. John",0,60,0,0,113800,26.55,,S -696,0,2,"Chapman, Mr. Charles Henry",0,52,0,0,248731,13.5,,S -697,0,3,"Kelly, Mr. James",0,44,0,0,363592,8.05,,S -698,1,3,"Mullens, Miss. Katherine ""Katie""",1,,0,0,35852,7.7333,,Q -699,0,1,"Thayer, Mr. John Borland",0,49,1,1,17421,110.8833,C68,C -700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",0,42,0,0,348121,7.65,F G63,S -701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",1,18,1,0,PC 17757,227.525,C62 C64,C -702,1,1,"Silverthorne, Mr. Spencer Victor",0,35,0,0,PC 17475,26.2875,E24,S -703,0,3,"Barbara, Miss. Saiide",1,18,0,1,2691,14.4542,,C -704,0,3,"Gallagher, Mr. Martin",0,25,0,0,36864,7.7417,,Q -705,0,3,"Hansen, Mr. Henrik Juul",0,26,1,0,350025,7.8542,,S -706,0,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",0,39,0,0,250655,26,,S -707,1,2,"Kelly, Mrs. Florence ""Fannie""",1,45,0,0,223596,13.5,,S -708,1,1,"Calderhead, Mr. Edward Pennington",0,42,0,0,PC 17476,26.2875,E24,S -709,1,1,"Cleaver, Miss. Alice",1,22,0,0,113781,151.55,,S -710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",0,,1,1,2661,15.2458,,C -711,1,1,"Mayne, Mlle. Berthe Antonine (""Mrs de Villiers"")",1,24,0,0,PC 17482,49.5042,C90,C -712,0,1,"Klaber, Mr. Herman",0,,0,0,113028,26.55,C124,S -713,1,1,"Taylor, Mr. Elmer Zebley",0,48,1,0,19996,52,C126,S -714,0,3,"Larsson, Mr. August Viktor",0,29,0,0,7545,9.4833,,S -715,0,2,"Greenberg, Mr. Samuel",0,52,0,0,250647,13,,S -716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",0,19,0,0,348124,7.65,F G73,S -717,1,1,"Endres, Miss. Caroline Louise",1,38,0,0,PC 17757,227.525,C45,C -718,1,2,"Troutt, Miss. Edwina Celia ""Winnie""",1,27,0,0,34218,10.5,E101,S -719,0,3,"McEvoy, Mr. Michael",0,,0,0,36568,15.5,,Q -720,0,3,"Johnson, Mr. Malkolm Joackim",0,33,0,0,347062,7.775,,S -721,1,2,"Harper, Miss. Annie Jessie ""Nina""",1,6,0,1,248727,33,,S -722,0,3,"Jensen, Mr. Svend Lauritz",0,17,1,0,350048,7.0542,,S -723,0,2,"Gillespie, Mr. William Henry",0,34,0,0,12233,13,,S -724,0,2,"Hodges, Mr. Henry Price",0,50,0,0,250643,13,,S -725,1,1,"Chambers, Mr. Norman Campbell",0,27,1,0,113806,53.1,E8,S -726,0,3,"Oreskovic, Mr. Luka",0,20,0,0,315094,8.6625,,S -727,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",1,30,3,0,31027,21,,S -728,1,3,"Mannion, Miss. Margareth",1,,0,0,36866,7.7375,,Q -729,0,2,"Bryhl, Mr. Kurt Arnold Gottfrid",0,25,1,0,236853,26,,S -730,0,3,"Ilmakangas, Miss. Pieta Sofia",1,25,1,0,STON/O2. 3101271,7.925,,S -731,1,1,"Allen, Miss. Elisabeth Walton",1,29,0,0,24160,211.3375,B5,S -732,0,3,"Hassan, Mr. Houssein G N",0,11,0,0,2699,18.7875,,C -733,0,2,"Knight, Mr. Robert J",0,,0,0,239855,0,,S -734,0,2,"Berriman, Mr. William John",0,23,0,0,28425,13,,S -735,0,2,"Troupiansky, Mr. Moses Aaron",0,23,0,0,233639,13,,S -736,0,3,"Williams, Mr. Leslie",0,28.5,0,0,54636,16.1,,S -737,0,3,"Ford, Mrs. Edward (Margaret Ann Watson)",1,48,1,3,W./C. 6608,34.375,,S -738,1,1,"Lesurer, Mr. Gustave J",0,35,0,0,PC 17755,512.3292,B101,C -739,0,3,"Ivanoff, Mr. Kanio",0,,0,0,349201,7.8958,,S -740,0,3,"Nankoff, Mr. Minko",0,,0,0,349218,7.8958,,S -741,1,1,"Hawksford, Mr. Walter James",0,,0,0,16988,30,D45,S -742,0,1,"Cavendish, Mr. Tyrell William",0,36,1,0,19877,78.85,C46,S -743,1,1,"Ryerson, Miss. Susan Parker ""Suzette""",1,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C -744,0,3,"McNamee, Mr. Neal",0,24,1,0,376566,16.1,,S -745,1,3,"Stranden, Mr. Juho",0,31,0,0,STON/O 2. 3101288,7.925,,S -746,0,1,"Crosby, Capt. Edward Gifford",0,70,1,1,WE/P 5735,71,B22,S -747,0,3,"Abbott, Mr. Rossmore Edward",0,16,1,1,C.A. 2673,20.25,,S -748,1,2,"Sinkkonen, Miss. Anna",1,30,0,0,250648,13,,S -749,0,1,"Marvin, Mr. Daniel Warner",0,19,1,0,113773,53.1,D30,S -750,0,3,"Connaghton, Mr. Michael",0,31,0,0,335097,7.75,,Q -751,1,2,"Wells, Miss. Joan",1,4,1,1,29103,23,,S -752,1,3,"Moor, Master. Meier",0,6,0,1,392096,12.475,E121,S -753,0,3,"Vande Velde, Mr. Johannes Joseph",0,33,0,0,345780,9.5,,S -754,0,3,"Jonkoff, Mr. Lalio",0,23,0,0,349204,7.8958,,S -755,1,2,"Herman, Mrs. Samuel (Jane Laver)",1,48,1,2,220845,65,,S -756,1,2,"Hamalainen, Master. Viljo",0,0.67,1,1,250649,14.5,,S -757,0,3,"Carlsson, Mr. August Sigfrid",0,28,0,0,350042,7.7958,,S -758,0,2,"Bailey, Mr. Percy Andrew",0,18,0,0,29108,11.5,,S -759,0,3,"Theobald, Mr. Thomas Leonard",0,34,0,0,363294,8.05,,S -760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)",1,33,0,0,110152,86.5,B77,S -761,0,3,"Garfirth, Mr. John",0,,0,0,358585,14.5,,S -762,0,3,"Nirva, Mr. Iisakki Antino Aijo",0,41,0,0,SOTON/O2 3101272,7.125,,S -763,1,3,"Barah, Mr. Hanna Assi",0,20,0,0,2663,7.2292,,C -764,1,1,"Carter, Mrs. William Ernest (Lucile Polk)",1,36,1,2,113760,120,B96 B98,S -765,0,3,"Eklund, Mr. Hans Linus",0,16,0,0,347074,7.775,,S -766,1,1,"Hogeboom, Mrs. John C (Anna Andrews)",1,51,1,0,13502,77.9583,D11,S -767,0,1,"Brewe, Dr. Arthur Jackson",0,,0,0,112379,39.6,,C -768,0,3,"Mangan, Miss. Mary",1,30.5,0,0,364850,7.75,,Q -769,0,3,"Moran, Mr. Daniel J",0,,1,0,371110,24.15,,Q -770,0,3,"Gronnestad, Mr. Daniel Danielsen",0,32,0,0,8471,8.3625,,S -771,0,3,"Lievens, Mr. Rene Aime",0,24,0,0,345781,9.5,,S -772,0,3,"Jensen, Mr. Niels Peder",0,48,0,0,350047,7.8542,,S -773,0,2,"Mack, Mrs. (Mary)",1,57,0,0,S.O./P.P. 3,10.5,E77,S -774,0,3,"Elias, Mr. Dibo",0,,0,0,2674,7.225,,C -775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",1,54,1,3,29105,23,,S -776,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",0,18,0,0,347078,7.75,,S -777,0,3,"Tobin, Mr. Roger",0,,0,0,383121,7.75,F38,Q -778,1,3,"Emanuel, Miss. Virginia Ethel",1,5,0,0,364516,12.475,,S -779,0,3,"Kilgannon, Mr. Thomas J",0,,0,0,36865,7.7375,,Q -780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton McMillan)",1,43,0,1,24160,211.3375,B3,S -781,1,3,"Ayoub, Miss. Banoura",1,13,0,0,2687,7.2292,,C -782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",1,17,1,0,17474,57,B20,S -783,0,1,"Long, Mr. Milton Clyde",0,29,0,0,113501,30,D6,S -784,0,3,"Johnston, Mr. Andrew G",0,,1,2,W./C. 6607,23.45,,S -785,0,3,"Ali, Mr. William",0,25,0,0,SOTON/O.Q. 3101312,7.05,,S -786,0,3,"Harmer, Mr. Abraham (David Lishin)",0,25,0,0,374887,7.25,,S -787,1,3,"Sjoblom, Miss. Anna Sofia",1,18,0,0,3101265,7.4958,,S -788,0,3,"Rice, Master. George Hugh",0,8,4,1,382652,29.125,,Q -789,1,3,"Dean, Master. Bertram Vere",0,1,1,2,C.A. 2315,20.575,,S -790,0,1,"Guggenheim, Mr. Benjamin",0,46,0,0,PC 17593,79.2,B82 B84,C -791,0,3,"Keane, Mr. Andrew ""Andy""",0,,0,0,12460,7.75,,Q -792,0,2,"Gaskell, Mr. Alfred",0,16,0,0,239865,26,,S -793,0,3,"Sage, Miss. Stella Anna",1,,8,2,CA. 2343,69.55,,S -794,0,1,"Hoyt, Mr. William Fisher",0,,0,0,PC 17600,30.6958,,C -795,0,3,"Dantcheff, Mr. Ristiu",0,25,0,0,349203,7.8958,,S -796,0,2,"Otter, Mr. Richard",0,39,0,0,28213,13,,S -797,1,1,"Leader, Dr. Alice (Farnham)",1,49,0,0,17465,25.9292,D17,S -798,1,3,"Osman, Mrs. Mara",1,31,0,0,349244,8.6833,,S -799,0,3,"Ibrahim Shawah, Mr. Yousseff",0,30,0,0,2685,7.2292,,C -800,0,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)",1,30,1,1,345773,24.15,,S -801,0,2,"Ponesell, Mr. Martin",0,34,0,0,250647,13,,S -802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",1,31,1,1,C.A. 31921,26.25,,S -803,1,1,"Carter, Master. William Thornton II",0,11,1,2,113760,120,B96 B98,S -804,1,3,"Thomas, Master. Assad Alexander",0,0.42,0,1,2625,8.5167,,C -805,1,3,"Hedman, Mr. Oskar Arvid",0,27,0,0,347089,6.975,,S -806,0,3,"Johansson, Mr. Karl Johan",0,31,0,0,347063,7.775,,S -807,0,1,"Andrews, Mr. Thomas Jr",0,39,0,0,112050,0,A36,S -808,0,3,"Pettersson, Miss. Ellen Natalia",1,18,0,0,347087,7.775,,S -809,0,2,"Meyer, Mr. August",0,39,0,0,248723,13,,S -810,1,1,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",1,33,1,0,113806,53.1,E8,S -811,0,3,"Alexander, Mr. William",0,26,0,0,3474,7.8875,,S -812,0,3,"Lester, Mr. James",0,39,0,0,A/4 48871,24.15,,S -813,0,2,"Slemen, Mr. Richard James",0,35,0,0,28206,10.5,,S -814,0,3,"Andersson, Miss. Ebba Iris Alfrida",1,6,4,2,347082,31.275,,S -815,0,3,"Tomlin, Mr. Ernest Portage",0,30.5,0,0,364499,8.05,,S -816,0,1,"Fry, Mr. Richard",0,,0,0,112058,0,B102,S -817,0,3,"Heininen, Miss. Wendla Maria",1,23,0,0,STON/O2. 3101290,7.925,,S -818,0,2,"Mallet, Mr. Albert",0,31,1,1,S.C./PARIS 2079,37.0042,,C -819,0,3,"Holm, Mr. John Fredrik Alexander",0,43,0,0,C 7075,6.45,,S -820,0,3,"Skoog, Master. Karl Thorsten",0,10,3,2,347088,27.9,,S -821,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gregg)",1,52,1,1,12749,93.5,B69,S -822,1,3,"Lulic, Mr. Nikola",0,27,0,0,315098,8.6625,,S -823,0,1,"Reuchlin, Jonkheer. John George",0,38,0,0,19972,0,,S -824,1,3,"Moor, Mrs. (Beila)",1,27,0,1,392096,12.475,E121,S -825,0,3,"Panula, Master. Urho Abraham",0,2,4,1,3101295,39.6875,,S -826,0,3,"Flynn, Mr. John",0,,0,0,368323,6.95,,Q -827,0,3,"Lam, Mr. Len",0,,0,0,1601,56.4958,,S -828,1,2,"Mallet, Master. Andre",0,1,0,2,S.C./PARIS 2079,37.0042,,C -829,1,3,"McCormack, Mr. Thomas Joseph",0,,0,0,367228,7.75,,Q -830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",1,62,0,0,113572,80,B28, -831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",1,15,1,0,2659,14.4542,,C -832,1,2,"Richards, Master. George Sibley",0,0.83,1,1,29106,18.75,,S -833,0,3,"Saad, Mr. Amin",0,,0,0,2671,7.2292,,C -834,0,3,"Augustsson, Mr. Albert",0,23,0,0,347468,7.8542,,S -835,0,3,"Allum, Mr. Owen George",0,18,0,0,2223,8.3,,S -836,1,1,"Compton, Miss. Sara Rebecca",1,39,1,1,PC 17756,83.1583,E49,C -837,0,3,"Pasic, Mr. Jakob",0,21,0,0,315097,8.6625,,S -838,0,3,"Sirota, Mr. Maurice",0,,0,0,392092,8.05,,S -839,1,3,"Chip, Mr. Chang",0,32,0,0,1601,56.4958,,S -840,1,1,"Marechal, Mr. Pierre",0,,0,0,11774,29.7,C47,C -841,0,3,"Alhomaki, Mr. Ilmari Rudolf",0,20,0,0,SOTON/O2 3101287,7.925,,S -842,0,2,"Mudd, Mr. Thomas Charles",0,16,0,0,S.O./P.P. 3,10.5,,S -843,1,1,"Serepeca, Miss. Augusta",1,30,0,0,113798,31,,C -844,0,3,"Lemberopolous, Mr. Peter L",0,34.5,0,0,2683,6.4375,,C -845,0,3,"Culumovic, Mr. Jeso",0,17,0,0,315090,8.6625,,S -846,0,3,"Abbing, Mr. Anthony",0,42,0,0,C.A. 5547,7.55,,S -847,0,3,"Sage, Mr. Douglas Bullen",0,,8,2,CA. 2343,69.55,,S -848,0,3,"Markoff, Mr. Marin",0,35,0,0,349213,7.8958,,C -849,0,2,"Harper, Rev. John",0,28,0,1,248727,33,,S -850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",1,,1,0,17453,89.1042,C92,C -851,0,3,"Andersson, Master. Sigvard Harald Elias",0,4,4,2,347082,31.275,,S -852,0,3,"Svensson, Mr. Johan",0,74,0,0,347060,7.775,,S -853,0,3,"Boulos, Miss. Nourelain",1,9,1,1,2678,15.2458,,C -854,1,1,"Lines, Miss. Mary Conover",1,16,0,1,PC 17592,39.4,D28,S -855,0,2,"Carter, Mrs. Ernest Courtenay (Lilian Hughes)",1,44,1,0,244252,26,,S -856,1,3,"Aks, Mrs. Sam (Leah Rosen)",1,18,0,1,392091,9.35,,S -857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",1,45,1,1,36928,164.8667,,S -858,1,1,"Daly, Mr. Peter Denis ",0,51,0,0,113055,26.55,E17,S -859,1,3,"Baclini, Mrs. Solomon (Latifa Qurban)",1,24,0,3,2666,19.2583,,C -860,0,3,"Razi, Mr. Raihed",0,,0,0,2629,7.2292,,C -861,0,3,"Hansen, Mr. Claus Peter",0,41,2,0,350026,14.1083,,S -862,0,2,"Giles, Mr. Frederick Edward",0,21,1,0,28134,11.5,,S -863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Barron)",1,48,0,0,17466,25.9292,D17,S -864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",1,,8,2,CA. 2343,69.55,,S -865,0,2,"Gill, Mr. John William",0,24,0,0,233866,13,,S -866,1,2,"Bystrom, Mrs. (Karolina)",1,42,0,0,236852,13,,S -867,1,2,"Duran y More, Miss. Asuncion",1,27,1,0,SC/PARIS 2149,13.8583,,C -868,0,1,"Roebling, Mr. Washington Augustus II",0,31,0,0,PC 17590,50.4958,A24,S -869,0,3,"van Melkebeke, Mr. Philemon",0,,0,0,345777,9.5,,S -870,1,3,"Johnson, Master. Harold Theodor",0,4,1,1,347742,11.1333,,S -871,0,3,"Balkic, Mr. Cerin",0,26,0,0,349248,7.8958,,S -872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",1,47,1,1,11751,52.5542,D35,S -873,0,1,"Carlsson, Mr. Frans Olof",0,33,0,0,695,5,B51 B53 B55,S -874,0,3,"Vander Cruyssen, Mr. Victor",0,47,0,0,345765,9,,S -875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",1,28,1,0,P/PP 3381,24,,C -876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",1,15,0,0,2667,7.225,,C -877,0,3,"Gustafsson, Mr. Alfred Ossian",0,20,0,0,7534,9.8458,,S -878,0,3,"Petroff, Mr. Nedelio",0,19,0,0,349212,7.8958,,S -879,0,3,"Laleff, Mr. Kristo",0,,0,0,349217,7.8958,,S -880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",1,56,0,1,11767,83.1583,C50,C -881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",1,25,0,1,230433,26,,S -882,0,3,"Markun, Mr. Johann",0,33,0,0,349257,7.8958,,S -883,0,3,"Dahlberg, Miss. Gerda Ulrika",1,22,0,0,7552,10.5167,,S -884,0,2,"Banfield, Mr. Frederick James",0,28,0,0,C.A./SOTON 34068,10.5,,S -885,0,3,"Sutehall, Mr. Henry Jr",0,25,0,0,SOTON/OQ 392076,7.05,,S -886,0,3,"Rice, Mrs. William (Margaret Norton)",1,39,0,5,382652,29.125,,Q -887,0,2,"Montvila, Rev. Juozas",0,27,0,0,211536,13,,S -888,1,1,"Graham, Miss. Margaret Edith",1,19,0,0,112053,30,B42,S -889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,,1,2,W./C. 6607,23.45,,S -890,1,1,"Behr, Mr. Karl Howell",0,26,0,0,111369,30,C148,C -891,0,3,"Dooley, Mr. Patrick",0,32,0,0,370376,7.75,,Q diff --git a/index.md b/index.md index e4a6ef60..485668c9 100644 --- a/index.md +++ b/index.md @@ -10,6 +10,7 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an |Title| Task | Dataset | Training Compute | Deployment Target | ML Framework | Tags | |:----|:-----|:-------:|:----------------:|:-----------------:|:------------:|:------------:| | [Using Azure ML environments](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training/using-environments/using-environments.ipynb) | Creating and registering environments | None | Local | None | None | None | + | [Estimators in AML with hyperparameter tuning](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training-with-deep-learning/how-to-use-estimator/how-to-use-estimator.ipynb) | Use the Estimator pattern in Azure Machine Learning SDK | None | AML Compute | None | None | None | @@ -17,22 +18,42 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an |Title| Task | Dataset | Training Compute | Deployment Target | ML Framework | Tags | |:----|:-----|:-------:|:----------------:|:-----------------:|:------------:|:------------:| -| [Using a notebook for training and deploying](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training/train-within-notebook/train-within-notebook.ipynb) | Training and deploying a model from a notebook | Diabetes | Local | Azure Container Instance | None | None | +| [](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training/train-within-notebook/train-within-notebook.ipynb) | Training and deploying a model from a notebook | Diabetes | Local | Azure Container Instance | None | None | + +| :star:[Filtering data using Tabular Timeseiries Dataset related API](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/datasets-tutorial/tabular-timeseries-dataset-filtering.ipynb) | Filtering | NOAA | local | None | Azure ML | Dataset, Tabular Timeseries | + +| :star:[Train with Datasets (Tabular and File)](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/datasets-tutorial/train-with-datasets.ipynb) | Filtering | Iris, Daibetes | remote | None | Azure ML | Dataset | + | [Use MLflow with Azure Machine Learning for training and deployment](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/using-mlflow/train-deploy-pytorch/train-and-deploy-pytorch.ipynb) | Use MLflow with Azure Machine Learning to train and deploy Pa yTorch image classifier model | MNIST | AML Compute | Azure Container Instance | PyTorch | None | + | :star:[Azure Machine Learning Pipeline with DataTranferStep](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-data-transfer.ipynb) | Demonstrates the use of DataTranferStep | Custom | ADF | None | Azure ML | None | + | [Getting Started with Azure Machine Learning Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-getting-started.ipynb) | Getting Started notebook for ANML Pipelines | Custom | AML Compute | None | Azure ML | None | + | [Azure Machine Learning Pipeline with AzureBatchStep](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-azurebatch-to-run-a-windows-executable.ipynb) | Demonstrates the use of AzureBatchStep | Custom | Azure Batch | None | Azure ML | None | + | [Azure Machine Learning Pipeline with EstimatorStep](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-estimatorstep.ipynb) | Demonstrates the use of EstimatorStep | Custom | AML Compute | None | Azure ML | None | + | :star:[How to use ModuleStep with AML Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-modulestep.ipynb) | Demonstrates the use of ModuleStep | Custom | AML Compute | None | Azure ML | None | + | :star:[How to use Pipeline Drafts to create a Published Pipeline](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-pipeline-drafts.ipynb) | Demonstrates the use of Pipeline Drafts | Custom | AML Compute | None | Azure ML | None | + | :star:[Azure Machine Learning Pipeline with HyperDriveStep](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-parameter-tuning-with-hyperdrive.ipynb) | Demonstrates the use of HyperDriveStep | Custom | AML Compute | None | Azure ML | None | + | :star:[How to Publish a Pipeline and Invoke the REST endpoint](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-publish-and-run-using-rest-endpoint.ipynb) | Demonstrates the use of Published Pipelines | Custom | AML Compute | None | Azure ML | None | + | :star:[How to Setup a Schedule for a Published Pipeline](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-setup-schedule-for-a-published-pipeline.ipynb) | Demonstrates the use of Schedules for Published Pipelines | Custom | AML Compute | None | Azure ML | None | + | [How to setup a versioned Pipeline Endpoint](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-setup-versioned-pipeline-endpoints.ipynb) | Demonstrates the use of PipelineEndpoint to run a specific version of the Published Pipeline | Custom | AML Compute | None | Azure ML | None | + | :star:[How to use DataPath as a PipelineParameter](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-showcasing-datapath-and-pipelineparameter.ipynb) | Demonstrates the use of DataPath as a PipelineParameter | Custom | AML Compute | None | Azure ML | None | + | [How to use AdlaStep with AML Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-use-adla-as-compute-target.ipynb) | Demonstrates the use of AdlaStep | Custom | Azure Data Lake Analytics | None | Azure ML | None | + | :star:[How to use DatabricksStep with AML Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-use-databricks-as-compute-target.ipynb) | Demonstrates the use of DatabricksStep | Custom | Azure Databricks | None | Azure ML, Azure Databricks | None | + | :star:[How to use AutoMLStep with AML Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-automated-machine-learning-step.ipynb) | Demonstrates the use of AutoMLStep | Custom | AML Compute | None | Automated Machine Learning | None | + | :star:[Azure Machine Learning Pipelines with Data Dependency](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-data-dependency-steps.ipynb) | Demonstrates how to construct a Pipeline with data dependency between steps | Custom | AML Compute | None | Azure ML | None | @@ -41,24 +62,43 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an |Title| Task | Dataset | Training Compute | Deployment Target | ML Framework | Tags | |:----|:-----|:-------:|:----------------:|:-----------------:|:------------:|:------------:| | [Train a model with hyperparameter tuning](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/chainer/deployment/train-hyperparameter-tune-deploy-with-chainer/train-hyperparameter-tune-deploy-with-chainer.ipynb) | Train a Convolutional Neural Network (CNN) | MNIST | AML Compute | Azure Container Instance | Chainer | None | + | [Distributed Training with Chainer](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/chainer/training/distributed-chainer/distributed-chainer.ipynb) | Use the Chainer estimator to perform distributed training | MNIST | AML Compute | None | Chainer | None | + | [Training with hyperparameter tuning using PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/deployment/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb) | Train an image classification model using transfer learning with the PyTorch estimator | ImageNet | AML Compute | Azure Container Instance | PyTorch | None | + | [Distributed PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/training/distributed-pytorch-with-horovod/distributed-pytorch-with-horovod.ipynb) | Train a model using the distributed training via Horovod | MNIST | AML Compute | None | PyTorch | None | + | [Distributed training with PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/training/distributed-pytorch-with-nccl-gloo/distributed-pytorch-with-nccl-gloo.ipynb) | Train a model using distributed training via Nccl/Gloo | MNIST | AML Compute | None | PyTorch | None | + | [Training and hyperparameter tuning with Scikit-learn](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/scikit-learn/training/train-hyperparameter-tune-deploy-with-sklearn/train-hyperparameter-tune-deploy-with-sklearn.ipynb) | Train a support vector machine (SVM) to perform classification | Iris | AML Compute | None | Scikit-learn | None | + | [Training and hyperparameter tuning using the TensorFlow estimator](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb) | Train a deep neural network | MNIST | AML Compute | Azure Container Instance | TensorFlow | None | + | [Distributed training using TensorFlow with Horovod](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/training/distributed-tensorflow-with-horovod/distributed-tensorflow-with-horovod.ipynb) | Use the TensorFlow estimator to train a word2vec model | None | AML Compute | None | TensorFlow | None | + | [Distributed TensorFlow with parameter server](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/training/distributed-tensorflow-with-parameter-server/distributed-tensorflow-with-parameter-server.ipynb) | Use the TensorFlow estimator to train a model using distributed training | MNIST | AML Compute | None | TensorFlow | None | + | [Resuming a model](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/training/train-tensorflow-resume-training/train-tensorflow-resume-training.ipynb) | Resume a model in TensorFlow from a previously submitted run | MNIST | AML Compute | None | TensorFlow | None | + | [Training in Spark](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training/train-in-spark/train-in-spark.ipynb) | Submiting a run on a spark cluster | None | HDI cluster | None | PySpark | None | + | [Train on Azure Machine Learning Compute](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb) | Submit an Azure Machine Leaarning Compute run | Diabetes | AML Compute | None | None | None | + | [Train on local compute](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training/train-on-local/train-on-local.ipynb) | Train a model locally | Diabetes | Local | None | None | None | + | [Train in a remote Linux virtual machine](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training/train-on-remote-vm/train-on-remote-vm.ipynb) | Configure and execute a run | Diabetes | Data Science Virtual Machine | None | None | None | + | [Using Tensorboard](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training-with-deep-learning/export-run-history-to-tensorboard/export-run-history-to-tensorboard.ipynb) | Export the run history as Tensorboard logs | None | None | None | TensorFlow | None | + | [Train a DNN using hyperparameter tuning and deploying with Keras](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.ipynb) | Create a multi-class classifier | MNIST | AML Compute | Azure Container Instance | TensorFlow | None | + | [Managing your training runs](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/manage-runs/manage-runs.ipynb) | Monitor and complete runs | None | Local | None | None | None | + | [Tensorboard integration with run history](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/tensorboard/tensorboard.ipynb) | Run a TensorFlow job and view its Tensorboard output live | None | Local, DSVM, AML Compute | None | TensorFlow | None | + | [Use MLflow with AML for a local training run](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/using-mlflow/train-local/train-local.ipynb) | Use MLflow tracking APIs together with Azure Machine Learning for storing your metrics and artifacts | Diabetes | Local | None | None | None | + | [Use MLflow with AML for a remote training run](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/using-mlflow/train-remote/train-remote.ipynb) | Use MLflow tracking APIs together with AML for storing your metrics and artifacts | Diabetes | AML Compute | None | None | None | @@ -75,114 +115,151 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an ## Other Notebooks |Title| Task | Dataset | Training Compute | Deployment Target | ML Framework | Tags | |:----|:-----|:-------:|:----------------:|:-----------------:|:------------:|:------------:| -| [Logging APIs](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb) | Logging APIs and analyzing results | None | None | None | None | None | | [configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) | | | | | | | + | [azure-ml-with-nvidia-rapids](https://github.com/Azure/MachineLearningNotebooks/blob/master//contrib/RAPIDS/azure-ml-with-nvidia-rapids.ipynb) | | | | | | | + | [auto-ml-classification](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/classification/auto-ml-classification.ipynb) | | | | | | | + | [auto-ml-classification-bank-marketing](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/classification-bank-marketing/auto-ml-classification-bank-marketing.ipynb) | | | | | | | + | [auto-ml-classification-credit-card-fraud](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb) | | | | | | | + | [auto-ml-classification-with-deployment](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/classification-with-deployment/auto-ml-classification-with-deployment.ipynb) | | | | | | | + | [auto-ml-classification-with-onnx](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/classification-with-onnx/auto-ml-classification-with-onnx.ipynb) | | | | | | | + | [auto-ml-classification-with-whitelisting](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/classification-with-whitelisting/auto-ml-classification-with-whitelisting.ipynb) | | | | | | | + | [auto-ml-dataset](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/dataset/auto-ml-dataset.ipynb) | | | | | | | + | [auto-ml-dataset-remote-execution](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/dataset-remote-execution/auto-ml-dataset-remote-execution.ipynb) | | | | | | | + | [auto-ml-exploring-previous-runs](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/exploring-previous-runs/auto-ml-exploring-previous-runs.ipynb) | | | | | | | + | [auto-ml-forecasting-bike-share](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.ipynb) | | | | | | | + | [auto-ml-forecasting-energy-demand](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-energy-demand/auto-ml-forecasting-energy-demand.ipynb) | | | | | | | + +| [automl-forecasting-function](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-high-frequency/automl-forecasting-function.ipynb) | | | | | | | + | [auto-ml-forecasting-orange-juice-sales](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb) | | | | | | | + | [auto-ml-missing-data-blacklist-early-termination](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/missing-data-blacklist-early-termination/auto-ml-missing-data-blacklist-early-termination.ipynb) | | | | | | | + | [auto-ml-model-explanation](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/model-explanation/auto-ml-model-explanation.ipynb) | | | | | | | + | [auto-ml-model-explanations-remote-compute](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/model-explanation-remote-amlcompute/auto-ml-model-explanations-remote-compute.ipynb) | | | | | | | + | [auto-ml-regression](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/regression/auto-ml-regression.ipynb) | | | | | | | + | [auto-ml-regression-concrete-strength](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/regression-concrete-strength/auto-ml-regression-concrete-strength.ipynb) | | | | | | | + | [auto-ml-regression-hardware-performance](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/regression-hardware-performance/auto-ml-regression-hardware-performance.ipynb) | | | | | | | + | [auto-ml-remote-amlcompute](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/remote-amlcompute/auto-ml-remote-amlcompute.ipynb) | | | | | | | + | [auto-ml-remote-amlcompute-with-onnx](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/remote-amlcompute-with-onnx/auto-ml-remote-amlcompute-with-onnx.ipynb) | | | | | | | + | [auto-ml-sample-weight](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/sample-weight/auto-ml-sample-weight.ipynb) | | | | | | | + | [auto-ml-sparse-data-train-test-split](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/sparse-data-train-test-split/auto-ml-sparse-data-train-test-split.ipynb) | | | | | | | + | [auto-ml-sql-energy-demand](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/sql-server/energy-demand/auto-ml-sql-energy-demand.ipynb) | | | | | | | + | [auto-ml-sql-setup](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/sql-server/setup/auto-ml-sql-setup.ipynb) | | | | | | | + | [auto-ml-subsampling-local](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/subsampling/auto-ml-subsampling-local.ipynb) | | | | | | | + | [build-model-run-history-03](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-databricks/amlsdk/build-model-run-history-03.ipynb) | | | | | | | + | [deploy-to-aci-04](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-databricks/amlsdk/deploy-to-aci-04.ipynb) | | | | | | | + | [deploy-to-aks-05](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-databricks/amlsdk/deploy-to-aks-05.ipynb) | | | | | | | + | [ingest-data-02](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-databricks/amlsdk/ingest-data-02.ipynb) | | | | | | | + | [installation-and-configuration-01](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-databricks/amlsdk/installation-and-configuration-01.ipynb) | | | | | | | + | [automl-databricks-local-01](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb) | | | | | | | + | [automl-databricks-local-with-deployment](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb) | | | | | | | + | [aml-pipelines-use-databricks-as-compute-target](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/azure-databricks/databricks-as-remote-compute-target/aml-pipelines-use-databricks-as-compute-target.ipynb) | | | | | | | + | [accelerated-models-object-detection](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/accelerated-models/accelerated-models-object-detection.ipynb) | | | | | | | + | [accelerated-models-quickstart](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/accelerated-models/accelerated-models-quickstart.ipynb) | | | | | | | + | [accelerated-models-training](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/accelerated-models/accelerated-models-training.ipynb) | | | | | | | + | [model-register-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/deploy-to-cloud/model-register-and-deploy.ipynb) | | | | | | | + | [register-model-deploy-local-advanced](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local-advanced.ipynb) | | | | | | | + | [register-model-deploy-local](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/deploy-to-local/register-model-deploy-local.ipynb) | | | | | | | + | [enable-app-insights-in-production-service](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/enable-app-insights-in-production-service/enable-app-insights-in-production-service.ipynb) | | | | | | | + | [enable-data-collection-for-models-in-aks](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/enable-data-collection-for-models-in-aks/enable-data-collection-for-models-in-aks.ipynb) | | | | | | | + | [onnx-convert-aml-deploy-tinyyolo](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/onnx/onnx-convert-aml-deploy-tinyyolo.ipynb) | | | | | | | + | [onnx-inference-facial-expression-recognition-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/onnx/onnx-inference-facial-expression-recognition-deploy.ipynb) | | | | | | | + | [onnx-inference-mnist-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/onnx/onnx-inference-mnist-deploy.ipynb) | | | | | | | + | [onnx-modelzoo-aml-deploy-resnet50](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/onnx/onnx-modelzoo-aml-deploy-resnet50.ipynb) | | | | | | | + | [onnx-train-pytorch-aml-deploy-mnist](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/onnx/onnx-train-pytorch-aml-deploy-mnist.ipynb) | | | | | | | + | [production-deploy-to-aks](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/production-deploy-to-aks/production-deploy-to-aks.ipynb) | | | | | | | + | [register-model-create-image-deploy-service](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/deployment/register-model-create-image-deploy-service/register-model-create-image-deploy-service.ipynb) | | | | | | | + | [explain-model-on-amlcompute](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb) | | | | | | | + | [save-retrieve-explanations-run-history](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.ipynb) | | | | | | | + | [train-explain-model-locally-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb) | | | | | | | + | [train-explain-model-on-amlcompute-and-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb) | | | | | | | + | [advanced-feature-transformations-explain-local](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/tabular-data/advanced-feature-transformations-explain-local.ipynb) | | | | | | | + | [explain-binary-classification-local](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/tabular-data/explain-binary-classification-local.ipynb) | | | | | | | + | [explain-multiclass-classification-local](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/tabular-data/explain-multiclass-classification-local.ipynb) | | | | | | | + | [explain-regression-local](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/tabular-data/explain-regression-local.ipynb) | | | | | | | + | [simple-feature-transformations-explain-local](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/explain-model/tabular-data/simple-feature-transformations-explain-local.ipynb) | | | | | | | + | [nyc-taxi-data-regression-model-building](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/nyc-taxi-data-regression-model-building.ipynb) | | | | | | | + | [pipeline-batch-scoring](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/pipeline-batch-scoring/pipeline-batch-scoring.ipynb) | | | | | | | + | [pipeline-style-transfer](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/pipeline-style-transfer/pipeline-style-transfer.ipynb) | | | | | | | + | [authentication-in-azureml](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/manage-azureml-service/authentication-in-azureml/authentication-in-azureml.ipynb) | | | | | | | + | [azure-ml-datadrift](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/monitor-models/data-drift/azure-ml-datadrift.ipynb) | | | | | | | + +| [Logging APIs](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb) | Logging APIs and analyzing results | | None | None | None | None | + | [distributed-cntk-with-custom-docker](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training-with-deep-learning/distributed-cntk-with-custom-docker/distributed-cntk-with-custom-docker.ipynb) | | | | | | | + | [notebook_example](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/training-with-deep-learning/how-to-use-estimator/notebook_example.ipynb) | | | | | | | -| [new-york-taxi](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi.ipynb) | | | | | | | -| [new-york-taxi_scale-out](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/case-studies/new-york-taxi/new-york-taxi_scale-out.ipynb) | | | | | | | -| [add-column-using-expression](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/add-column-using-expression.ipynb) | | | | | | | -| [append-columns-and-rows](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/append-columns-and-rows.ipynb) | | | | | | | -| [assertions](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/assertions.ipynb) | | | | | | | -| [auto-read-file](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/auto-read-file.ipynb) | | | | | | | -| [cache](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/cache.ipynb) | | | | | | | -| [column-manipulations](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-manipulations.ipynb) | | | | | | | -| [column-type-transforms](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/column-type-transforms.ipynb) | | | | | | | -| [custom-python-transforms](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/custom-python-transforms.ipynb) | | | | | | | -| [data-ingestion](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-ingestion.ipynb) | | | | | | | -| [data-profile](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/data-profile.ipynb) | | | | | | | -| [datastore](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/datastore.ipynb) | | | | | | | -| [derive-column-by-example](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/derive-column-by-example.ipynb) | | | | | | | -| [external-references](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/external-references.ipynb) | | | | | | | -| [filtering](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/filtering.ipynb) | | | | | | | -| [fuzzy-group](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/fuzzy-group.ipynb) | | | | | | | -| [impute-missing-values](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/impute-missing-values.ipynb) | | | | | | | -| [join](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/join.ipynb) | | | | | | | -| [label-encoder](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/label-encoder.ipynb) | | | | | | | -| [min-max-scaler](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/min-max-scaler.ipynb) | | | | | | | -| [one-hot-encoder](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/one-hot-encoder.ipynb) | | | | | | | -| [open-save-dataflows](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/open-save-dataflows.ipynb) | | | | | | | -| [quantile-transformation](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/quantile-transformation.ipynb) | | | | | | | -| [random-split](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/random-split.ipynb) | | | | | | | -| [replace-datasource-replace-reference](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-datasource-replace-reference.ipynb) | | | | | | | -| [replace-fill-error](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/replace-fill-error.ipynb) | | | | | | | -| [secrets](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/secrets.ipynb) | | | | | | | -| [semantic-types](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/semantic-types.ipynb) | | | | | | | -| [split-column-by-example](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/split-column-by-example.ipynb) | | | | | | | -| [subsetting-sampling](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/subsetting-sampling.ipynb) | | | | | | | -| [summarize](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/summarize.ipynb) | | | | | | | -| [working-with-file-streams](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/working-with-file-streams.ipynb) | | | | | | | -| [writing-data](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/how-to-guides/writing-data.ipynb) | | | | | | | -| [getting-started](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/dataprep/tutorials/getting-started/getting-started.ipynb) | | | | | | | -| [tabular-timeseries-dataset-filtering](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/datasets/datasets-tutorial/tabular-timeseries-dataset-filtering.ipynb) | | | | | | | -| [train-with-datasets](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/work-with-data/datasets/datasets-tutorial/train-with-datasets.ipynb) | | | | | | | + | [configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master//setup-environment/configuration.ipynb) | | | | | | | + | [img-classification-part1-training](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/img-classification-part1-training.ipynb) | | | | | | | + | [img-classification-part2-deploy](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/img-classification-part2-deploy.ipynb) | | | | | | | + | [regression-automated-ml](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/regression-automated-ml.ipynb) | | | | | | | + | [tutorial-1st-experiment-sdk-train](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/tutorial-1st-experiment-sdk-train.ipynb) | | | | | | | + | [tutorial-pipeline-batch-scoring-classification](https://github.com/Azure/MachineLearningNotebooks/blob/master//tutorials/tutorial-pipeline-batch-scoring-classification.ipynb) | | | | | | | diff --git a/setup-environment/configuration.ipynb b/setup-environment/configuration.ipynb index 7a9c6570..4d6d84f2 100644 --- a/setup-environment/configuration.ipynb +++ b/setup-environment/configuration.ipynb @@ -102,7 +102,7 @@ "source": [ "import azureml.core\n", "\n", - "print(\"This notebook was created using version 1.0.65 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.0.69 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, diff --git a/tutorials/img-classification-part1-training.ipynb b/tutorials/img-classification-part1-training.ipynb index 87c60300..d6c4e62e 100644 --- a/tutorials/img-classification-part1-training.ipynb +++ b/tutorials/img-classification-part1-training.ipynb @@ -262,7 +262,8 @@ "execution_count": null, "metadata": { "tags": [ - "use datastore" + "use datastore", + "dataset-remarks-file-sample" ] }, "outputs": [],