From bf937f9d4857c5882fb9344f7629e82becadae79 Mon Sep 17 00:00:00 2001 From: Peshotan Irani Date: Mon, 23 Nov 2020 15:17:05 -0800 Subject: [PATCH] docs(clean) : add documentation for clean_url --- docs/source/user_guide/clean/clean_url.ipynb | 1784 +++++++++++++++++ .../user_guide/clean/introduction.ipynb | 12 +- 2 files changed, 1794 insertions(+), 2 deletions(-) create mode 100644 docs/source/user_guide/clean/clean_url.ipynb diff --git a/docs/source/user_guide/clean/clean_url.ipynb b/docs/source/user_guide/clean/clean_url.ipynb new file mode 100644 index 000000000..11c3506a8 --- /dev/null +++ b/docs/source/user_guide/clean/clean_url.ipynb @@ -0,0 +1,1784 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# `clean_url()`: Clean and validate urls" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "The function `clean_url()` cleans a column containing urls, and extracts the important parameters including cleaned path, queries, scheme,etc. The function `validate_url()` validates either a single url or a column of urls, returning True if the value is valid, and False otherwise.\n", + "\n", + "`clean_url()` function extracts all the important features of the url and creates an additional column containing key value pairs of the parameters, following are the features it extracts:\n", + "\n", + "* scheme (string)\n", + "* host (string) \n", + "* cleaned path (string)\n", + "* queries (key-value pairs)\n", + "\n", + "Remove Auth tokens: Sometimes we would like to remove certain sensitive information which is usually contained in a url for e.g. access_tokens, user information, etc. `clean_url()` provides us with an option to remove this information with `remove_auth` parameter. The usage of all parameters is explained in depth in the sections below.\n", + "\n", + "Invalid parsing is handled with the `errors` parameter:\n", + "\n", + "* \"coerce\" (default), then invalid parsing will be set as NaN\n", + "* \"ignore\", then invalid parsing will return the input\n", + "* \"raise\", then invalid parsing will raise an exception\n", + "\n", + "After cleaning, a **report** is printed that provides the following information:\n", + "\n", + "* How many values were cleaned (the value must be transformed)\n", + "* How many values could not be parsed\n", + "* If `remove_auth` is specified, then displays how many queries were removed from how many rows\n", + "* And the data summary: how many values are in the correct format, and how many values are null\n", + "\n", + "The following sections demonstrate the functionality of `clean_url()` and `validate_url()`. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
messy_url
0random text which is not a url
1http://www.facebookee.com/otherpath?auth=faceb...
2https://www.sfu.ca/ficticiouspath?auth=samplet...
3notaurl
4NaN
5None
6https://www.sfu.ca/ficticiouspath?auth=samplet...
7
8{'not_a_url': True}
92345678
10345345345
11https://www.sfu.ca/ficticiouspath?auth=samplet...
12https://www.sfu.ca/ficticiouspath?auth=samplet...
\n", + "
" + ], + "text/plain": [ + " messy_url\n", + "0 random text which is not a url\n", + "1 http://www.facebookee.com/otherpath?auth=faceb...\n", + "2 https://www.sfu.ca/ficticiouspath?auth=samplet...\n", + "3 notaurl\n", + "4 NaN\n", + "5 None\n", + "6 https://www.sfu.ca/ficticiouspath?auth=samplet...\n", + "7 \n", + "8 {'not_a_url': True}\n", + "9 2345678\n", + "10 345345345\n", + "11 https://www.sfu.ca/ficticiouspath?auth=samplet...\n", + "12 https://www.sfu.ca/ficticiouspath?auth=samplet..." + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "df = pd.DataFrame({\"messy_url\":\n", + " [\"random text which is not a url\",\n", + " \"http://www.facebookee.com/otherpath?auth=facebookeeauth&token=iwusdkc¬_token=hiThere&another_token=12323423\",\n", + " \"https://www.sfu.ca/ficticiouspath?auth=sampletoken1&studentid=1234&loc=van\", \n", + " \"notaurl\", \n", + " np.nan,\n", + " None, \n", + " \"https://www.sfu.ca/ficticiouspath?auth=sampletoken2&studentid=1230&loc=bur\", \n", + " \"\", \n", + " {\"not_a_url\" : True}, \n", + " \"2345678\", \n", + " 345345345,\n", + " \"https://www.sfu.ca/ficticiouspath?auth=sampletoken3&studentid=1231&loc=sur\",\n", + " \"https://www.sfu.ca/ficticiouspath?auth=sampletoken1&studentid=1232&loc=van\",\n", + " ]\n", + " })\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. default: `clean_url()`\n", + "By default, all the parameters are set to default, `inplace = False`, `split = False`, `remove_auth = False`, `report = True`,`errors = coerce`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Url Cleaning report:\n", + " 5 values parsed (38.46 %)\n", + " 8 values unable to be parsed (61.54 %), set to NaN \n", + "Result contains parsed key-value pairs for 5 (38.46 %) rows (stored in column `messy_url_details`) and 8 null values(61.54 %).\n", + " \n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
messy_urlmessy_url_details
0random text which is not a urlNaN
1http://www.facebookee.com/otherpath?auth=faceb...{'scheme': 'http', 'host': 'www.facebookee.com...
2https://www.sfu.ca/ficticiouspath?auth=samplet...{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
3notaurlNaN
4NaNNaN
5NoneNaN
6https://www.sfu.ca/ficticiouspath?auth=samplet...{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
7NaN
8{'not_a_url': True}NaN
92345678NaN
10345345345NaN
11https://www.sfu.ca/ficticiouspath?auth=samplet...{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
12https://www.sfu.ca/ficticiouspath?auth=samplet...{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
\n", + "
" + ], + "text/plain": [ + " messy_url \\\n", + "0 random text which is not a url \n", + "1 http://www.facebookee.com/otherpath?auth=faceb... \n", + "2 https://www.sfu.ca/ficticiouspath?auth=samplet... \n", + "3 notaurl \n", + "4 NaN \n", + "5 None \n", + "6 https://www.sfu.ca/ficticiouspath?auth=samplet... \n", + "7 \n", + "8 {'not_a_url': True} \n", + "9 2345678 \n", + "10 345345345 \n", + "11 https://www.sfu.ca/ficticiouspath?auth=samplet... \n", + "12 https://www.sfu.ca/ficticiouspath?auth=samplet... \n", + "\n", + " messy_url_details \n", + "0 NaN \n", + "1 {'scheme': 'http', 'host': 'www.facebookee.com... \n", + "2 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes... \n", + "3 NaN \n", + "4 NaN \n", + "5 NaN \n", + "6 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes... \n", + "7 NaN \n", + "8 NaN \n", + "9 NaN \n", + "10 NaN \n", + "11 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes... \n", + "12 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes... " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from dataprep.clean import clean_url\n", + "df_default = clean_url(df, column=\"messy_url\")\n", + "df_default" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that in the new dataframe `df_default` a new column is created `messy_url_details`, this follows the naming convention of `orginal_column_name`**_details** (`messy_url_details` in our case).\n", + "\n", + "Now let us see what one of the cells in `messy_url_details` looks like." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'scheme': 'http',\n", + " 'host': 'www.facebookee.com',\n", + " 'messy_url_clean': 'http://www.facebookee.com/otherpath',\n", + " 'queries': {'auth': 'facebookeeauth',\n", + " 'token': 'iwusdkc',\n", + " 'not_token': 'hiThere',\n", + " 'another_token': '12323423'}}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_default[\"messy_url_details\"][1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. `remove_auth` parameter\n", + "\n", + "Sometimes we need to remove sensitive information when parsing a url, we can do this in the clean_url() function by specifying the `remove_auth` parameter to be True or we can can specify a list of parameters to removed. Hence `remove_auth` can be a `boolean` value or list of strings.\n", + "\n", + "When `remove_auth` is set to the boolean value of `True`, clean_url() looks for auth tokens based on the default list of token names (provided below) and removes them. When `remove_auth` is set to list of strings it creates a union of the user provided list and default list to create a new set of token words to be removed.\n", + "\n", + "`default_list = {\n", + " \"access_token\",\n", + " \"auth_key\",\n", + " \"auth\",\n", + " \"password\",\n", + " \"username\",\n", + " \"login\",\n", + " \"token\",\n", + " \"passcode\",\n", + " \"access-token\",\n", + " \"auth-key\",\n", + " \"authentication\",\n", + " \"authentication-key\",\n", + "}\n", + "`\n", + "\n", + "Lets have a look at the same dataframe and the two scenerios described above (by looking at the second row)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### a. `remove_auth = True` (boolean)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Url Cleaning report:\n", + " 5 values parsed (38.46 %)\n", + " 8 values unable to be parsed (61.54 %), set to NaN \n", + "Removed 6 auth queries from 5 rows\n", + "Result contains parsed key-value pairs for 5 (38.46 %) rows (stored in column `messy_url_details`) and 8 null values(61.54 %).\n", + " \n" + ] + }, + { + "data": { + "text/plain": [ + "{'scheme': 'http',\n", + " 'host': 'www.facebookee.com',\n", + " 'messy_url_clean': 'http://www.facebookee.com/otherpath',\n", + " 'queries': {'not_token': 'hiThere', 'another_token': '12323423'}}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_remove_auth_boolean = clean_url(df, column=\"messy_url\", remove_auth=True)\n", + "df_remove_auth_boolean[\"messy_url_details\"][1] " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see queries `auth` & `token` were removed from the result but `not_token` and `another_token` were included, this is because `auth` and `token` were specified in `default_list`. Also notice the **additional line** giving the stats on how many queries were removed from how many rows." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### b. remove_auth = list of string" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Url Cleaning report:\n", + " 5 values parsed (38.46 %)\n", + " 8 values unable to be parsed (61.54 %), set to NaN \n", + "Removed 7 auth queries from 5 rows\n", + "Result contains parsed key-value pairs for 5 (38.46 %) rows (stored in column `messy_url_details`) and 8 null values(61.54 %).\n", + " \n" + ] + }, + { + "data": { + "text/plain": [ + "{'scheme': 'http',\n", + " 'host': 'www.facebookee.com',\n", + " 'messy_url_clean': 'http://www.facebookee.com/otherpath',\n", + " 'queries': {'not_token': 'hiThere'}}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_remove_auth_list = clean_url(df, column=\"messy_url\", remove_auth=[\"another_token\"])\n", + "df_remove_auth_list[\"messy_url_details\"][1] " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see queries `auth`, `token` and `another_token` were removed but `not_token` was included in the result, this is because a new list was created by creating a union of `default_list` and user defined list and queries were removed based on the new combined list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. `split` Parameter\n", + "The `split` parameter adds individual columns containing the containing all the extracted features to the given DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Url Cleaning report:\n", + " 5 values parsed (38.46 %)\n", + " 8 values unable to be parsed (61.54 %), set to NaN \n", + "Result contains parsed values for 5(38.46 %) rows and 8 null values(61.54 %).\n", + " \n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
messy_urlschemehostmessy_url_cleanqueries
0random text which is not a urlNaNNaNNaNNaN
1http://www.facebookee.com/otherpath?auth=faceb...httpwww.facebookee.comhttp://www.facebookee.com/otherpath{'auth': 'facebookeeauth', 'token': 'iwusdkc',...
2https://www.sfu.ca/ficticiouspath?auth=samplet...httpswww.sfu.cahttps://www.sfu.ca/ficticiouspath{'auth': 'sampletoken1', 'studentid': '1234', ...
3notaurlNaNNaNNaNNaN
4NaNNaNNaNNaNNaN
5NoneNaNNaNNaNNaN
6https://www.sfu.ca/ficticiouspath?auth=samplet...httpswww.sfu.cahttps://www.sfu.ca/ficticiouspath{'auth': 'sampletoken2', 'studentid': '1230', ...
7NaNNaNNaNNaN
8{'not_a_url': True}NaNNaNNaNNaN
92345678NaNNaNNaNNaN
10345345345NaNNaNNaNNaN
11https://www.sfu.ca/ficticiouspath?auth=samplet...httpswww.sfu.cahttps://www.sfu.ca/ficticiouspath{'auth': 'sampletoken3', 'studentid': '1231', ...
12https://www.sfu.ca/ficticiouspath?auth=samplet...httpswww.sfu.cahttps://www.sfu.ca/ficticiouspath{'auth': 'sampletoken1', 'studentid': '1232', ...
\n", + "
" + ], + "text/plain": [ + " messy_url scheme \\\n", + "0 random text which is not a url NaN \n", + "1 http://www.facebookee.com/otherpath?auth=faceb... http \n", + "2 https://www.sfu.ca/ficticiouspath?auth=samplet... https \n", + "3 notaurl NaN \n", + "4 NaN NaN \n", + "5 None NaN \n", + "6 https://www.sfu.ca/ficticiouspath?auth=samplet... https \n", + "7 NaN \n", + "8 {'not_a_url': True} NaN \n", + "9 2345678 NaN \n", + "10 345345345 NaN \n", + "11 https://www.sfu.ca/ficticiouspath?auth=samplet... https \n", + "12 https://www.sfu.ca/ficticiouspath?auth=samplet... https \n", + "\n", + " host messy_url_clean \\\n", + "0 NaN NaN \n", + "1 www.facebookee.com http://www.facebookee.com/otherpath \n", + "2 www.sfu.ca https://www.sfu.ca/ficticiouspath \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "5 NaN NaN \n", + "6 www.sfu.ca https://www.sfu.ca/ficticiouspath \n", + "7 NaN NaN \n", + "8 NaN NaN \n", + "9 NaN NaN \n", + "10 NaN NaN \n", + "11 www.sfu.ca https://www.sfu.ca/ficticiouspath \n", + "12 www.sfu.ca https://www.sfu.ca/ficticiouspath \n", + "\n", + " queries \n", + "0 NaN \n", + "1 {'auth': 'facebookeeauth', 'token': 'iwusdkc',... \n", + "2 {'auth': 'sampletoken1', 'studentid': '1234', ... \n", + "3 NaN \n", + "4 NaN \n", + "5 NaN \n", + "6 {'auth': 'sampletoken2', 'studentid': '1230', ... \n", + "7 NaN \n", + "8 NaN \n", + "9 NaN \n", + "10 NaN \n", + "11 {'auth': 'sampletoken3', 'studentid': '1231', ... \n", + "12 {'auth': 'sampletoken1', 'studentid': '1232', ... " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_remove_split = clean_url(df, column=\"messy_url\", split=True)\n", + "df_remove_split" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. `inplace` parameter\n", + "Replaces the original column with `orginal_column_name`_details." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Url Cleaning report:\n", + " 5 values parsed (38.46 %)\n", + " 8 values unable to be parsed (61.54 %), set to NaN \n", + "Result contains parsed key-value pairs for 5 (38.46 %) rows (stored in column `messy_url_details`) and 8 null values(61.54 %).\n", + " \n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
messy_url_details
0NaN
1{'scheme': 'http', 'host': 'www.facebookee.com...
2{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
3NaN
4NaN
5NaN
6{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
7NaN
8NaN
9NaN
10NaN
11{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
12{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
\n", + "
" + ], + "text/plain": [ + " messy_url_details\n", + "0 NaN\n", + "1 {'scheme': 'http', 'host': 'www.facebookee.com...\n", + "2 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes...\n", + "3 NaN\n", + "4 NaN\n", + "5 NaN\n", + "6 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes...\n", + "7 NaN\n", + "8 NaN\n", + "9 NaN\n", + "10 NaN\n", + "11 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes...\n", + "12 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes..." + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_remove_inplace = clean_url(df, column=\"messy_url\", inplace=True)\n", + "df_remove_inplace" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. `split` and `inplace`\n", + "Replaces the original column with other columns based on the split parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Url Cleaning report:\n", + " 5 values parsed (38.46 %)\n", + " 8 values unable to be parsed (61.54 %), set to NaN \n", + "Result contains parsed values for 5(38.46 %) rows and 8 null values(61.54 %).\n", + " \n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
schemehostmessy_url_cleanqueries
0NaNNaNNaNNaN
1httpwww.facebookee.comhttp://www.facebookee.com/otherpath{'auth': 'facebookeeauth', 'token': 'iwusdkc',...
2httpswww.sfu.cahttps://www.sfu.ca/ficticiouspath{'auth': 'sampletoken1', 'studentid': '1234', ...
3NaNNaNNaNNaN
4NaNNaNNaNNaN
5NaNNaNNaNNaN
6httpswww.sfu.cahttps://www.sfu.ca/ficticiouspath{'auth': 'sampletoken2', 'studentid': '1230', ...
7NaNNaNNaNNaN
8NaNNaNNaNNaN
9NaNNaNNaNNaN
10NaNNaNNaNNaN
11httpswww.sfu.cahttps://www.sfu.ca/ficticiouspath{'auth': 'sampletoken3', 'studentid': '1231', ...
12httpswww.sfu.cahttps://www.sfu.ca/ficticiouspath{'auth': 'sampletoken1', 'studentid': '1232', ...
\n", + "
" + ], + "text/plain": [ + " scheme host messy_url_clean \\\n", + "0 NaN NaN NaN \n", + "1 http www.facebookee.com http://www.facebookee.com/otherpath \n", + "2 https www.sfu.ca https://www.sfu.ca/ficticiouspath \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "5 NaN NaN NaN \n", + "6 https www.sfu.ca https://www.sfu.ca/ficticiouspath \n", + "7 NaN NaN NaN \n", + "8 NaN NaN NaN \n", + "9 NaN NaN NaN \n", + "10 NaN NaN NaN \n", + "11 https www.sfu.ca https://www.sfu.ca/ficticiouspath \n", + "12 https www.sfu.ca https://www.sfu.ca/ficticiouspath \n", + "\n", + " queries \n", + "0 NaN \n", + "1 {'auth': 'facebookeeauth', 'token': 'iwusdkc',... \n", + "2 {'auth': 'sampletoken1', 'studentid': '1234', ... \n", + "3 NaN \n", + "4 NaN \n", + "5 NaN \n", + "6 {'auth': 'sampletoken2', 'studentid': '1230', ... \n", + "7 NaN \n", + "8 NaN \n", + "9 NaN \n", + "10 NaN \n", + "11 {'auth': 'sampletoken3', 'studentid': '1231', ... \n", + "12 {'auth': 'sampletoken1', 'studentid': '1232', ... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_remove_inplace_split = clean_url(df, column=\"messy_url\", inplace=True, split=True)\n", + "df_remove_inplace_split" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. `errors` parameter\n", + "* \"coerce\" (default), then invalid parsing will be set as NaN\n", + "* \"ignore\", then invalid parsing will return the input\n", + "* \"raise\", then invalid parsing will raise an exception" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### a. \"coerce\" (default)\n", + "This is the default value of the parameters, this sets the invalid parsing to NaN." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Url Cleaning report:\n", + " 5 values parsed (38.46 %)\n", + " 8 values unable to be parsed (61.54 %), set to NaN \n", + "Result contains parsed key-value pairs for 5 (38.46 %) rows (stored in column `messy_url_details`) and 8 null values(61.54 %).\n", + " \n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
messy_urlmessy_url_details
0random text which is not a urlNaN
1http://www.facebookee.com/otherpath?auth=faceb...{'scheme': 'http', 'host': 'www.facebookee.com...
2https://www.sfu.ca/ficticiouspath?auth=samplet...{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
3notaurlNaN
4NaNNaN
5NoneNaN
6https://www.sfu.ca/ficticiouspath?auth=samplet...{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
7NaN
8{'not_a_url': True}NaN
92345678NaN
10345345345NaN
11https://www.sfu.ca/ficticiouspath?auth=samplet...{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
12https://www.sfu.ca/ficticiouspath?auth=samplet...{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
\n", + "
" + ], + "text/plain": [ + " messy_url \\\n", + "0 random text which is not a url \n", + "1 http://www.facebookee.com/otherpath?auth=faceb... \n", + "2 https://www.sfu.ca/ficticiouspath?auth=samplet... \n", + "3 notaurl \n", + "4 NaN \n", + "5 None \n", + "6 https://www.sfu.ca/ficticiouspath?auth=samplet... \n", + "7 \n", + "8 {'not_a_url': True} \n", + "9 2345678 \n", + "10 345345345 \n", + "11 https://www.sfu.ca/ficticiouspath?auth=samplet... \n", + "12 https://www.sfu.ca/ficticiouspath?auth=samplet... \n", + "\n", + " messy_url_details \n", + "0 NaN \n", + "1 {'scheme': 'http', 'host': 'www.facebookee.com... \n", + "2 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes... \n", + "3 NaN \n", + "4 NaN \n", + "5 NaN \n", + "6 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes... \n", + "7 NaN \n", + "8 NaN \n", + "9 NaN \n", + "10 NaN \n", + "11 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes... \n", + "12 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes... " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_remove_errors_default = clean_url(df, column=\"messy_url\")\n", + "df_remove_errors_default" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### b. \"ignore\"\n", + "This sets the value of invalid parsing as the input." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Url Cleaning report:\n", + " 5 values parsed (38.46 %)\n", + " 8 values unable to be parsed (61.54 %), set to their original values \n", + "Result contains parsed key-value pairs for 5 (38.46 %) rows (stored in column `messy_url_details`) and 8 null / not parsable values(61.54 %).\n", + " \n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
messy_urlmessy_url_details
0random text which is not a urlrandom text which is not a url
1http://www.facebookee.com/otherpath?auth=faceb...{'scheme': 'http', 'host': 'www.facebookee.com...
2https://www.sfu.ca/ficticiouspath?auth=samplet...{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
3notaurlnotaurl
4NaNNaN
5NoneNone
6https://www.sfu.ca/ficticiouspath?auth=samplet...{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
7
8{'not_a_url': True}{'not_a_url': True}
923456782345678
10345345345345345345
11https://www.sfu.ca/ficticiouspath?auth=samplet...{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
12https://www.sfu.ca/ficticiouspath?auth=samplet...{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
\n", + "
" + ], + "text/plain": [ + " messy_url \\\n", + "0 random text which is not a url \n", + "1 http://www.facebookee.com/otherpath?auth=faceb... \n", + "2 https://www.sfu.ca/ficticiouspath?auth=samplet... \n", + "3 notaurl \n", + "4 NaN \n", + "5 None \n", + "6 https://www.sfu.ca/ficticiouspath?auth=samplet... \n", + "7 \n", + "8 {'not_a_url': True} \n", + "9 2345678 \n", + "10 345345345 \n", + "11 https://www.sfu.ca/ficticiouspath?auth=samplet... \n", + "12 https://www.sfu.ca/ficticiouspath?auth=samplet... \n", + "\n", + " messy_url_details \n", + "0 random text which is not a url \n", + "1 {'scheme': 'http', 'host': 'www.facebookee.com... \n", + "2 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes... \n", + "3 notaurl \n", + "4 NaN \n", + "5 None \n", + "6 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes... \n", + "7 \n", + "8 {'not_a_url': True} \n", + "9 2345678 \n", + "10 345345345 \n", + "11 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes... \n", + "12 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes... " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_remove_errors_ignore = clean_url(df, column=\"messy_url\", errors=\"ignore\")\n", + "df_remove_errors_ignore " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### c. \"raise\"\n", + "This will raise a value error when it encounters an invalid parsing value." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Unable to parse value random text which is not a url", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_remove_errors_raise\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclean_url\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"messy_url\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"raise\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdf_remove_errors_raise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/dataprep/dataprep/clean/clean_url.py\u001b[0m in \u001b[0;36mclean_url\u001b[0;34m(df, column, inplace, split, remove_auth, report, errors)\u001b[0m\n\u001b[1;32m 110\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mformat_url\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msplit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mremove_auth\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmeta\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmeta\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 112\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnrows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 113\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 114\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/dask/base.py\u001b[0m in \u001b[0;36mcompute\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 445\u001b[0m \u001b[0mpostcomputes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__dask_postcompute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 446\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 447\u001b[0;31m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mschedule\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdsk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 448\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mrepack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpostcomputes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 449\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/dask/threaded.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(dsk, result, cache, num_workers, pool, **kwargs)\u001b[0m\n\u001b[1;32m 82\u001b[0m \u001b[0mget_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_thread_get_id\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0mpack_exception\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpack_exception\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 84\u001b[0;31m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 85\u001b[0m )\n\u001b[1;32m 86\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36mget_async\u001b[0;34m(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)\u001b[0m\n\u001b[1;32m 484\u001b[0m \u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# Re-execute locally\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 485\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 486\u001b[0;31m \u001b[0mraise_exception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 487\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mworker_id\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres_info\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 488\u001b[0m \u001b[0mstate\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"cache\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36mreraise\u001b[0;34m(exc, tb)\u001b[0m\n\u001b[1;32m 314\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__traceback__\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 315\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 316\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 317\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 318\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/dask/local.py\u001b[0m in \u001b[0;36mexecute_task\u001b[0;34m(key, task_info, dumps, loads, get_id, pack_exception)\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 221\u001b[0m \u001b[0mtask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask_info\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 222\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 223\u001b[0m \u001b[0mid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_id\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/dask/core.py\u001b[0m in \u001b[0;36m_execute_task\u001b[0;34m(arg, cache, dsk)\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[0;31m# temporaries by their reference count and can execute certain\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[0;31m# operations in-place.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 121\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 122\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mishashable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/dask/optimization.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1020\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minkeys\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1021\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Expected %d args, got %d\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minkeys\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1022\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdsk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minkeys\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1023\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1024\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__reduce__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/dask/core.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(dsk, out, cache)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtoposort\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdsk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[0mtask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdsk\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 151\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 152\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 153\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/dask/core.py\u001b[0m in \u001b[0;36m_execute_task\u001b[0;34m(arg, cache, dsk)\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[0;31m# temporaries by their reference count and can execute certain\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[0;31m# operations in-place.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 121\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 122\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mishashable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/dask/utils.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(func, args, kwargs)\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 31\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 32\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/dask/dataframe/core.py\u001b[0m in \u001b[0;36mapply_and_enforce\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 5256\u001b[0m \u001b[0mfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"_func\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5257\u001b[0m \u001b[0mmeta\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"_meta\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5258\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5259\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_dataframe_like\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mis_series_like\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mis_index_like\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5260\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/dask/utils.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, obj, *args, **kwargs)\u001b[0m\n\u001b[1;32m 893\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 894\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 895\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 896\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 897\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__reduce__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, raw, result_type, args, **kwds)\u001b[0m\n\u001b[1;32m 6876\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6877\u001b[0m )\n\u001b[0;32m-> 6878\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6879\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6880\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapplymap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;34m\"DataFrame\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mget_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 184\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_raw\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 186\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 187\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 188\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply_empty_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mapply_standard\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 294\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 295\u001b[0m result = libreduction.compute_reduction(\n\u001b[0;32m--> 296\u001b[0;31m \u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdummy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdummy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 297\u001b[0m )\n\u001b[1;32m 298\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32mpandas/_libs/reduction.pyx\u001b[0m in \u001b[0;36mpandas._libs.reduction.compute_reduction\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mpandas/_libs/reduction.pyx\u001b[0m in \u001b[0;36mpandas._libs.reduction.Reducer.get_result\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mf\u001b[0;34m(x)\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 113\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 114\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/dataprep/dataprep/clean/clean_url.py\u001b[0m in \u001b[0;36mformat_url\u001b[0;34m(row, column, split, remove_auth, errors)\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 134\u001b[0m val_dict = get_url_params(\n\u001b[0;32m--> 135\u001b[0;31m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcolumn\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msplit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mremove_auth\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mremove_auth\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcolumn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 136\u001b[0m )\n\u001b[1;32m 137\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34mf\"{column}_details\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mval_dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/dataprep/dataprep/clean/clean_url.py\u001b[0m in \u001b[0;36mget_url_params\u001b[0;34m(url, column, split, remove_auth, errors)\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 153\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"raise\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 154\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Unable to parse value {url}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 155\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0murl\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"ignore\"\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnan\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: Unable to parse value random text which is not a url" + ] + } + ], + "source": [ + "df_remove_errors_raise = clean_url(df, column=\"messy_url\", errors=\"raise\")\n", + "df_remove_errors_raise " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Report Parameter\n", + "By default it is set to `True`, when set to `False` - will not display the stats pertaining to the cleaned operations performed." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
messy_urlmessy_url_details
0random text which is not a urlNaN
1http://www.facebookee.com/otherpath?auth=faceb...{'scheme': 'http', 'host': 'www.facebookee.com...
2https://www.sfu.ca/ficticiouspath?auth=samplet...{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
3notaurlNaN
4NaNNaN
5NoneNaN
6https://www.sfu.ca/ficticiouspath?auth=samplet...{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
7NaN
8{'not_a_url': True}NaN
92345678NaN
10345345345NaN
11https://www.sfu.ca/ficticiouspath?auth=samplet...{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
12https://www.sfu.ca/ficticiouspath?auth=samplet...{'scheme': 'https', 'host': 'www.sfu.ca', 'mes...
\n", + "
" + ], + "text/plain": [ + " messy_url \\\n", + "0 random text which is not a url \n", + "1 http://www.facebookee.com/otherpath?auth=faceb... \n", + "2 https://www.sfu.ca/ficticiouspath?auth=samplet... \n", + "3 notaurl \n", + "4 NaN \n", + "5 None \n", + "6 https://www.sfu.ca/ficticiouspath?auth=samplet... \n", + "7 \n", + "8 {'not_a_url': True} \n", + "9 2345678 \n", + "10 345345345 \n", + "11 https://www.sfu.ca/ficticiouspath?auth=samplet... \n", + "12 https://www.sfu.ca/ficticiouspath?auth=samplet... \n", + "\n", + " messy_url_details \n", + "0 NaN \n", + "1 {'scheme': 'http', 'host': 'www.facebookee.com... \n", + "2 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes... \n", + "3 NaN \n", + "4 NaN \n", + "5 NaN \n", + "6 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes... \n", + "7 NaN \n", + "8 NaN \n", + "9 NaN \n", + "10 NaN \n", + "11 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes... \n", + "12 {'scheme': 'https', 'host': 'www.sfu.ca', 'mes... " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_remove_auth_boolean = clean_url(df, column=\"messy_url\", remove_auth=True, report=False)\n", + "df_remove_auth_boolean" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Validate_url\n", + "validate_url() returns True when the input is a valid url. Otherwise it returns False." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "False\n", + "False\n", + "True\n", + "True\n" + ] + } + ], + "source": [ + "from dataprep.clean import validate_url\n", + "print(validate_url({\"not_a_url\" : True}))\n", + "print(validate_url(2346789))\n", + "print(validate_url(\"https://www.sfu.ca/ficticiouspath?auth=sampletoken3&studentid=1231&loc=sur\"))\n", + "print(validate_url(\"http://www.facebookee.com/otherpath?auth=facebookeeauth&token=iwusdkc¬token=hiThere&another_token=12323423\"))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
messy_urlvalidate_url
0random text which is not a urlFalse
1http://www.facebookee.com/otherpath?auth=faceb...True
2https://www.sfu.ca/ficticiouspath?auth=samplet...True
3notaurlFalse
4NaNFalse
5NoneFalse
6https://www.sfu.ca/ficticiouspath?auth=samplet...True
7False
8{'not_a_url': True}False
92345678False
10345345345False
11https://www.sfu.ca/ficticiouspath?auth=samplet...True
12https://www.sfu.ca/ficticiouspath?auth=samplet...True
\n", + "
" + ], + "text/plain": [ + " messy_url validate_url\n", + "0 random text which is not a url False\n", + "1 http://www.facebookee.com/otherpath?auth=faceb... True\n", + "2 https://www.sfu.ca/ficticiouspath?auth=samplet... True\n", + "3 notaurl False\n", + "4 NaN False\n", + "5 None False\n", + "6 https://www.sfu.ca/ficticiouspath?auth=samplet... True\n", + "7 False\n", + "8 {'not_a_url': True} False\n", + "9 2345678 False\n", + "10 345345345 False\n", + "11 https://www.sfu.ca/ficticiouspath?auth=samplet... True\n", + "12 https://www.sfu.ca/ficticiouspath?auth=samplet... True" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({\"messy_url\":\n", + " [\"random text which is not a url\",\n", + " \"http://www.facebookee.com/otherpath?auth=facebookeeauth&token=iwusdkc¬token=hiThere&another_token=12323423\",\n", + " \"https://www.sfu.ca/ficticiouspath?auth=sampletoken1&studentid=1234&loc=van\", \n", + " \"notaurl\", \n", + " np.nan,\n", + " None, \n", + " \"https://www.sfu.ca/ficticiouspath?auth=sampletoken2&studentid=1230&loc=bur\", \n", + " \"\", \n", + " {\"not_a_url\" : True}, \n", + " \"2345678\", \n", + " 345345345,\n", + " \"https://www.sfu.ca/ficticiouspath?auth=sampletoken3&studentid=1231&loc=sur\",\n", + " \"https://www.sfu.ca/ficticiouspath?auth=sampletoken1&studentid=1232&loc=van\",\n", + " ]\n", + " })\n", + "\n", + "df[\"validate_url\"] = validate_url(df[\"messy_url\"])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/source/user_guide/clean/introduction.ipynb b/docs/source/user_guide/clean/introduction.ipynb index f3d5a19ec..9028239c1 100644 --- a/docs/source/user_guide/clean/introduction.ipynb +++ b/docs/source/user_guide/clean/introduction.ipynb @@ -27,8 +27,16 @@ " * [clean_lat_long(): geographic coordinates](clean_lat_long.ipynb)\n", " * [clean_email(): email addresses](clean_email.ipynb)\n", " * [clean_country(): countries](clean_country.ipynb)\n", - " * [clean_phone(): phone numbers](clean_phone.ipynb)" + " * [clean_phone(): phone numbers](clean_phone.ipynb)\n", + " * [clean_url() : urls](clean_url.ipynb)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -48,7 +56,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.6.5" } }, "nbformat": 4,