From 65436b06a5cc6d3bf83e4e5d48abadc6e53792dc Mon Sep 17 00:00:00 2001 From: atol Date: Thu, 22 Apr 2021 17:15:49 -0700 Subject: [PATCH] docs(clean): add documentation for clean_text --- dataprep/clean/clean_text.py | 7 +- docs/source/user_guide/clean/clean_text.ipynb | 1147 +++++++++++++++++ .../user_guide/clean/introduction.ipynb | 3 +- 3 files changed, 1155 insertions(+), 2 deletions(-) create mode 100644 docs/source/user_guide/clean/clean_text.ipynb diff --git a/dataprep/clean/clean_text.py b/dataprep/clean/clean_text.py index b96c7ad9e..8dd5bf68f 100644 --- a/dataprep/clean/clean_text.py +++ b/dataprep/clean/clean_text.py @@ -37,6 +37,8 @@ def clean_text( """ Clean text data in a DataFrame column. + Read more in the :ref:`User Guide `. + Parameters ---------- df @@ -45,7 +47,8 @@ def clean_text( The name of the column containing text data. pipeline A list of cleaning functions to be applied to the column. If None, - use the default pipeline. + use the default pipeline. See the :ref:`User Guide ` + for more information on customizing the pipeline. (default: None) stopwords @@ -81,6 +84,8 @@ def default_text_pipeline() -> List[Dict[str, Any]]: Return a list of dictionaries representing the functions in the default pipeline. Use as a template for creating a custom pipeline. + Read more in the :ref:`User Guide `. + Examples -------- >>> default_text_pipeline() diff --git a/docs/source/user_guide/clean/clean_text.ipynb b/docs/source/user_guide/clean/clean_text.ipynb new file mode 100644 index 000000000..e5ef81f93 --- /dev/null +++ b/docs/source/user_guide/clean/clean_text.ipynb @@ -0,0 +1,1147 @@ +{ + "cells": [ + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + ".. _clean_text_user_guide:\n", + "\n", + "Text\n", + "====" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + "Introduction\n", + "------------\n", + "\n", + "The function :func:`clean_text() ` cleans text data in a DataFrame column." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using a default or customized pipeline, the function performs a series of cleaning operations on the data.\n", + "\n", + "The following sections demonstrate the functionality of `clean_text()`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### An example dirty dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "pd.set_option(\"display.max_colwidth\", None)\n", + "\n", + "df = pd.DataFrame(\n", + " {\n", + " \"text\": [\n", + " \"'ZZZZZ!' If IMDb would allow one-word reviews, that's what mine would be.\",\n", + " \"The cast played Shakespeare.

Shakespeare lost.\",\n", + " \"Simon of the Desert (Simón del desierto) is a 1965 film directed by Luis Buñuel.\",\n", + " \"[SPOILERS]\\nI don't think I've seen a film this bad before {acting, script, effects (!), etc...}\",\n", + " \"Cannes 1968:\\tA video essay\",\n", + " \"Recap thread for @RottenTomatoes excellent panel, hosted by @ErikDavis with @FilmFatale_NYC and @AshCrossan.\",\n", + " \"#GameOfThrones: Season 8 is #Rotten at 54% on the #Tomatometer. But does it deserve to be?\",\n", + " \"Come join and share your thoughts on this week's episode: https://twitter.com/i/spaces/1fake2URL3\",\n", + " 123,\n", + " np.nan,\n", + " \"NULL\",\n", + " ]\n", + " }\n", + ")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Default `clean_text()`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The default pipeline for the `clean_text()` function is the following:" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + "1. :ref:`fillna `: Replace all null values with `NaN`.\n", + "2. :ref:`lowercase `: Convert all characters to lowercase.\n", + "3. :ref:`remove_digits `: Remove numbers.\n", + "4. :ref:`remove_html ` Remove HTML tags.\n", + "5. :ref:`remove_urls `: Remove URLs.\n", + "6. :ref:`remove_punctuation `: Remove punctuation marks.\n", + "7. :ref:`remove_accents `: Remove accent marks.\n", + "8. :ref:`remove_stopwords `: Remove stopwords.\n", + "9. :ref:`remove_whitespace `: Remove extra spaces, and tabs and newlines." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dataprep.clean import clean_text\n", + "clean_text(df, \"text\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, the stopwords removed are the set of words in NLTK's English stopwords. To remove a different set of words, pass the set into the `stopwords` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dataprep.clean import clean_text\n", + "clean_text(df, \"text\", stopwords={\"imdb\", \"film\"})" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + ".. _clean_text_custom_pipeline:\n", + "\n", + "2. Custom pipeline\n", + "------------------" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Users can pass in a custom pipeline to `clean_text()` using the `pipeline` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\"operator\": \"lowercase\"},\n", + " {\"operator\": \"remove_digits\"},\n", + " {\"operator\": \"remove_whitespace\"},\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Users can also define and pass in their own functions using the `pipeline` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "def split(text: str) -> str:\n", + " return str(text).split()\n", + "\n", + "def replace_z(text: str, value: str) -> str:\n", + " return re.sub(r\"z\", value, str(text), flags=re.I)\n", + "\n", + "custom_pipeline = [\n", + " {\"operator\": \"lowercase\"},\n", + " {\"operator\": \"remove_digits\"},\n", + " {\"operator\": split},\n", + " {\"operator\": replace_z, \"parameters\": {\"value\": \"*\"}},\n", + " {\"operator\": \"remove_whitespace\"},\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In general, custom pipelines can be defined using the form:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\n", + " \"operator\": \"\",\n", + " \"parameters\": {\"\": \"\"},\n", + " }\n", + "]" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + "To get the default pipeline in the form of a list, call :func:`default_text_pipeline() `." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This can be used as a template to build a list of cleaning operations to be passed into the `pipeline` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dataprep.clean import default_text_pipeline\n", + "default_text_pipeline()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Built-in functions\n", + "\n", + "This section demonstrates the built-in cleaning operations which can be called using the `pipeline` parameter.\n", + "\n", + "`clean_text()` assumes the DataFrame column contains text data. As such, any `int` values will be cast to `str` after applying a cleaning function." + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + ".. _fillna:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### fillna\n", + "\n", + "By default, `fillna` replaces all null values with `NaN`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"fillna\"}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To specify a specific value to replace null values, use the `value` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"fillna\", \"parameters\": {\"value\": \"\"}}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + ".. _lowercase:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### lowercase\n", + "\n", + "Convert all characters to lowercase." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"lowercase\"}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### sentence_case\n", + "\n", + "Convert the first character of the string to uppercase and all remaining characters to lowercase." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"sentence_case\"}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### title_case\n", + "\n", + "Convert the first character of each word to uppercase and the remaining words to lowercase." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"title_case\"}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### uppercase\n", + "\n", + "Convert all characters to uppercase." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"uppercase\"}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + ".. _remove_accents:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### remove_accents\n", + "\n", + "Remove accents (diacritic marks) from the text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"remove_accents\"}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### remove_bracketed\n", + "\n", + "Remove text between brackets.\n", + "\n", + "The style of the brackets can be specified using the `brackets` parameter:\n", + "\n", + "* \"angle\": `<>`\n", + "* \"curly\": `{}`\n", + "* \"round\": `()`\n", + "* \"square\": `[]`\n", + "\n", + "By default, the `inclusive` parameter is set to True and the brackets are removed along with the text in between." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\"operator\": \"remove_bracketed\", \"parameters\": {\"brackets\": \"round\"}}\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To remove the text but keep the brackets, set `inclusive` to False." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\n", + " \"operator\": \"remove_bracketed\",\n", + " \"parameters\": {\"brackets\": \"round\", \"inclusive\": False},\n", + " }\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `brackets` parameter can also take in a set, which allows multiple bracket styles to be specified at a time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\n", + " \"operator\": \"remove_bracketed\",\n", + " \"parameters\": {\"brackets\": {\"angle\", \"curly\", \"round\", \"square\"}},\n", + " }\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + ".. _remove_digits:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### remove_digits\n", + "\n", + "Remove all digits." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"remove_digits\"}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + ".. _remove_html:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### remove_html\n", + "\n", + "Remove HTML tags, including the non-breaking space ` `." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"remove_html\"}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### remove_prefixed\n", + "\n", + "Remove substrings that start with the prefix(es) specified in the `prefix` parameter. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"remove_prefixed\", \"parameters\": {\"prefix\": \"#\"}}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To specify multiple prefixes, pass in a set of the prefixes to the `prefix` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\"operator\": \"remove_prefixed\", \"parameters\": {\"prefix\": {\"#\", \"@\"}}}\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + ".. _remove_punctuation:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### remove_puncutation\n", + "\n", + "Remove all punctuation marks defined in Python's `string.punctuation`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"remove_punctuation\"}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + ".. _remove_stopwords:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### remove_stopwords\n", + "\n", + "Remove common words. By default, the set of stopwords to remove is NLTK's English stopwords." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"remove_stopwords\"}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use a custom set of words, pass the set into the `stopwords` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\"operator\": \"remove_stopwords\", \"parameters\": {\"stopwords\": {\"imdb\", \"film\"}}}\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, expand upon the default set of stopwords by importing `dataprep.assets.english_stopwords` and adding custom words." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dataprep.assets.english_stopwords import english_stopwords\n", + "custom_stopwords = english_stopwords.copy()\n", + "custom_stopwords.add(\"imdb\")\n", + "custom_stopwords.add(\"film\")\n", + "\n", + "custom_pipeline = [\n", + " {\n", + " \"operator\": \"remove_stopwords\",\n", + " \"parameters\": {\"stopwords\": custom_stopwords},\n", + " }\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + ".. _remove_urls:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### remove_urls\n", + "\n", + "Remove URLs. Substrings that start with \"http\" or \"www\" are considered URLs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"remove_urls\"}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "raw_mimetype": "text/restructuredtext" + }, + "source": [ + ".. _remove_whitespace:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### remove_whitespace\n", + "\n", + "Remove extra spaces (two or more) along with tabs and newlines. Leading and trailing spaces are also removed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"remove_whitespace\"}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### replace_bracketed\n", + "\n", + "Replace text between brackets with the `value`.\n", + "\n", + "The style of the brackets can be specified using the `brackets` parameter:\n", + "\n", + "* \"angle\": `<>`\n", + "* \"curly\": `{}`\n", + "* \"round\": `()`\n", + "* \"square\": `[]`\n", + "\n", + "By default, the `inclusive` parameter is set to True and the brackets are also replaced by the `value` along with the text in between." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\n", + " \"operator\": \"replace_bracketed\",\n", + " \"parameters\": {\"brackets\": \"square\", \"value\": \"**SPOILERS**\"},\n", + " }\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To replace the text, but keep the brackets, set `inclusive` to False." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\n", + " \"operator\": \"replace_bracketed\",\n", + " \"parameters\": {\n", + " \"brackets\": \"square\",\n", + " \"value\": \"**SPOILERS**\",\n", + " \"inclusive\": False,\n", + " },\n", + " }\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `brackets` parameter can also take in a set, which allows multiple bracket styles to be specified at a time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\n", + " \"operator\": \"replace_bracketed\",\n", + " \"parameters\": {\n", + " \"brackets\": {\"angle\", \"curly\", \"round\", \"square\"},\n", + " \"value\": \"\",\n", + " },\n", + " }\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To assign different replacement values to different bracket styles, chain together `replace_bracketed` operations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\n", + " \"operator\": \"replace_bracketed\",\n", + " \"parameters\": {\n", + " \"brackets\": \"square\",\n", + " \"value\": \"**SPOILERS**\",\n", + " },\n", + " },\n", + " {\n", + " \"operator\": \"replace_bracketed\",\n", + " \"parameters\": {\n", + " \"brackets\": \"curly\",\n", + " \"value\": \"in every aspect.\",\n", + " },\n", + " },\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### replace_digits\n", + "\n", + "Replace all digits with the `value`. By default, the `block` parameter is set to True and only blocks of digits, i.e. tokens composed solely of numbers, are removed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"replace_digits\", \"parameters\": {\"value\": \"X\"}}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To replace all digits appearing in the text, set `block` to False." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\"operator\": \"replace_digits\", \"parameters\": {\"value\": \"X\", \"block\": False}}\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### replace_prefixed\n", + "\n", + "Replace all substrings that start with the prefix(es) specified in the `prefix` parameter with the `value`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\n", + " \"operator\": \"replace_prefixed\",\n", + " \"parameters\": {\"prefix\": \"#\", \"value\": \"\"},\n", + " }\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To replace substrings of different prefixes with the same `value`, pass in a set of the prefixes to the `prefix` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\n", + " \"operator\": \"replace_prefixed\",\n", + " \"parameters\": {\"prefix\": {\"#\", \"@\"}, \"value\": \"\"},\n", + " }\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To replace different prefixed substrings with different values, chain together `replace_prefixed` operations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\n", + " \"operator\": \"replace_prefixed\",\n", + " \"parameters\": {\"prefix\": \"#\", \"value\": \"\"},\n", + " },\n", + " {\n", + " \"operator\": \"replace_prefixed\",\n", + " \"parameters\": {\"prefix\": \"@\", \"value\": \"\"},\n", + " },\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### replace_punctuation\n", + "\n", + "Replace all punctuation marks defined in `string.punctuation` with the `value`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\"operator\": \"replace_punctuation\", \"parameters\": {\"value\": \"\"}}\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### replace_stopwords\n", + "\n", + "Replace common words with the `value`. By default, the set of stopwords to replace is NLTK's English stopwords." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"replace_stopwords\", \"parameters\": {\"value\": \"\"}}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use a custom set of words, pass the set into the `stopwords` parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\n", + " \"operator\": \"replace_stopwords\",\n", + " \"parameters\": {\"stopwords\": {\"imdb\", \"film\"}, \"value\": \"\"},\n", + " }\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, expand upon the default set of stopwords by importing `dataprep.assets.english_stopwords` and adding custom words." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dataprep.assets.english_stopwords import english_stopwords\n", + "custom_stopwords = english_stopwords.copy()\n", + "custom_stopwords.add(\"imdb\")\n", + "custom_stopwords.add(\"film\")\n", + "\n", + "custom_pipeline = [\n", + " {\n", + " \"operator\": \"replace_stopwords\",\n", + " \"parameters\": {\n", + " \"stopwords\": custom_stopwords,\n", + " \"value\": \"\"\n", + " },\n", + " }\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### replace_text\n", + "\n", + "Replace a sequence of characters with another according to the mapping specified in the `value` parameter. By default, `block` is set to True and only blocks of text, i.e. tokens composed solely of the specified sequence of characters, are replaced." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\n", + " \"operator\": \"replace_text\",\n", + " \"parameters\": {\"value\": {\"imdb\": \"Netflix\", \"film\": \"movie\"}},\n", + " }\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To replace the sequence of characters wherever they appear in the text, set `block` to False." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [\n", + " {\n", + " \"operator\": \"replace_text\",\n", + " \"parameters\": {\"value\": {\"imdb\": \"Netflix\", \"film\": \"movie\"}, \"block\": False},\n", + " }\n", + "]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### replace_urls\n", + "\n", + "Replace URLs with the `value`. Substrings that start with \"http\" or \"www\" are considered URLs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "custom_pipeline = [{\"operator\": \"replace_urls\", \"parameters\": {\"value\": \"\"}}]\n", + "clean_text(df, \"text\", pipeline=custom_pipeline)" + ] + } + ], + "metadata": { + "celltoolbar": "Raw Cell Format", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/source/user_guide/clean/introduction.ipynb b/docs/source/user_guide/clean/introduction.ipynb index c77315995..1f5d236d0 100644 --- a/docs/source/user_guide/clean/introduction.ipynb +++ b/docs/source/user_guide/clean/introduction.ipynb @@ -38,6 +38,7 @@ " * [Geographic Goordinates](clean_lat_long.ipynb)\n", " * [IP Addresses](clean_ip.ipynb)\n", " * [Phone Numbers](clean_phone.ipynb)\n", + " * [Text](clean_text.ipynb)\n", " * [URLs](clean_url.ipynb)\n", " * [US Street Addresses](clean_address.ipynb)" ] @@ -60,7 +61,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.8.5" } }, "nbformat": 4,