From 65436b06a5cc6d3bf83e4e5d48abadc6e53792dc Mon Sep 17 00:00:00 2001
From: atol <alicelinlc@gmail.com>
Date: Thu, 22 Apr 2021 17:15:49 -0700
Subject: [PATCH] docs(clean): add documentation for clean_text

---
 dataprep/clean/clean_text.py                  |    7 +-
 docs/source/user_guide/clean/clean_text.ipynb | 1147 +++++++++++++++++
 .../user_guide/clean/introduction.ipynb       |    3 +-
 3 files changed, 1155 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/user_guide/clean/clean_text.ipynb
diff --git a/dataprep/clean/clean_text.py b/dataprep/clean/clean_text.py
index b96c7ad9e..8dd5bf68f 100644
--- a/dataprep/clean/clean_text.py
+++ b/dataprep/clean/clean_text.py
@@ -37,6 +37,8 @@ def clean_text(
     """
     Clean text data in a DataFrame column.
 
+    Read more in the :ref:`User Guide <clean_text_user_guide>`.
+
     Parameters
     ----------
         df
@@ -45,7 +47,8 @@ def clean_text(
             The name of the column containing text data.
         pipeline
             A list of cleaning functions to be applied to the column. If None,
-            use the default pipeline.
+            use the default pipeline. See the :ref:`User Guide <clean_text_custom_pipeline>`
+            for more information on customizing the pipeline.
 
             (default: None)
         stopwords
@@ -81,6 +84,8 @@ def default_text_pipeline() -> List[Dict[str, Any]]:
     Return a list of dictionaries representing the functions in the default pipeline.
     Use as a template for creating a custom pipeline.
 
+    Read more in the :ref:`User Guide <clean_text_user_guide>`.
+
     Examples
     --------
     >>> default_text_pipeline()
diff --git a/docs/source/user_guide/clean/clean_text.ipynb b/docs/source/user_guide/clean/clean_text.ipynb
new file mode 100644
index 000000000..e5ef81f93
--- /dev/null
+++ b/docs/source/user_guide/clean/clean_text.ipynb
@@ -0,0 +1,1147 @@
+{
+ "cells": [
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "raw_mimetype": "text/restructuredtext"
+   },
+   "source": [
+    ".. _clean_text_user_guide:\n",
+    "\n",
+    "Text\n",
+    "===="
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "raw_mimetype": "text/restructuredtext"
+   },
+   "source": [
+    "Introduction\n",
+    "------------\n",
+    "\n",
+    "The function :func:`clean_text() <dataprep.clean.clean_text.clean_text>` cleans text data in a DataFrame column."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Using a default or customized pipeline, the function performs a series of cleaning operations on the data.\n",
+    "\n",
+    "The following sections demonstrate the functionality of `clean_text()`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### An example dirty dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "pd.set_option(\"display.max_colwidth\", None)\n",
+    "\n",
+    "df = pd.DataFrame(\n",
+    "    {\n",
+    "        \"text\": [\n",
+    "            \"'ZZZZZ!' If IMDb would allow one-word reviews, that's what mine would be.\",\n",
+    "            \"The cast played Shakespeare.<br /><br />Shakespeare lost.\",\n",
+    "            \"Simon of the Desert (Simón del desierto) is a 1965 film directed by Luis Buñuel.\",\n",
+    "            \"[SPOILERS]\\nI don't think I've seen a film this bad before {acting, script, effects (!), etc...}\",\n",
+    "            \"<a href='/festivals/cannes-1968-a-video-essay'>Cannes 1968:\\tA video essay</a>\",\n",
+    "            \"Recap thread for @RottenTomatoes excellent panel, hosted by @ErikDavis with @FilmFatale_NYC and @AshCrossan.\",\n",
+    "            \"#GameOfThrones: Season 8 is #Rotten at 54% on the #Tomatometer.  But does it deserve to be?\",\n",
+    "            \"Come join and share your thoughts on this week's episode: https://twitter.com/i/spaces/1fake2URL3\",\n",
+    "            123,\n",
+    "            np.nan,\n",
+    "            \"NULL\",\n",
+    "        ]\n",
+    "    }\n",
+    ")\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Default `clean_text()`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The default pipeline for the `clean_text()` function is the following:"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "raw_mimetype": "text/restructuredtext"
+   },
+   "source": [
+    "1. :ref:`fillna <fillna>`: Replace all null values with `NaN`.\n",
+    "2. :ref:`lowercase <lowercase>`: Convert all characters to lowercase.\n",
+    "3. :ref:`remove_digits <remove_digits>`: Remove numbers.\n",
+    "4. :ref:`remove_html <remove_html>` Remove HTML tags.\n",
+    "5. :ref:`remove_urls <remove_urls>`: Remove URLs.\n",
+    "6. :ref:`remove_punctuation <remove_punctuation>`: Remove punctuation marks.\n",
+    "7. :ref:`remove_accents <remove_accents>`: Remove accent marks.\n",
+    "8. :ref:`remove_stopwords <remove_stopwords>`: Remove stopwords.\n",
+    "9. :ref:`remove_whitespace <remove_whitespace>`: Remove extra spaces, and tabs and newlines."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dataprep.clean import clean_text\n",
+    "clean_text(df, \"text\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "By default, the stopwords removed are the set of words in NLTK's English stopwords. To remove a different set of words, pass the set into the `stopwords` parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dataprep.clean import clean_text\n",
+    "clean_text(df, \"text\", stopwords={\"imdb\", \"film\"})"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "raw_mimetype": "text/restructuredtext"
+   },
+   "source": [
+    ".. _clean_text_custom_pipeline:\n",
+    "\n",
+    "2. Custom pipeline\n",
+    "------------------"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Users can pass in a custom pipeline to `clean_text()` using the `pipeline` parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\"operator\": \"lowercase\"},\n",
+    "    {\"operator\": \"remove_digits\"},\n",
+    "    {\"operator\": \"remove_whitespace\"},\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Users can also define and pass in their own functions using the `pipeline` parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "def split(text: str) -> str:\n",
+    "    return str(text).split()\n",
+    "\n",
+    "def replace_z(text: str, value: str) -> str:\n",
+    "    return re.sub(r\"z\", value, str(text), flags=re.I)\n",
+    "\n",
+    "custom_pipeline = [\n",
+    "    {\"operator\": \"lowercase\"},\n",
+    "    {\"operator\": \"remove_digits\"},\n",
+    "    {\"operator\": split},\n",
+    "    {\"operator\": replace_z, \"parameters\": {\"value\": \"*\"}},\n",
+    "    {\"operator\": \"remove_whitespace\"},\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In general, custom pipelines can be defined using the form:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\n",
+    "        \"operator\": \"<operator_name>\",\n",
+    "        \"parameters\": {\"<parameter_name>\": \"<parameter_value>\"},\n",
+    "    }\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "raw_mimetype": "text/restructuredtext"
+   },
+   "source": [
+    "To get the default pipeline in the form of a list, call :func:`default_text_pipeline() <dataprep.clean.clean_text.default_text_pipeline>`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This can be used as a template to build a list of cleaning operations to be passed into the `pipeline` parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dataprep.clean import default_text_pipeline\n",
+    "default_text_pipeline()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Built-in functions\n",
+    "\n",
+    "This section demonstrates the built-in cleaning operations which can be called using the `pipeline` parameter.\n",
+    "\n",
+    "`clean_text()` assumes the DataFrame column contains text data. As such, any `int` values will be cast to `str` after applying a cleaning function."
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "raw_mimetype": "text/restructuredtext"
+   },
+   "source": [
+    ".. _fillna:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### fillna\n",
+    "\n",
+    "By default, `fillna` replaces all null values with `NaN`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"fillna\"}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To specify a specific value to replace null values, use the `value` parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"fillna\", \"parameters\": {\"value\": \"<NAN>\"}}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "raw_mimetype": "text/restructuredtext"
+   },
+   "source": [
+    ".. _lowercase:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### lowercase\n",
+    "\n",
+    "Convert all characters to lowercase."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"lowercase\"}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### sentence_case\n",
+    "\n",
+    "Convert the first character of the string to uppercase and all remaining characters to lowercase."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"sentence_case\"}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### title_case\n",
+    "\n",
+    "Convert the first character of each word to uppercase and the remaining words to lowercase."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"title_case\"}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### uppercase\n",
+    "\n",
+    "Convert all characters to uppercase."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"uppercase\"}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "raw_mimetype": "text/restructuredtext"
+   },
+   "source": [
+    ".. _remove_accents:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### remove_accents\n",
+    "\n",
+    "Remove accents (diacritic marks) from the text."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"remove_accents\"}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### remove_bracketed\n",
+    "\n",
+    "Remove text between brackets.\n",
+    "\n",
+    "The style of the brackets can be specified using the `brackets` parameter:\n",
+    "\n",
+    "* \"angle\": `<>`\n",
+    "* \"curly\": `{}`\n",
+    "* \"round\": `()`\n",
+    "* \"square\": `[]`\n",
+    "\n",
+    "By default, the `inclusive` parameter is set to True and the brackets are removed along with the text in between."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\"operator\": \"remove_bracketed\", \"parameters\": {\"brackets\": \"round\"}}\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To remove the text but keep the brackets, set `inclusive` to False."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\n",
+    "        \"operator\": \"remove_bracketed\",\n",
+    "        \"parameters\": {\"brackets\": \"round\", \"inclusive\": False},\n",
+    "    }\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `brackets` parameter can also take in a set, which allows multiple bracket styles to be specified at a time."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\n",
+    "        \"operator\": \"remove_bracketed\",\n",
+    "        \"parameters\": {\"brackets\": {\"angle\", \"curly\", \"round\", \"square\"}},\n",
+    "    }\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "raw_mimetype": "text/restructuredtext"
+   },
+   "source": [
+    ".. _remove_digits:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### remove_digits\n",
+    "\n",
+    "Remove all digits."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"remove_digits\"}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "raw_mimetype": "text/restructuredtext"
+   },
+   "source": [
+    ".. _remove_html:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### remove_html\n",
+    "\n",
+    "Remove HTML tags, including the non-breaking space `&nbsp;`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"remove_html\"}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### remove_prefixed\n",
+    "\n",
+    "Remove substrings that start with the prefix(es) specified in the `prefix` parameter. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"remove_prefixed\", \"parameters\": {\"prefix\": \"#\"}}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To specify multiple prefixes, pass in a set of the prefixes to the `prefix` parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\"operator\": \"remove_prefixed\", \"parameters\": {\"prefix\": {\"#\", \"@\"}}}\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "raw_mimetype": "text/restructuredtext"
+   },
+   "source": [
+    ".. _remove_punctuation:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### remove_puncutation\n",
+    "\n",
+    "Remove all punctuation marks defined in Python's `string.punctuation`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"remove_punctuation\"}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "raw_mimetype": "text/restructuredtext"
+   },
+   "source": [
+    ".. _remove_stopwords:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### remove_stopwords\n",
+    "\n",
+    "Remove common words. By default, the set of stopwords to remove is NLTK's English stopwords."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"remove_stopwords\"}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To use a custom set of words, pass the set into the `stopwords` parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\"operator\": \"remove_stopwords\", \"parameters\": {\"stopwords\": {\"imdb\", \"film\"}}}\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Alternatively, expand upon the default set of stopwords by importing `dataprep.assets.english_stopwords` and adding custom words."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dataprep.assets.english_stopwords import english_stopwords\n",
+    "custom_stopwords = english_stopwords.copy()\n",
+    "custom_stopwords.add(\"imdb\")\n",
+    "custom_stopwords.add(\"film\")\n",
+    "\n",
+    "custom_pipeline = [\n",
+    "    {\n",
+    "        \"operator\": \"remove_stopwords\",\n",
+    "        \"parameters\": {\"stopwords\": custom_stopwords},\n",
+    "    }\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "raw_mimetype": "text/restructuredtext"
+   },
+   "source": [
+    ".. _remove_urls:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### remove_urls\n",
+    "\n",
+    "Remove URLs. Substrings that start with \"http\" or \"www\" are considered URLs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"remove_urls\"}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "raw_mimetype": "text/restructuredtext"
+   },
+   "source": [
+    ".. _remove_whitespace:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### remove_whitespace\n",
+    "\n",
+    "Remove extra spaces (two or more) along with tabs and newlines. Leading and trailing spaces are also removed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"remove_whitespace\"}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### replace_bracketed\n",
+    "\n",
+    "Replace text between brackets with the `value`.\n",
+    "\n",
+    "The style of the brackets can be specified using the `brackets` parameter:\n",
+    "\n",
+    "* \"angle\": `<>`\n",
+    "* \"curly\": `{}`\n",
+    "* \"round\": `()`\n",
+    "* \"square\": `[]`\n",
+    "\n",
+    "By default, the `inclusive` parameter is set to True and the brackets are also replaced by the `value` along with the text in between."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\n",
+    "        \"operator\": \"replace_bracketed\",\n",
+    "        \"parameters\": {\"brackets\": \"square\", \"value\": \"**SPOILERS**\"},\n",
+    "    }\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To replace the text, but keep the brackets, set `inclusive` to False."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\n",
+    "        \"operator\": \"replace_bracketed\",\n",
+    "        \"parameters\": {\n",
+    "            \"brackets\": \"square\",\n",
+    "            \"value\": \"**SPOILERS**\",\n",
+    "            \"inclusive\": False,\n",
+    "        },\n",
+    "    }\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `brackets` parameter can also take in a set, which allows multiple bracket styles to be specified at a time."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\n",
+    "        \"operator\": \"replace_bracketed\",\n",
+    "        \"parameters\": {\n",
+    "            \"brackets\": {\"angle\", \"curly\", \"round\", \"square\"},\n",
+    "            \"value\": \"<REDACTED>\",\n",
+    "        },\n",
+    "    }\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To assign different replacement values to different bracket styles, chain together `replace_bracketed` operations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\n",
+    "        \"operator\": \"replace_bracketed\",\n",
+    "        \"parameters\": {\n",
+    "            \"brackets\": \"square\",\n",
+    "            \"value\": \"**SPOILERS**\",\n",
+    "        },\n",
+    "    },\n",
+    "    {\n",
+    "        \"operator\": \"replace_bracketed\",\n",
+    "        \"parameters\": {\n",
+    "            \"brackets\": \"curly\",\n",
+    "            \"value\": \"in every aspect.\",\n",
+    "        },\n",
+    "    },\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### replace_digits\n",
+    "\n",
+    "Replace all digits with the `value`. By default, the `block` parameter is set to True and only blocks of digits, i.e. tokens composed solely of numbers, are removed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"replace_digits\", \"parameters\": {\"value\": \"X\"}}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To replace all digits appearing in the text, set `block` to False."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\"operator\": \"replace_digits\", \"parameters\": {\"value\": \"X\", \"block\": False}}\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### replace_prefixed\n",
+    "\n",
+    "Replace all substrings that start with the prefix(es) specified in the `prefix` parameter with the `value`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\n",
+    "        \"operator\": \"replace_prefixed\",\n",
+    "        \"parameters\": {\"prefix\": \"#\", \"value\": \"<HASHTAG>\"},\n",
+    "    }\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To replace substrings of different prefixes with the same `value`, pass in a set of the prefixes to the `prefix` parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\n",
+    "        \"operator\": \"replace_prefixed\",\n",
+    "        \"parameters\": {\"prefix\": {\"#\", \"@\"}, \"value\": \"<TAG>\"},\n",
+    "    }\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To replace different prefixed substrings with different values, chain together `replace_prefixed` operations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\n",
+    "        \"operator\": \"replace_prefixed\",\n",
+    "        \"parameters\": {\"prefix\": \"#\", \"value\": \"<HASHTAG>\"},\n",
+    "    },\n",
+    "    {\n",
+    "        \"operator\": \"replace_prefixed\",\n",
+    "        \"parameters\": {\"prefix\": \"@\", \"value\": \"<MENTION>\"},\n",
+    "    },\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### replace_punctuation\n",
+    "\n",
+    "Replace all punctuation marks defined in `string.punctuation` with the `value`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\"operator\": \"replace_punctuation\", \"parameters\": {\"value\": \"<PUNC>\"}}\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### replace_stopwords\n",
+    "\n",
+    "Replace common words with the `value`. By default, the set of stopwords to replace is NLTK's English stopwords."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"replace_stopwords\", \"parameters\": {\"value\": \"<S>\"}}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To use a custom set of words, pass the set into the `stopwords` parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\n",
+    "        \"operator\": \"replace_stopwords\",\n",
+    "        \"parameters\": {\"stopwords\": {\"imdb\", \"film\"}, \"value\": \"<S>\"},\n",
+    "    }\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Alternatively, expand upon the default set of stopwords by importing `dataprep.assets.english_stopwords` and adding custom words."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dataprep.assets.english_stopwords import english_stopwords\n",
+    "custom_stopwords = english_stopwords.copy()\n",
+    "custom_stopwords.add(\"imdb\")\n",
+    "custom_stopwords.add(\"film\")\n",
+    "\n",
+    "custom_pipeline = [\n",
+    "    {\n",
+    "        \"operator\": \"replace_stopwords\",\n",
+    "        \"parameters\": {\n",
+    "            \"stopwords\": custom_stopwords,\n",
+    "            \"value\": \"<S>\"\n",
+    "        },\n",
+    "    }\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### replace_text\n",
+    "\n",
+    "Replace a sequence of characters with another according to the mapping specified in the `value` parameter. By default, `block` is set to True and only blocks of text, i.e. tokens composed solely of the specified sequence of characters, are replaced."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\n",
+    "        \"operator\": \"replace_text\",\n",
+    "        \"parameters\": {\"value\": {\"imdb\": \"Netflix\", \"film\": \"movie\"}},\n",
+    "    }\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To replace the sequence of characters wherever they appear in the text, set `block` to False."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [\n",
+    "    {\n",
+    "        \"operator\": \"replace_text\",\n",
+    "        \"parameters\": {\"value\": {\"imdb\": \"Netflix\", \"film\": \"movie\"}, \"block\": False},\n",
+    "    }\n",
+    "]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### replace_urls\n",
+    "\n",
+    "Replace URLs with the `value`. Substrings that start with \"http\" or \"www\" are considered URLs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "custom_pipeline = [{\"operator\": \"replace_urls\", \"parameters\": {\"value\": \"<URL>\"}}]\n",
+    "clean_text(df, \"text\", pipeline=custom_pipeline)"
+   ]
+  }
+ ],
+ "metadata": {
+  "celltoolbar": "Raw Cell Format",
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/source/user_guide/clean/introduction.ipynb b/docs/source/user_guide/clean/introduction.ipynb
index c77315995..1f5d236d0 100644
--- a/docs/source/user_guide/clean/introduction.ipynb
+++ b/docs/source/user_guide/clean/introduction.ipynb
@@ -38,6 +38,7 @@
     " * [Geographic Goordinates](clean_lat_long.ipynb)\n",
     " * [IP Addresses](clean_ip.ipynb)\n",
     " * [Phone Numbers](clean_phone.ipynb)\n",
+    " * [Text](clean_text.ipynb)\n",
     " * [URLs](clean_url.ipynb)\n",
     " * [US Street Addresses](clean_address.ipynb)"
    ]
@@ -60,7 +61,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,