From 90f162efb6313ba6d1b5f6311e2c83835c4d1bdd Mon Sep 17 00:00:00 2001
From: Ankit Dangi <dangiankit@gmail.com>
Date: Sun, 8 Dec 2024 22:01:03 -0800
Subject: [PATCH] text-splitters: add pydocstyle linting (#28127)

As seen in #23188, turned on Google-style docstrings by enabling
`pydocstyle` linting in the `text-splitters` package. Each resulting
linting error was addressed differently: ignored, resolved, suppressed,
and missing docstrings were added.

Fixes one of the checklist items from #25154, similar to #25939 in
`core` package. Ran `make format`, `make lint` and `make test` from the
root of the package `text-splitters` to ensure no issues were found.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
---
 .../langchain_text_splitters/__init__.py      |  1 -
 .../langchain_text_splitters/base.py          | 15 ++++++
 .../langchain_text_splitters/character.py     | 28 ++++++++++
 .../langchain_text_splitters/html.py          | 53 +++++++++++++++----
 .../langchain_text_splitters/json.py          | 39 +++++++++++---
 .../langchain_text_splitters/markdown.py      | 44 ++++++++++++---
 .../sentence_transformers.py                  | 25 +++++++++
 .../langchain_text_splitters/spacy.py         |  1 -
 libs/text-splitters/pyproject.toml            | 15 +++++-
 9 files changed, 194 insertions(+), 27 deletions(-)

diff --git a/libs/text-splitters/langchain_text_splitters/__init__.py b/libs/text-splitters/langchain_text_splitters/__init__.py
index 58ad7b0e4c585..65af087fdd85f 100644
--- a/libs/text-splitters/langchain_text_splitters/__init__.py
+++ b/libs/text-splitters/langchain_text_splitters/__init__.py
@@ -1,6 +1,5 @@
 """**Text Splitters** are classes for splitting text.
 
-
 **Class hierarchy:**
 
 .. code-block::
diff --git a/libs/text-splitters/langchain_text_splitters/base.py b/libs/text-splitters/langchain_text_splitters/base.py
index 0e0a49c182da7..10dd6903ba172 100644
--- a/libs/text-splitters/langchain_text_splitters/base.py
+++ b/libs/text-splitters/langchain_text_splitters/base.py
@@ -249,6 +249,21 @@ def __init__(
         self._disallowed_special = disallowed_special
 
     def split_text(self, text: str) -> List[str]:
+        """Splits the input text into smaller chunks based on tokenization.
+
+        This method uses a custom tokenizer configuration to encode the input text
+        into tokens, processes the tokens in chunks of a specified size with overlap,
+        and decodes them back into text chunks. The splitting is performed using the
+        `split_text_on_tokens` function.
+
+        Args:
+            text (str): The input text to be split into smaller chunks.
+
+        Returns:
+            List[str]: A list of text chunks, where each chunk is derived from a portion
+            of the input text based on the tokenization and chunking rules.
+        """
+
         def _encode(_text: str) -> List[int]:
             return self._tokenizer.encode(
                 _text,
diff --git a/libs/text-splitters/langchain_text_splitters/character.py b/libs/text-splitters/langchain_text_splitters/character.py
index f65c38869d394..a2918bd27f0ac 100644
--- a/libs/text-splitters/langchain_text_splitters/character.py
+++ b/libs/text-splitters/langchain_text_splitters/character.py
@@ -115,17 +115,45 @@ def _split_text(self, text: str, separators: List[str]) -> List[str]:
         return final_chunks
 
     def split_text(self, text: str) -> List[str]:
+        """Split the input text into smaller chunks based on predefined separators.
+
+        Args:
+            text (str): The input text to be split.
+
+        Returns:
+            List[str]: A list of text chunks obtained after splitting.
+        """
         return self._split_text(text, self._separators)
 
     @classmethod
     def from_language(
         cls, language: Language, **kwargs: Any
     ) -> RecursiveCharacterTextSplitter:
+        """Return an instance of this class based on a specific language.
+
+        This method initializes the text splitter with language-specific separators.
+
+        Args:
+            language (Language): The language to configure the text splitter for.
+            **kwargs (Any): Additional keyword arguments to customize the splitter.
+
+        Returns:
+            RecursiveCharacterTextSplitter: An instance of the text splitter configured
+            for the specified language.
+        """
         separators = cls.get_separators_for_language(language)
         return cls(separators=separators, is_separator_regex=True, **kwargs)
 
     @staticmethod
     def get_separators_for_language(language: Language) -> List[str]:
+        """Retrieve a list of separators specific to the given language.
+
+        Args:
+            language (Language): The language for which to get the separators.
+
+        Returns:
+            List[str]: A list of separators appropriate for the specified language.
+        """
         if language == Language.C or language == Language.CPP:
             return [
                 # Split along class definitions
diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py
index cdbea7f724b53..241c0981f58f5 100644
--- a/libs/text-splitters/langchain_text_splitters/html.py
+++ b/libs/text-splitters/langchain_text_splitters/html.py
@@ -21,8 +21,8 @@ class ElementType(TypedDict):
 
 
 class HTMLHeaderTextSplitter:
-    """
-    Splitting HTML files based on specified headers.
+    """Splitting HTML files based on specified headers.
+
     Requires lxml package.
     """
 
@@ -46,7 +46,7 @@ def __init__(
     def aggregate_elements_to_chunks(
         self, elements: List[ElementType]
     ) -> List[Document]:
-        """Combine elements with common metadata into chunks
+        """Combine elements with common metadata into chunks.
 
         Args:
             elements: HTML element content with associated identifying info and metadata
@@ -72,7 +72,7 @@ def aggregate_elements_to_chunks(
         ]
 
     def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]:
-        """Split HTML from web URL
+        """Split HTML from web URL.
 
         Args:
             url: web URL
@@ -83,7 +83,7 @@ def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]:
         return self.split_text_from_file(BytesIO(r.content))
 
     def split_text(self, text: str) -> List[Document]:
-        """Split HTML text string
+        """Split HTML text string.
 
         Args:
             text: HTML text
@@ -91,7 +91,7 @@ def split_text(self, text: str) -> List[Document]:
         return self.split_text_from_file(StringIO(text))
 
     def split_text_from_file(self, file: Any) -> List[Document]:
-        """Split HTML file
+        """Split HTML file.
 
         Args:
             file: HTML file
@@ -166,8 +166,8 @@ def split_text_from_file(self, file: Any) -> List[Document]:
 
 
 class HTMLSectionSplitter:
-    """
-    Splitting HTML files based on specified tag and font sizes.
+    """Splitting HTML files based on specified tag and font sizes.
+
     Requires lxml package.
     """
 
@@ -186,6 +186,8 @@ def __init__(
             xslt_path: path to xslt file for document transformation.
             Uses a default if not passed.
             Needed for html contents that using different format and layouts.
+            **kwargs (Any): Additional optional arguments for customizations.
+
         """
         self.headers_to_split_on = dict(headers_to_split_on)
 
@@ -210,7 +212,7 @@ def split_documents(self, documents: Iterable[Document]) -> List[Document]:
         return text_splitter.split_documents(results)
 
     def split_text(self, text: str) -> List[Document]:
-        """Split HTML text string
+        """Split HTML text string.
 
         Args:
             text: HTML text
@@ -236,6 +238,23 @@ def create_documents(
         return documents
 
     def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]:
+        """Split an HTML document into sections based on specified header tags.
+
+        This method uses BeautifulSoup to parse the HTML content and divides it into
+        sections based on headers defined in `headers_to_split_on`. Each section
+        contains the header text, content under the header, and the tag name.
+
+        Args:
+            html_doc (str): The HTML document to be split into sections.
+
+        Returns:
+            List[Dict[str, Optional[str]]]: A list of dictionaries representing
+            sections.
+                Each dictionary contains:
+                - 'header': The header text or a default title for the first section.
+                - 'content': The content under the header.
+                - 'tag_name': The name of the header tag (e.g., "h1", "h2").
+        """
         try:
             from bs4 import BeautifulSoup, PageElement  # type: ignore[import-untyped]
         except ImportError as e:
@@ -259,7 +278,7 @@ def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]
                 section_content: List = []
             else:
                 current_header = header_element.text.strip()
-                current_header_tag = header_element.name
+                current_header_tag = header_element.name  # type: ignore[attr-defined]
                 section_content = []
             for element in header_element.next_elements:
                 if i + 1 < len(headers) and element == headers[i + 1]:
@@ -280,6 +299,18 @@ def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]
         return sections
 
     def convert_possible_tags_to_header(self, html_content: str) -> str:
+        """Convert specific HTML tags to headers using an XSLT transformation.
+
+        This method uses an XSLT file to transform the HTML content, converting
+        certain tags into headers for easier parsing. If no XSLT path is provided,
+        the HTML content is returned unchanged.
+
+        Args:
+            html_content (str): The HTML content to be transformed.
+
+        Returns:
+            str: The transformed HTML content as a string.
+        """
         if self.xslt_path is None:
             return html_content
 
@@ -299,7 +330,7 @@ def convert_possible_tags_to_header(self, html_content: str) -> str:
         return str(result)
 
     def split_text_from_file(self, file: Any) -> List[Document]:
-        """Split HTML file
+        """Split HTML file.
 
         Args:
             file: HTML file
diff --git a/libs/text-splitters/langchain_text_splitters/json.py b/libs/text-splitters/langchain_text_splitters/json.py
index c83d8b2a42880..c58174dd8b33a 100644
--- a/libs/text-splitters/langchain_text_splitters/json.py
+++ b/libs/text-splitters/langchain_text_splitters/json.py
@@ -8,9 +8,38 @@
 
 
 class RecursiveJsonSplitter:
+    """Splits JSON data into smaller, structured chunks while preserving hierarchy.
+
+    This class provides methods to split JSON data into smaller dictionaries or
+    JSON-formatted strings based on configurable maximum and minimum chunk sizes.
+    It supports nested JSON structures, optionally converts lists into dictionaries
+    for better chunking, and allows the creation of document objects for further use.
+
+    Attributes:
+        max_chunk_size (int): The maximum size for each chunk. Defaults to 2000.
+        min_chunk_size (int): The minimum size for each chunk, derived from
+            `max_chunk_size` if not explicitly provided.
+    """
+
     def __init__(
         self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None
     ):
+        """Initialize the chunk size configuration for text processing.
+
+        This constructor sets up the maximum and minimum chunk sizes, ensuring that
+        the `min_chunk_size` defaults to a value slightly smaller than the
+        `max_chunk_size` if not explicitly provided.
+
+        Args:
+            max_chunk_size (int): The maximum size for a chunk. Defaults to 2000.
+            min_chunk_size (Optional[int]): The minimum size for a chunk. If None,
+                defaults to the maximum chunk size minus 200, with a lower bound of 50.
+
+        Attributes:
+            max_chunk_size (int): The configured maximum size for each chunk.
+            min_chunk_size (int): The configured minimum size for each chunk, derived
+                from `max_chunk_size` if not explicitly provided.
+        """
         super().__init__()
         self.max_chunk_size = max_chunk_size
         self.min_chunk_size = (
@@ -51,9 +80,7 @@ def _json_split(
         current_path: Optional[List[str]] = None,
         chunks: Optional[List[Dict]] = None,
     ) -> List[Dict]:
-        """
-        Split json into maximum size dictionaries while preserving structure.
-        """
+        """Split json into maximum size dictionaries while preserving structure."""
         current_path = current_path or []
         chunks = chunks if chunks is not None else [{}]
         if isinstance(data, dict):
@@ -83,8 +110,7 @@ def split_json(
         json_data: Dict[str, Any],
         convert_lists: bool = False,
     ) -> List[Dict]:
-        """Splits JSON into a list of JSON chunks"""
-
+        """Splits JSON into a list of JSON chunks."""
         if convert_lists:
             chunks = self._json_split(self._list_to_dict_preprocessing(json_data))
         else:
@@ -101,8 +127,7 @@ def split_text(
         convert_lists: bool = False,
         ensure_ascii: bool = True,
     ) -> List[str]:
-        """Splits JSON into a list of JSON formatted strings"""
-
+        """Splits JSON into a list of JSON formatted strings."""
         chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
 
         # Convert to string
diff --git a/libs/text-splitters/langchain_text_splitters/markdown.py b/libs/text-splitters/langchain_text_splitters/markdown.py
index fdcd010f50d43..34c7d2197d238 100644
--- a/libs/text-splitters/langchain_text_splitters/markdown.py
+++ b/libs/text-splitters/langchain_text_splitters/markdown.py
@@ -45,7 +45,8 @@ def __init__(
         self.strip_headers = strip_headers
 
     def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
-        """Combine lines with common metadata into chunks
+        """Combine lines with common metadata into chunks.
+
         Args:
             lines: Line of text / associated header metadata
         """
@@ -87,10 +88,11 @@ def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
         ]
 
     def split_text(self, text: str) -> List[Document]:
-        """Split markdown file
-        Args:
-            text: Markdown file"""
+        """Split markdown file.
 
+        Args:
+            text: Markdown file
+        """
         # Split the input text by newline character ("\n").
         lines = text.split("\n")
         # Final output
@@ -225,8 +227,7 @@ class HeaderType(TypedDict):
 
 
 class ExperimentalMarkdownSyntaxTextSplitter:
-    """
-    An experimental text splitter for handling Markdown syntax.
+    """An experimental text splitter for handling Markdown syntax.
 
     This splitter aims to retain the exact whitespace of the original text while
     extracting structured metadata, such as headers. It is a re-implementation of the
@@ -280,6 +281,22 @@ def __init__(
         return_each_line: bool = False,
         strip_headers: bool = True,
     ):
+        """Initialize the text splitter with header splitting and formatting options.
+
+        This constructor sets up the required configuration for splitting text into
+        chunks based on specified headers and formatting preferences.
+
+        Args:
+            headers_to_split_on (Union[List[Tuple[str, str]], None]):
+                A list of tuples, where each tuple contains a header tag (e.g., "h1")
+                and its corresponding metadata key. If None, default headers are used.
+            return_each_line (bool):
+                Whether to return each line as an individual chunk.
+                Defaults to False, which aggregates lines into larger chunks.
+            strip_headers (bool):
+                Whether to exclude headers from the resulting chunks.
+                Defaults to True.
+        """
         self.chunks: List[Document] = []
         self.current_chunk = Document(page_content="")
         self.current_header_stack: List[Tuple[int, str]] = []
@@ -292,6 +309,21 @@ def __init__(
         self.return_each_line = return_each_line
 
     def split_text(self, text: str) -> List[Document]:
+        """Split the input text into structured chunks.
+
+        This method processes the input text line by line, identifying and handling
+        specific patterns such as headers, code blocks, and horizontal rules to
+        split it into structured chunks based on headers, code blocks, and
+        horizontal rules.
+
+        Args:
+            text (str): The input text to be split into chunks.
+
+        Returns:
+            List[Document]: A list of `Document` objects representing the structured
+            chunks of the input text. If `return_each_line` is enabled, each line
+            is returned as a separate `Document`.
+        """
         raw_lines = text.splitlines(keepends=True)
 
         while raw_lines:
diff --git a/libs/text-splitters/langchain_text_splitters/sentence_transformers.py b/libs/text-splitters/langchain_text_splitters/sentence_transformers.py
index beb314d810d9e..3b19c5edc594d 100644
--- a/libs/text-splitters/langchain_text_splitters/sentence_transformers.py
+++ b/libs/text-splitters/langchain_text_splitters/sentence_transformers.py
@@ -51,6 +51,20 @@ def _initialize_chunk_configuration(
             )
 
     def split_text(self, text: str) -> List[str]:
+        """Splits the input text into smaller components by splitting text on tokens.
+
+        This method encodes the input text using a private `_encode` method, then
+        strips the start and stop token IDs from the encoded result. It returns the
+        processed segments as a list of strings.
+
+        Args:
+            text (str): The input text to be split.
+
+        Returns:
+            List[str]: A list of string components derived from the input text after
+            encoding and processing.
+        """
+
         def encode_strip_start_and_stop_token_ids(text: str) -> List[int]:
             return self._encode(text)[1:-1]
 
@@ -64,6 +78,17 @@ def encode_strip_start_and_stop_token_ids(text: str) -> List[int]:
         return split_text_on_tokens(text=text, tokenizer=tokenizer)
 
     def count_tokens(self, *, text: str) -> int:
+        """Counts the number of tokens in the given text.
+
+        This method encodes the input text using a private `_encode` method and
+        calculates the total number of tokens in the encoded result.
+
+        Args:
+            text (str): The input text for which the token count is calculated.
+
+        Returns:
+            int: The number of tokens in the encoded text.
+        """
         return len(self._encode(text))
 
     _max_length_equal_32_bit_integer: int = 2**32
diff --git a/libs/text-splitters/langchain_text_splitters/spacy.py b/libs/text-splitters/langchain_text_splitters/spacy.py
index 447a3e429600c..a15e8b00418a0 100644
--- a/libs/text-splitters/langchain_text_splitters/spacy.py
+++ b/libs/text-splitters/langchain_text_splitters/spacy.py
@@ -8,7 +8,6 @@
 class SpacyTextSplitter(TextSplitter):
     """Splitting text using Spacy package.
 
-
     Per default, Spacy's `en_core_web_sm` model is used and
     its default max_length is 1000000 (it is the length of maximum character
     this model takes which can be increased for large files). For a faster, but
diff --git a/libs/text-splitters/pyproject.toml b/libs/text-splitters/pyproject.toml
index c4c8a7535860e..53f8993809454 100644
--- a/libs/text-splitters/pyproject.toml
+++ b/libs/text-splitters/pyproject.toml
@@ -26,7 +26,20 @@ python = ">=3.9,<4.0"
 langchain-core = "^0.3.15"
 
 [tool.ruff.lint]
-select = [ "E", "F", "I", "T201",]
+select = [
+    "E",        # pycodestyle
+    "F",        # Pyflakes
+    "I",        # isort
+    "T201",     # print
+    "D",        # pydocstyle
+]
+ignore = ["D100"]  # ignore missing module docstring
+
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+
+[tool.ruff.lint.per-file-ignores]
+"tests/**" = ["D"]  # ignore docstring checks for tests
 
 [tool.coverage.run]
 omit = [ "tests/*",]