Skip to content

Commit

Permalink
text-splitters: add pydocstyle linting (#28127)
Browse files Browse the repository at this point in the history
As seen in #23188, turned on Google-style docstrings by enabling
`pydocstyle` linting in the `text-splitters` package. Each resulting
linting error was addressed differently: ignored, resolved, suppressed,
and missing docstrings were added.

Fixes one of the checklist items from #25154, similar to #25939 in
`core` package. Ran `make format`, `make lint` and `make test` from the
root of the package `text-splitters` to ensure no issues were found.

---------

Co-authored-by: Erick Friis <[email protected]>
  • Loading branch information
dangiankit and efriis authored Dec 9, 2024
1 parent b53f07b commit 90f162e
Show file tree
Hide file tree
Showing 9 changed files with 194 additions and 27 deletions.
1 change: 0 additions & 1 deletion libs/text-splitters/langchain_text_splitters/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""**Text Splitters** are classes for splitting text.
**Class hierarchy:**
.. code-block::
Expand Down
15 changes: 15 additions & 0 deletions libs/text-splitters/langchain_text_splitters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,21 @@ def __init__(
self._disallowed_special = disallowed_special

def split_text(self, text: str) -> List[str]:
"""Splits the input text into smaller chunks based on tokenization.
This method uses a custom tokenizer configuration to encode the input text
into tokens, processes the tokens in chunks of a specified size with overlap,
and decodes them back into text chunks. The splitting is performed using the
`split_text_on_tokens` function.
Args:
text (str): The input text to be split into smaller chunks.
Returns:
List[str]: A list of text chunks, where each chunk is derived from a portion
of the input text based on the tokenization and chunking rules.
"""

def _encode(_text: str) -> List[int]:
return self._tokenizer.encode(
_text,
Expand Down
28 changes: 28 additions & 0 deletions libs/text-splitters/langchain_text_splitters/character.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,17 +115,45 @@ def _split_text(self, text: str, separators: List[str]) -> List[str]:
return final_chunks

def split_text(self, text: str) -> List[str]:
"""Split the input text into smaller chunks based on predefined separators.
Args:
text (str): The input text to be split.
Returns:
List[str]: A list of text chunks obtained after splitting.
"""
return self._split_text(text, self._separators)

@classmethod
def from_language(
cls, language: Language, **kwargs: Any
) -> RecursiveCharacterTextSplitter:
"""Return an instance of this class based on a specific language.
This method initializes the text splitter with language-specific separators.
Args:
language (Language): The language to configure the text splitter for.
**kwargs (Any): Additional keyword arguments to customize the splitter.
Returns:
RecursiveCharacterTextSplitter: An instance of the text splitter configured
for the specified language.
"""
separators = cls.get_separators_for_language(language)
return cls(separators=separators, is_separator_regex=True, **kwargs)

@staticmethod
def get_separators_for_language(language: Language) -> List[str]:
"""Retrieve a list of separators specific to the given language.
Args:
language (Language): The language for which to get the separators.
Returns:
List[str]: A list of separators appropriate for the specified language.
"""
if language == Language.C or language == Language.CPP:
return [
# Split along class definitions
Expand Down
53 changes: 42 additions & 11 deletions libs/text-splitters/langchain_text_splitters/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ class ElementType(TypedDict):


class HTMLHeaderTextSplitter:
"""
Splitting HTML files based on specified headers.
"""Splitting HTML files based on specified headers.
Requires lxml package.
"""

Expand All @@ -46,7 +46,7 @@ def __init__(
def aggregate_elements_to_chunks(
self, elements: List[ElementType]
) -> List[Document]:
"""Combine elements with common metadata into chunks
"""Combine elements with common metadata into chunks.
Args:
elements: HTML element content with associated identifying info and metadata
Expand All @@ -72,7 +72,7 @@ def aggregate_elements_to_chunks(
]

def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]:
"""Split HTML from web URL
"""Split HTML from web URL.
Args:
url: web URL
Expand All @@ -83,15 +83,15 @@ def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]:
return self.split_text_from_file(BytesIO(r.content))

def split_text(self, text: str) -> List[Document]:
"""Split HTML text string
"""Split HTML text string.
Args:
text: HTML text
"""
return self.split_text_from_file(StringIO(text))

def split_text_from_file(self, file: Any) -> List[Document]:
"""Split HTML file
"""Split HTML file.
Args:
file: HTML file
Expand Down Expand Up @@ -166,8 +166,8 @@ def split_text_from_file(self, file: Any) -> List[Document]:


class HTMLSectionSplitter:
"""
Splitting HTML files based on specified tag and font sizes.
"""Splitting HTML files based on specified tag and font sizes.
Requires lxml package.
"""

Expand All @@ -186,6 +186,8 @@ def __init__(
xslt_path: path to xslt file for document transformation.
Uses a default if not passed.
Needed for html contents that using different format and layouts.
**kwargs (Any): Additional optional arguments for customizations.
"""
self.headers_to_split_on = dict(headers_to_split_on)

Expand All @@ -210,7 +212,7 @@ def split_documents(self, documents: Iterable[Document]) -> List[Document]:
return text_splitter.split_documents(results)

def split_text(self, text: str) -> List[Document]:
"""Split HTML text string
"""Split HTML text string.
Args:
text: HTML text
Expand All @@ -236,6 +238,23 @@ def create_documents(
return documents

def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]:
"""Split an HTML document into sections based on specified header tags.
This method uses BeautifulSoup to parse the HTML content and divides it into
sections based on headers defined in `headers_to_split_on`. Each section
contains the header text, content under the header, and the tag name.
Args:
html_doc (str): The HTML document to be split into sections.
Returns:
List[Dict[str, Optional[str]]]: A list of dictionaries representing
sections.
Each dictionary contains:
- 'header': The header text or a default title for the first section.
- 'content': The content under the header.
- 'tag_name': The name of the header tag (e.g., "h1", "h2").
"""
try:
from bs4 import BeautifulSoup, PageElement # type: ignore[import-untyped]
except ImportError as e:
Expand All @@ -259,7 +278,7 @@ def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]
section_content: List = []
else:
current_header = header_element.text.strip()
current_header_tag = header_element.name
current_header_tag = header_element.name # type: ignore[attr-defined]
section_content = []
for element in header_element.next_elements:
if i + 1 < len(headers) and element == headers[i + 1]:
Expand All @@ -280,6 +299,18 @@ def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]
return sections

def convert_possible_tags_to_header(self, html_content: str) -> str:
"""Convert specific HTML tags to headers using an XSLT transformation.
This method uses an XSLT file to transform the HTML content, converting
certain tags into headers for easier parsing. If no XSLT path is provided,
the HTML content is returned unchanged.
Args:
html_content (str): The HTML content to be transformed.
Returns:
str: The transformed HTML content as a string.
"""
if self.xslt_path is None:
return html_content

Expand All @@ -299,7 +330,7 @@ def convert_possible_tags_to_header(self, html_content: str) -> str:
return str(result)

def split_text_from_file(self, file: Any) -> List[Document]:
"""Split HTML file
"""Split HTML file.
Args:
file: HTML file
Expand Down
39 changes: 32 additions & 7 deletions libs/text-splitters/langchain_text_splitters/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,38 @@


class RecursiveJsonSplitter:
"""Splits JSON data into smaller, structured chunks while preserving hierarchy.
This class provides methods to split JSON data into smaller dictionaries or
JSON-formatted strings based on configurable maximum and minimum chunk sizes.
It supports nested JSON structures, optionally converts lists into dictionaries
for better chunking, and allows the creation of document objects for further use.
Attributes:
max_chunk_size (int): The maximum size for each chunk. Defaults to 2000.
min_chunk_size (int): The minimum size for each chunk, derived from
`max_chunk_size` if not explicitly provided.
"""

def __init__(
self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None
):
"""Initialize the chunk size configuration for text processing.
This constructor sets up the maximum and minimum chunk sizes, ensuring that
the `min_chunk_size` defaults to a value slightly smaller than the
`max_chunk_size` if not explicitly provided.
Args:
max_chunk_size (int): The maximum size for a chunk. Defaults to 2000.
min_chunk_size (Optional[int]): The minimum size for a chunk. If None,
defaults to the maximum chunk size minus 200, with a lower bound of 50.
Attributes:
max_chunk_size (int): The configured maximum size for each chunk.
min_chunk_size (int): The configured minimum size for each chunk, derived
from `max_chunk_size` if not explicitly provided.
"""
super().__init__()
self.max_chunk_size = max_chunk_size
self.min_chunk_size = (
Expand Down Expand Up @@ -51,9 +80,7 @@ def _json_split(
current_path: Optional[List[str]] = None,
chunks: Optional[List[Dict]] = None,
) -> List[Dict]:
"""
Split json into maximum size dictionaries while preserving structure.
"""
"""Split json into maximum size dictionaries while preserving structure."""
current_path = current_path or []
chunks = chunks if chunks is not None else [{}]
if isinstance(data, dict):
Expand Down Expand Up @@ -83,8 +110,7 @@ def split_json(
json_data: Dict[str, Any],
convert_lists: bool = False,
) -> List[Dict]:
"""Splits JSON into a list of JSON chunks"""

"""Splits JSON into a list of JSON chunks."""
if convert_lists:
chunks = self._json_split(self._list_to_dict_preprocessing(json_data))
else:
Expand All @@ -101,8 +127,7 @@ def split_text(
convert_lists: bool = False,
ensure_ascii: bool = True,
) -> List[str]:
"""Splits JSON into a list of JSON formatted strings"""

"""Splits JSON into a list of JSON formatted strings."""
chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)

# Convert to string
Expand Down
44 changes: 38 additions & 6 deletions libs/text-splitters/langchain_text_splitters/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ def __init__(
self.strip_headers = strip_headers

def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
"""Combine lines with common metadata into chunks
"""Combine lines with common metadata into chunks.
Args:
lines: Line of text / associated header metadata
"""
Expand Down Expand Up @@ -87,10 +88,11 @@ def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
]

def split_text(self, text: str) -> List[Document]:
"""Split markdown file
Args:
text: Markdown file"""
"""Split markdown file.
Args:
text: Markdown file
"""
# Split the input text by newline character ("\n").
lines = text.split("\n")
# Final output
Expand Down Expand Up @@ -225,8 +227,7 @@ class HeaderType(TypedDict):


class ExperimentalMarkdownSyntaxTextSplitter:
"""
An experimental text splitter for handling Markdown syntax.
"""An experimental text splitter for handling Markdown syntax.
This splitter aims to retain the exact whitespace of the original text while
extracting structured metadata, such as headers. It is a re-implementation of the
Expand Down Expand Up @@ -280,6 +281,22 @@ def __init__(
return_each_line: bool = False,
strip_headers: bool = True,
):
"""Initialize the text splitter with header splitting and formatting options.
This constructor sets up the required configuration for splitting text into
chunks based on specified headers and formatting preferences.
Args:
headers_to_split_on (Union[List[Tuple[str, str]], None]):
A list of tuples, where each tuple contains a header tag (e.g., "h1")
and its corresponding metadata key. If None, default headers are used.
return_each_line (bool):
Whether to return each line as an individual chunk.
Defaults to False, which aggregates lines into larger chunks.
strip_headers (bool):
Whether to exclude headers from the resulting chunks.
Defaults to True.
"""
self.chunks: List[Document] = []
self.current_chunk = Document(page_content="")
self.current_header_stack: List[Tuple[int, str]] = []
Expand All @@ -292,6 +309,21 @@ def __init__(
self.return_each_line = return_each_line

def split_text(self, text: str) -> List[Document]:
"""Split the input text into structured chunks.
This method processes the input text line by line, identifying and handling
specific patterns such as headers, code blocks, and horizontal rules to
split it into structured chunks based on headers, code blocks, and
horizontal rules.
Args:
text (str): The input text to be split into chunks.
Returns:
List[Document]: A list of `Document` objects representing the structured
chunks of the input text. If `return_each_line` is enabled, each line
is returned as a separate `Document`.
"""
raw_lines = text.splitlines(keepends=True)

while raw_lines:
Expand Down
Loading

0 comments on commit 90f162e

Please sign in to comment.