From 90f162efb6313ba6d1b5f6311e2c83835c4d1bdd Mon Sep 17 00:00:00 2001 From: Ankit Dangi Date: Sun, 8 Dec 2024 22:01:03 -0800 Subject: [PATCH] text-splitters: add pydocstyle linting (#28127) As seen in #23188, turned on Google-style docstrings by enabling `pydocstyle` linting in the `text-splitters` package. Each resulting linting error was addressed differently: ignored, resolved, suppressed, and missing docstrings were added. Fixes one of the checklist items from #25154, similar to #25939 in `core` package. Ran `make format`, `make lint` and `make test` from the root of the package `text-splitters` to ensure no issues were found. --------- Co-authored-by: Erick Friis --- .../langchain_text_splitters/__init__.py | 1 - .../langchain_text_splitters/base.py | 15 ++++++ .../langchain_text_splitters/character.py | 28 ++++++++++ .../langchain_text_splitters/html.py | 53 +++++++++++++++---- .../langchain_text_splitters/json.py | 39 +++++++++++--- .../langchain_text_splitters/markdown.py | 44 ++++++++++++--- .../sentence_transformers.py | 25 +++++++++ .../langchain_text_splitters/spacy.py | 1 - libs/text-splitters/pyproject.toml | 15 +++++- 9 files changed, 194 insertions(+), 27 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/__init__.py b/libs/text-splitters/langchain_text_splitters/__init__.py index 58ad7b0e4c585..65af087fdd85f 100644 --- a/libs/text-splitters/langchain_text_splitters/__init__.py +++ b/libs/text-splitters/langchain_text_splitters/__init__.py @@ -1,6 +1,5 @@ """**Text Splitters** are classes for splitting text. - **Class hierarchy:** .. code-block:: diff --git a/libs/text-splitters/langchain_text_splitters/base.py b/libs/text-splitters/langchain_text_splitters/base.py index 0e0a49c182da7..10dd6903ba172 100644 --- a/libs/text-splitters/langchain_text_splitters/base.py +++ b/libs/text-splitters/langchain_text_splitters/base.py @@ -249,6 +249,21 @@ def __init__( self._disallowed_special = disallowed_special def split_text(self, text: str) -> List[str]: + """Splits the input text into smaller chunks based on tokenization. + + This method uses a custom tokenizer configuration to encode the input text + into tokens, processes the tokens in chunks of a specified size with overlap, + and decodes them back into text chunks. The splitting is performed using the + `split_text_on_tokens` function. + + Args: + text (str): The input text to be split into smaller chunks. + + Returns: + List[str]: A list of text chunks, where each chunk is derived from a portion + of the input text based on the tokenization and chunking rules. + """ + def _encode(_text: str) -> List[int]: return self._tokenizer.encode( _text, diff --git a/libs/text-splitters/langchain_text_splitters/character.py b/libs/text-splitters/langchain_text_splitters/character.py index f65c38869d394..a2918bd27f0ac 100644 --- a/libs/text-splitters/langchain_text_splitters/character.py +++ b/libs/text-splitters/langchain_text_splitters/character.py @@ -115,17 +115,45 @@ def _split_text(self, text: str, separators: List[str]) -> List[str]: return final_chunks def split_text(self, text: str) -> List[str]: + """Split the input text into smaller chunks based on predefined separators. + + Args: + text (str): The input text to be split. + + Returns: + List[str]: A list of text chunks obtained after splitting. + """ return self._split_text(text, self._separators) @classmethod def from_language( cls, language: Language, **kwargs: Any ) -> RecursiveCharacterTextSplitter: + """Return an instance of this class based on a specific language. + + This method initializes the text splitter with language-specific separators. + + Args: + language (Language): The language to configure the text splitter for. + **kwargs (Any): Additional keyword arguments to customize the splitter. + + Returns: + RecursiveCharacterTextSplitter: An instance of the text splitter configured + for the specified language. + """ separators = cls.get_separators_for_language(language) return cls(separators=separators, is_separator_regex=True, **kwargs) @staticmethod def get_separators_for_language(language: Language) -> List[str]: + """Retrieve a list of separators specific to the given language. + + Args: + language (Language): The language for which to get the separators. + + Returns: + List[str]: A list of separators appropriate for the specified language. + """ if language == Language.C or language == Language.CPP: return [ # Split along class definitions diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index cdbea7f724b53..241c0981f58f5 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -21,8 +21,8 @@ class ElementType(TypedDict): class HTMLHeaderTextSplitter: - """ - Splitting HTML files based on specified headers. + """Splitting HTML files based on specified headers. + Requires lxml package. """ @@ -46,7 +46,7 @@ def __init__( def aggregate_elements_to_chunks( self, elements: List[ElementType] ) -> List[Document]: - """Combine elements with common metadata into chunks + """Combine elements with common metadata into chunks. Args: elements: HTML element content with associated identifying info and metadata @@ -72,7 +72,7 @@ def aggregate_elements_to_chunks( ] def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]: - """Split HTML from web URL + """Split HTML from web URL. Args: url: web URL @@ -83,7 +83,7 @@ def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]: return self.split_text_from_file(BytesIO(r.content)) def split_text(self, text: str) -> List[Document]: - """Split HTML text string + """Split HTML text string. Args: text: HTML text @@ -91,7 +91,7 @@ def split_text(self, text: str) -> List[Document]: return self.split_text_from_file(StringIO(text)) def split_text_from_file(self, file: Any) -> List[Document]: - """Split HTML file + """Split HTML file. Args: file: HTML file @@ -166,8 +166,8 @@ def split_text_from_file(self, file: Any) -> List[Document]: class HTMLSectionSplitter: - """ - Splitting HTML files based on specified tag and font sizes. + """Splitting HTML files based on specified tag and font sizes. + Requires lxml package. """ @@ -186,6 +186,8 @@ def __init__( xslt_path: path to xslt file for document transformation. Uses a default if not passed. Needed for html contents that using different format and layouts. + **kwargs (Any): Additional optional arguments for customizations. + """ self.headers_to_split_on = dict(headers_to_split_on) @@ -210,7 +212,7 @@ def split_documents(self, documents: Iterable[Document]) -> List[Document]: return text_splitter.split_documents(results) def split_text(self, text: str) -> List[Document]: - """Split HTML text string + """Split HTML text string. Args: text: HTML text @@ -236,6 +238,23 @@ def create_documents( return documents def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]]: + """Split an HTML document into sections based on specified header tags. + + This method uses BeautifulSoup to parse the HTML content and divides it into + sections based on headers defined in `headers_to_split_on`. Each section + contains the header text, content under the header, and the tag name. + + Args: + html_doc (str): The HTML document to be split into sections. + + Returns: + List[Dict[str, Optional[str]]]: A list of dictionaries representing + sections. + Each dictionary contains: + - 'header': The header text or a default title for the first section. + - 'content': The content under the header. + - 'tag_name': The name of the header tag (e.g., "h1", "h2"). + """ try: from bs4 import BeautifulSoup, PageElement # type: ignore[import-untyped] except ImportError as e: @@ -259,7 +278,7 @@ def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]] section_content: List = [] else: current_header = header_element.text.strip() - current_header_tag = header_element.name + current_header_tag = header_element.name # type: ignore[attr-defined] section_content = [] for element in header_element.next_elements: if i + 1 < len(headers) and element == headers[i + 1]: @@ -280,6 +299,18 @@ def split_html_by_headers(self, html_doc: str) -> List[Dict[str, Optional[str]]] return sections def convert_possible_tags_to_header(self, html_content: str) -> str: + """Convert specific HTML tags to headers using an XSLT transformation. + + This method uses an XSLT file to transform the HTML content, converting + certain tags into headers for easier parsing. If no XSLT path is provided, + the HTML content is returned unchanged. + + Args: + html_content (str): The HTML content to be transformed. + + Returns: + str: The transformed HTML content as a string. + """ if self.xslt_path is None: return html_content @@ -299,7 +330,7 @@ def convert_possible_tags_to_header(self, html_content: str) -> str: return str(result) def split_text_from_file(self, file: Any) -> List[Document]: - """Split HTML file + """Split HTML file. Args: file: HTML file diff --git a/libs/text-splitters/langchain_text_splitters/json.py b/libs/text-splitters/langchain_text_splitters/json.py index c83d8b2a42880..c58174dd8b33a 100644 --- a/libs/text-splitters/langchain_text_splitters/json.py +++ b/libs/text-splitters/langchain_text_splitters/json.py @@ -8,9 +8,38 @@ class RecursiveJsonSplitter: + """Splits JSON data into smaller, structured chunks while preserving hierarchy. + + This class provides methods to split JSON data into smaller dictionaries or + JSON-formatted strings based on configurable maximum and minimum chunk sizes. + It supports nested JSON structures, optionally converts lists into dictionaries + for better chunking, and allows the creation of document objects for further use. + + Attributes: + max_chunk_size (int): The maximum size for each chunk. Defaults to 2000. + min_chunk_size (int): The minimum size for each chunk, derived from + `max_chunk_size` if not explicitly provided. + """ + def __init__( self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None ): + """Initialize the chunk size configuration for text processing. + + This constructor sets up the maximum and minimum chunk sizes, ensuring that + the `min_chunk_size` defaults to a value slightly smaller than the + `max_chunk_size` if not explicitly provided. + + Args: + max_chunk_size (int): The maximum size for a chunk. Defaults to 2000. + min_chunk_size (Optional[int]): The minimum size for a chunk. If None, + defaults to the maximum chunk size minus 200, with a lower bound of 50. + + Attributes: + max_chunk_size (int): The configured maximum size for each chunk. + min_chunk_size (int): The configured minimum size for each chunk, derived + from `max_chunk_size` if not explicitly provided. + """ super().__init__() self.max_chunk_size = max_chunk_size self.min_chunk_size = ( @@ -51,9 +80,7 @@ def _json_split( current_path: Optional[List[str]] = None, chunks: Optional[List[Dict]] = None, ) -> List[Dict]: - """ - Split json into maximum size dictionaries while preserving structure. - """ + """Split json into maximum size dictionaries while preserving structure.""" current_path = current_path or [] chunks = chunks if chunks is not None else [{}] if isinstance(data, dict): @@ -83,8 +110,7 @@ def split_json( json_data: Dict[str, Any], convert_lists: bool = False, ) -> List[Dict]: - """Splits JSON into a list of JSON chunks""" - + """Splits JSON into a list of JSON chunks.""" if convert_lists: chunks = self._json_split(self._list_to_dict_preprocessing(json_data)) else: @@ -101,8 +127,7 @@ def split_text( convert_lists: bool = False, ensure_ascii: bool = True, ) -> List[str]: - """Splits JSON into a list of JSON formatted strings""" - + """Splits JSON into a list of JSON formatted strings.""" chunks = self.split_json(json_data=json_data, convert_lists=convert_lists) # Convert to string diff --git a/libs/text-splitters/langchain_text_splitters/markdown.py b/libs/text-splitters/langchain_text_splitters/markdown.py index fdcd010f50d43..34c7d2197d238 100644 --- a/libs/text-splitters/langchain_text_splitters/markdown.py +++ b/libs/text-splitters/langchain_text_splitters/markdown.py @@ -45,7 +45,8 @@ def __init__( self.strip_headers = strip_headers def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]: - """Combine lines with common metadata into chunks + """Combine lines with common metadata into chunks. + Args: lines: Line of text / associated header metadata """ @@ -87,10 +88,11 @@ def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]: ] def split_text(self, text: str) -> List[Document]: - """Split markdown file - Args: - text: Markdown file""" + """Split markdown file. + Args: + text: Markdown file + """ # Split the input text by newline character ("\n"). lines = text.split("\n") # Final output @@ -225,8 +227,7 @@ class HeaderType(TypedDict): class ExperimentalMarkdownSyntaxTextSplitter: - """ - An experimental text splitter for handling Markdown syntax. + """An experimental text splitter for handling Markdown syntax. This splitter aims to retain the exact whitespace of the original text while extracting structured metadata, such as headers. It is a re-implementation of the @@ -280,6 +281,22 @@ def __init__( return_each_line: bool = False, strip_headers: bool = True, ): + """Initialize the text splitter with header splitting and formatting options. + + This constructor sets up the required configuration for splitting text into + chunks based on specified headers and formatting preferences. + + Args: + headers_to_split_on (Union[List[Tuple[str, str]], None]): + A list of tuples, where each tuple contains a header tag (e.g., "h1") + and its corresponding metadata key. If None, default headers are used. + return_each_line (bool): + Whether to return each line as an individual chunk. + Defaults to False, which aggregates lines into larger chunks. + strip_headers (bool): + Whether to exclude headers from the resulting chunks. + Defaults to True. + """ self.chunks: List[Document] = [] self.current_chunk = Document(page_content="") self.current_header_stack: List[Tuple[int, str]] = [] @@ -292,6 +309,21 @@ def __init__( self.return_each_line = return_each_line def split_text(self, text: str) -> List[Document]: + """Split the input text into structured chunks. + + This method processes the input text line by line, identifying and handling + specific patterns such as headers, code blocks, and horizontal rules to + split it into structured chunks based on headers, code blocks, and + horizontal rules. + + Args: + text (str): The input text to be split into chunks. + + Returns: + List[Document]: A list of `Document` objects representing the structured + chunks of the input text. If `return_each_line` is enabled, each line + is returned as a separate `Document`. + """ raw_lines = text.splitlines(keepends=True) while raw_lines: diff --git a/libs/text-splitters/langchain_text_splitters/sentence_transformers.py b/libs/text-splitters/langchain_text_splitters/sentence_transformers.py index beb314d810d9e..3b19c5edc594d 100644 --- a/libs/text-splitters/langchain_text_splitters/sentence_transformers.py +++ b/libs/text-splitters/langchain_text_splitters/sentence_transformers.py @@ -51,6 +51,20 @@ def _initialize_chunk_configuration( ) def split_text(self, text: str) -> List[str]: + """Splits the input text into smaller components by splitting text on tokens. + + This method encodes the input text using a private `_encode` method, then + strips the start and stop token IDs from the encoded result. It returns the + processed segments as a list of strings. + + Args: + text (str): The input text to be split. + + Returns: + List[str]: A list of string components derived from the input text after + encoding and processing. + """ + def encode_strip_start_and_stop_token_ids(text: str) -> List[int]: return self._encode(text)[1:-1] @@ -64,6 +78,17 @@ def encode_strip_start_and_stop_token_ids(text: str) -> List[int]: return split_text_on_tokens(text=text, tokenizer=tokenizer) def count_tokens(self, *, text: str) -> int: + """Counts the number of tokens in the given text. + + This method encodes the input text using a private `_encode` method and + calculates the total number of tokens in the encoded result. + + Args: + text (str): The input text for which the token count is calculated. + + Returns: + int: The number of tokens in the encoded text. + """ return len(self._encode(text)) _max_length_equal_32_bit_integer: int = 2**32 diff --git a/libs/text-splitters/langchain_text_splitters/spacy.py b/libs/text-splitters/langchain_text_splitters/spacy.py index 447a3e429600c..a15e8b00418a0 100644 --- a/libs/text-splitters/langchain_text_splitters/spacy.py +++ b/libs/text-splitters/langchain_text_splitters/spacy.py @@ -8,7 +8,6 @@ class SpacyTextSplitter(TextSplitter): """Splitting text using Spacy package. - Per default, Spacy's `en_core_web_sm` model is used and its default max_length is 1000000 (it is the length of maximum character this model takes which can be increased for large files). For a faster, but diff --git a/libs/text-splitters/pyproject.toml b/libs/text-splitters/pyproject.toml index c4c8a7535860e..53f8993809454 100644 --- a/libs/text-splitters/pyproject.toml +++ b/libs/text-splitters/pyproject.toml @@ -26,7 +26,20 @@ python = ">=3.9,<4.0" langchain-core = "^0.3.15" [tool.ruff.lint] -select = [ "E", "F", "I", "T201",] +select = [ + "E", # pycodestyle + "F", # Pyflakes + "I", # isort + "T201", # print + "D", # pydocstyle +] +ignore = ["D100"] # ignore missing module docstring + +[tool.ruff.lint.pydocstyle] +convention = "google" + +[tool.ruff.lint.per-file-ignores] +"tests/**" = ["D"] # ignore docstring checks for tests [tool.coverage.run] omit = [ "tests/*",]