Skip to content

Commit

Permalink
wip: fixing all tests after changes
Browse files Browse the repository at this point in the history
  • Loading branch information
davidsbatista committed Dec 12, 2024
1 parent d66afd5 commit 5bcf709
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 64 deletions.
4 changes: 1 addition & 3 deletions haystack/components/preprocessors/recursive_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,18 +128,16 @@ def _chunk_text(self, text: str) -> List[str]:

for curr_separator in self.separators: # type: ignore # the caller already checked that separators is not None
if curr_separator == "sentence":
# using the custom NLTK-based sentence tokenizer
sentence_with_spans = self.nltk_tokenizer.split_sentences(text)
splits = [sentence["sentence"] for sentence in sentence_with_spans]
else:
# using the separator as a regex
escaped_separator = re.escape(curr_separator)
escaped_separator = (
f"({escaped_separator})" # wrap the separator in a group to include it in the splits
)
splits = re.split(escaped_separator, text)

# merge every two consecutive splits (i.e., the ext and the separator after it)
# merge every two consecutive splits, i.e.: the text and the separator after it
splits = [
"".join([splits[i], splits[i + 1]]) if i < len(splits) - 1 else splits[i]
for i in range(0, len(splits), 2)
Expand Down
105 changes: 44 additions & 61 deletions test/components/preprocessors/test_recursive_splitter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pytest
from pytest import LogCaptureFixture

from haystack import Document, Pipeline
from haystack.components.preprocessors.recursive_splitter import RecursiveDocumentSplitter
Expand Down Expand Up @@ -72,72 +73,54 @@ def test_chunk_text_by_period():
assert chunks[2] == " And one more."


def test_recursive_splitter_empty_documents():
def test_recursive_splitter_multiple_new_lines():
splitter = RecursiveDocumentSplitter(split_length=20, separators=["\n\n", "\n"])
text = "This is a test.\n\n\nAnother test.\n\n\n\nFinal test."
doc = Document(content=text)
chunks = splitter.run([doc])["documents"]
assert chunks[0].content == "This is a test.\n\n"
assert chunks[1].content == "\nAnother test.\n\n"
assert chunks[2].content == "\n\nFinal test."


def test_recursive_splitter_empty_documents(caplog: LogCaptureFixture):
splitter = RecursiveDocumentSplitter(split_length=20, split_overlap=0, separators=["."])
empty_doc = Document(content="")
doc_chunks = splitter.run([empty_doc])
doc_chunks = doc_chunks["documents"]
assert len(doc_chunks) == 0


def test_recursive_splitter_generate_empty_chunks():
splitter = RecursiveDocumentSplitter(split_length=15, separators=["\n\n", "\n"])
text = "This is a test.\n\n\nAnother test.\n\n\n\nFinal test."
doc = Document(content=text)
chunks = splitter.run([doc])["documents"]
assert chunks[0].content == "This is a test."
assert chunks[1].content == "\nAnother test."
assert chunks[2].content == "Final test."


# def test_chunk_text_using_nltk_sentence():
# """
# This test includes abbreviations that are not handled by the simple sentence tokenizer based on "." and
# requires a more sophisticated sentence tokenizer like the one provided by NLTK.
# """
#
# splitter = RecursiveDocumentSplitter(split_length=400, split_overlap=0, separators=["\n\n", "\n", ".", " "])
# text = """Artificial intelligence (AI) - Introduction
#
# AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.
# AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go).""" # noqa: E501
#
# chunks = splitter._chunk_text(text)
# assert len(chunks) == 4
# assert chunks[0] == "Artificial intelligence (AI) - Introduction\n\n"
# assert (
# chunks[1]
# == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n"
# ) # noqa: E501
# assert chunks[2] == "AI technology is widely used throughout industry, government, and science." # noqa: E501
# assert (
# chunks[3]
# == "Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go)."
# ) # noqa: E501


# def test_recursive_chunker_with_multiple_separators_recursive():
# splitter = RecursiveDocumentSplitter(split_length=260, split_overlap=0, separators=["\n\n", "\n", "sentence", " "])
# text = """Artificial intelligence (AI) - Introduction
#
# AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.
# AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games.""" # noqa: E501
#
# doc = Document(content=text)
# doc_chunks = splitter.run([doc])
# doc_chunks = doc_chunks["documents"]
#
# assert len(doc_chunks) == 4
# assert doc_chunks[0].content == "Artificial intelligence (AI) - Introduction\n\n"
# assert (
# doc_chunks[1].content
# == "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n"
# )
# assert doc_chunks[2].content == "AI technology is widely used throughout industry, government, and science."
# assert (
# doc_chunks[3].content
# == " Some high-profile applications include advanced web search engines; recommendation systems; interacting via human speech; autonomous vehicles; generative and creative tools; and superhuman play and analysis in strategy games."
# )
assert "has an empty content. Skipping this document." in caplog.text


def test_recursive_splitter_using_custom_sentence_tokenizer():
"""
This test includes abbreviations that are not handled by the simple sentence tokenizer based on "." and requires a
more sophisticated sentence tokenizer like the one provided by NLTK.
"""
splitter = RecursiveDocumentSplitter(split_length=400, split_overlap=0, separators=["\n\n", "\n", "sentence", " "])
text = """Artificial intelligence (AI) - Introduction
AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.
AI technology is widely used throughout industry, government, and science. Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go).""" # noqa: E501

chunks = splitter.run([Document(content=text)])
chunks = chunks["documents"]
assert len(chunks) == 4
assert chunks[0].content == "Artificial intelligence (AI) - Introduction\n\n"
assert (
chunks[1].content
== "AI, in its broadest sense, is intelligence exhibited by machines, particularly computer systems.\n"
) # noqa: E501
assert chunks[2].content == "AI technology is widely used throughout industry, government, and science." # noqa: E501
assert (
chunks[3].content
== "Some high-profile applications include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go)."
) # noqa: E501


def test_recursive_splitter_custom_sentence_tokenizer_document_and_overlap():
# ToDo:
pass


def test_recursive_chunker_split_document_with_overlap():
Expand Down

0 comments on commit 5bcf709

Please sign in to comment.