From ccb0ed1a2afdf8bbdfa6708907248e39fd2b6369 Mon Sep 17 00:00:00 2001 From: jyong Date: Thu, 7 Mar 2024 18:19:50 +0800 Subject: [PATCH] fix overlap and splitter optimization --- api/core/rag/index_processor/index_processor_base.py | 4 ++-- api/core/splitter/text_splitter.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/api/core/rag/index_processor/index_processor_base.py b/api/core/rag/index_processor/index_processor_base.py index fcb06e5c84653..509a1a189b2e2 100644 --- a/api/core/rag/index_processor/index_processor_base.py +++ b/api/core/rag/index_processor/index_processor_base.py @@ -52,7 +52,7 @@ def _get_splitter(self, processing_rule: dict, character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder( chunk_size=segmentation["max_tokens"], - chunk_overlap=0, + chunk_overlap=segmentation.get('chunk_overlap', 0), fixed_separator=separator, separators=["\n\n", "。", ".", " ", ""], embedding_model_instance=embedding_model_instance @@ -61,7 +61,7 @@ def _get_splitter(self, processing_rule: dict, # Automatic segmentation character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder( chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'], - chunk_overlap=0, + chunk_overlap=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['chunk_overlap'], separators=["\n\n", "。", ".", " ", ""], embedding_model_instance=embedding_model_instance ) diff --git a/api/core/splitter/text_splitter.py b/api/core/splitter/text_splitter.py index e3d43c0658144..5eeb237a960ea 100644 --- a/api/core/splitter/text_splitter.py +++ b/api/core/splitter/text_splitter.py @@ -30,7 +30,7 @@ def _split_text_with_regex( if separator: if keep_separator: # The parentheses in the pattern keep the delimiters in the result. - _splits = re.split(f"({separator})", text) + _splits = re.split(f"({re.escape(separator)})", text) splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)] if len(_splits) % 2 == 0: splits += _splits[-1:] @@ -94,7 +94,7 @@ def create_documents( documents.append(new_doc) return documents - def split_documents(self, documents: Iterable[Document]) -> list[Document]: + def split_documents(self, documents: Iterable[Document] ) -> list[Document]: """Split documents.""" texts, metadatas = [], [] for doc in documents: