From bdaadd79f9aed8a8de3eb54231f63d531bc223c4 Mon Sep 17 00:00:00 2001 From: Johannes Wesch Date: Tue, 6 Feb 2024 09:28:03 +0100 Subject: [PATCH] added break for recursive summary when number of partial summary doesn't change --- src/intelligence_layer/core/chunk.py | 4 +--- tests/core/test_chunk.py | 5 +++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/intelligence_layer/core/chunk.py b/src/intelligence_layer/core/chunk.py index 0bdd6b566..92b3d8c2f 100644 --- a/src/intelligence_layer/core/chunk.py +++ b/src/intelligence_layer/core/chunk.py @@ -96,9 +96,7 @@ def __init__( overlap_length_tokens, max_tokens_per_chunk ) ) - self.chunk_task = ChunkTask( - client, model, overlap_length_tokens // 2 - ) + self.chunk_task = ChunkTask(client, model, overlap_length_tokens // 2) self.tokenizer = client.tokenizer(model) self.max_tokens_per_chunk = max_tokens_per_chunk self.overlap_length_tokens = overlap_length_tokens diff --git a/tests/core/test_chunk.py b/tests/core/test_chunk.py index b2c6ca1bc..888e5d912 100644 --- a/tests/core/test_chunk.py +++ b/tests/core/test_chunk.py @@ -37,14 +37,15 @@ def test_overlapped_chunking( print(first) assert ( - len(first) <= MAX_TOKENS + 2 + len(first) + <= MAX_TOKENS + 2 # `+2` because re-tokenizing the chunk can add a few extra tokens at # the beginning or end of each chunk. This is a hack. ) next = output_tokenized[chunk_index + 1].tokens found = False - for offset in range(len(next)-OVERLAP//2): + for offset in range(len(next) - OVERLAP // 2): if first[-OVERLAP // 2 :] != next[offset : offset + OVERLAP // 2]: continue found = True