From c233ecc061ec9c564494853d9acbf6ee8b52a305 Mon Sep 17 00:00:00 2001
From: Felix Fehse <felix.fehse@tngtech.com>
Date: Mon, 5 Feb 2024 14:56:27 +0100
Subject: [PATCH 1/5] IL-239 added ChunkOverlapTask

---
 src/intelligence_layer/core/chunk.py          | 38 +++++++++++++++++++
 .../steerable_long_context_summarize.py       |  7 +---
 tests/core/test_chunk.py                      |  0
 3 files changed, 40 insertions(+), 5 deletions(-)
 create mode 100644 tests/core/test_chunk.py

diff --git a/src/intelligence_layer/core/chunk.py b/src/intelligence_layer/core/chunk.py
index 73a1fb3cc..70e1a5538 100644
--- a/src/intelligence_layer/core/chunk.py
+++ b/src/intelligence_layer/core/chunk.py
@@ -66,3 +66,41 @@ def do_run(self, input: ChunkInput, task_span: TaskSpan) -> ChunkOutput:
             for t in self._splitter.chunks(input.text, self._max_tokens_per_chunk)
         ]
         return ChunkOutput(chunks=chunks)
+
+
+class ChunkOverlapTask(Task[ChunkInput, ChunkOutput]):
+    """Splits a longer text into smaller text chunks, where every chunk overlaps
+    with the previous chunk by `overlap_length_tokens` number of tokens.
+
+    Provide a text of any length and chunk it into smaller pieces using a
+    tokenizer that is available within the Aleph Alpha client.
+
+    Args:
+        client: Aleph Alpha client instance for running model related API calls.
+        model: A valid Aleph Alpha model name.
+        max_tokens_per_chunk: The maximum number of tokens to fit into one chunk.
+        overlap_length_tokens: The number of tokens every chunk overlaps with the previous chunk.
+    """
+
+    def __init__(
+        self,
+        client: AlephAlphaClientProtocol,
+        model: str,
+        max_tokens_per_chunk: int,
+        overlap_length_tokens: int
+    ):
+        super().__init__()
+        self.chunk_task = ChunkTask(client, model, max_tokens_per_chunk-overlap_length_tokens)
+        self.tokenizer = client.tokenizer(model)
+        self.overlap_length_tokens = overlap_length_tokens
+
+    def do_run(self, input: ChunkInput, task_span: TaskSpan) -> ChunkOutput:
+        chunks = self.chunk_task.run(input, task_span).chunks
+        token_chunks = self.tokenizer.encode_batch(chunks)
+        chunk_tokens = [token_chunks[0].tokens] + [
+            token_chunks[i].tokens[-self.overlap_length_tokens:].append(token_chunks[i+1].tokens)
+            for i in range(len(token_chunks)-1)
+        ]
+        decoded_chunks = self.tokenizer.decode_batch(chunk_tokens)
+        return ChunkOutput(chunks=decoded_chunks)
+
diff --git a/src/intelligence_layer/use_cases/summarize/steerable_long_context_summarize.py b/src/intelligence_layer/use_cases/summarize/steerable_long_context_summarize.py
index 5d96af700..8cba09b7f 100644
--- a/src/intelligence_layer/use_cases/summarize/steerable_long_context_summarize.py
+++ b/src/intelligence_layer/use_cases/summarize/steerable_long_context_summarize.py
@@ -31,14 +31,11 @@ class SteerableLongContextSummarize(
 
     Args:
         client: Aleph Alpha client instance for running model related API calls.
-        few_shot_configs: A mapping of valid `Language` to `FewShotConfig` for each
-            supported language.
-        model: A valid Aleph Alpha model name.
         max_generated_tokens: The maximum number of tokens per sub-summary.
         max_tokens_per_chunk: The maximum number of tokens per chunk that the long text
             is divided into.
-        allowed_languages: List of languages to which the language detection is limited (ISO619).
-        fallback_language: The default language of the output.
+        model: A valid Aleph Alpha model name.
+        intruction_configs: Dictionary of the prompts for each language.
     """
 
     def __init__(
diff --git a/tests/core/test_chunk.py b/tests/core/test_chunk.py
new file mode 100644
index 000000000..e69de29bb

From e9f8cb62af58e772e27657afc1ba6f770ca56d64 Mon Sep 17 00:00:00 2001
From: Felix Fehse <felix.fehse@tngtech.com>
Date: Mon, 5 Feb 2024 16:38:55 +0100
Subject: [PATCH 2/5] IL-239 added `test_chunk.py`

---
 src/intelligence_layer/core/chunk.py | 28 +++++++++++-----
 tests/core/test_chunk.py             | 50 ++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/src/intelligence_layer/core/chunk.py b/src/intelligence_layer/core/chunk.py
index 70e1a5538..22bf2646e 100644
--- a/src/intelligence_layer/core/chunk.py
+++ b/src/intelligence_layer/core/chunk.py
@@ -87,20 +87,30 @@ def __init__(
         client: AlephAlphaClientProtocol,
         model: str,
         max_tokens_per_chunk: int,
-        overlap_length_tokens: int
+        overlap_length_tokens: int,
     ):
         super().__init__()
-        self.chunk_task = ChunkTask(client, model, max_tokens_per_chunk-overlap_length_tokens)
+        if overlap_length_tokens >= max_tokens_per_chunk:
+            raise RuntimeError(
+                "Cannot choose an overlap ({}) longer than the chunk ({})".format(
+                    overlap_length_tokens, max_tokens_per_chunk
+                )
+            )
+        self.chunk_task = ChunkTask(
+            client, model, max_tokens_per_chunk - overlap_length_tokens
+        )
         self.tokenizer = client.tokenizer(model)
         self.overlap_length_tokens = overlap_length_tokens
 
     def do_run(self, input: ChunkInput, task_span: TaskSpan) -> ChunkOutput:
         chunks = self.chunk_task.run(input, task_span).chunks
-        token_chunks = self.tokenizer.encode_batch(chunks)
-        chunk_tokens = [token_chunks[0].tokens] + [
-            token_chunks[i].tokens[-self.overlap_length_tokens:].append(token_chunks[i+1].tokens)
-            for i in range(len(token_chunks)-1)
-        ]
-        decoded_chunks = self.tokenizer.decode_batch(chunk_tokens)
-        return ChunkOutput(chunks=decoded_chunks)
+        id_chunks = self.tokenizer.encode_batch(chunks)
 
+        chunk_ids = [id_chunks[0].ids]
+        for i in range(len(id_chunks) - 1):
+            chunk_ids.append(
+                chunk_ids[i][-self.overlap_length_tokens :] + id_chunks[i + 1].ids
+            )
+
+        decoded_chunks = self.tokenizer.decode_batch(chunk_ids)
+        return ChunkOutput(chunks=decoded_chunks)
diff --git a/tests/core/test_chunk.py b/tests/core/test_chunk.py
index e69de29bb..04f278f24 100644
--- a/tests/core/test_chunk.py
+++ b/tests/core/test_chunk.py
@@ -0,0 +1,50 @@
+from pytest import fixture
+
+from intelligence_layer.connectors import AlephAlphaClientProtocol
+from intelligence_layer.core import InMemoryTracer
+from intelligence_layer.core.chunk import ChunkInput, ChunkOverlapTask
+
+
+@fixture
+def some_large_text() -> str:
+    return """
+  The Williamsburgh Savings Bank Tower, also known as One Hanson Place, is a skyscraper in the Fort Greene neighborhood of Brooklyn in New York City. Located at the northeast corner of Ashland Place and Hanson Place near Downtown Brooklyn, the tower was designed by Halsey, McCormack & Helmer and constructed from 1927 to 1929 as the new headquarters for the Williamsburgh Savings Bank. At 41 stories and 512 feet (156 m) tall, the Williamsburgh Savings Bank Tower was the tallest building in Brooklyn until 2009.
+
+  The Williamsburgh Savings Bank was originally headquartered in Williamsburg, Brooklyn; its officers decided to construct a new skyscraper headquarters near Downtown Brooklyn in the mid-1920s. The bank occupied the lowest floors when the building opened on April 1, 1929, while the remaining stories were rented as offices. By the late 20th century, dentists' offices occupied much of the structure. The New York City Landmarks Preservation Commission designated the tower's exterior as a city landmark in 1977 and designated some of the interior spaces in 1996. Through several mergers, the Williamsburgh Savings Bank became part of HSBC Bank USA, which sold the building in 2004. The building's upper stories were converted to luxury condominium apartments from 2005 to 2007, while the banking hall became an event space.
+  """
+
+
+def test_overlapped_chunking(
+    client: AlephAlphaClientProtocol, some_large_text: str
+) -> None:
+    MODEL = "luminous-base"
+    OVERLAP = 4
+    MAX_TOKENS = 10
+
+    tracer = InMemoryTracer()
+    task = ChunkOverlapTask(
+        client,
+        model=MODEL,
+        max_tokens_per_chunk=MAX_TOKENS,
+        overlap_length_tokens=OVERLAP,
+    )
+    output = task.run(ChunkInput(text=some_large_text), tracer)
+
+    tokenizer = client.tokenizer(MODEL)
+    output_tokenized = tokenizer.encode_batch(output.chunks)
+    for chunk_index in range(len(output_tokenized) - 1):
+        first = output_tokenized[chunk_index].tokens
+
+        assert (
+            len(first) <= MAX_TOKENS + 2
+        )  # `+2` because re-tokenizing the chunk can add a few extra tokens at the beginning or end of each chunk. This is a hack.
+        next = output_tokenized[chunk_index + 1].tokens
+
+        found = False
+        for offset in range(OVERLAP):
+            if first[-OVERLAP // 2 :] != next[offset : offset + OVERLAP // 2]:
+                continue
+            found = True
+            break
+
+        assert found

From fdffb7a94af0d3673fa37a0ad5980ff478b74d25 Mon Sep 17 00:00:00 2001
From: Felix Fehse <felix.fehse@tngtech.com>
Date: Mon, 5 Feb 2024 17:13:00 +0100
Subject: [PATCH 3/5] IL-239 add `overlap_length_tokens` to
 `SteerableLongContextSummarize`

---
 .../steerable_long_context_summarize.py         | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/intelligence_layer/use_cases/summarize/steerable_long_context_summarize.py b/src/intelligence_layer/use_cases/summarize/steerable_long_context_summarize.py
index 8cba09b7f..c34c496ce 100644
--- a/src/intelligence_layer/use_cases/summarize/steerable_long_context_summarize.py
+++ b/src/intelligence_layer/use_cases/summarize/steerable_long_context_summarize.py
@@ -2,6 +2,7 @@
 
 from intelligence_layer.connectors import AlephAlphaClientProtocol
 from intelligence_layer.core import ChunkInput, ChunkTask, Task, TaskSpan
+from intelligence_layer.core.chunk import ChunkOutput, ChunkOverlapTask
 from intelligence_layer.core.detect_language import Language
 from intelligence_layer.use_cases.summarize.steerable_single_chunk_summarize import (
     SteerableSingleChunkSummarize,
@@ -43,6 +44,7 @@ def __init__(
         client: AlephAlphaClientProtocol,
         max_generated_tokens: int,
         max_tokens_per_chunk: int,
+        overlap_length_tokens: int = 0,
         model: str = "luminous-base-control",
         instruction_configs: Mapping[Language, str] = INSTRUCTION_CONFIGS,
     ) -> None:
@@ -50,9 +52,18 @@ def __init__(
         self._summarize = SteerableSingleChunkSummarize(
             client, model, max_generated_tokens, instruction_configs
         )
-        self._chunk_task = ChunkTask(
-            client, model=model, max_tokens_per_chunk=max_tokens_per_chunk
-        )
+        self._chunk_task: Task[ChunkInput, ChunkOutput]
+        if overlap_length_tokens == 0:
+            self._chunk_task = ChunkTask(
+                client, model=model, max_tokens_per_chunk=max_tokens_per_chunk
+            )
+        else:
+            self._chunk_task = ChunkOverlapTask(
+                client,
+                model=model,
+                max_tokens_per_chunk=max_tokens_per_chunk,
+                overlap_length_tokens=overlap_length_tokens,
+            )
 
     def do_run(
         self, input: LongContextSummarizeInput, task_span: TaskSpan

From 128e8cdcb4b2fc6942b098a28d447dc7f7b127bc Mon Sep 17 00:00:00 2001
From: Felix Fehse <felix.fehse@tngtech.com>
Date: Mon, 5 Feb 2024 18:20:10 +0100
Subject: [PATCH 4/5] IL-239 new chunking for overlap with half-overlap size

---
 src/intelligence_layer/core/chunk.py | 24 ++++++++++++++++++------
 tests/core/test_chunk.py             | 15 +++++++++++----
 2 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/src/intelligence_layer/core/chunk.py b/src/intelligence_layer/core/chunk.py
index 22bf2646e..0bdd6b566 100644
--- a/src/intelligence_layer/core/chunk.py
+++ b/src/intelligence_layer/core/chunk.py
@@ -97,9 +97,10 @@ def __init__(
                 )
             )
         self.chunk_task = ChunkTask(
-            client, model, max_tokens_per_chunk - overlap_length_tokens
+            client, model, overlap_length_tokens // 2
         )
         self.tokenizer = client.tokenizer(model)
+        self.max_tokens_per_chunk = max_tokens_per_chunk
         self.overlap_length_tokens = overlap_length_tokens
 
     def do_run(self, input: ChunkInput, task_span: TaskSpan) -> ChunkOutput:
@@ -107,10 +108,21 @@ def do_run(self, input: ChunkInput, task_span: TaskSpan) -> ChunkOutput:
         id_chunks = self.tokenizer.encode_batch(chunks)
 
         chunk_ids = [id_chunks[0].ids]
-        for i in range(len(id_chunks) - 1):
-            chunk_ids.append(
-                chunk_ids[i][-self.overlap_length_tokens :] + id_chunks[i + 1].ids
-            )
-
+        current_chunk = chunk_ids[0]
+        last_overlap = [chunk_ids[0]]
+        for chunk in id_chunks[1:]:
+            if len(chunk.ids) + len(current_chunk) <= self.max_tokens_per_chunk:
+                current_chunk.extend(chunk.ids)
+            else:
+                current_chunk = sum(last_overlap, []) + chunk.ids
+                chunk_ids.append(current_chunk)
+
+            last_overlap.append(chunk.ids)
+            total_length = len(sum(last_overlap, []))
+            while total_length > self.overlap_length_tokens:
+                total_length -= len(last_overlap[0])
+                last_overlap = last_overlap[1:]
+
+        print(chunk_ids)
         decoded_chunks = self.tokenizer.decode_batch(chunk_ids)
         return ChunkOutput(chunks=decoded_chunks)
diff --git a/tests/core/test_chunk.py b/tests/core/test_chunk.py
index 04f278f24..b2c6ca1bc 100644
--- a/tests/core/test_chunk.py
+++ b/tests/core/test_chunk.py
@@ -18,8 +18,8 @@ def test_overlapped_chunking(
     client: AlephAlphaClientProtocol, some_large_text: str
 ) -> None:
     MODEL = "luminous-base"
-    OVERLAP = 4
-    MAX_TOKENS = 10
+    OVERLAP = 8
+    MAX_TOKENS = 16
 
     tracer = InMemoryTracer()
     task = ChunkOverlapTask(
@@ -34,17 +34,24 @@ def test_overlapped_chunking(
     output_tokenized = tokenizer.encode_batch(output.chunks)
     for chunk_index in range(len(output_tokenized) - 1):
         first = output_tokenized[chunk_index].tokens
+        print(first)
 
         assert (
             len(first) <= MAX_TOKENS + 2
-        )  # `+2` because re-tokenizing the chunk can add a few extra tokens at the beginning or end of each chunk. This is a hack.
+            # `+2` because re-tokenizing the chunk can add a few extra tokens at
+            # the beginning or end of each chunk. This is a hack.
+        )
         next = output_tokenized[chunk_index + 1].tokens
 
         found = False
-        for offset in range(OVERLAP):
+        for offset in range(len(next)-OVERLAP//2):
             if first[-OVERLAP // 2 :] != next[offset : offset + OVERLAP // 2]:
                 continue
             found = True
             break
 
+        if not found:
+            print("first = ", first)
+            print("next =  ", next)
+
         assert found

From e15956a98c138b8901853e4b7eb1b863d516ad7d Mon Sep 17 00:00:00 2001
From: Johannes Wesch <johannes.wesch@aleph-alpha.com>
Date: Tue, 6 Feb 2024 09:28:03 +0100
Subject: [PATCH 5/5] added break for recursive summary when number of partial
 summary doesn't change

---
 src/intelligence_layer/core/chunk.py          |  4 +---
 .../summarize/recursive_summarize.py          |  8 +++++---
 tests/core/test_chunk.py                      |  5 +++--
 .../summarize/test_recursive_summarize.py     | 20 +++++++++++++++++++
 4 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/src/intelligence_layer/core/chunk.py b/src/intelligence_layer/core/chunk.py
index 0bdd6b566..92b3d8c2f 100644
--- a/src/intelligence_layer/core/chunk.py
+++ b/src/intelligence_layer/core/chunk.py
@@ -96,9 +96,7 @@ def __init__(
                     overlap_length_tokens, max_tokens_per_chunk
                 )
             )
-        self.chunk_task = ChunkTask(
-            client, model, overlap_length_tokens // 2
-        )
+        self.chunk_task = ChunkTask(client, model, overlap_length_tokens // 2)
         self.tokenizer = client.tokenizer(model)
         self.max_tokens_per_chunk = max_tokens_per_chunk
         self.overlap_length_tokens = overlap_length_tokens
diff --git a/src/intelligence_layer/use_cases/summarize/recursive_summarize.py b/src/intelligence_layer/use_cases/summarize/recursive_summarize.py
index 80cebabdd..0c2e4e7ab 100644
--- a/src/intelligence_layer/use_cases/summarize/recursive_summarize.py
+++ b/src/intelligence_layer/use_cases/summarize/recursive_summarize.py
@@ -27,20 +27,22 @@ def __init__(
     def do_run(
         self, input: LongContextSummarizeInput, task_span: TaskSpan
     ) -> SummarizeOutput:
+        num_partial_summaries = 0
         text = input.text
-        loop_count = 0
         while True:
             summarize_output = self.long_context_summarize_task.run(
                 LongContextSummarizeInput(text=text, language=input.language), task_span
             )
+            if num_partial_summaries == len(summarize_output.partial_summaries):
+                break
+            num_partial_summaries = len(summarize_output.partial_summaries)
+
             num_generated_tokens = 0
             text = ""
             for partial_summary in summarize_output.partial_summaries:
                 num_generated_tokens += partial_summary.generated_tokens
                 text += partial_summary.summary + "\n"
 
-            loop_count += 1
-
             if len(summarize_output.partial_summaries) == 1:
                 break
 
diff --git a/tests/core/test_chunk.py b/tests/core/test_chunk.py
index b2c6ca1bc..888e5d912 100644
--- a/tests/core/test_chunk.py
+++ b/tests/core/test_chunk.py
@@ -37,14 +37,15 @@ def test_overlapped_chunking(
         print(first)
 
         assert (
-            len(first) <= MAX_TOKENS + 2
+            len(first)
+            <= MAX_TOKENS + 2
             # `+2` because re-tokenizing the chunk can add a few extra tokens at
             # the beginning or end of each chunk. This is a hack.
         )
         next = output_tokenized[chunk_index + 1].tokens
 
         found = False
-        for offset in range(len(next)-OVERLAP//2):
+        for offset in range(len(next) - OVERLAP // 2):
             if first[-OVERLAP // 2 :] != next[offset : offset + OVERLAP // 2]:
                 continue
             found = True
diff --git a/tests/use_cases/summarize/test_recursive_summarize.py b/tests/use_cases/summarize/test_recursive_summarize.py
index b0e95df1e..b1fd7412f 100644
--- a/tests/use_cases/summarize/test_recursive_summarize.py
+++ b/tests/use_cases/summarize/test_recursive_summarize.py
@@ -4,12 +4,18 @@
 from aleph_alpha_client import Client, CompletionRequest, CompletionResponse
 from pytest import fixture
 
+from intelligence_layer.connectors.limited_concurrency_client import (
+    AlephAlphaClientProtocol,
+)
 from intelligence_layer.core import NoOpTracer
 from intelligence_layer.use_cases import (
     LongContextHighCompressionSummarize,
     LongContextSummarizeInput,
     RecursiveSummarize,
 )
+from intelligence_layer.use_cases.summarize.steerable_long_context_summarize import (
+    SteerableLongContextSummarize,
+)
 
 
 class RecursiveCountingClient(Client):
@@ -52,6 +58,20 @@ def test_recursive_summarize_stops_when_hitting_max_tokens(
     assert "new orleans" in output.summary.lower()
 
 
+def test_recursive_summarize_stops_when_num_partial_summaries_stays_same(
+    client: AlephAlphaClientProtocol,
+) -> None:
+    max_tokens = None
+    slcs = SteerableLongContextSummarize(
+        client, model="luminous-base", max_generated_tokens=75, max_tokens_per_chunk=145
+    )
+    input = LongContextSummarizeInput(text=short_text, max_tokens=max_tokens)
+    task = RecursiveSummarize(slcs)
+    output = task.run(input, NoOpTracer())
+
+    assert output.generated_tokens > 145
+
+
 def test_recursive_summarize_stops_after_one_chunk(
     recursive_counting_client: RecursiveCountingClient,
 ) -> None: