Aleph-Alpha · benbrandt · Nov 14, 2023 · Nov 13, 2023 · Nov 13, 2023 · Nov 13, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -32,5 +32,5 @@ repos:
     rev: v2.2.4
     hooks:
       - id: codespell
-        args: ["-L", "newyorker,te,responde,ist,als,oder,technik,sie"]
+        args: ["-L", "newyorker,te,responde,ist,als,oder,technik,sie,rouge"]
         exclude: '^(poetry\.lock|trace-viewer/.*|tests/connectors/retrievers/test_document_index_retriever\.py|src/intelligence_layer/use_cases/qa/multiple_chunk_qa.py|src/intelligence_layer/use_cases/summarize/.*|tests/connectors/retrievers/test_document_index_retriever\.py|src/intelligence_layer/use_cases/classify/keyword_extract.py|tests/use_cases/summarize/test_single_chunk_few_shot_summarize.py)$'
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,6 +27,9 @@ jupyter = "^1.0.0"
 requests = "^2.31.0"
 pytest-xdist = "^3.3.1"
 langdetect = "^1.0.9"
+nltk = "^3.8.1"
+pycountry = "^22.3.5"
+rouge = "^1.0.1"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "^1.6.1"

diff --git a/src/intelligence_layer/core/__init__.py b/src/intelligence_layer/core/__init__.py
@@ -14,7 +14,7 @@
     Language,
 )
 from .echo import EchoInput, EchoOutput, EchoTask
-from .evaluator import Dataset, Evaluation, Example
+from .evaluator import AggregatedEvaluation, Dataset, Evaluation, Evaluator, Example
 from .explain import Explain, ExplainInput, ExplainOutput
 from .prompt_template import (
     Cursor,

diff --git a/src/intelligence_layer/core/detect_language.py b/src/intelligence_layer/core/detect_language.py
@@ -1,6 +1,8 @@
-from typing import Mapping, NewType, Optional, Sequence, TypeVar
+from dataclasses import dataclass
+from typing import Mapping, Optional, Sequence, TypeVar
 
 from langdetect import detect_langs  # type: ignore
+from pycountry import languages  # type: ignore
 from pydantic import BaseModel
 
 from intelligence_layer.core.task import Task
@@ -11,8 +13,15 @@ class LanguageNotSupportedError(ValueError):
     """Raised in case language in the input is not compatible with the languages supported in the task"""
 
 
-Language = NewType("Language", str)
-"""A language identified by its `ISO 639-1 code <https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes>`_."""
+@dataclass(frozen=True)
+class Language:
+    """A language identified by its `ISO 639-1 code <https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes>`_."""
+
+    iso_639_1: str
+
+    def get_name(self) -> Optional[str]:
+        language = languages.get(alpha_2=self.iso_639_1)
+        return language.name if language else None
 
 
 Config = TypeVar("Config")
@@ -22,7 +31,7 @@ def language_config(language: Language, configs: Mapping[Language, Config]) -> C
     config = configs.get(language)
     if config is None:
         raise LanguageNotSupportedError(
-            f"{language} not in ({', '.join(configs.keys())})"
+            f"{language.iso_639_1} not in ({', '.join(lang.iso_639_1 for lang in configs.keys())})"
         )
     return config
 

diff --git a/src/intelligence_layer/core/evaluator.py b/src/intelligence_layer/core/evaluator.py
@@ -1,14 +1,21 @@
 from abc import ABC, abstractmethod
 from concurrent.futures import ThreadPoolExecutor
-from typing import Generic, Optional, Sequence, TypeVar
+from dataclasses import dataclass
+from typing import Generic, Mapping, Optional, Sequence, TypeVar
 from uuid import uuid4
 
+import nltk  # type: ignore
+from nltk.tokenize import RegexpTokenizer  # type: ignore
+from nltk.translate.bleu_score import sentence_bleu  # type: ignore
 from pydantic import BaseModel, Field
+from rouge import Rouge  # type: ignore
 from tqdm import tqdm
 
 from intelligence_layer.core.task import Input
 from intelligence_layer.core.tracer import PydanticSerializable, Tracer
 
+nltk.download("punkt")
+
 ExpectedOutput = TypeVar("ExpectedOutput", bound=PydanticSerializable)
 Evaluation = TypeVar("Evaluation", bound=PydanticSerializable)
 AggregatedEvaluation = TypeVar("AggregatedEvaluation", bound=PydanticSerializable)
@@ -71,7 +78,7 @@ def evaluate(
             tracer: Ttracer used for tracing of tasks.
             expected_output: Output that is expected from the task run with the supplied input.
         Returns:
-            Evaluation: interface of the metrics that come from the evaluated task.
+            Interface of the metrics that come from the evaluated task.
         """
         pass
 
@@ -87,7 +94,7 @@ def evaluate_dataset(
             dataset: Dataset that will be used to evaluate a task.
             tracer: tracer used for tracing.
         Returns:
-            AggregatedEvaluation: The aggregated results of an evaluation run with a dataset.
+            The aggregated results of an evaluation run with a dataset.
         """
         with ThreadPoolExecutor(max_workers=10) as executor:
             evaluations = list(
@@ -116,6 +123,76 @@ def aggregate(self, evaluations: Sequence[Evaluation]) -> AggregatedEvaluation:
         Args:
             evalautions: The results from running `evaluate_dataset` with a task.
         Returns:
-            AggregatedEvaluation: The aggregated results of an evaluation run with a dataset.
+            The aggregated results of an evaluation run with a dataset.
         """
         pass
+
+
+def tokenize(input: str) -> Sequence[str]:
+    """Splits a string into a list of words.
+
+    Removes non-alphanumeric characters and lowercases the given text.
+
+    Args:
+        input: String to split.
+    Returns:
+        List of words.
+    """
+    tokenizer = RegexpTokenizer(r"\w+")
+    tokens = tokenizer.tokenize(input.lower())
+    assert isinstance(tokens, list)
+    return tokens
+
+
+def calculate_bleu(hypothesis: str, reference: str) -> float:
+    """Calculates the BLEU-score for the given hypothesis and reference.
+
+    In the summarization use-case the BLEU-score roughly corresponds to the precision of the generated summary with regard to the expected summary.
+
+    Args:
+        hypothesis: The generation to be evaluated.
+        reference: The baseline for the evaluation.
+
+    Returns:
+        BLEU-score, float between 0 and 1. Where 1 means perfect match and 0 no overlap.
+    """
+    hypothesis_tokens = tokenize(hypothesis)
+    reference_tokens = tokenize(reference)
+    bleu_score = sentence_bleu(
+        references=[reference_tokens], hypothesis=hypothesis_tokens
+    )
+    return bleu_score if isinstance(bleu_score, float) else 0.0
+
+
+@dataclass
+class RougeScores:
+    precision: float
+    recall: float
+    f1: float
+
+    @classmethod
+    def from_rouge_results(cls, rouge_results: Mapping[str, float]) -> "RougeScores":
+        return cls(
+            precision=rouge_results["p"],
+            recall=rouge_results["r"],
+            f1=rouge_results["f"],
+        )
+
+
+def calculate_rouge(hypothesis: str, reference: str) -> RougeScores:
+    """Calculates the ROUGE-score for the hypothesis and reference.
+
+    In the summarization use-case the ROUGE-score roughly corresponds to the recall of the generated summary with regard to the expected summary.
+
+    Args:
+        hypothesis: The generation to be evaluated.
+        reference: The baseline for the evaluation.
+
+    Returns:
+        ROUGE-score, which contains precision, recall and f1 metrics, all will be floats between 0 and 1. Where 1 means perfect match and 0 no overlap.
+    """
+    hypothesis = " ".join(tokenize(hypothesis))
+    reference = " ".join(tokenize(reference))
+    rouge = Rouge()
+    rouge_scores = rouge.get_scores(hypothesis, reference)[0]["rouge-2"]
+    return RougeScores.from_rouge_results(rouge_scores)
diff --git a/src/intelligence_layer/use_cases/classify/keyword_extract.py b/src/intelligence_layer/use_cases/classify/keyword_extract.py
@@ -10,7 +10,7 @@
     FewShotExample,
     FewShotInput,
 )
-from intelligence_layer.core.detect_language import Language, LanguageNotSupportedError
+from intelligence_layer.core.detect_language import Language, language_config
 from intelligence_layer.core.task import Task
 from intelligence_layer.core.tracer import TaskSpan
 
@@ -158,11 +158,7 @@ def __init__(
     def do_run(
         self, input: KeywordExtractInput, task_span: TaskSpan
     ) -> KeywordExtractOutput:
-        config = self._few_shot_configs.get(input.language)
-        if config is None:
-            raise LanguageNotSupportedError(
-                f"{input.language} not in ({', '.join(self._few_shot_configs.keys())})"
-            )
+        config = language_config(input.language, self._few_shot_configs)
         result = self._few_shot.run(
             FewShotInput(
                 few_shot_config=config,

diff --git a/src/intelligence_layer/use_cases/qa/single_chunk_qa.py b/src/intelligence_layer/use_cases/qa/single_chunk_qa.py
@@ -6,7 +6,7 @@
 
 from intelligence_layer.core.chunk import Chunk
 from intelligence_layer.core.complete import Instruct, InstructInput, PromptOutput
-from intelligence_layer.core.detect_language import Language, LanguageNotSupportedError
+from intelligence_layer.core.detect_language import Language, language_config
 from intelligence_layer.core.prompt_template import PromptWithMetadata
 from intelligence_layer.core.task import Task
 from intelligence_layer.core.text_highlight import TextHighlight, TextHighlightInput
@@ -109,11 +109,7 @@ def __init__(
     def do_run(
         self, input: SingleChunkQaInput, task_span: TaskSpan
     ) -> SingleChunkQaOutput:
-        instruction_text = self._instruction_config.get(input.language)
-        if not instruction_text:
-            raise LanguageNotSupportedError(
-                f"{input.language} not in ({', '.join(self._instruction_config.keys())})"
-            )
+        instruction_text = language_config(input.language, self._instruction_config)
 
         output = self._generate_answer(
             Template(instruction_text).render(

diff --git a/src/intelligence_layer/use_cases/summarize/summarize.py b/src/intelligence_layer/use_cases/summarize/summarize.py
@@ -1,9 +1,13 @@
-from typing import Sequence
+from statistics import mean
+from typing import Sequence, Union
 
 from pydantic import BaseModel
 
 from intelligence_layer.core.chunk import Chunk
 from intelligence_layer.core.detect_language import Language
+from intelligence_layer.core.evaluator import Evaluator, calculate_bleu, calculate_rouge
+from intelligence_layer.core.task import Task
+from intelligence_layer.core.tracer import Tracer
 
 
 class LongContextSummarizeInput(BaseModel):
@@ -53,3 +57,116 @@ class SingleChunkSummarizeOutput(BaseModel):
     """
 
     summary: str
+
+
+class SummarizeEvaluation(BaseModel):
+    """The evaluation of a summarization run.
+
+    Attributes:
+        bleu: roughly corresponds to precision
+        rouge: rougly corresponds to recall
+        output: The actual output from the task run
+    """
+
+    bleu: float
+    rouge: float
+    output: Union[SingleChunkSummarizeOutput, LongContextSummarizeOutput]
+
+
+class AggregatedSummarizeEvaluation(BaseModel):
+    """The aggregated evaluation of a summarization implementation against a dataset.
+
+    Attributes:
+        aggregate_bleu: average over BLEU-scores
+        aggregate_rouge: average over ROUGE-scores
+        evaluation: The actual evaluations
+    """
+
+    aggregate_bleu: float
+    aggregate_rouge: float
+    evaluations: Sequence[SummarizeEvaluation]
+
+
+class SingleChunkSummarizeEvaluator(
+    Evaluator[
+        SingleChunkSummarizeInput,
+        str,
+        SummarizeEvaluation,
+        AggregatedSummarizeEvaluation,
+    ]
+):
+    def __init__(
+        self, task: Task[SingleChunkSummarizeInput, SingleChunkSummarizeOutput]
+    ) -> None:
+        self.task = task
+
+    def evaluate(
+        self,
+        input: SingleChunkSummarizeInput,
+        tracer: Tracer,
+        expected_output: str,
+    ) -> SummarizeEvaluation:
+        summary = self.task.run(input, tracer)
+        bleu_score = calculate_bleu(summary.summary, expected_output)
+        rouge_score = calculate_rouge(summary.summary, expected_output)
+
+        return SummarizeEvaluation(
+            bleu=bleu_score, rouge=rouge_score.recall, output=summary
+        )
+
+    def aggregate(
+        self, evaluations: Sequence[SummarizeEvaluation]
+    ) -> AggregatedSummarizeEvaluation:
+        if len(evaluations) != 0:
+            bleu_avg = mean(eval.bleu for eval in evaluations)
+            rouge_avg = mean(eval.rouge for eval in evaluations)
+        else:
+            bleu_avg = 0.0
+            rouge_avg = 0.0
+        return AggregatedSummarizeEvaluation(
+            aggregate_bleu=bleu_avg, aggregate_rouge=rouge_avg, evaluations=evaluations
+        )
+
+
+class LongContextSummarizeEvaluator(
+    Evaluator[
+        LongContextSummarizeInput,
+        str,
+        SummarizeEvaluation,
+        AggregatedSummarizeEvaluation,
+    ]
+):
+    def __init__(
+        self, task: Task[LongContextSummarizeInput, LongContextSummarizeOutput]
+    ) -> None:
+        self.task = task
+
+    def evaluate(
+        self,
+        input: LongContextSummarizeInput,
+        tracer: Tracer,
+        expected_output: str,
+    ) -> SummarizeEvaluation:
+        output = self.task.run(input, tracer)
+        joint_summary = " ".join(
+            partial_summary.summary for partial_summary in output.partial_summaries
+        )
+        bleu_score = calculate_bleu(joint_summary, expected_output)
+        rouge_score = calculate_rouge(joint_summary, expected_output)
+
+        return SummarizeEvaluation(
+            bleu=bleu_score, rouge=rouge_score.recall, output=output
+        )
+
+    def aggregate(
+        self, evaluations: Sequence[SummarizeEvaluation]
+    ) -> AggregatedSummarizeEvaluation:
+        if len(evaluations) != 0:
+            bleu_avg = mean(eval.bleu for eval in evaluations)
+            rouge_avg = mean(eval.rouge for eval in evaluations)
+        else:
+            bleu_avg = 0.0
+            rouge_avg = 0.0
+        return AggregatedSummarizeEvaluation(
+            aggregate_bleu=bleu_avg, aggregate_rouge=rouge_avg, evaluations=evaluations
+        )
diff --git a/tests/core/test_detect_language.py b/tests/core/test_detect_language.py
@@ -16,4 +16,4 @@ def test_detect_language_returns_correct_language() -> None:
     tracer = NoOpTracer()
     output = task.run(input, tracer)
 
-    assert output.best_fit == "en"
+    assert output.best_fit == Language("en")