Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Summary evaluator #90

Merged
merged 8 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,5 @@ repos:
rev: v2.2.4
hooks:
- id: codespell
args: ["-L", "newyorker,te,responde,ist,als,oder,technik,sie"]
args: ["-L", "newyorker,te,responde,ist,als,oder,technik,sie,rouge"]
exclude: '^(poetry\.lock|trace-viewer/.*|tests/connectors/retrievers/test_document_index_retriever\.py|src/intelligence_layer/use_cases/qa/multiple_chunk_qa.py|src/intelligence_layer/use_cases/summarize/.*|tests/connectors/retrievers/test_document_index_retriever\.py|src/intelligence_layer/use_cases/classify/keyword_extract.py|tests/use_cases/summarize/test_single_chunk_few_shot_summarize.py)$'
184 changes: 162 additions & 22 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ jupyter = "^1.0.0"
requests = "^2.31.0"
pytest-xdist = "^3.3.1"
langdetect = "^1.0.9"
nltk = "^3.8.1"
pycountry = "^22.3.5"
rouge = "^1.0.1"

[tool.poetry.group.dev.dependencies]
mypy = "^1.6.1"
Expand Down
2 changes: 1 addition & 1 deletion src/intelligence_layer/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
Language,
)
from .echo import EchoInput, EchoOutput, EchoTask
from .evaluator import Dataset, Evaluation, Example
from .evaluator import AggregatedEvaluation, Dataset, Evaluation, Evaluator, Example
from .explain import Explain, ExplainInput, ExplainOutput
from .prompt_template import (
Cursor,
Expand Down
17 changes: 13 additions & 4 deletions src/intelligence_layer/core/detect_language.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from typing import Mapping, NewType, Optional, Sequence, TypeVar
from dataclasses import dataclass
from typing import Mapping, Optional, Sequence, TypeVar

from langdetect import detect_langs # type: ignore
from pycountry import languages # type: ignore
from pydantic import BaseModel

from intelligence_layer.core.task import Task
Expand All @@ -11,8 +13,15 @@ class LanguageNotSupportedError(ValueError):
"""Raised in case language in the input is not compatible with the languages supported in the task"""


Language = NewType("Language", str)
"""A language identified by its `ISO 639-1 code <https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes>`_."""
@dataclass(frozen=True)
class Language:
"""A language identified by its `ISO 639-1 code <https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes>`_."""

iso_639_1: str

def get_name(self) -> Optional[str]:
language = languages.get(alpha_2=self.iso_639_1)
return language.name if language else None


Config = TypeVar("Config")
Expand All @@ -22,7 +31,7 @@ def language_config(language: Language, configs: Mapping[Language, Config]) -> C
config = configs.get(language)
if config is None:
raise LanguageNotSupportedError(
f"{language} not in ({', '.join(configs.keys())})"
f"{language.iso_639_1} not in ({', '.join(lang.iso_639_1 for lang in configs.keys())})"
)
return config

Expand Down
85 changes: 81 additions & 4 deletions src/intelligence_layer/core/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor
from typing import Generic, Optional, Sequence, TypeVar
from dataclasses import dataclass
from typing import Generic, Mapping, Optional, Sequence, TypeVar
from uuid import uuid4

import nltk # type: ignore
from nltk.tokenize import RegexpTokenizer # type: ignore
from nltk.translate.bleu_score import sentence_bleu # type: ignore
from pydantic import BaseModel, Field
from rouge import Rouge # type: ignore
from tqdm import tqdm

from intelligence_layer.core.task import Input
from intelligence_layer.core.tracer import PydanticSerializable, Tracer

nltk.download("punkt")

ExpectedOutput = TypeVar("ExpectedOutput", bound=PydanticSerializable)
Evaluation = TypeVar("Evaluation", bound=PydanticSerializable)
AggregatedEvaluation = TypeVar("AggregatedEvaluation", bound=PydanticSerializable)
Expand Down Expand Up @@ -71,7 +78,7 @@ def evaluate(
tracer: Ttracer used for tracing of tasks.
expected_output: Output that is expected from the task run with the supplied input.
Returns:
Evaluation: interface of the metrics that come from the evaluated task.
Interface of the metrics that come from the evaluated task.
"""
pass

Expand All @@ -87,7 +94,7 @@ def evaluate_dataset(
dataset: Dataset that will be used to evaluate a task.
tracer: tracer used for tracing.
Returns:
AggregatedEvaluation: The aggregated results of an evaluation run with a dataset.
The aggregated results of an evaluation run with a dataset.
"""
with ThreadPoolExecutor(max_workers=10) as executor:
evaluations = list(
Expand Down Expand Up @@ -116,6 +123,76 @@ def aggregate(self, evaluations: Sequence[Evaluation]) -> AggregatedEvaluation:
Args:
evalautions: The results from running `evaluate_dataset` with a task.
Returns:
AggregatedEvaluation: The aggregated results of an evaluation run with a dataset.
The aggregated results of an evaluation run with a dataset.
"""
pass


def tokenize(input: str) -> Sequence[str]:
"""Splits a string into a list of words.

Removes non-alphanumeric characters and lowercases the given text.

Args:
input: String to split.
Returns:
List of words.
"""
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(input.lower())
assert isinstance(tokens, list)
return tokens


def calculate_bleu(hypothesis: str, reference: str) -> float:
"""Calculates the BLEU-score for the given hypothesis and reference.

In the summarization use-case the BLEU-score roughly corresponds to the precision of the generated summary with regard to the expected summary.

Args:
hypothesis: The generation to be evaluated.
reference: The baseline for the evaluation.

Returns:
BLEU-score, float between 0 and 1. Where 1 means perfect match and 0 no overlap.
"""
hypothesis_tokens = tokenize(hypothesis)
reference_tokens = tokenize(reference)
bleu_score = sentence_bleu(
references=[reference_tokens], hypothesis=hypothesis_tokens
)
return bleu_score if isinstance(bleu_score, float) else 0.0


@dataclass
class RougeScores:
precision: float
recall: float
f1: float

@classmethod
def from_rouge_results(cls, rouge_results: Mapping[str, float]) -> "RougeScores":
return cls(
precision=rouge_results["p"],
recall=rouge_results["r"],
f1=rouge_results["f"],
)


def calculate_rouge(hypothesis: str, reference: str) -> RougeScores:
"""Calculates the ROUGE-score for the hypothesis and reference.

In the summarization use-case the ROUGE-score roughly corresponds to the recall of the generated summary with regard to the expected summary.

Args:
hypothesis: The generation to be evaluated.
reference: The baseline for the evaluation.

Returns:
ROUGE-score, which contains precision, recall and f1 metrics, all will be floats between 0 and 1. Where 1 means perfect match and 0 no overlap.
"""
hypothesis = " ".join(tokenize(hypothesis))
reference = " ".join(tokenize(reference))
rouge = Rouge()
rouge_scores = rouge.get_scores(hypothesis, reference)[0]["rouge-2"]
return RougeScores.from_rouge_results(rouge_scores)
8 changes: 2 additions & 6 deletions src/intelligence_layer/use_cases/classify/keyword_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
FewShotExample,
FewShotInput,
)
from intelligence_layer.core.detect_language import Language, LanguageNotSupportedError
from intelligence_layer.core.detect_language import Language, language_config
from intelligence_layer.core.task import Task
from intelligence_layer.core.tracer import TaskSpan

Expand Down Expand Up @@ -158,11 +158,7 @@ def __init__(
def do_run(
self, input: KeywordExtractInput, task_span: TaskSpan
) -> KeywordExtractOutput:
config = self._few_shot_configs.get(input.language)
if config is None:
raise LanguageNotSupportedError(
f"{input.language} not in ({', '.join(self._few_shot_configs.keys())})"
)
config = language_config(input.language, self._few_shot_configs)
result = self._few_shot.run(
FewShotInput(
few_shot_config=config,
Expand Down
8 changes: 2 additions & 6 deletions src/intelligence_layer/use_cases/qa/single_chunk_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from intelligence_layer.core.chunk import Chunk
from intelligence_layer.core.complete import Instruct, InstructInput, PromptOutput
from intelligence_layer.core.detect_language import Language, LanguageNotSupportedError
from intelligence_layer.core.detect_language import Language, language_config
from intelligence_layer.core.prompt_template import PromptWithMetadata
from intelligence_layer.core.task import Task
from intelligence_layer.core.text_highlight import TextHighlight, TextHighlightInput
Expand Down Expand Up @@ -109,11 +109,7 @@ def __init__(
def do_run(
self, input: SingleChunkQaInput, task_span: TaskSpan
) -> SingleChunkQaOutput:
instruction_text = self._instruction_config.get(input.language)
if not instruction_text:
raise LanguageNotSupportedError(
f"{input.language} not in ({', '.join(self._instruction_config.keys())})"
)
instruction_text = language_config(input.language, self._instruction_config)

output = self._generate_answer(
Template(instruction_text).render(
Expand Down
119 changes: 118 additions & 1 deletion src/intelligence_layer/use_cases/summarize/summarize.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from typing import Sequence
from statistics import mean
from typing import Sequence, Union

from pydantic import BaseModel

from intelligence_layer.core.chunk import Chunk
from intelligence_layer.core.detect_language import Language
from intelligence_layer.core.evaluator import Evaluator, calculate_bleu, calculate_rouge
from intelligence_layer.core.task import Task
from intelligence_layer.core.tracer import Tracer


class LongContextSummarizeInput(BaseModel):
Expand Down Expand Up @@ -53,3 +57,116 @@ class SingleChunkSummarizeOutput(BaseModel):
"""

summary: str


class SummarizeEvaluation(BaseModel):
"""The evaluation of a summarization run.

Attributes:
bleu: roughly corresponds to precision
rouge: rougly corresponds to recall
output: The actual output from the task run
"""

bleu: float
rouge: float
output: Union[SingleChunkSummarizeOutput, LongContextSummarizeOutput]


class AggregatedSummarizeEvaluation(BaseModel):
"""The aggregated evaluation of a summarization implementation against a dataset.

Attributes:
aggregate_bleu: average over BLEU-scores
aggregate_rouge: average over ROUGE-scores
evaluation: The actual evaluations
"""

aggregate_bleu: float
aggregate_rouge: float
evaluations: Sequence[SummarizeEvaluation]


class SingleChunkSummarizeEvaluator(
Evaluator[
SingleChunkSummarizeInput,
str,
SummarizeEvaluation,
AggregatedSummarizeEvaluation,
]
):
def __init__(
self, task: Task[SingleChunkSummarizeInput, SingleChunkSummarizeOutput]
) -> None:
self.task = task

def evaluate(
self,
input: SingleChunkSummarizeInput,
tracer: Tracer,
expected_output: str,
) -> SummarizeEvaluation:
summary = self.task.run(input, tracer)
bleu_score = calculate_bleu(summary.summary, expected_output)
rouge_score = calculate_rouge(summary.summary, expected_output)

return SummarizeEvaluation(
bleu=bleu_score, rouge=rouge_score.recall, output=summary
)

def aggregate(
self, evaluations: Sequence[SummarizeEvaluation]
) -> AggregatedSummarizeEvaluation:
if len(evaluations) != 0:
bleu_avg = mean(eval.bleu for eval in evaluations)
rouge_avg = mean(eval.rouge for eval in evaluations)
else:
bleu_avg = 0.0
rouge_avg = 0.0
return AggregatedSummarizeEvaluation(
aggregate_bleu=bleu_avg, aggregate_rouge=rouge_avg, evaluations=evaluations
)


class LongContextSummarizeEvaluator(
Evaluator[
LongContextSummarizeInput,
str,
SummarizeEvaluation,
AggregatedSummarizeEvaluation,
]
):
def __init__(
self, task: Task[LongContextSummarizeInput, LongContextSummarizeOutput]
) -> None:
self.task = task

def evaluate(
self,
input: LongContextSummarizeInput,
tracer: Tracer,
expected_output: str,
) -> SummarizeEvaluation:
output = self.task.run(input, tracer)
joint_summary = " ".join(
partial_summary.summary for partial_summary in output.partial_summaries
)
bleu_score = calculate_bleu(joint_summary, expected_output)
rouge_score = calculate_rouge(joint_summary, expected_output)

return SummarizeEvaluation(
bleu=bleu_score, rouge=rouge_score.recall, output=output
)

def aggregate(
self, evaluations: Sequence[SummarizeEvaluation]
) -> AggregatedSummarizeEvaluation:
if len(evaluations) != 0:
bleu_avg = mean(eval.bleu for eval in evaluations)
rouge_avg = mean(eval.rouge for eval in evaluations)
else:
bleu_avg = 0.0
rouge_avg = 0.0
return AggregatedSummarizeEvaluation(
aggregate_bleu=bleu_avg, aggregate_rouge=rouge_avg, evaluations=evaluations
)
2 changes: 1 addition & 1 deletion tests/core/test_detect_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ def test_detect_language_returns_correct_language() -> None:
tracer = NoOpTracer()
output = task.run(input, tracer)

assert output.best_fit == "en"
assert output.best_fit == Language("en")
Loading