Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More RAG and Summarization Metrics #701

Merged
merged 44 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
523bf5b
change context to contexts where appropriate, minor improvements to t…
Jul 30, 2024
b1c5794
add unit tests for Contexts
Jul 30, 2024
244a171
use openai and mistral client names in integration tests to improve c…
Jul 30, 2024
24e4a50
change contexts to context_list in most places, standardization of fo…
Jul 30, 2024
7aa1151
reorganize external tests
Jul 30, 2024
d0bd4bc
add bias to external integration tests
Jul 31, 2024
e6f373f
add context relevance and faithfulness to external integration tests
Jul 31, 2024
a845b2b
add hallucination to external integration tests
Jul 31, 2024
f833458
add toxicity to external integration tests
Jul 31, 2024
74831aa
fix hallucination functional tests and external integration tests
Jul 31, 2024
6b1fc7d
slight improvement to hallucination external integration test
Jul 31, 2024
e36e355
Merge branch 'main' into improve_text_gen_instructions_and_tests
Jul 31, 2024
74a2c7a
minor cleanup and additions
Jul 31, 2024
37e7821
create version of llm instructions with and without analysis
Aug 2, 2024
e2a702e
Merge branch 'main' into improve_text_gen_instructions_and_tests
Aug 2, 2024
b778059
grammar check context and contexts
Aug 2, 2024
b632ac4
more grammer context vs contexts
Aug 2, 2024
b2952d6
minor name changes and comment changes
Aug 3, 2024
be540b8
minor name changes and comment changes part 2
Aug 3, 2024
9b4dec0
move BadValueInTestLLMClientsError to test_llm_clients.py
Aug 5, 2024
5544aa1
docstrings for llm instructions
Aug 5, 2024
51412a5
Merge branch 'main' into improve_text_gen_instructions_and_tests
Aug 5, 2024
ceba785
rename migrations after merge with main
Aug 5, 2024
4b70278
reorganize _compute_text_generation_metrics to prepare for more RAG m…
Aug 5, 2024
d6907c6
add AnswerCorrectness with tests
Aug 6, 2024
241537c
merge with main
Aug 15, 2024
ae8164c
merge with main
Aug 16, 2024
3ff96de
add ContextPrecision with tests
Aug 16, 2024
0f0bff3
add ContextRecall with tests
Aug 21, 2024
1ee343e
Merge branch 'main' into more_rag_and_summarization_metrics
Aug 21, 2024
a15ad81
external tests for AnswerCorrectness, ContextPrecision, ContextRecall…
Aug 21, 2024
a3937a0
Merge branch 'main' into more_rag_and_summarization_metrics
Aug 22, 2024
7770c33
add AnswerCorrectness, ContextPrecision, ContextRecall and ContextRel…
Aug 22, 2024
dfc7552
rework coherence to be summarization specific
Aug 22, 2024
d300dd6
example notebook and docs updated for coherence
Aug 22, 2024
f6e9d21
Merge branch 'main' into more_rag_and_summarization_metrics
Aug 22, 2024
002f56d
review - update code coverage, docs
Aug 26, 2024
e04337e
Merge branch 'main' into more_rag_and_summarization_metrics
Aug 26, 2024
4abc884
Improve clarity of summarization docs
Aug 26, 2024
168a7ae
rename Coherence to SummaryCoherence, metric docs additions
Aug 27, 2024
ee8db6e
rename context_list to ordered_context_list for ContextPrecision
Aug 27, 2024
dca76ea
rework ContextPrecision to aggregate over ground truths differently
Aug 27, 2024
0c9f859
Merge branch 'main' into more_rag_and_summarization_metrics
Aug 27, 2024
bafb1c2
Merge branch 'main' into more_rag_and_summarization_metrics
Aug 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
632 changes: 519 additions & 113 deletions api/tests/functional-tests/backend/core/test_llm_clients.py

Large diffs are not rendered by default.

424 changes: 368 additions & 56 deletions api/tests/functional-tests/backend/metrics/test_text_generation.py

Large diffs are not rendered by default.

20 changes: 9 additions & 11 deletions api/tests/unit-tests/schemas/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,12 @@ def test_EvaluationParameters(llm_api_params):
schemas.EvaluationParameters(
task_type=enums.TaskType.TEXT_GENERATION,
metrics_to_return=[
MetricType.AnswerCorrectness,
MetricType.AnswerRelevance,
MetricType.Bias,
MetricType.BLEU,
MetricType.Coherence,
MetricType.ContextPrecision,
MetricType.ContextRecall,
MetricType.ContextRelevance,
MetricType.Faithfulness,
MetricType.Hallucination,
Expand All @@ -76,10 +78,12 @@ def test_EvaluationParameters(llm_api_params):
schemas.EvaluationParameters(
task_type=enums.TaskType.TEXT_GENERATION,
metrics_to_return=[
MetricType.AnswerCorrectness,
MetricType.AnswerRelevance,
MetricType.Bias,
MetricType.BLEU,
MetricType.Coherence,
MetricType.ContextPrecision,
MetricType.ContextRecall,
MetricType.ContextRelevance,
MetricType.Faithfulness,
MetricType.Hallucination,
Expand Down Expand Up @@ -167,15 +171,13 @@ def test_EvaluationParameters(llm_api_params):
)

# If any llm-guided metrics are requested, then llm_api_params must be provided.
# Purposely did a subset of metrics_to_return, to increase test variation.
with pytest.raises(ValidationError):
schemas.EvaluationParameters(
task_type=enums.TaskType.TEXT_GENERATION,
metrics_to_return=[
MetricType.AnswerRelevance,
MetricType.Bias,
MetricType.BLEU,
MetricType.Coherence,
MetricType.ContextRelevance,
MetricType.Faithfulness,
MetricType.Hallucination,
MetricType.ROUGE,
Expand All @@ -195,19 +197,15 @@ def test_EvaluationParameters(llm_api_params):
bleu_weights=[1.1, 0.3, -0.5, 0.1],
)

# BLEU weights must sum to 1.
# BLEU weights must sum to 1. metrics_to_return here are all metrics applicable to summarization.
with pytest.raises(ValidationError):
schemas.EvaluationParameters(
task_type=enums.TaskType.TEXT_GENERATION,
metrics_to_return=[
MetricType.AnswerRelevance,
MetricType.Bias,
MetricType.BLEU,
MetricType.Coherence,
MetricType.ContextRelevance,
MetricType.Faithfulness,
MetricType.Hallucination,
MetricType.ROUGE,
MetricType.SummaryCoherence,
MetricType.Toxicity,
],
llm_api_params=llm_api_params,
Expand Down
160 changes: 146 additions & 14 deletions api/tests/unit-tests/schemas/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,50 @@ def test_DetailedPrecisionRecallCurve():
}


def test_AnswerCorrectnessMetric():
metric = schemas.AnswerCorrectnessMetric(
value=0.52,
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"prediction": "some prediction",
},
)

with pytest.raises(ValidationError):
schemas.AnswerCorrectnessMetric(
value=None, # type: ignore
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"prediction": "some prediction",
},
)

with pytest.raises(ValidationError):
schemas.AnswerCorrectnessMetric(
value={"key": 0.3}, # type: ignore
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"prediction": "some prediction",
},
)

with pytest.raises(ValidationError):
schemas.AnswerCorrectnessMetric(
value=0.0, # type: ignore
parameters="not a valid parameter", # type: ignore
)

assert all(
[
key in ["value", "type", "evaluation_id", "parameters"]
for key in metric.db_mapping(evaluation_id=1)
]
)


def test_AnswerRelevanceMetric():
metric = schemas.AnswerRelevanceMetric(
value=0.421,
Expand Down Expand Up @@ -581,49 +625,83 @@ def test_BLEUMetric():
)


def test_CoherenceMetric():
metric = schemas.CoherenceMetric(
value=3,
def test_ContextPrecisionMetric():
metric = schemas.ContextPrecisionMetric(
value=0.873,
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"prediction": "some prediction",
"context_list": ["context1", "context2"],
},
)

with pytest.raises(ValidationError):
schemas.CoherenceMetric(
schemas.ContextPrecisionMetric(
value=None, # type: ignore
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"prediction": "some prediction",
"context_list": ["context1", "context2"],
},
)

with pytest.raises(ValidationError):
schemas.CoherenceMetric(
value=2.5, # type: ignore
schemas.ContextPrecisionMetric(
value={"key": 0.222}, # type: ignore
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"prediction": "some prediction",
"context_list": ["context1", "context2"],
},
)

with pytest.raises(ValidationError):
schemas.CoherenceMetric(
value={"key": 4}, # type: ignore
schemas.ContextPrecisionMetric(
value=0.501, # type: ignore
parameters="not a valid parameter", # type: ignore
)

assert all(
[
key in ["value", "type", "evaluation_id", "parameters"]
for key in metric.db_mapping(evaluation_id=1)
]
)


def test_ContextRecallMetric():
metric = schemas.ContextRecallMetric(
value=0.8,
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"context_list": ["context1", "context2"],
},
)

with pytest.raises(ValidationError):
schemas.ContextRecallMetric(
value="value", # type: ignore
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"prediction": "some prediction",
"context_list": ["context1", "context2"],
},
)

with pytest.raises(ValidationError):
schemas.CoherenceMetric(
value=5, # type: ignore
schemas.ContextRecallMetric(
value={"key": 0.5}, # type: ignore
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"context_list": ["context1", "context2"],
},
)

with pytest.raises(ValidationError):
schemas.ContextRecallMetric(
value=0.6, # type: ignore
parameters="not a valid parameter", # type: ignore
)

Expand Down Expand Up @@ -838,6 +916,60 @@ def test_ROUGEMetric():
)


def test_SummaryCoherenceMetric():
metric = schemas.SummaryCoherenceMetric(
value=3,
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"prediction": "some summary",
},
)

with pytest.raises(ValidationError):
schemas.SummaryCoherenceMetric(
value=None, # type: ignore
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"prediction": "some summary",
},
)

with pytest.raises(ValidationError):
schemas.SummaryCoherenceMetric(
value=2.5, # type: ignore
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"prediction": "some summary",
},
)

with pytest.raises(ValidationError):
schemas.SummaryCoherenceMetric(
value={"key": 4}, # type: ignore
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"prediction": "some summary",
},
)

with pytest.raises(ValidationError):
schemas.SummaryCoherenceMetric(
value=5, # type: ignore
parameters="not a valid parameter", # type: ignore
)

assert all(
[
key in ["value", "type", "evaluation_id", "parameters"]
for key in metric.db_mapping(evaluation_id=1)
]
)


def test_ToxicityMetric():
metric = schemas.ToxicityMetric(
value=0.4,
Expand Down
Loading
Loading