Incorrect Evaluation Metrics Produced #187

NikhielRahulSingh · 2024-11-01T11:49:34Z

`def beir_evaluation():

actual_contexts_dict = {'0': 
                           {'0': 1, '1': 1, '2': 1, '3': 1, '4': 1, '5': 1, '6': 1, '7': 1, '8': 1, '9': 1}
                        }
results_dict = {'0': 
                   {'0': 1, '1': 1, '2': 1, '3': 1, '4': 1, '5': 1, '6': 1, '7': 1, '8': 1, '9': 1}
                }

# Evaluate retrieval metrics
ndcg, map_score, recall, precision = EvaluateRetrieval.evaluate(
    actual_contexts_dict, results_dict, k_values=[10]
)
mrr = EvaluateRetrieval.evaluate_custom(actual_contexts_dict, results_dict, [10], metric="mrr")

# Print evaluation results
print(f"\nRecall@10   : {recall['Recall@10']:.2f}")
print(f"Precision@10: {precision['P@10']:.2f}")
print(f"\nNDCG@10     : {ndcg['NDCG@10']:.2f}")
print(f"MAP@10      : {map_score['MAP@10']:.2f}")
print(f"MRR@10      : {mrr['MRR@10']:.2f}")`

Recall@10 : 0.90
Precision@10: 0.90

NDCG@10 : 0.94
MAP@10 : 0.90
MRR@10 : 1.00

This is clearly incorrect as everything should be 1. Please can this be resolved

The text was updated successfully, but these errors were encountered:

NikhielRahulSingh · 2024-11-01T13:38:07Z

from beir.retrieval.evaluation import EvaluateRetrieval

def beir_evaluation():

    actual_contexts_dict = {'0':{'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9},
                            '1':{'10': 10, '11': 11, '12': 12, '13': 13, '14': 14, '15': 15, '16': 16, '17': 17, '18': 18, '19': 19}
                            }
    results_dict = {'0':{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': 0, '6': 0, '7': 0, '8': 0, '9': 0},
                    '1':{'10': 10, '11': 11, '12': 12, '13': 13, '14': 14, '15': 15, '16': 16, '17': 17, '18': 18, '19': 19}
                    }

    # Evaluate retrieval metrics
    ndcg, map_score, recall, precision = EvaluateRetrieval.evaluate(
        actual_contexts_dict, results_dict, k_values=[10]
    )
    mrr = EvaluateRetrieval.evaluate_custom(actual_contexts_dict, results_dict, [10], metric="mrr")

    # Print evaluation results
    print(f"\nRecall@10   : {recall['Recall@10']:.2f}")
    print(f"Precision@10: {precision['P@10']:.2f}")
    print(f"\nNDCG@10     : {ndcg['NDCG@10']:.2f}")
    print(f"MAP@10      : {map_score['MAP@10']:.2f}")
    print(f"MRR@10      : {mrr['MRR@10']:.2f}")

# Run the evaluation
beir_evaluation()

Recall@10 : 1.00
Precision@10: 0.95

NDCG@10 : 1.00
MAP@10 : 1.00
MRR@10 : 1.00

Why is precision@10 not 1.00 ?

shivareddy0117 · 2024-11-16T17:49:38Z

`def beir_evaluation():

actual_contexts_dict = {'0': 
                           {'0': 1, '1': 1, '2': 1, '3': 1, '4': 1, '5': 1, '6': 1, '7': 1, '8': 1, '9': 1}
                        }
results_dict = {'0': 
                   {'0': 1, '1': 1, '2': 1, '3': 1, '4': 1, '5': 1, '6': 1, '7': 1, '8': 1, '9': 1}
                }

# Evaluate retrieval metrics
ndcg, map_score, recall, precision = EvaluateRetrieval.evaluate(
    actual_contexts_dict, results_dict, k_values=[10],  ignore_identical_ids=False
)    # please make sure to add the ignore_identical_ids parameters to False to consider all the documents or use the different Query and Documents_ID
mrr = EvaluateRetrieval.evaluate_custom(actual_contexts_dict, results_dict, [10], metric="mrr")

# Print evaluation results
print(f"\nRecall@10   : {recall['Recall@10']:.2f}")
print(f"Precision@10: {precision['P@10']:.2f}")
print(f"\nNDCG@10     : {ndcg['NDCG@10']:.2f}")
print(f"MAP@10      : {map_score['MAP@10']:.2f}")
print(f"MRR@10      : {mrr['MRR@10']:.2f}")`

please make sure to add the ignore_identical_ids parameters to False to consider all the documents or use the different Query and Documents_ID

#please go through this code for better understanding:
https://github.com/beir-cellar/beir/blob/main/beir/retrieval/evaluation.py

SighingSnow · 2024-12-24T01:37:59Z

BEIR use pytrec_eval to evaluate. Maybe the solutions can be found in pytrec_eval issuses.

NikhielRahulSingh · 2025-01-19T18:39:34Z

y = [[1,2,3,4,5,6,7,8,9,10],
     [11,12,13,14,15,16,17,18,19,20],
     [21,22,23,24,25,26,27,28,29,30]
     ]
y_hat = [[8, 1, 4, 9, 7, 3, 5, 6, 10, 2],
         [15, 13, 18, 11, 20, 14, 17, 16, 19, 12],
         [27, 28, 23, 21, 29, 30, 22, 25, 26, 24]
        ]

evaluator = Beir_Evaluator(y)
evaluator.evaluate_retrieval(y_hat)

Output
Recall@10 : 1.00
Precision@10: 1.00
NDCG@10 : 1.00
MAP@10 : 1.00
MRR@10 : 1.00

from beir.retrieval.evaluation import EvaluateRetrieval

class Beir_Evaluator():

    def __init__(self,benchmarks):
        self.benchmarks = benchmarks

    def _get_beir_format(self,results):

        return {
            str(i): {str(item): int(item) for item in sublist}
            for i, sublist in enumerate(results)
        }

    def evaluate_retrieval(self,retrieved_contexts,k_vals = [10]):
        
        actual_contexts = self._get_beir_format(self.benchmarks)
        retrieved_contexts = self._get_beir_format(retrieved_contexts)

        ndcg, map_score, recall, precision = EvaluateRetrieval.evaluate(
        actual_contexts, retrieved_contexts, k_values=k_vals
        )
        mrr = EvaluateRetrieval.evaluate_custom(actual_contexts, retrieved_contexts, k_vals, metric="mrr")

        # Print evaluation results
        for k in k_vals:
            print(f"\n{k}:")
            print(f"Recall@{k}   : {recall[f'Recall@{k}']:.2f}")
            print(f"Precision@{k}: {precision[f'P@{k}']:.2f}")
            print(f"NDCG@{k}     : {ndcg[f'NDCG@{k}']:.2f}")
            print(f"MAP@{k}      : {map_score[f'MAP@{k}']:.2f}")
            print(f"MRR@{k}      : {mrr[f'MRR@{k}']:.2f}")

Would this be a good approach and is there any benefit in creating a PR ?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Incorrect Evaluation Metrics Produced #187

Incorrect Evaluation Metrics Produced #187

NikhielRahulSingh commented Nov 1, 2024

NikhielRahulSingh commented Nov 1, 2024

shivareddy0117 commented Nov 16, 2024 •

edited

Loading

please make sure to add the ignore_identical_ids parameters to False to consider all the documents or use the different Query and Documents_ID

SighingSnow commented Dec 24, 2024

NikhielRahulSingh commented Jan 19, 2025

Incorrect Evaluation Metrics Produced #187

Incorrect Evaluation Metrics Produced #187

Comments

NikhielRahulSingh commented Nov 1, 2024

NikhielRahulSingh commented Nov 1, 2024

shivareddy0117 commented Nov 16, 2024 • edited Loading

please make sure to add the ignore_identical_ids parameters to False to consider all the documents or use the different Query and Documents_ID

SighingSnow commented Dec 24, 2024

NikhielRahulSingh commented Jan 19, 2025

shivareddy0117 commented Nov 16, 2024 •

edited

Loading