forked from usnistgov/trec_eval
-
Notifications
You must be signed in to change notification settings - Fork 0
/
trec_eval.c
619 lines (563 loc) · 19.2 KB
/
trec_eval.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
/*
Copyright (c) 2008 - Chris Buckley.
Permission is granted for use and modification of this file for
research, non-commercial purposes.
*/
static char *VersionID = VERSIONID;
static char *help_message =
"trec_eval [-h] [-q] [-m measure[.params] [-c] [-n] [-l <num>]\n\
[-D debug_level] [-N <num>] [-M <num>] [-R rel_format] [-T results_format]\n\
rel_info_file results_file \n\
\n\
Calculate and print various evaluation measures, evaluating the results \n\
in results_file against the relevance info in rel_info_file. \n\
\n\
There are a fair number of options, of which only the lower case options are \n\
normally ever used. \n\
--help:\n\
-h: Print full help message and exit. Full help message will include\n\
descriptions for any measures designated by a '-m' parameter, and\n\
input file format descriptions for any rel_info_format given by '-R'\n\
and any top results_format given by '-T.'\n\
Thus to see all info about preference measures use\n\
trec_eval -h -m all_prefs -R prefs -T trec_results \n\
--version:\n\
-v: Print version of trec_eval and exit.\n\
--query_eval_wanted:\n\
-q: In addition to summary evaluation, give evaluation for each query/topic\n\
--measure measure_name[.measure_params]:\n\
-m measure: Add 'measure' to the lists of measures to calculate and print.\n\
If 'measure' contains a '.', then the name of the measure is everything\n\
preceeding the period, and everything to the right of the period is\n\
assumed to be a list of parameters for the measure, separated by ','. \n\
There can be multiple occurrences of the -m flag.\n\
'measure' can also be a nickname for a set of measures. Current \n\
nicknames include \n\
'official': the main measures often used by TREC\n\
'all_trec': all measures calculated with the standard TREC\n\
results and rel_info format files.\n\
'set': subset of all_trec that calculates unranked values.\n\
'prefs': Measures not in all_trec that calculate preference measures.\n\
--complete_rel_info_wanted:\n\
-c: Average over the complete set of queries in the relevance judgements \n\
instead of the queries in the intersection of relevance judgements \n\
and results. Missing queries will contribute a value of 0 to all \n\
evaluation measures (which may or may not be reasonable for a \n\
particular evaluation measure, but is reasonable for standard TREC \n\
measures.) Default is off.\n\
--level_for_rel num:\n\
-l<num>: Num indicates the minimum relevance judgement value needed for \n\
a document to be called relevant. Used if rel_info_file contains \n\
relevance judged on a multi-relevance scale. Default is 1. \n\
--nosummary:\n\
-n: No summary evaluation will be printed\n\
--Debug_level num:\n\
-D <num>: Debug level. 1 and 2 used for measures, 3 and 4 for merging\n\
rel_info and results, 5 and 6 for input. Currently, num can be of the\n\
form <num>.<qid> and only qid will be evaluated with debug info printed.\n\
Default is 0.\n\
--Number_docs_in_coll num:\n\
-N <num>: Number of docs in collection Default is MAX_LONG \n\
-Max_retrieved_per_topic num:\n\
-M <num>: Max number of docs per topic to use in evaluation (discard rest). \n\
Default is MAX_LONG.\n\
--Judged_docs_only:\n\
-J: Calculate all values only over the judged (either relevant or \n\
nonrelevant) documents. All unjudged documents are removed from the \n\
retrieved set before any calculations (possibly leaving an empty set). \n\
DO NOT USE, unless you really know what you're doing - very easy to get \n\
reasonable looking numbers in a file that you will later forget were \n\
calculated with the -J flag. \n\
--Rel_info_format format:\n\
-R format: The rel_info file is assumed to be in format 'format'. Current\n\
values for 'format' include 'qrels', 'prefs', 'qrels_prefs'. Note not\n\
all measures can be calculated with all formats.\n\
--Results_format format:\n\
-T format: the top results_file is assumed to be in format 'format'. Current\n\
values for 'format' include 'trec_results'. Note not all measures can be\n\
calculated with all formats.\n\
--Zscore Zmean_file:\n\
-Z Zmean_file: Instead of printing the raw score for each measure, print\n\
a Z score instead. The score printed will be the deviation from the mean\n\
of the raw score, expressed in standard deviations, where the mean and\n\
standard deviation for each measure and query are found in Zmean_file.\n\
If mean is not in Zmeanfile for a measure and query, -1000000 is printed.\n\
Zmean_file format is ascii lines of form \n\
qid measure_name mean std_dev\n\
\n\
\n\
Standard evaluation procedure:\n\
For each of the standard TREC measures requested, a ranked list of\n\
of relevance judgements is created corresponding to each ranked retrieved doc,\n\
A rel judgement is set to -1 if the document was not in the pool (not in \n\
rel_info_file) or -2 if the document was in the pool but unjudged (some \n\
measures (infAP) allow the pool to be sampled instead of judged fully). \n\
Otherwise it is set to the value in rel_info_file. \n\
Most measures, but not all, will treat -1 or -2 the same as 0, \n\
namely nonrelevant. Note that relevance_level is used to \n\
determine if the document is relevant during score calculations. \n\
Queries for which there is no relevance information are ignored. \n\
Warning: queries for which there are relevant docs but no retrieved docs \n\
are also ignored by default. This allows systems to evaluate over subsets \n\
of the relevant docs, but means if a system improperly retrieves no docs, \n\
it will not be detected. Use the -c flag to avoid this behavior. \n\
";
#include "common.h"
#include "sysfunc.h"
#include "trec_eval.h"
#include "functions.h"
#ifdef MDEBUG
#include "mcheck.h"
#endif /* MDEBUG */
static char *usage = "Usage: trec_eval [-h] [-q] {-m measure}* trec_rel_file trec_top_file\n\
-h: Give full help information, including other options\n\
-q: In addition to summary evaluation, give evaluation for each query\n\
-m: calculate and print measures indicated by 'measure'\n\
('-m all_qrels' prints all qrels measures, '-m official' is default)\n";
extern int te_num_trec_measures;
extern TREC_MEAS *te_trec_measures[];
extern int te_num_trec_measure_nicknames;
extern TREC_MEASURE_NICKNAMES te_trec_measure_nicknames[];
extern int te_num_rel_info_format;
extern REL_INFO_FILE_FORMAT te_rel_info_format[];
extern int te_num_results_format;
extern RESULTS_FILE_FORMAT te_results_format[];
extern int te_num_form_inter_procs;
extern RESULTS_FILE_FORMAT te_form_inter_procs[];
static int mark_measure (EPI *epi, char *optarg);
static int trec_eval_help(EPI *epi);
static void get_debug_level_query (EPI *epi, char *optarg);
static int cleanup (EPI *epi);
int
main (argc, argv)
int argc;
char *argv[];
{
char *trec_results_file;
ALL_RESULTS all_results;
char *trec_rel_info_file;
ALL_REL_INFO all_rel_info;
char *zscores_file= NULL;
ALL_ZSCORES all_zscores;
EPI epi; /* Eval parameter info */
TREC_EVAL accum_eval;
TREC_EVAL q_eval;
long i,j,m;
int c;
long help_wanted = 0;
long measure_marked_flag = 0;
#ifdef MDEBUG
/* Turn on memory debugging if environment variable MALLOC_TRACE is
defined as an output file for reports.
mcheck_check_all() will dynamically check */
mcheck(NULL);
mtrace();
#endif /* MDEBUG */
/* Initialize static info before getting program optional args */
epi.query_flag = 0;
epi.average_complete_flag = 0;
epi.judged_docs_only_flag = 0;
epi.summary_flag = 1;
epi.relation_flag = 1;
epi.debug_level = 0;
epi.debug_query = NULL;
epi.num_docs_in_coll = 0;
epi.relevance_level = 1;
epi.max_num_docs_per_topic = MAXLONG;
epi.rel_info_format = "qrels";
epi.results_format = "trec_results";
epi.zscore_flag = 0;
if (NULL == (epi.meas_arg = Malloc (argc+1, MEAS_ARG)))
exit (1);
epi.meas_arg[0].measure_name = NULL;
/* Get command line options */
while (1) {
int option_index = 0;
static struct option long_options[] = {
{"help", 0, 0, 'h'},
{"version", 0, 0, 'v'},
{"query_eval_wanted", 0, 0, 'q'},
{"measure", 1, 0, 'm'},
{"complete_rel_info_wanted", 0, 0, 'c'},
{"level_for_rel", 1, 0, 'l'},
{"nosummary", 0,0,'n'},
{"Debug_level", 1, 0, 'D'},
{"Judged_docs_only", 0, 0, 'J'},
{"Number_docs_in_coll", 1, 0, 'N'},
{"Max_retrieved_per_topic", 1, 0, 'M'},
{"Rel_info_format", 1, 0, 'R'},
{"Results_format", 1, 0, 'T'},
{"Output_old_results_format", 1, 0, 'o'},
{"Zscore", 1, 0, 'Z'},
{0, 0, 0, 0},
};
c = getopt_long (argc, argv, "hvqm:cl:nD:JN:M:R:T:oZ:",
long_options, &option_index);
if (c == -1)
break;
switch (c) {
case 'h':
help_wanted++;
break;
case 'v':
fprintf (stderr, "trec_eval version %s\n", VersionID);
exit (0);
case 'q':
epi.query_flag++;
break;
case 'm':
/* Mark measure(s) indicated by optarg to be done */
if (UNDEF == mark_measure (&epi, optarg)) {
fprintf (stderr, "trec_eval: illegal measure '%s'\n", optarg);
exit (1);
}
measure_marked_flag++;
break;
case 'c':
epi.average_complete_flag++;
break;
case 'l':
epi.relevance_level = atol (optarg);
break;
case 'n':
epi.summary_flag = 0;
break;
case 'D':
get_debug_level_query (&epi, optarg);
break;
case 'J':
epi.judged_docs_only_flag++;
break;
case 'N':
epi.num_docs_in_coll = atol (optarg);
break;
case 'M':
epi.max_num_docs_per_topic = atol (optarg);
break;
case 'R':
epi.rel_info_format = optarg;
break;
case 'T':
epi.results_format = optarg;
break;
case 'o':
/* Obsolete, no longer supported */
epi.relation_flag = 0;
break;
case 'Z':
epi.zscore_flag++;
zscores_file = optarg;
break;
case '?':
default:
(void) fputs (usage,stderr);
exit (1);
}
}
if (help_wanted) {
if (UNDEF == trec_eval_help(&epi))
return (UNDEF);
exit (0);
}
if (optind + 2 != argc ) {
(void) fputs (usage,stderr);
exit (1);
}
trec_rel_info_file = argv[optind++];
trec_results_file = argv[optind++];
/* Find and get qrels and ranked results information for all queries from
the input text files */
for (i = 0; i < te_num_rel_info_format; i++) {
if (0 == strcmp (epi.rel_info_format, te_rel_info_format[i].name)) {
if (UNDEF == te_rel_info_format[i].get_file (&epi,
trec_rel_info_file,
&all_rel_info)) {
fprintf (stderr, "trec_eval: Quit in file '%s'\n",
trec_rel_info_file);
exit (2);
}
break;
}
}
if (i >= te_num_rel_info_format) {
fprintf (stderr, "trec_eval: Illegal rel_format '%s'\n",
epi.rel_info_format);
exit (2);
}
for (i = 0; i < te_num_results_format; i++) {
if (0 == strcmp (epi.results_format, te_results_format[i].name)) {
if (UNDEF == te_results_format[i].get_file (&epi,
trec_results_file,
&all_results)) {
fprintf (stderr, "trec_eval: Quit in file '%s'\n",
trec_results_file);
exit (2);
}
break;
}
}
if (i >= te_num_results_format) {
fprintf (stderr, "trec_eval: Illegal retrieval results format '%s'\n",
epi.results_format);
exit (2);
}
if (epi.zscore_flag) {
if (UNDEF == te_get_zscores (&epi, zscores_file, &all_zscores))
return (UNDEF);
}
/* Initialize all marked measures (possibly using command line info) */
if (0 == measure_marked_flag) {
/* If no measures designated on command line, first mark "official" */
if (UNDEF == mark_measure (&epi, "official")) {
fprintf (stderr, "trec_eval: illegal measure 'official'\n");
exit (1);
}
}
accum_eval = (TREC_EVAL) {"all", 0, NULL, 0, 0};
for (m = 0; m < te_num_trec_measures; m++) {
if (MEASURE_MARKED(te_trec_measures[m])) {
if (UNDEF == te_trec_measures[m]->init_meas (&epi,
te_trec_measures[m],
&accum_eval)) {
fprintf (stderr, "trec_eval: Cannot initialize measure '%s'\n",
te_trec_measures[m]->name);
exit (2);
}
}
}
/* Reserve space and initialize q_eval to be copy of accum_eval */
if (NULL == (q_eval.values = Malloc (accum_eval.num_values,
TREC_EVAL_VALUE)))
exit (3);
(void) memcpy (q_eval.values, accum_eval.values,
accum_eval.num_values * sizeof (TREC_EVAL_VALUE));
q_eval.num_values = accum_eval.num_values;
q_eval.num_queries = 0;
/* For each topic which has both qrels and top results information,
calculate, possibly print (if query_flag), and accumulate
evaluation measures. */
for (i = 0; i < all_results.num_q_results; i++) {
/* If debugging a particular query, then skip all others */
if (epi.debug_query &&
strcmp (epi.debug_query, all_results.results[i].qid))
continue;
/* Find rel info for this query (skip if no rel info) */
for (j = 0; j < all_rel_info.num_q_rels; j++) {
if (0 == strcmp (all_results.results[i].qid,
all_rel_info.rel_info[j].qid))
break;
}
if (j >= all_rel_info.num_q_rels)
continue;
/* zero out all measures for new query */
for (m = 0; m < q_eval.num_values; m++)
q_eval.values[m].value = 0;
q_eval.qid = all_results.results[i].qid;
/* Calculate all measure scores */
for (m = 0; m < te_num_trec_measures; m++) {
if (MEASURE_REQUESTED(te_trec_measures[m])) {
if (UNDEF == te_trec_measures[m]->calc_meas (&epi,
&all_rel_info.rel_info[j],
&all_results.results[i],
te_trec_measures[m],
&q_eval)) {
fprintf (stderr,"trec_eval: Can't calculate measure '%s'\n",
te_trec_measures[m]->name);
exit (4);
}
}
}
/* Convert values to zscores if requested */
if (epi.zscore_flag) {
if (UNDEF == te_convert_to_zscore (&all_zscores, &q_eval))
return (UNDEF);
}
/* Add this topics value to accumulated values, and possibly print */
for (m = 0; m < te_num_trec_measures; m++) {
if (MEASURE_REQUESTED(te_trec_measures[m])) {
if (UNDEF == te_trec_measures[m]->acc_meas (&epi,
te_trec_measures[m],
&q_eval,
&accum_eval)) {
fprintf(stderr,"trec_eval: Can't accumulate measure '%s'\n",
te_trec_measures[m]->name);
exit (5);
}
if (epi.query_flag &&
UNDEF == te_trec_measures[m]->print_single_meas (&epi,
te_trec_measures[m],
&q_eval)) {
fprintf(stderr,
"trec_eval: Can't print query measure '%s'\n",
te_trec_measures[m]->name);
exit (6);
}
}
}
accum_eval.num_queries++;
}
if (accum_eval.num_queries == 0) {
fprintf (stderr,
"trec_eval: No queries with both results and relevance info\n");
exit (7);
}
/* Calculate final averages, and print (if desired) */
/* Note that averages may depend on the entire rel_info data if
epi.average_complete_flag is set */
for (m = 0; m < te_num_trec_measures; m++) {
if (MEASURE_REQUESTED(te_trec_measures[m])) {
if (UNDEF == te_trec_measures[m]->calc_avg_meas
(&epi, te_trec_measures[m],
&all_rel_info, &accum_eval) ||
UNDEF == te_trec_measures[m]->print_final_and_cleanup_meas
(&epi, te_trec_measures[m], &accum_eval)) {
fprintf (stderr,"trec_eval: Can't print measure '%s'\n",
te_trec_measures[m]->name);
exit (8);
}
}
}
if (UNDEF == cleanup (&epi)) {
fprintf (stderr,"trec_eval: cleanup failed\n");
exit (10);
}
Free (q_eval.values);
Free (accum_eval.values);
Free (epi.meas_arg);
exit (0);
}
static int
add_meas_arg_info (EPI *epi, char *meas, char *param)
{
long i;
/* Guaranteed space since malloc'd argc+1 entries and can't be more
than one entry per command line argument */
/* Find non-NULL entry */
i = 0;
while (epi->meas_arg[i].measure_name) i++;
epi->meas_arg[i].measure_name = meas;
epi->meas_arg[i].parameters = param;
/* Ensure measure_name exists, has non_NULL parameter and mark it
to be calculated */
if (*param == '\0') {
fprintf (stderr, "trec_eval: improper measure in parameter '%s'\n",
epi->meas_arg[i].measure_name);
return (UNDEF);
}
epi->meas_arg[i+1].measure_name = NULL;
return (1);
}
static int
mark_single_measure (char *optarg)
{
long i;
for (i = 0; i < te_num_trec_measures; i++) {
if (0 == strcmp (optarg, te_trec_measures[i]->name)) {
te_trec_measures[i]->eval_index = -2;
break;
}
}
if (i >= te_num_trec_measures)
return (UNDEF);
return (1);
}
static int
mark_measure (EPI *epi, char *optarg)
{
long i;
char *ptr;
ptr = optarg;
while (*ptr && *ptr != '.') ptr++;
if (*ptr == '.') {
*ptr++ = '\0';
if (UNDEF == add_meas_arg_info (epi, optarg, ptr))
return (UNDEF);
}
for (i = 0; i < te_num_trec_measure_nicknames; i++) {
if (0 == strcmp (optarg, te_trec_measure_nicknames[i].name)) {
/* Have found nickname. Mark all real names it refers to */
char **name = te_trec_measure_nicknames[i].name_list;
while (*name) {
if (UNDEF == mark_single_measure (*name))
return (UNDEF);
name++;
}
return (1);
}
}
/* optarg did not match any nickname, mark measure directly */
return (mark_single_measure (optarg));
}
static int
trec_eval_help(EPI *epi)
{
long m, f;
long m_marked = 0;
printf ("%s\n-----------------------\n", help_message);
for (f = 0; f < te_num_results_format; f++) {
if (0 == strcmp (te_results_format[f].name, epi->results_format))
break;
}
if (f < te_num_results_format)
printf ("%s\n-----------------------\n",
te_results_format[f].explanation);
for (f = 0; f < te_num_rel_info_format; f++) {
if (0 == strcmp (te_rel_info_format[f].name, epi->rel_info_format))
break;
}
if (f < te_num_rel_info_format)
printf ("%s\n-----------------------\n",
te_rel_info_format[f].explanation);
printf ("Individual measure documentation for requested measures\n");
for (m = 0; m < te_num_trec_measures; m++) {
if (MEASURE_MARKED(te_trec_measures[m])) {
m_marked = 1;
printf ("%s\n%s",
te_trec_measures[m]->name,
te_trec_measures[m]->explanation);
}
}
if (! m_marked)
printf ("-- No measures indicated.\n Request measure documentation using <-m measure> on command line\n");
return (1);
}
static void
get_debug_level_query ( EPI *epi, char *optarg)
{
char *ptr;
for (ptr = optarg; *ptr && *ptr != '.'; ptr++)
;
if (*ptr) {
*ptr++ = '\0';
epi->debug_query = ptr;
}
epi->debug_level = atol (optarg);
}
static int
cleanup (EPI *epi)
{
long i;
for (i = 0; i < te_num_rel_info_format; i++) {
if (0 == strcmp (epi->rel_info_format, te_rel_info_format[i].name)) {
if (UNDEF == te_rel_info_format[i].cleanup())
return (UNDEF);
break;
}
}
for (i = 0; i < te_num_results_format; i++) {
if (0 == strcmp (epi->results_format, te_results_format[i].name)) {
if (UNDEF == te_results_format[i].cleanup ())
return (UNDEF);
break;
}
}
for (i = 0; i < te_num_form_inter_procs; i++) {
if (UNDEF == te_form_inter_procs[i].cleanup ())
return (UNDEF);
}
if (epi->zscore_flag) {
if (UNDEF == te_get_zscores_cleanup())
return (UNDEF);
}
return (1);
}