From 1ab16c1fde95b7bf79733baa289a08da4dd53549 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 4 Jan 2022 14:32:47 +0100
Subject: [PATCH 01/16] Import seqio related code from promptsource

---
 requirements.txt                 |   1 +
 setup.py                         |   5 +
 t0/seqio_tasks/__init__.py       |   3 +
 t0/seqio_tasks/experiment_D4.csv | 242 ++++++++++++++++++
 t0/seqio_tasks/tasks.py          | 421 +++++++++++++++++++++++++++++++
 t0/seqio_tasks/utils.py          |  77 ++++++
 6 files changed, 749 insertions(+)
 create mode 100644 t0/seqio_tasks/__init__.py
 create mode 100644 t0/seqio_tasks/experiment_D4.csv
 create mode 100644 t0/seqio_tasks/tasks.py
 create mode 100644 t0/seqio_tasks/utils.py

diff --git a/requirements.txt b/requirements.txt
index 1d2805a..eadc3ca 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,7 @@ transformers
 datasets
 jinja2
 torch
+seqio
 sentencepiece
 protobuf
 scikit-learn
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 1e65553..f577325 100644
--- a/setup.py
+++ b/setup.py
@@ -22,4 +22,9 @@
     packages=find_packages(),
     license="Apache Software License 2.0",
     long_description=readme,
+    package_data={
+        "": [
+            "seqio_tasks/experiment_D4.csv",
+        ]
+    }
 )
\ No newline at end of file
diff --git a/t0/seqio_tasks/__init__.py b/t0/seqio_tasks/__init__.py
new file mode 100644
index 0000000..8a29711
--- /dev/null
+++ b/t0/seqio_tasks/__init__.py
@@ -0,0 +1,3 @@
+"""Tools for loading prompted tasks in seqio."""
+
+from t0.seqio_tasks import tasks, utils
diff --git a/t0/seqio_tasks/experiment_D4.csv b/t0/seqio_tasks/experiment_D4.csv
new file mode 100644
index 0000000..71c8216
--- /dev/null
+++ b/t0/seqio_tasks/experiment_D4.csv
@@ -0,0 +1,242 @@
+HF_name,subset,task_by_convention,format,comment,seed_paper,september_check,do_train,do_eval,train_size,adjusted_train_size,D3_do_train,D3_do_eval,D3_adjusted_train_size,metric,multiple correct answer,Paper link,non_linguistic_knowledge,skip,Imported Task Name,imported category,input_length,_human_skill,Domain,Reference
+crows_pairs,,bias_and_fairness,,test set only; authors themselves acknowledge some problems,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
+jigsaw_toxicity_pred,,bias_and_fairness,,current https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data ; want https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
+super_glue,axg,bias_and_fairness,cls,test set only,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
+winogender,,bias_and_fairness,cls,also as axg in super_glue,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
+wino_bias,type1_anti,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
+wino_bias,type2_anti,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
+wino_bias,type1_pro,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
+wino_bias,type2_pro,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
+super_glue,wsc.fixed,coreference,cls,,,,,TRUE,554,0,TRUE,TRUE,554,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,,superglue-wsc,cls/other,single sentence,knowledge-? reading comprehension,,Levesque et al. 2012
+winograd_wsc,wsc273,coreference,ext,,GPT,,,TRUE,0,0,,,0,accuracy,,https://www.aaai.org/ocs/index.php/KR/KR12/paper/download/4492/4924,,,,,,,,Levesque et al. 2012
+winogrande,winogrande_xl,coreference,ext,,GPT,TRUE,,TRUE,40398,0,,,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020
+winogrande,winogrande_debiased,coreference,ext,"""debiased"" = adversarially filtered",GPT,TRUE,,TRUE,9248,0,,,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020
+glue,cola,grammatical_acceptability,cls,includes semantic acceptability too; to be replaced by blimp,,,,TRUE,8551,0,,TRUE,0,accuracy;matthews_corrcoef,,https://arxiv.org/pdf/1805.12471.pdf,,,glue-cola,cls/other,single sentence,,,Warstadt et al. 2019
+super_glue,cb,NLI,cls,"""for multi-class F1 we compute the unweighted average of the F1 per class.""",,TRUE,,TRUE,250,0,,TRUE,0,mean_multiclass_f1;accuracy,,https://semanticsarchive.net/Archive/Tg3ZGI2M/Marneffe.pdf,,,superglue-cb,cls/nli,sentence pair,knowledge-neutral inference,,de Marneffe et al. 2019
+super_glue,rte,NLI,cls,,,TRUE,,TRUE,2490,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,,superglue-rte,cls/nli,sentence pair,knowledge modest inference,,Dagan et al. 2005; Bar-Haim et al. 2006 Giampiccolo et al. 2007; Bentivogli et al. 2009
+anli,,NLI,cls,"In addition to accuracy, paper also evaluates on range of relaxed/strict and matched/unmatched settings and reports F scores for different answers",,,,TRUE,162865,0,,TRUE,0,accuracy,,https://arxiv.org/abs/1910.14599,,,anli,cls/nli,sentence pair,knowledge modest inference,,Nie et al. 2020
+hans,,NLI,cls,,,TRUE,,TRUE,0,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1902.01007.pdf,,,,,sentence pair,syntax?,,McCoy et al. 2019
+super_glue,axb,NLI,cls,test set only,,TRUE,,TRUE,0,0,,,,,,,,,,,,,,
+glue,mrpc,paraphrase,cls,,,,TRUE,TRUE,3668,3668,TRUE,TRUE,3668,accuracy;f1_score,,https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/I05-50025B15D.pdf,,,glue-mrpc,cls/paraphrase,,paraphrase,,Dolan and Brockett 2005
+glue,qqp,paraphrase,cls,,,,TRUE,TRUE,363846,363846,TRUE,,363846,accuracy;f1_score,,https://aclanthology.org/I05-5002.pdf,,,glue-qqp,cls/paraphrase,,,,(link)
+paws,labeled_final,paraphrase,cls,,,,TRUE,,49401,49401,TRUE,,49401,,,,,,paws,cls/paraphrase,,,,Zhang et al. 2019
+ai2_arc,ARC-Challenge,QA_closed_book,cls,,GPT,,,TRUE,1119,0,TRUE,,1119,"accuracy_with_tie : For each question, a system receives 1 point if it
+chooses the correct answer and 1/k if it reports a k-way tie
+(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,,ARC (chal.),qa/multiple-choice qa,,nontrivial_comprehension,,Clark et al. 2018
+ai2_arc,ARC-Easy,QA_closed_book,cls,,GPT,,,TRUE,2251,0,TRUE,,2251,"accuracy_with_tie: For each question, a system receives 1 point if it
+chooses the correct answer and 1/k if it reports a k-way tie
+(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,,ARC (easy),Multiple choice,,,,
+nq_open,,QA_closed_book,gen,,GPT,TRUE,,TRUE,87925,0,,TRUE,0,kilt-exact_match;average_accuracy_accross_answers,TRUE,https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00276/43518/Natural-Questions-A-Benchmark-for-Question,intensive,,Natural Questions (open domain),,,trivia,,
+kilt_tasks,hotpotqa,QA_closed_book,gen,recast as closed-book due to input length,self,,TRUE,,88869,88869,,,,,,,,,kilt hotpotqa,qa/closed-book qa,,encyclopedia; multi-hop QA,,Yang et al. 2018
+trivia_qa,unfiltered,QA_closed_book,gen,,GPT,TRUE,,TRUE,87622,0,TRUE,,87622,exact_match;f1_over_words => wikipedia aliases are considered valid answers,TRUE,https://arxiv.org/pdf/1705.03551.pdf,intensive,,Trivia QA,,,,,
+web_questions,,QA_closed_book,gen,"""supposed to be answerable by Freebase"" Check corpora deduplication with freebaseqa.",GPT,,,TRUE,3778,0,TRUE,,3778,accuracy : they don't mention how they normalize across multiple correct answers,TRUE,https://aclanthology.org/D13-1160.pdf,intensive,,web questions,qa/closed-book qa,,,,Berant et al. 2013
+wiki_qa,,QA_closed_book,cls,,CrossFit,,TRUE,,20360,20360,,,,,,https://aclanthology.org/D15-1237.pdf,,,wiki qa,cls/other,,,,Yang et al. 2015
+adversarial_qa,dbidaf,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,https://aclanthology.org/2020.tacl-1.43/,,,adversarialqa,qa/machine reading comprehension,,,,Bartolo et al. 2020
+adversarial_qa,dbert,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,,,,,,,,,
+adversarial_qa,droberta,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,,,,,,,,,
+coqa,,QA_extractive,ext,GPT-easy,GPT,,,TRUE,7199,,,,,"macro_average_f1: for computing a model’s performance, each individual prediction is compared
+against n human answers resulting in n F1 scores,
+the maximum of which is chosen as the prediction’s
+F1.For each question, we average out F1 across
+these n sets, both for humans and models. In our
+final evaluation, we use n = 4 human answers for
+every question (the original answer and 3 additionally collected answers). The articles a, an and the
+and punctuations are excluded in evaluation.",from the paper it seems it could contain multiple answers but the datasets has only one answer per question,https://arxiv.org/pdf/1808.07042.pdf,,,,,,,,
+duorc,SelfRC,QA_extractive,ext,,TaskEmbed;CrossFit,,TRUE,,60721,60721,,,,,,https://duorc.github.io/,,,DuoRC,qa/machine reading comprehension,,,Wikipedia/IMDB crowd,Saha et al. 2018
+duorc,ParaphraseRC,QA_extractive,ext,,TaskEmbed;CrossFit,,TRUE,,69524,69524,,,,,,https://arxiv.org/pdf/1804.07927.pdf,,,DuoRC,paraphrased QA,,,,Saha et al. 2018
+ropes,,QA_extractive,ext,,,TRUE,TRUE,,10924,10924,TRUE,,10924,,,,modest,,ropes,Extractive QA,,cause_and_effect;nontrivial_comprehension,,Lin et al. 2019
+squad_v2,,QA_extractive,ext,,GPT,,,TRUE,130319,0,TRUE,,130319,exact_match;f1_score,TRUE,https://arxiv.org/pdf/1806.03822.pdf,,,SQuAD 2.0,Extractive QA,,,,Rajpurkar et al. 2018
+super_glue,record,QA_extractive,ext,,,TRUE,,TRUE,100730,0,TRUE,TRUE,100730,max_token_level_f1;exact_match,TRUE,https://arxiv.org/pdf/1810.12885.pdf,,,superglue-record,qa/machine reading comprehension,,knowledge-? reading comprehension,,Zhang et al. 2018
+qa_srl,,QA_extractive,ext,"need non-naive metric (""If the predicted word is contained inside the annotated answer span it is considered a correct prediction.""); v2 not in HF https://aclanthology.org/P18-1191.pdf",Eval WG,,,TRUE,6414,0,TRUE,TRUE,6414,accuracy,TRUE,https://dada.cs.washington.edu/qasrl/#page-top,neutral,,qa srl,other,,semantic role,,He et al. 2015
+quac,,QA_extractive,ext,,GPT,,,TRUE,11567,,,,,"average_maximum_f1;HEQ-Q;HEQ-D:  To make oracle human and system performance comparable,
+given n references, we report the average of the
+maximum F1 computed from each n − 1 subset
+with respect to the heldout reference.",TRUE,https://arxiv.org/pdf/1808.07036.pdf,,,,,,dialogue,,
+quoref,,QA_extractive,ext,,,TRUE,TRUE,,19399,19399,TRUE,,19399,,,https://aclanthology.org/D19-1606.pdf,,,Quoref,Extractive QA,,,,Dasigi et al. 2019
+tydiqa,,QA_extractive,ext,,Eval WG,,TRUE,,9211,9211,,,,,,,,,,,,,,
+drop,,QA_generative,gen,"nontrivial math; try history_690, it's pretty hard even when I have domain knowledge",GPT,TRUE,,TRUE,,,,,,exact_match; macro_average_f1,TRUE,https://aclanthology.org/N19-1246.pdf,,,DROP ,multi-hop quantitative reasoning; Abstractive QA,,numerical,Wikipedia crowd,Dua et al. 2019
+cos_e,v1.11,QA_multiple_choice,cls,"same as commonsense_qa but with (poorly sourced) human explanations; questionable ""commonsense"" lots of world knowledge",Vania,TRUE,TRUE,,9741,9741,TRUE,,9741,,,,,,cos e,other/generate explanation,,,,Rajani et al. 2019
+cosmos_qa,,QA_multiple_choice,cls,,,TRUE,TRUE,,25262,25262,TRUE,,25262,,,,,,cosmos qa,qa/multiple-choice qa,,,,Huang et al. 2019
+dream,,QA_multiple_choice,cls,,,TRUE,TRUE,,6116,6116,TRUE,,6116,,,,,,dream,qa/multiple-choice qa,,,,Sun et al. 2019
+openbookqa,main,QA_multiple_choice,cls,interesting combo of pragmatics + scientific reasoning,GPT,,,TRUE,4957,0,TRUE,TRUE,4957,"accuracy_with_tie : For each question, a system receives 1 point if it
+chooses the correct answer and 1/k if it reports a k-way tie
+(i.e., chooses multiple answers) that includes the correct answer.",,https://aclanthology.org/D18-1260.pdf,modest,,openbookqa,qa/multiple-choice qa,,pragmatics,,Mihaylov et al. 2018
+qasc,,QA_multiple_choice,cls,,,TRUE,TRUE,,8134,8134,TRUE,,8134,,,,given?,,qasc,qa/multiple-choice qa,,,,Khot et al. 2020
+quail,,QA_multiple_choice,cls,,,TRUE,TRUE,,10246,10246,TRUE,,10246,,,,,,quail,qa/multiple-choice qa,,,,Rogers et al. 2020
+quarel,,QA_multiple_choice,cls,,CrossFit,,TRUE,,1941,1941,,,,,,,,,quarel,qa/multiple-choice qa,,logical form,,Tafjord et al. 2019a
+quartz,,QA_multiple_choice,cls,,,TRUE,TRUE,,2696,2696,TRUE,,2696,,,https://aclanthology.org/D19-1608.pdf,given?,,quartz-with knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b
+race,high,QA_multiple_choice,cls,GPT-hard,GPT,,,TRUE,62445,0,TRUE,TRUE,62445,accuracy,,https://arxiv.org/pdf/1704.04683.pdff,neutral,,race-high,qa/multiple-choice qa,,knowledge-neutral reading comprehension,,Lai et al. 2017
+race,middle,QA_multiple_choice,cls,"revisit: define as comprehension, paragraph level?",GPT,,,TRUE,25421,0,TRUE,TRUE,25421,accuracy,,https://arxiv.org/pdf/1704.04683.pdf,neutral,,race-middle,qa/multiple-choice qa,,knowledge-neutral reading comprehension,,Lai et al. 2017
+sciq,,QA_multiple_choice,cls,,,TRUE,TRUE,,11679,11679,TRUE,,11679,,,,,,sciq,qa/multiple-choice qa,,,,Welbl et al. 2017
+social_i_qa,,QA_multiple_choice,cls,metric differ by prompt: 4-way classification cast as binary ,,TRUE,TRUE,TRUE,33410,33410,TRUE,TRUE,33410,accuracy,,https://arxiv.org/pdf/1904.09728.pdf,,,SIQA,qa/multiple-choice qa,,cultural knowledge,,Sap et al. 2019
+super_glue,boolq,QA_multiple_choice,cls,,,TRUE,,TRUE,9427,0,TRUE,TRUE,9427,accuracy,,https://arxiv.org/pdf/1905.10044.pdf,neutral?,,superglue-boolq,,,knowledge-? reading comprehension,,
+super_glue,copa,QA_multiple_choice,cls,,,TRUE,,TRUE,400,0,TRUE,TRUE,400,accuracy,,http://commonsensereasoning.org/2011/papers/Roemmele.pdf,modest,,superglue-copa,qa/multiple-choice qa,,causal cognition,,Gordon et al. 2012
+super_glue,multirc,QA_multiple_choice,cls,F1 over all answer options. See paper p. 259 for defintion,,TRUE,,TRUE,27243,0,TRUE,TRUE,27243,f1_over_all_options;exact_match,,https://aclanthology.org/N18-1023.pdf,neutral?,,superglue-multirc,qa/multiple-choice qa,,knowledge-? reading comprehension,,Khashabi et al. 2018
+wiki_hop,original,QA_multiple_choice,cls,,,TRUE,TRUE,,43738,43738,TRUE,,43738,,,https://transacl.org/ojs/index.php/tacl/article/viewFile/1325/299,,,WikiHop (Welbl et al. 2018),multi-hop QA,,,Wikipedia KB,
+wiqa,,QA_multiple_choice,cls,,,TRUE,TRUE,,29808,29808,TRUE,,29808,,,,,,wiqa,qa/multiple-choice qa,,cause_and_effect,,Tandon et al. 2019
+circa,,QA_multiple_choice,cls,revisit: problematic prompts,,,,TRUE,34268,0,,TRUE,0,mean_multiclass_f1;accuracy,,https://arxiv.org/pdf/2010.03450.pdf,,,circa,cls/other,,pragmatics,,Louis et al. 2020
+mc_taco,,QA_multiple_choice,cls,no train set; variable number of answer_chocies; eval in paper is over set of possible candidates;,,,,TRUE,0,0,,TRUE,0,exact_match; f1_score,,https://arxiv.org/pdf/1909.03065.pdf,,,mc taco,qa/binary,,temporal cognition,,Zhou et al. 2019
+piqa,,QA_multiple_choice,cls,revisit: not just other,GPT,,,TRUE,16113,0,TRUE,,16113,accuracy,,https://arxiv.org/pdf/1911.11641.pdf,,,PIQA,Multiple choice,,physical_cognition,,Bisk et al. 2020
+amazon_polarity,,sentiment,cls,,,TRUE,TRUE,,3600000,500000,TRUE,,500000,,,https://cs.stanford.edu/people/jure/pubs/reviews-recsys13.pdf,,,amazon polarity,cls/sentiment analysis,,,,McAuley and Leskovec 2013
+app_reviews,,sentiment,cls,,,TRUE,TRUE,,288065,288065,TRUE,,288065,,,,,,app reviews,other/regression,,,,Missing
+imdb,,sentiment,cls,,,TRUE,TRUE,,25000,25000,TRUE,,25000,,,,,,imdb,cls/sentiment analysis,,no dev set,,Maas et al. 2011
+rotten_tomatoes,,sentiment,cls,,,TRUE,TRUE,,8530,8530,TRUE,,8530,,,,,,rotten tomatoes,cls/sentiment analysis,,,,Pang and Lee 2005
+yelp_review_full,,sentiment,cls,no dev set,,TRUE,TRUE,,650000,500000,TRUE,,500000,,,,,,yelp review full,other/regression,,,,Zhang et al. 2015; (link)
+lambada,,story_completion,gen,revisit: story or cloze or coref? trivial cloze prompt; training set is just unlabeled corpora; GPT task,GPT,,,TRUE,0,0,,TRUE,0,accuracy;perplexity;median_rank,,https://arxiv.org/pdf/1606.06031.pdf,,,,,,,,
+craffel/openai_lambada,,story_completion,gen,revisit: story or cloze or coref? trivial cloze prompt; training set is just unlabeled corpora; GPT task,GPT,,,TRUE,0,0,,TRUE,0,accuracy;perplexity;median_rank,,https://arxiv.org/pdf/1606.06031.pdf,,,,,,,,
+story_cloze,2016,story_completion,cls,todo: custom loading; swag like?,GPT,,,TRUE,,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1604.01696.pdf,,,,,,,,
+hellaswag,,story_completion,cls,,GPT,,,TRUE,39905,0,TRUE,,39905,accuracy,,https://arxiv.org/pdf/1905.07830.pdf,,,hellaswag,qa/multiple-choice qa,,,,Zellers et al. 2019
+common_gen,,structure_to_text,gen,,,TRUE,TRUE,,67389,67389,TRUE,,67389,,,,,,common gen,other,,,,Lin et al. 2020b
+wiki_bio,,structure_to_text,gen,,,TRUE,TRUE,,582659,500000,TRUE,,500000,,,,,,wiki bio,cg/other,,,,Lebret et al. 2016
+cnn_dailymail,3.0.0,summarization,gen,,,TRUE,TRUE,,287113,287113,TRUE,,287113,,,,,,,,,,,
+gigaword,,summarization,gen,,,TRUE,TRUE,,3803957,500000,TRUE,,500000,,,,,,gigaword,cg/summarization,,,,Napoles et al. 2012
+multi_news,,summarization,gen,,CrossFit,,TRUE,,44972,44972,,,,,,,,,multi news,cg/summarization,,,,Fabbri et al. 2019
+samsum,,summarization,gen,,CrossFit,,TRUE,,14732,14732,,,,,,,,,samsum,cg/summarization,,,,Gliwa et al. 2019
+xsum,,summarization,gen,,,TRUE,TRUE,TRUE,204045,204045,TRUE,TRUE,204045,rouge,,https://arxiv.org/pdf/1808.08745.pdf,,,xsum,cg/summarization,,,,Narayan et al. 2018
+ag_news,,topic_classification,cls,,,TRUE,TRUE,,120000,120000,TRUE,,120000,,,http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html,,,ag news,cls/topic,,,,Gulli (link)
+dbpedia_14,,topic_classification,cls,,,TRUE,TRUE,,560000,500000,TRUE,,500000,,,https://svn.aksw.org/papers/2013/SWJ_DBpedia/public.pdf,,,dbpedia 14,cls/topic,,,,Lehmann et al. 2015
+trec,,topic_classification,cls,,,TRUE,TRUE,,5452,5452,TRUE,,5452,,,https://trec.nist.gov/data/qa.html,,,trec,cls/other,,,,Li and Roth 2002; Hovy et al. 2001
+super_glue,wic,word_sense_disambiguation,cls,,,TRUE,,TRUE,5428,0,TRUE,TRUE,5428,accuracy,,https://arxiv.org/pdf/1808.09121.pdf,,,superglue-wic,cls/other,,lexical_knowledge,,Pilehvar and Camacho-Collados 2019
+Staging Area,,,,,,,,,,,,,,,,,,,,,,,,
+Would Include but not in HF or some other practical limitations,,,,,,,,,,,,,,,,,,,,,,,,
+definite_pronoun_resolution,,coreference,,todo: download error,,,,,,,,,,,,,,,deﬁnite pronoun resolution,other,,,,Rahman and Ng 2012
+jeopardy,,closed-book qa,gen,sporadic download error,CrossFit,,,,,,,,,,,,,promptsource download error,jeopardy,qa/closed-book qa,,,,(link)
+blimp,,,cls,no prompts yet; collapse subsets,,,,,,0,,,0,,,,,,,,,,,
+Hendrycks et al. 2021,,,,https://arxiv.org/abs/2009.03300v3,,,,,,,,,,,,,,,,,,,,
+Multi-Turn Dialogue Reasoning,,,,https://aclanthology.org/2020.acl-main.130.pdf,Vania,,,,7088,,,,,,,,,,,,,,,
+Argument Reasoning Comprehension Task,,,,https://aclanthology.org/N18-1175.pdf,Vania,,,,1211,,,,,,,,,,,,,,,
+MCScript,,,,https://aclanthology.org/L18-1564.pdf,Vania,,,,14191,,,,,,,,,,,,,,,
+narrativeqa,,,,very long input sequence,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,NarQA,Abstractive QA,,,,
+newsqa,,,,download error,TaskEmbed,,,,,,,,,,,,,promptsource download error,NewsQA,Extractive QA,,,,Trischler et al. 2017
+eli5,,,,dataset split error,CrossFit,,,,,,,,,,,https://facebookresearch.github.io/ELI5/explore.html,,skip: HF datasets error the split field is used for subsets,eli5-askh,qa/long-form qa,,possibly knowledge-neutral,,Fan et al. 2019
+Maybe Reconsider,,,,,,,,,,,,,,,,,,,,,,,,
+zest,,,,its original task is quite complex (need to provide a decision function); should be held-out eval only,self,,,,,,,,,,,,,,,,,,,
+swag,,story_completion,cls,revisit whether this should be considered as a variant of NLI,,,,,73546,0,TRUE,,73546,,,,,,swag,qa/multiple-choice qa,,,,Zellers et al. 2018
+codah,codah,story_completion,cls,a variant of swag revisit whether this should be considered as a variant of NLI,,,,,2776,0,TRUE,,2776,,,,,,codah,qa/multiple-choice qa,,,,Chen et al. 2019
+wiki_auto,,,,revisit: lots of duplicate simplified text; novel generative task could be very challenging,CrossFit,,,,,,,,,,,,,no prompt yet,wiki auto,cls/other,,text simplification,,Jiang et al. 2020
+proto_qa,,,gen,"generate prototypical concepts, kinda niche format with multiple correct answers",CrossFit,,,,,,,,,,,,,no prompt yet,proto qa,other,,,,Boratko et al. 2020
+empathetic_dialogues,,,,generation? classification?,CrossFit,,,,,,,,,,,https://arxiv.org/pdf/1811.00207.pdf,,no prompt yet,empathetic dialogues,cg/dialogue,,,,Rashkin et al. 2019
+qed,,,,uses held-out Natural Questions,,,,,,,,,,,,,,,,,,,,
+kilt_tasks,aidayago2,,,,,,,,,,,,,,,,,no prompt yet,kilt ay2,other/entity linking,,encyclopedia,,Hoffart et al. 2011
+kilt_tasks,wow,,,,,,,,,,,,,,,,,no prompt yet,kilt wow,cg/dialogue,,encyclopedia,,Dinan et al. 2019
+lama,conceptnet,,,,,,,,,,,,,,,,,no prompt yet,lama-conceptnet,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
+lama,google_re,,,,,,,,,,,,,,,,,no prompt yet,lama-google re,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
+lama,squad,,,,,,,,,,,,,,,,,no prompt yet,lama-squad,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
+lama,trex,,,,,,,,,,,,,,,,,no prompt yet,lama-trex,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
+limit,,physical cognition,,,,,,,,,,,,,,https://aclanthology.org/2020.findings-emnlp.88.pdf,,label errors in dataset itself? also no validation set otherwise well motivated by semantic theories,limit,other,,physical semantic repr.,,Manotas et al. 2020
+kilt_tasks,fever,,,revisit whether this should be considered as a variant of NLI,,,,,,,,,,,,,,temporary skip: prompts available in non-benchmark standalone dataset,kilt fever,cls/fact checking,,encyclopedia,,Thorne et al. 2018
+Skipped,,,,,,,,,,,,,,,,,,,,,,,,
+fever,v2.0,closed-book qa/fact checking,,also in KILT,,,,,,,,,,,,,,skip: awkward prompts as closed-book qa,FEVER,,,,,
+hotpot_qa,distractor,,,also in KILT,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,Hotpot QA,,,,,
+hotpot_qa,fullwiki,,,also in KILT,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,Hotpot QA,,,,,
+emo,,sentiment,cls,skip: offensive and ungrammatical text,,merged,,,30160,0,TRUE,TRUE,30160,precision;recall;F1,,https://aclanthology.org/S19-2005.pdf,,skip: offensive and ungrammatical text,emo,cls/emotion,,,,Chatterjee et al. 2019
+freebase_qa,,QA_closed_book,gen,"need to be held out because web_questions is ""supposed to be answerable by Freebase""",,,,,20358,0,TRUE,,20358,,,,intensive,,freebase qa,qa/closed-book qa,,,,Jiang et al. 2019
+aqua_rat,,,,,,,,,,,,,,,,https://arxiv.org/abs/1705.04146,,skip: nontrivial math,aqua rat,qa/multiple-choice qa,,nontrivial math,,Ling et al. 2017
+math_qa,,,,,,,,,,,,,,,,,,skip: nontrivial math,math qa,qa/multiple-choice qa,,nontrivial math,,Amini et al. 2019
+numer_sense,,,,,,,,,,,,,,,,,,skip: closed-book trivia ,numer sense,qa/closed-book qa,,numerical knowledge,,Lin et al. 2020a
+squad_adversarial,,,,,,,,,,,,,,,,,,validation set only,,,,,,
+squadshifts,,,,,,,,,,,,,,,,,,test set only,,,,,,
+sms_spam,,,,,,,,,,,,,,,,,,skip: unclean corpus and likely harmful content,sms spam,cls/other,,,,Almeida et al. 2011
+search_qa,,,,,,,,,,,,,,,,,,skip: seems like a very unclean corpus,search qa,qa/closed-book qa,,,,Dunn et al. 2017
+kilt_tasks,trex,,,,,,,,,,,,,,,,,skip: non-natural language,kilt trex,qa/closed-book qa,,encyclopedia,,Elsahar et al. 2018
+kilt_tasks,structured_zeroshot,,,,,,,,,,,,,,,,,skip: non-natural language,kilt zsre,qa/closed-book qa,,encyclopedia,,Levy et al. 2017
+spider,,,,,,,,,,,,,,,,,,skip: non-natural language,spider,cg/other,,,,Yu et al. 2018
+wikisql,,,,,,,,,,,,,,,,,,skip: non-natural language,wikisql,cg/other,,,,Zhong et al. 2017
+com_qa,,,,,CrossFit,,,,,,,,,,,https://arxiv.org/pdf/1809.09528.pdf,,skip: non-human language: URL,ComQA (Abujabal et al. 2019),factoid QA w/ paraphrases,,,snippets WikiAnswers,
+climate_fever,,,,revisit whether this should be considered as a variant of NLI,,,,,,,,,,,,,,skip: no train set,climate fever,cls/fact checking,,,,Diggelmann et al. 2020
+art,,,,,,,,,,,,,,,,https://arxiv.org/pdf/1908.05739.pdf,,skip: NLI reserved for generalization studies (although this one is not a traditionally defined NLI),art (abductive nli),other,,,,Bhagavatula et al. 2020
+glue,mnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-mnli,cls/nli,,,,Williams et al. 2018
+glue,qnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-qnli,cls/nli,,,,Rajpurkar et al. 2016
+glue,rte,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-rte,cls/nli,,,,Dagan et al. 2005; Bar-Haim et al. 2006 Giampiccolo et al. 2007; Bentivogli et al. 2009
+glue,wnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-wnli,cls/nli,,,,Levesque et al. 2012
+,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,scitail,cls/nli,,,,Khot et al. 2018
+,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,sick,cls/nli,,,,Marelli et al. 2014
+,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,SNLI (Bowman et al. 2015),NLI,,,misc.,
+aeslc,,,,summarization by email subject line,,,,,,,,,,,,https://arxiv.org/abs/1906.03497,,skip: niche task,aeslc,cg/summarization,,generation,,Zhang and Tetreault 2019
+onestop_english,,,,,,,,,,,,,,,,https://aclanthology.org/W18-0535.pdf,,skip: niche task: classify curriculum diffculty,onestop english,cls/other,,,,Vajjala and Luˇci´c 2018
+mocha,,,,,,,,,,,,,,,,,,skip: model generated text,mocha,other/regression,,,,Chen et al. 2020a
+commonsense_qa,,,,duplicate with cos_e,Vania,,,,9741,,,,,,,https://arxiv.org/pdf/1811.00937.pdf,,,Commonsense QA,qa/multiple-choice qa,,,,Talmor et al. 2019
+,,,,,,,,,,,,,,,,,,skip: maybe harmful content from Twitter,emotion,cls/emotion,,,,Saravia et al. 2018
+,,,,the authors themselves seem to have renounced their own work,,,,,,,,,,,,https://github.com/nyu-mll/crows-pairs,,skip: harmful content,crows pairs,other,,,,Nangia et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-directed vs generalized,cls/hate speech detection,,,,Mollas et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-disability,cls/hate speech detection,,,,Mollas et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-gender,cls/hate speech detection,,,,Mollas et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-national origin,cls/hate speech detection,,,,Mollas et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-race,cls/hate speech detection,,,,Mollas et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-religion,cls/hate speech detection,,,,Mollas et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-sexual orientation,cls/hate speech detection,,,,Mollas et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,hate speech offensive,cls/hate speech detection,,,,Davidson et al. 2017
+,,,,,,,,,,,,,,,,,,skip: harmful content,hate speech18,cls/hate speech detection,,,,de Gibert et al. 2018
+,,,,,,,,,,,,,,,,,,skip: harmful content,hatexplain,cls/hate speech detection,,,,Mathew et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,reddit tifu-title,cg/summarization,,,,Kim et al. 2019
+,,,,,,,,,,,,,,,,,,skip: harmful content,reddit tifu-tldr,cg/summarization,,,,Kim et al. 2019
+,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-emoji,cls/emotion,,,,Barbieri et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-emotion,cls/emotion,,,,Barbieri et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-hate,cls/emotion,,,,Barbieri et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-irony,cls/emotion,,,,Barbieri et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-offensive,cls/emotion,,,,Barbieri et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-sentiment,cls/emotion,,,,Barbieri et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance abortion,cls/emotion,,,,Barbieri et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance atheism,cls/emotion,,,,Barbieri et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance climate,cls/emotion,,,,Barbieri et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance feminist,cls/emotion,,,,Barbieri et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance hillary,cls/emotion,,,,Barbieri et al. 2020
+,,,,,,,,,,,,,,,,,,skip: harmful content,tweet qa,qa/machine reading comprehension,,,,Xiong et al. 2019
+yelp_polarity,,,,,,,,,,,,,,,,,,skip: duplicate with yelp_review_full,yelp polarity,cls/sentiment analysis,,,,Zhang et al. 2015; (link)
+quora,,,,,,,,,,,,,,,,https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs,,skip: duplicate under GLUE,QQP,paraphrase identiﬁcation,,,social QA,Iyer et al. 2017
+squad,,,,,,,,,,,,,,,,,,skip: duplicate under Squad 2.0,SQuAD 1.1,Extractive QA,,,,
+yahoo_answers_topics,,,,,,,,,,,,,,,,,,skip for early experiments: unclean corpus,yahoo answers topics,cls/topic,,,,(link)
+tab_fact,,,,,,,,,,,,,,,,,,skip for early experiments: tabular data,tab fact,cls/fact checking,,,,Chen et al. 2020b
+,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-anaphor gender agreement,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
+,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-anaphor number agreement,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
+,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-determiner noun agreement with adj irregular 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
+,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-ellipsis n bar 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
+,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-ellipsis n bar 2,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
+,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-existential there quantiﬁers 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
+,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-irregular past participle adjectives,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
+,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-sentential negation npi licensor present,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
+,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-sentential negation npi scope,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
+,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-wh questions object gap,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
+poem_sentiment,,,,,,,,,,,,,,,,,,skip for early experiments: poetry domain,poem sentiment,cls/sentiment analysis,,creativity,,Sheng and Uthus 2020
+acronym_identification,,,,,,,,,,,,,,,,https://arxiv.org/pdf/2010.14678.pdf,,skip for early experiments: niche/hard task,acronym identiﬁcation,other,,,,Pouran Ben Veyseh et al. 2020
+google_wellformed_query,,,,revisit whether to exclude fine-grain regression tasks,,,,,,,,,,,,,,skip for early experiments: niche/hard task,google wellformed query,cls/other,,,,Faruqui and Das 2018
+liar,,,,revisit whether to exclude fine-grain regression tasks,,,,,,,,,,,,,,skip for early experiments: niche/hard task,liar,cls/fact checking,,,,Wang 2017
+,,,,,,,,,,,,,,,,,,skip for early experiments: niche/hard task,break-QDMR-high-level,other,,semantic representation,,Wolfson et al. 2020
+,,,,,,,,,,,,,,,,,,skip for early experiments: niche/hard task,crawl domain,other,,,,Zhang et al. 2020
+discovery,discovery,,,,,,,,,,,,,,,,,skip for early experiments: niche task no cannonical answer,discovery,cls/other,,generative-ish,,Sileo et al. 2019
+wiki_split,,,,,,,,,,,,,,,,,,skip for early experiments: niche task,wiki split,cg/other,,,,Botha et al. 2018
+,,,,,,,,,,,,,,,,,,skip for early experiments: multilingual,aslg pc12,other,,,,Othman and Jemni 2012
+,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,CCG (Hockenmaier and Steedman 2007),CCG supertagging,,syntax,Penn Treebank,
+,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Chunk (Tjong Kim Sang and Buchholz 2000),syntactic chunking,,syntax,Penn Treebank,
+,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Conj (Ficler and Goldberg 2016),conjunct identiﬁcation,,syntax,Penn Treebank,
+,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GED (Yannakoudakis et al. 2011),grammatical error detection,,syntax,misc.,
+,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GGParent (Liu et al. 2019a),syntactic tagging,,syntax,Penn Treebank,
+,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GParent (Liu et al. 2019a),syntactic tagging,,syntax,Penn Treebank,
+,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,NER (Tjong Kim Sang and De Meulder 2003),named entity recognition,,,news,
+,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Parent (Liu et al. 2019a),syntactic tagging,,syntax; constituency,Penn Treebank,
+,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,POS-EWT (Silveira et al. 2014),part-of-speech tagging,,syntax,Web Treebank,
+,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,POS-PTB (Marcus et al. 1993),part-of-speech tagging,,syntax,Penn Treebank,
+,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,ST (Bjerva et al. 2016),semantic tagging,,,Groningen Meaning Bank,
+financial_phrasebank,,,,,,,,,,,,,,,,,,skip for early experiments: financial domain,ﬁnancial phrasebank,cls/sentiment analysis,,,,Malo et al. 2014
+health_fact,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,health fact,cls/fact checking,,,,Kotonya and Toni 2020
+,,,,,,,,,,,,,,,,http://www.sciencedirect.com/science/article/pii/S1532046412000615,,skip for early experiments: biomedical domain,ade corpus v2-classiﬁcation,cls/other,,,,Gurulingappa et al. 2012
+,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,ade corpus v2-dosage,other/slot ﬁlling,,,,Gurulingappa et al. 2012
+,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,ade corpus v2-effect,other/slot ﬁlling,,,,Gurulingappa et al. 2012
+,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,biomrc,qa/machine reading comprehension,,,,Pappas et al. 2020
+,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,medical questions pairs,cls/paraphrase,,,,McCreery et al. 2020
+scicite,,,,,,,,,,,,,,,,,,skip for early experiments: academic domain + niche/hard task,scicite,cls/other,,,,Cohan et al. 2019
+,,,,,,,,,,,,,,,,,,skip for early experiments: abstract semantic representations,break-QDMR,other,,logical form,,Wolfson et al. 2020
+,,,,,,,,,,,,,,,,,,skip for early experiments: abstract semantic representations,e2e nlg cleaned,other,,,,Duˇsek et al. 2020 2019
+glue,sst2,,,,,,,,,,,,,,,,,revisit: very short and often ill-formed movie reviews,glue-sst2,cls/sentiment analysis,,,,Socher et al. 2013
+glue,stsb,fine-grain regression,,,,,,,,,,,,,,,,revisit whether to exclude fine-grain regression tasks,glue-stsb,semantic similarity,,,misc.,
+,,,,,,,,,,,,,,,,,,double check: subset missing from HF datasets,squad-no context,qa/closed-book qa,,,,Rajpurkar et al. 2016
+,,,,,,,,,,,,,,,,,,double check: subset missing from HF datasets,squad-with context,qa/machine reading comprehension,,,,Rajpurkar et al. 2016
+,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,BoolQ-CS,Binary yes/no,,,,
+,,,,,,,,,,,,,,,,https://aclanthology.org/C16-1236.pdf,,double check: missing from HF datasets,CQ (Bao et al. 2016),knowledge-based QA,,,snippets web queries/KB,
+,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,DROP-CS,Abstractive QA,,,,
+,,,,,,,,,,,,,,,,https://aclanthology.org/D13-1020.pdf,,double check: missing from HF datasets,MCTest,Multiple choice,,,,
+,,,,,,,,,,,,,,,,,,double check: missing from HF datasets,MRPC (Dolan and Brockett 2005),paraphrase identiﬁcation,,,news,
+,,,,"""naturally perturbed"" version of BoolQ",,,,,,,,,,,,https://arxiv.org/pdf/2004.04849.pdf,,double check: missing from HF datasets,NP-BoolQ,Binary yes/no,,,,
+,,,,,,,,,,,,,,,,https://aclanthology.org/D19-1608.pdf,,double check: missing from HF datasets,quartz-no knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b
+,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,Quoref-CS,Extractive QA,,,,
+,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,ROPES-CS,Extractive QA,,,,
diff --git a/t0/seqio_tasks/tasks.py b/t0/seqio_tasks/tasks.py
new file mode 100644
index 0000000..2cec8f5
--- /dev/null
+++ b/t0/seqio_tasks/tasks.py
@@ -0,0 +1,421 @@
+import csv
+import functools
+from typing import Dict, List, Optional, Tuple
+
+import datasets
+import pkg_resources
+import seqio
+import t5
+import tensorflow as tf
+from t5.data.glue_utils import get_glue_metric, get_super_glue_metric
+from t5.evaluation import metrics as mt
+
+import promptsource.templates
+from promptsource.seqio_tasks import utils
+
+
+GET_METRICS = {
+    "BLEU": mt.bleu,
+    "ROUGE": mt.rouge,
+    "Span Squad": mt.span_squad,
+    "Squad": mt.squad,
+    "Trivia QA": mt.trivia_qa,
+    "Accuracy": mt.accuracy,
+    "Sequence Accuracy": mt.sequence_accuracy,
+    "Pearson Correlation": mt.pearson_corrcoef,
+    "Spearman Correlation": mt.spearman_corrcoef,
+    "MultiRC": mt.multirc_f1_over_all_answers,
+    "AUC": mt.auc,
+    "COQA F1": mt.coqa_f1,
+    "Edit Distance": mt.edit_distance,
+    # "Mean Reciprocal Rank": mt.accuracy,  # NOTE not in T5?
+    "Other": mt.accuracy,
+    # Missing support for mean_multiclass_f1 etc. which need a num_classes parameter
+}
+
+MAX_EXAMPLES_PER_DATASET = 500_000
+
+
+def strip_whitespace(output_or_target, example=None, is_target=False):
+    """Cached tasks from promptsource all have a leading space on the ground-truth targets."""
+    return output_or_target.strip()
+
+
+def maybe_get_class_id_postprocessor(template):
+    if template.get_fixed_answer_choices_list():
+
+        def postprocess_fn(output_or_target, example=None, is_target=False):
+            output_or_target = strip_whitespace(output_or_target)
+            return t5.data.postprocessors.string_label_to_class_id(
+                output_or_target, label_classes=template.get_fixed_answer_choices_list()
+            )
+
+        return postprocess_fn
+
+    else:
+        return strip_whitespace
+
+
+def get_tf_dataset(split, shuffle_files, seed, dataset_name, subset_name, template, split_mapping):
+    # HF datasets does not support file-level shuffling
+    del shuffle_files, seed
+    dataset = datasets.load_dataset(dataset_name, subset_name)
+    dataset = dataset[split_mapping[split]]
+    dataset = utils.apply_template(dataset, template)
+    return utils.hf_dataset_to_tf_dataset(dataset)
+
+
+def add_task(dataset_name, subset_name, template_name, task_name=None, split_mapping=None):
+    template = all_templates.get_dataset(dataset_name, subset_name)[template_name]
+    task_name = task_name or utils.get_task_name(dataset_name, subset_name, template_name)
+
+    if dataset_name == "glue":
+        metrics = get_glue_metric(subset_name)
+    elif dataset_name == "super_glue":
+        if subset_name in ("wsc.fixed", "multirc"):
+            # TODO: WSC and MultiRC need special pre/postprocesing
+            metrics = [mt.accuracy]
+        else:
+            metrics = get_super_glue_metric(subset_name)
+    else:
+        # TODO what if metric is null?
+        metrics = [GET_METRICS[m] for m in template.metadata.metrics]
+
+    dataset_splits = utils.get_dataset_splits(dataset_name, subset_name)
+    split_mapping = split_mapping or {k: k for k in dataset_splits.keys()}
+
+    dataset_fn = functools.partial(
+        get_tf_dataset,
+        seed=None,
+        dataset_name=dataset_name,
+        subset_name=subset_name,
+        template=template,
+        split_mapping=split_mapping,
+    )
+    data_source = seqio.FunctionDataSource(
+        dataset_fn,
+        splits=list(split_mapping.keys()),
+        num_input_examples={s: dataset_splits[split_mapping[s]].num_examples for s in split_mapping.keys()},
+    )
+    output_features = {
+        "inputs": seqio.Feature(t5.data.get_default_vocabulary(), add_eos=False, dtype=tf.int32),
+        "targets": seqio.Feature(t5.data.get_default_vocabulary(), add_eos=True, dtype=tf.int32),
+    }
+    preprocessors = [
+        seqio.preprocessors.tokenize,
+        seqio.preprocessors.append_eos,
+        seqio.CacheDatasetPlaceholder(required=False),
+    ]
+
+    # Add train and normal eval tasks
+    seqio.TaskRegistry.add(
+        task_name,
+        data_source,
+        preprocessors=preprocessors,
+        output_features=output_features,
+        metric_fns=metrics,
+        postprocess_fn=maybe_get_class_id_postprocessor(template),
+    )
+
+    # Add rank classification eval task
+    if template.answer_choices:
+        rank_classification_preprocessor = functools.partial(
+            t5.data.preprocessors.rank_classification,
+            inputs_fn=lambda ex: tf.fill((len(ex["answer_choices"]),), ex["inputs"]),
+            targets_fn=lambda ex: ex["answer_choices"],
+            is_correct_fn=lambda ex: tf.equal(ex["answer_choices"], tf.strings.strip(ex["targets"])),
+            weight_fn=lambda ex: 1.0,
+        )
+
+        fixed_choices = template.get_fixed_answer_choices_list()
+        num_classes = len(fixed_choices) if fixed_choices else None
+        seqio.TaskRegistry.add(
+            task_name + "_score_eval",
+            data_source,
+            preprocessors=[rank_classification_preprocessor] + preprocessors,
+            output_features=output_features,
+            metric_fns=[functools.partial(t5.evaluation.metrics.rank_classification, num_classes=num_classes)],
+            postprocess_fn=t5.data.postprocessors.rank_classification,
+        )
+
+
+datatset_subset_tuple = Tuple[str, Optional[str]]
+d4_train: List[datatset_subset_tuple] = []
+d4_eval: List[datatset_subset_tuple] = []
+d3_train_gpt: List[datatset_subset_tuple] = []
+d3_train_sglue: List[datatset_subset_tuple] = []
+bias_fairness_eval: List[datatset_subset_tuple] = []
+gsheet: Dict[datatset_subset_tuple, Dict] = {}
+experiment_path = pkg_resources.resource_filename(__name__, "experiment_D4.csv")
+with open(experiment_path) as exp_file:
+    reader = csv.DictReader(exp_file)
+    for row in reader:
+        if row["skip"]:
+            continue
+        if row["subset"] == "":
+            row["subset"] = None  # to match promptsource.Template object
+        dataset_subset = (row["HF_name"], row["subset"])
+        if row["do_train"] == "TRUE":
+            d4_train.append(dataset_subset)
+        if row["do_eval"] == "TRUE":
+            d4_eval.append(dataset_subset)
+        if row["D3_do_train"] == "TRUE" and "GPT" in row["seed_paper"]:
+            d3_train_gpt.append(dataset_subset)
+        if row["D3_do_train"] == "TRUE" and row["HF_name"] == "super_glue":
+            d3_train_sglue.append(dataset_subset)
+        if (
+            row["do_eval"] == "TRUE"
+            and row["task_by_convention"] == "bias_and_fairness"
+            and row["HF_name"] != "winogender"
+        ):
+            bias_fairness_eval.append(dataset_subset)
+        gsheet[dataset_subset] = row
+all_datasets = d4_train + d4_eval + d3_train_gpt + d3_train_sglue + bias_fairness_eval
+
+all_templates = promptsource.templates.TemplateCollection()
+all_templates.remove("anli")  # Need to special-case ANLI due to weird split conventions
+
+# 3 stages of training/ablation: D4 -> GPT -> SuperGLUE
+d4_train_mixture: List[str] = []  # strings are dataset_subset_template
+gpt_train_mixture: List[str] = []
+sglue_train_mixture: List[str] = []
+d4_eval_mixture: List[str] = []
+bias_fairness_eval_mixture: List[str] = []
+mixture_cap: Dict[str, int] = {}
+single_original_task: Dict[Tuple[str, str], str] = {}
+all_original_tasks: List[str] = []
+for dataset_name, subset_name in all_templates.keys:
+    if (dataset_name, subset_name) not in all_datasets:
+        all_templates.remove(dataset_name, subset_name)
+        continue
+
+    dataset = all_templates.get_dataset(dataset_name, subset_name)
+    num_templates = len(dataset.all_template_names)
+    train_size = gsheet[(dataset_name, subset_name)]["train_size"]
+    if train_size == "":
+        train_size = 0
+    else:
+        train_size = int(train_size)
+    if train_size > MAX_EXAMPLES_PER_DATASET:
+        cap = MAX_EXAMPLES_PER_DATASET // num_templates
+    else:
+        cap = train_size
+    for template_name in dataset.all_template_names:
+        add_task(dataset_name, subset_name, template_name)
+
+        template = dataset[template_name]
+
+        task_name = utils.get_task_name(dataset_name, subset_name, template_name)
+
+        if (dataset_name, subset_name) not in single_original_task and template.metadata.original_task:
+            single_original_task[(dataset_name, subset_name)] = task_name
+
+        if template.metadata.original_task:
+            all_original_tasks.append(task_name)
+
+        if (dataset_name, subset_name) in d4_train:
+            d4_train_mixture.append(task_name)
+            mixture_cap[task_name] = cap
+        if (dataset_name, subset_name) in d3_train_gpt:
+            gpt_train_mixture.append(task_name)
+            mixture_cap[task_name] = cap
+        if (dataset_name, subset_name) in d3_train_sglue:
+            sglue_train_mixture.append(task_name)
+            mixture_cap[task_name] = cap
+        if (dataset_name, subset_name) in d4_eval:
+            if template.metadata.original_task:
+                d4_eval_mixture.append(task_name)
+            # TODO use template.metadata.answer_choices here for rank eval
+        if (dataset_name, subset_name) in bias_fairness_eval:
+            bias_fairness_eval_mixture.append(task_name)
+
+# Special case for ANLI, which has weirdly-named splits and rounds that should be subsets
+dataset_name, subset_name = ("anli", None)
+dataset = all_templates.get_dataset(dataset_name, subset_name)
+for anli_round in ("r1", "r2", "r3"):
+    for template_name in all_templates.get_dataset(dataset_name, subset_name).all_template_names:
+        task_name = utils.get_task_name(dataset_name, subset_name, template_name) + f"_{anli_round}"
+        split_mapping = {
+            "train": f"train_{anli_round}",
+            "validation": f"dev_{anli_round}",
+            "test": f"test_{anli_round}",
+        }
+        add_task(dataset_name, subset_name, template_name, task_name, split_mapping)
+
+        template = dataset[template_name]
+        if template.metadata.original_task:
+            d4_eval_mixture.append(task_name)  # TODO or add to ANLI special mixture
+        # TODO use template.metadata.answer_choices here for rank eval
+
+
+TASK_BLACKLIST = [
+    # Tasks which often tokenize to > 1024 tokens currently
+    "hotpot_qa_distractor_Generate_Explanations",
+    "hotpot_qa_fullwiki_Generate_Explanations",
+    "hotpot_qa_distractor_Generate_Answer_and_Explanations",
+    "hotpot_qa_fullwiki_Generate_Answer_and_Explanations",
+    "hotpot_qa_fullwiki_Generate_Answer",
+    "hotpot_qa_distractor_Generate_Answer",
+    "hotpot_qa_distractor_Generate_Title_2",
+    "hotpot_qa_fullwiki_Generate_Title_2",
+    "hotpot_qa_fullwiki_Generate_Title_1",
+    "hotpot_qa_distractor_Generate_Title_1",
+    "hotpot_qa_distractor_Generate_Question",
+    "hotpot_qa_fullwiki_Generate_Question",
+    "tab_fact_tab_fact_tab_fact_3",
+    "tab_fact_tab_fact_tab_fact_2",
+    "tab_fact_tab_fact_tab_fact_1",
+    "tab_fact_tab_fact_tab_fact_7",
+    "tab_fact_tab_fact_tab_fact_4",
+    "tab_fact_tab_fact_tab_fact_5",
+    "tab_fact_tab_fact_tab_fact_6",
+    "wiki_hop_masked_Choose_Best_Object_Candidate",
+    "wiki_hop_masked_Indirect_Question_about_Birthplace_Citizenship_Place_of_Death",
+    "narrativeqa_Template_05",
+    "ecthr_cases_alleged_violation_prediction_silver_rationales",
+    # Tasks with broken cached files
+    "gigaword_summarize_",
+]
+
+# Tasks that failed caching (won't try to fix them for now) - remove when we are done
+D4_TRAIN_SCORE_EVAL_TASK_BLACKLIST = [
+    "amazon_polarity_Is_this_product_review_positive_score_eval",
+    "amazon_polarity_Is_this_review_negative_score_eval",
+    "amazon_polarity_Is_this_review_score_eval",
+    "amazon_polarity_User_recommend_this_product_score_eval",
+    "amazon_polarity_convey_negative_or_positive_sentiment_score_eval",
+    "amazon_polarity_flattering_or_not_score_eval",
+    "amazon_polarity_negative_or_positive_tone_score_eval",
+    "amazon_polarity_user_satisfied_score_eval",
+    "amazon_polarity_would_you_buy_score_eval",
+    "dbpedia_14_given_a_choice_of_categories__score_eval",
+    "dbpedia_14_given_list_what_category_does_the_paragraph_belong_to_score_eval",
+    "dbpedia_14_pick_one_category_for_the_following_text_score_eval",
+    "wiki_hop_original_choose_best_object_affirmative_1_score_eval",
+    "wiki_hop_original_choose_best_object_affirmative_2_score_eval",
+    "wiki_hop_original_choose_best_object_affirmative_3_score_eval",
+    "wiki_hop_original_choose_best_object_interrogative_1_score_eval",
+    "wiki_hop_original_choose_best_object_interrogative_2_score_eval",
+]
+
+seqio.MixtureRegistry.add(
+    "d4_train",
+    [task for task in d4_train_mixture if task not in TASK_BLACKLIST],
+    default_rate=lambda t: mixture_cap[t.name],
+)
+
+# seqio.MixtureRegistry.add(
+#     "gpt_train",
+#     [task for task in gpt_train_mixture if task not in TASK_BLACKLIST],
+#     default_rate=lambda t: mixture_cap[t.name],
+# )
+#
+# seqio.MixtureRegistry.add(
+#     "sglue_train",
+#     [task for task in sglue_train_mixture if task not in TASK_BLACKLIST],
+#     default_rate=lambda t: mixture_cap[t.name],
+# )
+
+seqio.MixtureRegistry.add(
+    "d4_gpt_train",
+    [task for task in d4_train_mixture + gpt_train_mixture if task not in TASK_BLACKLIST],
+    default_rate=lambda t: mixture_cap[t.name],
+)
+
+seqio.MixtureRegistry.add(
+    "d4_gpt_sglue_train",
+    [task for task in d4_train_mixture + gpt_train_mixture + sglue_train_mixture if task not in TASK_BLACKLIST],
+    default_rate=lambda t: mixture_cap[t.name],
+)
+
+# seqio.MixtureRegistry.add(
+#     "d4_eval",
+#     [task for task in d4_eval_mixture if task not in TASK_BLACKLIST],
+#     default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
+# )  # eval mixture does not need to be capped
+
+
+seqio.MixtureRegistry.add(
+    "d4_score_eval",
+    [
+        task
+        for task in seqio.TaskRegistry.names()
+        if task.endswith("_score_eval")
+        and task.split("_score_eval")[0] in d4_eval_mixture
+        and task.split("_score_eval")[0] not in TASK_BLACKLIST
+    ],
+    default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
+)
+
+# # Train tasks we don't care about evaluating on
+# D4_TRAIN_SKIP_EVAL = [
+#     "paws_labeled_final",
+#     "adversarial_qa_dbidaf",
+#     "adversarial_qa_dbert",
+#     "duorc_ParaphraseRC",
+#     "dream",
+#     "amazon_polarity",
+#     "app_reviews",
+#     "imdb",
+#     "wiki_bio",
+#     "gigaword",
+#     "multi_news",
+#     "samsum",
+#     "dbpedia_14",
+#     "trec",
+# ]
+
+# seqio.MixtureRegistry.add(
+#     "d4_train_eval",
+#     [
+#         task
+#         for task in d4_train_mixture
+#         if task not in TASK_BLACKLIST
+#         and not any([skip in task for skip in D4_TRAIN_SKIP_EVAL])
+#         and task in all_original_tasks
+#     ],
+#     default_rate=lambda t: mixture_cap[t.name],
+# )
+#
+# seqio.MixtureRegistry.add(
+#     "d4_train_score_eval",
+#     [
+#         task
+#         for task in seqio.TaskRegistry.names()
+#         if task.endswith("_score_eval")
+#         and task.split("_score_eval")[0] in d4_train_mixture
+#         and task.split("_score_eval")[0] not in TASK_BLACKLIST
+#         and task not in D4_TRAIN_SCORE_EVAL_TASK_BLACKLIST
+#         and not any([skip in task for skip in D4_TRAIN_SKIP_EVAL])
+#         and task.split("_score_eval")[0] in all_original_tasks
+#     ],
+#     default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
+# )
+
+seqio.MixtureRegistry.add(
+    "d4_train_one_og_prompt",
+    [task for task in single_original_task.values() if task in d4_train_mixture and task not in TASK_BLACKLIST],
+    default_rate=lambda t: mixture_cap[t.name],
+)
+
+seqio.MixtureRegistry.add(
+    "d4_train_all_og_prompts",
+    [task for task in all_original_tasks if task in d4_train_mixture and task not in TASK_BLACKLIST],
+    default_rate=lambda t: mixture_cap[t.name],
+)
+
+# seqio.MixtureRegistry.add(
+#     "bias_fairness_eval",
+#     bias_fairness_eval_mixture,
+#     default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
+# )
+
+seqio.MixtureRegistry.add(
+    "bias_fairness_eval_score_eval",
+    [
+        task
+        for task in seqio.TaskRegistry.names()
+        if task.endswith("_score_eval") and task.split("_score_eval")[0] in bias_fairness_eval_mixture
+    ],
+    default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
+)
diff --git a/t0/seqio_tasks/utils.py b/t0/seqio_tasks/utils.py
new file mode 100644
index 0000000..1b4df95
--- /dev/null
+++ b/t0/seqio_tasks/utils.py
@@ -0,0 +1,77 @@
+import re
+
+import datasets
+import tensorflow as tf
+
+import promptsource.utils
+
+
+def feature_to_spec(feature, length=False):
+    if isinstance(feature, datasets.ClassLabel):
+        return tf.TensorSpec(shape=() if not length else (None if length == -1 else length,), dtype=tf.int64)
+    elif isinstance(feature, datasets.Value):
+        return tf.TensorSpec(
+            shape=() if not length else (None if length == -1 else length,), dtype=getattr(tf.dtypes, feature.dtype)
+        )
+    elif hasattr(feature, "dtype") and hasattr(feature, "shape"):
+        return tf.TensorSpec(shape=feature.shape, dtype=feature.dtype)
+    elif isinstance(feature, datasets.Sequence):
+        return feature_to_spec(feature.feature, length=feature.length)
+    elif isinstance(feature, list):
+        return [feature_to_spec(f, length=length) for f in feature]
+    elif isinstance(feature, dict):
+        return {k: feature_to_spec(v, length=length) for k, v in feature.items()}
+    else:
+        raise ValueError(f"Unparseable feature type {type(feature)}")
+
+
+def hf_dataset_to_tf_dataset(dataset):
+    return tf.data.Dataset.from_generator(
+        dataset.__iter__, output_signature={k: feature_to_spec(v) for k, v in dataset.features.items()}
+    )
+
+
+def apply_template(dataset, template):
+    def map_fn(ex):
+        ex = promptsource.utils.removeHyphen(ex)
+        inputs_and_targets = template.apply(ex)
+        answer_choices = template.get_answer_choices_list(ex)
+        if len(inputs_and_targets) == 2:
+            inputs, targets = inputs_and_targets
+            if targets == "":
+                ex = {"inputs": inputs, "targets": "<NO LABEL>"}
+            else:
+                ex = {"inputs": inputs, "targets": targets}
+        # When template results in an empty example, template.apply returns [""]
+        # Also, if the template gets split wrong, len can be > 2
+        # We will filter these out later
+        else:
+            ex = {"inputs": "", "targets": ""}
+
+        if answer_choices:
+            ex["answer_choices"] = answer_choices
+
+        return ex
+
+    def filter_fn(ex):
+        return len(ex["inputs"]) > 0 and len(ex["targets"]) > 0
+
+    original_columns = dataset.column_names
+    dataset = dataset.map(map_fn).filter(filter_fn)
+    # map keeps original columns, remove them
+    return dataset.remove_columns(set(original_columns) - {"inputs", "targets", "answer_choices"})
+
+
+def get_dataset_splits(dataset_name, subset_name=None):
+    info = datasets.get_dataset_infos(dataset_name)
+    subset_name = subset_name or list(info.keys())[0]
+    return info[subset_name].splits
+
+
+def task_clean(text):
+    # Clean the text according to allowed characters for a task name
+    return re.sub(r"[^\w\d\._]+", "_", text)
+
+
+def get_task_name(dataset_name, subset_name, template_name):
+    return task_clean(dataset_name + (f"_{subset_name}_" if subset_name is not None else "_") + template_name)

From 23eb5f50893ab6ce0224c377d73a4279aeba21f0 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 4 Jan 2022 17:23:09 +0100
Subject: [PATCH 02/16] Mark all datasets that we need to keep as done=True

---
 t0/seqio_tasks/experiment_D4.csv | 457 +++++++++++++++----------------
 t0/seqio_tasks/tasks.py          |  58 ++--
 2 files changed, 256 insertions(+), 259 deletions(-)

diff --git a/t0/seqio_tasks/experiment_D4.csv b/t0/seqio_tasks/experiment_D4.csv
index 71c8216..206ba10 100644
--- a/t0/seqio_tasks/experiment_D4.csv
+++ b/t0/seqio_tasks/experiment_D4.csv
@@ -1,40 +1,39 @@
-HF_name,subset,task_by_convention,format,comment,seed_paper,september_check,do_train,do_eval,train_size,adjusted_train_size,D3_do_train,D3_do_eval,D3_adjusted_train_size,metric,multiple correct answer,Paper link,non_linguistic_knowledge,skip,Imported Task Name,imported category,input_length,_human_skill,Domain,Reference
-crows_pairs,,bias_and_fairness,,test set only; authors themselves acknowledge some problems,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-jigsaw_toxicity_pred,,bias_and_fairness,,current https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data ; want https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-super_glue,axg,bias_and_fairness,cls,test set only,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-winogender,,bias_and_fairness,cls,also as axg in super_glue,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-wino_bias,type1_anti,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-wino_bias,type2_anti,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-wino_bias,type1_pro,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-wino_bias,type2_pro,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
-super_glue,wsc.fixed,coreference,cls,,,,,TRUE,554,0,TRUE,TRUE,554,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,,superglue-wsc,cls/other,single sentence,knowledge-? reading comprehension,,Levesque et al. 2012
-winograd_wsc,wsc273,coreference,ext,,GPT,,,TRUE,0,0,,,0,accuracy,,https://www.aaai.org/ocs/index.php/KR/KR12/paper/download/4492/4924,,,,,,,,Levesque et al. 2012
-winogrande,winogrande_xl,coreference,ext,,GPT,TRUE,,TRUE,40398,0,,,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020
-winogrande,winogrande_debiased,coreference,ext,"""debiased"" = adversarially filtered",GPT,TRUE,,TRUE,9248,0,,,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020
-glue,cola,grammatical_acceptability,cls,includes semantic acceptability too; to be replaced by blimp,,,,TRUE,8551,0,,TRUE,0,accuracy;matthews_corrcoef,,https://arxiv.org/pdf/1805.12471.pdf,,,glue-cola,cls/other,single sentence,,,Warstadt et al. 2019
-super_glue,cb,NLI,cls,"""for multi-class F1 we compute the unweighted average of the F1 per class.""",,TRUE,,TRUE,250,0,,TRUE,0,mean_multiclass_f1;accuracy,,https://semanticsarchive.net/Archive/Tg3ZGI2M/Marneffe.pdf,,,superglue-cb,cls/nli,sentence pair,knowledge-neutral inference,,de Marneffe et al. 2019
-super_glue,rte,NLI,cls,,,TRUE,,TRUE,2490,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,,superglue-rte,cls/nli,sentence pair,knowledge modest inference,,Dagan et al. 2005; Bar-Haim et al. 2006 Giampiccolo et al. 2007; Bentivogli et al. 2009
-anli,,NLI,cls,"In addition to accuracy, paper also evaluates on range of relaxed/strict and matched/unmatched settings and reports F scores for different answers",,,,TRUE,162865,0,,TRUE,0,accuracy,,https://arxiv.org/abs/1910.14599,,,anli,cls/nli,sentence pair,knowledge modest inference,,Nie et al. 2020
-hans,,NLI,cls,,,TRUE,,TRUE,0,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1902.01007.pdf,,,,,sentence pair,syntax?,,McCoy et al. 2019
-super_glue,axb,NLI,cls,test set only,,TRUE,,TRUE,0,0,,,,,,,,,,,,,,
-glue,mrpc,paraphrase,cls,,,,TRUE,TRUE,3668,3668,TRUE,TRUE,3668,accuracy;f1_score,,https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/I05-50025B15D.pdf,,,glue-mrpc,cls/paraphrase,,paraphrase,,Dolan and Brockett 2005
-glue,qqp,paraphrase,cls,,,,TRUE,TRUE,363846,363846,TRUE,,363846,accuracy;f1_score,,https://aclanthology.org/I05-5002.pdf,,,glue-qqp,cls/paraphrase,,,,(link)
-paws,labeled_final,paraphrase,cls,,,,TRUE,,49401,49401,TRUE,,49401,,,,,,paws,cls/paraphrase,,,,Zhang et al. 2019
-ai2_arc,ARC-Challenge,QA_closed_book,cls,,GPT,,,TRUE,1119,0,TRUE,,1119,"accuracy_with_tie : For each question, a system receives 1 point if it
+HF_name,subset,task_by_convention,format,comment,seed_paper,do_train,do_eval,train_size,adjusted_train_size,metric,multiple correct answer,Paper link,non_linguistic_knowledge,Imported Task Name,imported category,input_length,_human_skill,Domain,Reference,done
+crows_pairs,,bias_and_fairness,,test set only; authors themselves acknowledge some problems,Eval WG,,TRUE,,,,,,,,,,,,,TRUE
+jigsaw_toxicity_pred,,bias_and_fairness,,current https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data ; want https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification,Eval WG,,TRUE,,,,,,,,,,,,,TRUE
+super_glue,axg,bias_and_fairness,cls,test set only,Eval WG,,TRUE,,,,,,,,,,,,,TRUE
+winogender,,bias_and_fairness,cls,also as axg in super_glue,Eval WG,,TRUE,,,,,,,,,,,,,
+wino_bias,type1_anti,bias_and_fairness,cls,,Eval WG,,TRUE,,,,,,,,,,,,,TRUE
+wino_bias,type2_anti,bias_and_fairness,cls,,Eval WG,,TRUE,,,,,,,,,,,,,TRUE
+wino_bias,type1_pro,bias_and_fairness,cls,,Eval WG,,TRUE,,,,,,,,,,,,,TRUE
+wino_bias,type2_pro,bias_and_fairness,cls,,Eval WG,,TRUE,,,,,,,,,,,,,TRUE
+super_glue,wsc.fixed,coreference,cls,,,SGLUE,TRUE,554,0,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,superglue-wsc,cls/other,single sentence,knowledge-? reading comprehension,,Levesque et al. 2012,TRUE
+winograd_wsc,wsc273,coreference,ext,,GPT,,TRUE,0,0,accuracy,,https://www.aaai.org/ocs/index.php/KR/KR12/paper/download/4492/4924,,,,,,,Levesque et al. 2012,
+winogrande,winogrande_xl,coreference,ext,,GPT,,TRUE,40398,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020,TRUE
+glue,cola,grammatical_acceptability,cls,includes semantic acceptability too; to be replaced by blimp,,,TRUE,8551,0,accuracy;matthews_corrcoef,,https://arxiv.org/pdf/1805.12471.pdf,,glue-cola,cls/other,single sentence,,,Warstadt et al. 2019,
+super_glue,cb,NLI,cls,"""for multi-class F1 we compute the unweighted average of the F1 per class.""",,,TRUE,250,0,mean_multiclass_f1;accuracy,,https://semanticsarchive.net/Archive/Tg3ZGI2M/Marneffe.pdf,,superglue-cb,cls/nli,sentence pair,knowledge-neutral inference,,de Marneffe et al. 2019,TRUE
+super_glue,rte,NLI,cls,,,,TRUE,2490,0,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,superglue-rte,cls/nli,sentence pair,knowledge modest inference,,Dagan et al. 2005; Bar-Haim et al. 2006 Giampiccolo et al. 2007; Bentivogli et al. 2009,TRUE
+anli,,NLI,cls,"In addition to accuracy, paper also evaluates on range of relaxed/strict and matched/unmatched settings and reports F scores for different answers",,,TRUE,162865,0,accuracy,,https://arxiv.org/abs/1910.14599,,anli,cls/nli,sentence pair,knowledge modest inference,,Nie et al. 2020,TRUE
+hans,,NLI,cls,,,,TRUE,0,0,accuracy,,https://arxiv.org/pdf/1902.01007.pdf,,,,sentence pair,syntax?,,McCoy et al. 2019,
+super_glue,axb,NLI,cls,test set only,,,TRUE,0,0,,,,,,,,,,,
+glue,mrpc,paraphrase,cls,,,BASE,,3668,3668,accuracy;f1_score,,https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/I05-50025B15D.pdf,,glue-mrpc,cls/paraphrase,,paraphrase,,Dolan and Brockett 2005,TRUE
+glue,qqp,paraphrase,cls,,,BASE,,363846,363846,accuracy;f1_score,,https://aclanthology.org/I05-5002.pdf,,glue-qqp,cls/paraphrase,,,,(link),TRUE
+paws,labeled_final,paraphrase,cls,,,BASE,,49401,49401,,,,,paws,cls/paraphrase,,,,Zhang et al. 2019,TRUE
+ai2_arc,ARC-Challenge,QA_closed_book,cls,,GPT,GPT_EVAL,,1119,0,"accuracy_with_tie : For each question, a system receives 1 point if it
 chooses the correct answer and 1/k if it reports a k-way tie
-(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,,ARC (chal.),qa/multiple-choice qa,,nontrivial_comprehension,,Clark et al. 2018
-ai2_arc,ARC-Easy,QA_closed_book,cls,,GPT,,,TRUE,2251,0,TRUE,,2251,"accuracy_with_tie: For each question, a system receives 1 point if it
+(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,ARC (chal.),qa/multiple-choice qa,,nontrivial_comprehension,,Clark et al. 2018,TRUE
+ai2_arc,ARC-Easy,QA_closed_book,cls,,GPT,GPT_EVAL,,2251,0,"accuracy_with_tie: For each question, a system receives 1 point if it
 chooses the correct answer and 1/k if it reports a k-way tie
-(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,,ARC (easy),Multiple choice,,,,
-nq_open,,QA_closed_book,gen,,GPT,TRUE,,TRUE,87925,0,,TRUE,0,kilt-exact_match;average_accuracy_accross_answers,TRUE,https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00276/43518/Natural-Questions-A-Benchmark-for-Question,intensive,,Natural Questions (open domain),,,trivia,,
-kilt_tasks,hotpotqa,QA_closed_book,gen,recast as closed-book due to input length,self,,TRUE,,88869,88869,,,,,,,,,kilt hotpotqa,qa/closed-book qa,,encyclopedia; multi-hop QA,,Yang et al. 2018
-trivia_qa,unfiltered,QA_closed_book,gen,,GPT,TRUE,,TRUE,87622,0,TRUE,,87622,exact_match;f1_over_words => wikipedia aliases are considered valid answers,TRUE,https://arxiv.org/pdf/1705.03551.pdf,intensive,,Trivia QA,,,,,
-web_questions,,QA_closed_book,gen,"""supposed to be answerable by Freebase"" Check corpora deduplication with freebaseqa.",GPT,,,TRUE,3778,0,TRUE,,3778,accuracy : they don't mention how they normalize across multiple correct answers,TRUE,https://aclanthology.org/D13-1160.pdf,intensive,,web questions,qa/closed-book qa,,,,Berant et al. 2013
-wiki_qa,,QA_closed_book,cls,,CrossFit,,TRUE,,20360,20360,,,,,,https://aclanthology.org/D15-1237.pdf,,,wiki qa,cls/other,,,,Yang et al. 2015
-adversarial_qa,dbidaf,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,https://aclanthology.org/2020.tacl-1.43/,,,adversarialqa,qa/machine reading comprehension,,,,Bartolo et al. 2020
-adversarial_qa,dbert,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,,,,,,,,,
-adversarial_qa,droberta,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,,,,,,,,,
-coqa,,QA_extractive,ext,GPT-easy,GPT,,,TRUE,7199,,,,,"macro_average_f1: for computing a model’s performance, each individual prediction is compared
+(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,ARC (easy),Multiple choice,,,,,TRUE
+nq_open,,QA_closed_book,gen,,GPT,,TRUE,87925,0,kilt-exact_match;average_accuracy_accross_answers,TRUE,https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00276/43518/Natural-Questions-A-Benchmark-for-Question,intensive,Natural Questions (open domain),,,trivia,,,
+kilt_tasks,hotpotqa,QA_closed_book,gen,recast as closed-book due to input length,self,BASE,,88869,88869,,,,,kilt hotpotqa,qa/closed-book qa,,encyclopedia; multi-hop QA,,Yang et al. 2018,TRUE
+trivia_qa,unfiltered,QA_closed_book,gen,,GPT,GPT_EVAL,,87622,0,exact_match;f1_over_words => wikipedia aliases are considered valid answers,TRUE,https://arxiv.org/pdf/1705.03551.pdf,intensive,Trivia QA,,,,,,TRUE
+web_questions,,QA_closed_book,gen,"""supposed to be answerable by Freebase"" Check corpora deduplication with freebaseqa.",GPT,GPT_EVAL,,3778,0,accuracy : they don't mention how they normalize across multiple correct answers,TRUE,https://aclanthology.org/D13-1160.pdf,intensive,web questions,qa/closed-book qa,,,,Berant et al. 2013,TRUE
+wiki_qa,,QA_closed_book,cls,,CrossFit,BASE,,20360,20360,,,https://aclanthology.org/D15-1237.pdf,,wiki qa,cls/other,,,,Yang et al. 2015,TRUE
+adversarial_qa,dbidaf,QA_extractive,ext,,,BASE,,10000,10000,,,https://aclanthology.org/2020.tacl-1.43/,,adversarialqa,qa/machine reading comprehension,,,,Bartolo et al. 2020,TRUE
+adversarial_qa,dbert,QA_extractive,ext,,,BASE,,10000,10000,,,,,,,,,,,TRUE
+adversarial_qa,droberta,QA_extractive,ext,,,BASE,,10000,10000,,,,,,,,,,,TRUE
+coqa,,QA_extractive,ext,GPT-easy,GPT,,TRUE,7199,,"macro_average_f1: for computing a model’s performance, each individual prediction is compared
 against n human answers resulting in n F1 scores,
 the maximum of which is chosen as the prediction’s
 F1.For each question, we average out F1 across
@@ -42,201 +41,199 @@ these n sets, both for humans and models. In our
 final evaluation, we use n = 4 human answers for
 every question (the original answer and 3 additionally collected answers). The articles a, an and the
 and punctuations are excluded in evaluation.",from the paper it seems it could contain multiple answers but the datasets has only one answer per question,https://arxiv.org/pdf/1808.07042.pdf,,,,,,,,
-duorc,SelfRC,QA_extractive,ext,,TaskEmbed;CrossFit,,TRUE,,60721,60721,,,,,,https://duorc.github.io/,,,DuoRC,qa/machine reading comprehension,,,Wikipedia/IMDB crowd,Saha et al. 2018
-duorc,ParaphraseRC,QA_extractive,ext,,TaskEmbed;CrossFit,,TRUE,,69524,69524,,,,,,https://arxiv.org/pdf/1804.07927.pdf,,,DuoRC,paraphrased QA,,,,Saha et al. 2018
-ropes,,QA_extractive,ext,,,TRUE,TRUE,,10924,10924,TRUE,,10924,,,,modest,,ropes,Extractive QA,,cause_and_effect;nontrivial_comprehension,,Lin et al. 2019
-squad_v2,,QA_extractive,ext,,GPT,,,TRUE,130319,0,TRUE,,130319,exact_match;f1_score,TRUE,https://arxiv.org/pdf/1806.03822.pdf,,,SQuAD 2.0,Extractive QA,,,,Rajpurkar et al. 2018
-super_glue,record,QA_extractive,ext,,,TRUE,,TRUE,100730,0,TRUE,TRUE,100730,max_token_level_f1;exact_match,TRUE,https://arxiv.org/pdf/1810.12885.pdf,,,superglue-record,qa/machine reading comprehension,,knowledge-? reading comprehension,,Zhang et al. 2018
-qa_srl,,QA_extractive,ext,"need non-naive metric (""If the predicted word is contained inside the annotated answer span it is considered a correct prediction.""); v2 not in HF https://aclanthology.org/P18-1191.pdf",Eval WG,,,TRUE,6414,0,TRUE,TRUE,6414,accuracy,TRUE,https://dada.cs.washington.edu/qasrl/#page-top,neutral,,qa srl,other,,semantic role,,He et al. 2015
-quac,,QA_extractive,ext,,GPT,,,TRUE,11567,,,,,"average_maximum_f1;HEQ-Q;HEQ-D:  To make oracle human and system performance comparable,
+duorc,SelfRC,QA_extractive,ext,,TaskEmbed;CrossFit,BASE,,60721,60721,,,https://duorc.github.io/,,DuoRC,qa/machine reading comprehension,,,Wikipedia/IMDB crowd,Saha et al. 2018,TRUE
+duorc,ParaphraseRC,QA_extractive,ext,,TaskEmbed;CrossFit,BASE,,69524,69524,,,https://arxiv.org/pdf/1804.07927.pdf,,DuoRC,paraphrased QA,,,,Saha et al. 2018,TRUE
+ropes,,QA_extractive,ext,,,BASE,,10924,10924,,,,modest,ropes,Extractive QA,,cause_and_effect;nontrivial_comprehension,,Lin et al. 2019,TRUE
+squad_v2,,QA_extractive,ext,,GPT,GPT_EVAL,,130319,0,exact_match;f1_score,TRUE,https://arxiv.org/pdf/1806.03822.pdf,,SQuAD 2.0,Extractive QA,,,,Rajpurkar et al. 2018,TRUE
+super_glue,record,QA_extractive,ext,,,SGLUE,,100730,0,max_token_level_f1;exact_match,TRUE,https://arxiv.org/pdf/1810.12885.pdf,,superglue-record,qa/machine reading comprehension,,knowledge-? reading comprehension,,Zhang et al. 2018,TRUE
+qa_srl,,QA_extractive,ext,"need non-naive metric (""If the predicted word is contained inside the annotated answer span it is considered a correct prediction.""); v2 not in HF https://aclanthology.org/P18-1191.pdf",Eval WG,,TRUE,6414,0,accuracy,TRUE,https://dada.cs.washington.edu/qasrl/#page-top,neutral,qa srl,other,,semantic role,,He et al. 2015,
+quac,,QA_extractive,ext,,GPT,,TRUE,11567,,"average_maximum_f1;HEQ-Q;HEQ-D:  To make oracle human and system performance comparable,
 given n references, we report the average of the
 maximum F1 computed from each n − 1 subset
-with respect to the heldout reference.",TRUE,https://arxiv.org/pdf/1808.07036.pdf,,,,,,dialogue,,
-quoref,,QA_extractive,ext,,,TRUE,TRUE,,19399,19399,TRUE,,19399,,,https://aclanthology.org/D19-1606.pdf,,,Quoref,Extractive QA,,,,Dasigi et al. 2019
-tydiqa,,QA_extractive,ext,,Eval WG,,TRUE,,9211,9211,,,,,,,,,,,,,,
-drop,,QA_generative,gen,"nontrivial math; try history_690, it's pretty hard even when I have domain knowledge",GPT,TRUE,,TRUE,,,,,,exact_match; macro_average_f1,TRUE,https://aclanthology.org/N19-1246.pdf,,,DROP ,multi-hop quantitative reasoning; Abstractive QA,,numerical,Wikipedia crowd,Dua et al. 2019
-cos_e,v1.11,QA_multiple_choice,cls,"same as commonsense_qa but with (poorly sourced) human explanations; questionable ""commonsense"" lots of world knowledge",Vania,TRUE,TRUE,,9741,9741,TRUE,,9741,,,,,,cos e,other/generate explanation,,,,Rajani et al. 2019
-cosmos_qa,,QA_multiple_choice,cls,,,TRUE,TRUE,,25262,25262,TRUE,,25262,,,,,,cosmos qa,qa/multiple-choice qa,,,,Huang et al. 2019
-dream,,QA_multiple_choice,cls,,,TRUE,TRUE,,6116,6116,TRUE,,6116,,,,,,dream,qa/multiple-choice qa,,,,Sun et al. 2019
-openbookqa,main,QA_multiple_choice,cls,interesting combo of pragmatics + scientific reasoning,GPT,,,TRUE,4957,0,TRUE,TRUE,4957,"accuracy_with_tie : For each question, a system receives 1 point if it
+with respect to the heldout reference.",TRUE,https://arxiv.org/pdf/1808.07036.pdf,,,,,dialogue,,,
+quoref,,QA_extractive,ext,,,BASE,,19399,19399,,,https://aclanthology.org/D19-1606.pdf,,Quoref,Extractive QA,,,,Dasigi et al. 2019,TRUE
+drop,,QA_generative,gen,"nontrivial math; try history_690, it's pretty hard even when I have domain knowledge",GPT,,TRUE,,,exact_match; macro_average_f1,TRUE,https://aclanthology.org/N19-1246.pdf,,DROP ,multi-hop quantitative reasoning; Abstractive QA,,numerical,Wikipedia crowd,Dua et al. 2019,
+cos_e,v1.11,QA_multiple_choice,cls,"same as commonsense_qa but with (poorly sourced) human explanations; questionable ""commonsense"" lots of world knowledge",Vania,BASE,,9741,9741,,,,,cos e,other/generate explanation,,,,Rajani et al. 2019,TRUE
+cosmos_qa,,QA_multiple_choice,cls,,,BASE,,25262,25262,,,,,cosmos qa,qa/multiple-choice qa,,,,Huang et al. 2019,TRUE
+dream,,QA_multiple_choice,cls,,,BASE,,6116,6116,,,,,dream,qa/multiple-choice qa,,,,Sun et al. 2019,TRUE
+openbookqa,main,QA_multiple_choice,cls,interesting combo of pragmatics + scientific reasoning,GPT,GPT_EVAL,,4957,0,"accuracy_with_tie : For each question, a system receives 1 point if it
 chooses the correct answer and 1/k if it reports a k-way tie
-(i.e., chooses multiple answers) that includes the correct answer.",,https://aclanthology.org/D18-1260.pdf,modest,,openbookqa,qa/multiple-choice qa,,pragmatics,,Mihaylov et al. 2018
-qasc,,QA_multiple_choice,cls,,,TRUE,TRUE,,8134,8134,TRUE,,8134,,,,given?,,qasc,qa/multiple-choice qa,,,,Khot et al. 2020
-quail,,QA_multiple_choice,cls,,,TRUE,TRUE,,10246,10246,TRUE,,10246,,,,,,quail,qa/multiple-choice qa,,,,Rogers et al. 2020
-quarel,,QA_multiple_choice,cls,,CrossFit,,TRUE,,1941,1941,,,,,,,,,quarel,qa/multiple-choice qa,,logical form,,Tafjord et al. 2019a
-quartz,,QA_multiple_choice,cls,,,TRUE,TRUE,,2696,2696,TRUE,,2696,,,https://aclanthology.org/D19-1608.pdf,given?,,quartz-with knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b
-race,high,QA_multiple_choice,cls,GPT-hard,GPT,,,TRUE,62445,0,TRUE,TRUE,62445,accuracy,,https://arxiv.org/pdf/1704.04683.pdff,neutral,,race-high,qa/multiple-choice qa,,knowledge-neutral reading comprehension,,Lai et al. 2017
-race,middle,QA_multiple_choice,cls,"revisit: define as comprehension, paragraph level?",GPT,,,TRUE,25421,0,TRUE,TRUE,25421,accuracy,,https://arxiv.org/pdf/1704.04683.pdf,neutral,,race-middle,qa/multiple-choice qa,,knowledge-neutral reading comprehension,,Lai et al. 2017
-sciq,,QA_multiple_choice,cls,,,TRUE,TRUE,,11679,11679,TRUE,,11679,,,,,,sciq,qa/multiple-choice qa,,,,Welbl et al. 2017
-social_i_qa,,QA_multiple_choice,cls,metric differ by prompt: 4-way classification cast as binary ,,TRUE,TRUE,TRUE,33410,33410,TRUE,TRUE,33410,accuracy,,https://arxiv.org/pdf/1904.09728.pdf,,,SIQA,qa/multiple-choice qa,,cultural knowledge,,Sap et al. 2019
-super_glue,boolq,QA_multiple_choice,cls,,,TRUE,,TRUE,9427,0,TRUE,TRUE,9427,accuracy,,https://arxiv.org/pdf/1905.10044.pdf,neutral?,,superglue-boolq,,,knowledge-? reading comprehension,,
-super_glue,copa,QA_multiple_choice,cls,,,TRUE,,TRUE,400,0,TRUE,TRUE,400,accuracy,,http://commonsensereasoning.org/2011/papers/Roemmele.pdf,modest,,superglue-copa,qa/multiple-choice qa,,causal cognition,,Gordon et al. 2012
-super_glue,multirc,QA_multiple_choice,cls,F1 over all answer options. See paper p. 259 for defintion,,TRUE,,TRUE,27243,0,TRUE,TRUE,27243,f1_over_all_options;exact_match,,https://aclanthology.org/N18-1023.pdf,neutral?,,superglue-multirc,qa/multiple-choice qa,,knowledge-? reading comprehension,,Khashabi et al. 2018
-wiki_hop,original,QA_multiple_choice,cls,,,TRUE,TRUE,,43738,43738,TRUE,,43738,,,https://transacl.org/ojs/index.php/tacl/article/viewFile/1325/299,,,WikiHop (Welbl et al. 2018),multi-hop QA,,,Wikipedia KB,
-wiqa,,QA_multiple_choice,cls,,,TRUE,TRUE,,29808,29808,TRUE,,29808,,,,,,wiqa,qa/multiple-choice qa,,cause_and_effect,,Tandon et al. 2019
-circa,,QA_multiple_choice,cls,revisit: problematic prompts,,,,TRUE,34268,0,,TRUE,0,mean_multiclass_f1;accuracy,,https://arxiv.org/pdf/2010.03450.pdf,,,circa,cls/other,,pragmatics,,Louis et al. 2020
-mc_taco,,QA_multiple_choice,cls,no train set; variable number of answer_chocies; eval in paper is over set of possible candidates;,,,,TRUE,0,0,,TRUE,0,exact_match; f1_score,,https://arxiv.org/pdf/1909.03065.pdf,,,mc taco,qa/binary,,temporal cognition,,Zhou et al. 2019
-piqa,,QA_multiple_choice,cls,revisit: not just other,GPT,,,TRUE,16113,0,TRUE,,16113,accuracy,,https://arxiv.org/pdf/1911.11641.pdf,,,PIQA,Multiple choice,,physical_cognition,,Bisk et al. 2020
-amazon_polarity,,sentiment,cls,,,TRUE,TRUE,,3600000,500000,TRUE,,500000,,,https://cs.stanford.edu/people/jure/pubs/reviews-recsys13.pdf,,,amazon polarity,cls/sentiment analysis,,,,McAuley and Leskovec 2013
-app_reviews,,sentiment,cls,,,TRUE,TRUE,,288065,288065,TRUE,,288065,,,,,,app reviews,other/regression,,,,Missing
-imdb,,sentiment,cls,,,TRUE,TRUE,,25000,25000,TRUE,,25000,,,,,,imdb,cls/sentiment analysis,,no dev set,,Maas et al. 2011
-rotten_tomatoes,,sentiment,cls,,,TRUE,TRUE,,8530,8530,TRUE,,8530,,,,,,rotten tomatoes,cls/sentiment analysis,,,,Pang and Lee 2005
-yelp_review_full,,sentiment,cls,no dev set,,TRUE,TRUE,,650000,500000,TRUE,,500000,,,,,,yelp review full,other/regression,,,,Zhang et al. 2015; (link)
-lambada,,story_completion,gen,revisit: story or cloze or coref? trivial cloze prompt; training set is just unlabeled corpora; GPT task,GPT,,,TRUE,0,0,,TRUE,0,accuracy;perplexity;median_rank,,https://arxiv.org/pdf/1606.06031.pdf,,,,,,,,
-craffel/openai_lambada,,story_completion,gen,revisit: story or cloze or coref? trivial cloze prompt; training set is just unlabeled corpora; GPT task,GPT,,,TRUE,0,0,,TRUE,0,accuracy;perplexity;median_rank,,https://arxiv.org/pdf/1606.06031.pdf,,,,,,,,
-story_cloze,2016,story_completion,cls,todo: custom loading; swag like?,GPT,,,TRUE,,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1604.01696.pdf,,,,,,,,
-hellaswag,,story_completion,cls,,GPT,,,TRUE,39905,0,TRUE,,39905,accuracy,,https://arxiv.org/pdf/1905.07830.pdf,,,hellaswag,qa/multiple-choice qa,,,,Zellers et al. 2019
-common_gen,,structure_to_text,gen,,,TRUE,TRUE,,67389,67389,TRUE,,67389,,,,,,common gen,other,,,,Lin et al. 2020b
-wiki_bio,,structure_to_text,gen,,,TRUE,TRUE,,582659,500000,TRUE,,500000,,,,,,wiki bio,cg/other,,,,Lebret et al. 2016
-cnn_dailymail,3.0.0,summarization,gen,,,TRUE,TRUE,,287113,287113,TRUE,,287113,,,,,,,,,,,
-gigaword,,summarization,gen,,,TRUE,TRUE,,3803957,500000,TRUE,,500000,,,,,,gigaword,cg/summarization,,,,Napoles et al. 2012
-multi_news,,summarization,gen,,CrossFit,,TRUE,,44972,44972,,,,,,,,,multi news,cg/summarization,,,,Fabbri et al. 2019
-samsum,,summarization,gen,,CrossFit,,TRUE,,14732,14732,,,,,,,,,samsum,cg/summarization,,,,Gliwa et al. 2019
-xsum,,summarization,gen,,,TRUE,TRUE,TRUE,204045,204045,TRUE,TRUE,204045,rouge,,https://arxiv.org/pdf/1808.08745.pdf,,,xsum,cg/summarization,,,,Narayan et al. 2018
-ag_news,,topic_classification,cls,,,TRUE,TRUE,,120000,120000,TRUE,,120000,,,http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html,,,ag news,cls/topic,,,,Gulli (link)
-dbpedia_14,,topic_classification,cls,,,TRUE,TRUE,,560000,500000,TRUE,,500000,,,https://svn.aksw.org/papers/2013/SWJ_DBpedia/public.pdf,,,dbpedia 14,cls/topic,,,,Lehmann et al. 2015
-trec,,topic_classification,cls,,,TRUE,TRUE,,5452,5452,TRUE,,5452,,,https://trec.nist.gov/data/qa.html,,,trec,cls/other,,,,Li and Roth 2002; Hovy et al. 2001
-super_glue,wic,word_sense_disambiguation,cls,,,TRUE,,TRUE,5428,0,TRUE,TRUE,5428,accuracy,,https://arxiv.org/pdf/1808.09121.pdf,,,superglue-wic,cls/other,,lexical_knowledge,,Pilehvar and Camacho-Collados 2019
-Staging Area,,,,,,,,,,,,,,,,,,,,,,,,
-Would Include but not in HF or some other practical limitations,,,,,,,,,,,,,,,,,,,,,,,,
-definite_pronoun_resolution,,coreference,,todo: download error,,,,,,,,,,,,,,,deﬁnite pronoun resolution,other,,,,Rahman and Ng 2012
-jeopardy,,closed-book qa,gen,sporadic download error,CrossFit,,,,,,,,,,,,,promptsource download error,jeopardy,qa/closed-book qa,,,,(link)
-blimp,,,cls,no prompts yet; collapse subsets,,,,,,0,,,0,,,,,,,,,,,
-Hendrycks et al. 2021,,,,https://arxiv.org/abs/2009.03300v3,,,,,,,,,,,,,,,,,,,,
-Multi-Turn Dialogue Reasoning,,,,https://aclanthology.org/2020.acl-main.130.pdf,Vania,,,,7088,,,,,,,,,,,,,,,
-Argument Reasoning Comprehension Task,,,,https://aclanthology.org/N18-1175.pdf,Vania,,,,1211,,,,,,,,,,,,,,,
-MCScript,,,,https://aclanthology.org/L18-1564.pdf,Vania,,,,14191,,,,,,,,,,,,,,,
-narrativeqa,,,,very long input sequence,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,NarQA,Abstractive QA,,,,
-newsqa,,,,download error,TaskEmbed,,,,,,,,,,,,,promptsource download error,NewsQA,Extractive QA,,,,Trischler et al. 2017
-eli5,,,,dataset split error,CrossFit,,,,,,,,,,,https://facebookresearch.github.io/ELI5/explore.html,,skip: HF datasets error the split field is used for subsets,eli5-askh,qa/long-form qa,,possibly knowledge-neutral,,Fan et al. 2019
-Maybe Reconsider,,,,,,,,,,,,,,,,,,,,,,,,
-zest,,,,its original task is quite complex (need to provide a decision function); should be held-out eval only,self,,,,,,,,,,,,,,,,,,,
-swag,,story_completion,cls,revisit whether this should be considered as a variant of NLI,,,,,73546,0,TRUE,,73546,,,,,,swag,qa/multiple-choice qa,,,,Zellers et al. 2018
-codah,codah,story_completion,cls,a variant of swag revisit whether this should be considered as a variant of NLI,,,,,2776,0,TRUE,,2776,,,,,,codah,qa/multiple-choice qa,,,,Chen et al. 2019
-wiki_auto,,,,revisit: lots of duplicate simplified text; novel generative task could be very challenging,CrossFit,,,,,,,,,,,,,no prompt yet,wiki auto,cls/other,,text simplification,,Jiang et al. 2020
-proto_qa,,,gen,"generate prototypical concepts, kinda niche format with multiple correct answers",CrossFit,,,,,,,,,,,,,no prompt yet,proto qa,other,,,,Boratko et al. 2020
-empathetic_dialogues,,,,generation? classification?,CrossFit,,,,,,,,,,,https://arxiv.org/pdf/1811.00207.pdf,,no prompt yet,empathetic dialogues,cg/dialogue,,,,Rashkin et al. 2019
-qed,,,,uses held-out Natural Questions,,,,,,,,,,,,,,,,,,,,
-kilt_tasks,aidayago2,,,,,,,,,,,,,,,,,no prompt yet,kilt ay2,other/entity linking,,encyclopedia,,Hoffart et al. 2011
-kilt_tasks,wow,,,,,,,,,,,,,,,,,no prompt yet,kilt wow,cg/dialogue,,encyclopedia,,Dinan et al. 2019
-lama,conceptnet,,,,,,,,,,,,,,,,,no prompt yet,lama-conceptnet,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
-lama,google_re,,,,,,,,,,,,,,,,,no prompt yet,lama-google re,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
-lama,squad,,,,,,,,,,,,,,,,,no prompt yet,lama-squad,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
-lama,trex,,,,,,,,,,,,,,,,,no prompt yet,lama-trex,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
-limit,,physical cognition,,,,,,,,,,,,,,https://aclanthology.org/2020.findings-emnlp.88.pdf,,label errors in dataset itself? also no validation set otherwise well motivated by semantic theories,limit,other,,physical semantic repr.,,Manotas et al. 2020
-kilt_tasks,fever,,,revisit whether this should be considered as a variant of NLI,,,,,,,,,,,,,,temporary skip: prompts available in non-benchmark standalone dataset,kilt fever,cls/fact checking,,encyclopedia,,Thorne et al. 2018
-Skipped,,,,,,,,,,,,,,,,,,,,,,,,
-fever,v2.0,closed-book qa/fact checking,,also in KILT,,,,,,,,,,,,,,skip: awkward prompts as closed-book qa,FEVER,,,,,
-hotpot_qa,distractor,,,also in KILT,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,Hotpot QA,,,,,
-hotpot_qa,fullwiki,,,also in KILT,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,Hotpot QA,,,,,
-emo,,sentiment,cls,skip: offensive and ungrammatical text,,merged,,,30160,0,TRUE,TRUE,30160,precision;recall;F1,,https://aclanthology.org/S19-2005.pdf,,skip: offensive and ungrammatical text,emo,cls/emotion,,,,Chatterjee et al. 2019
-freebase_qa,,QA_closed_book,gen,"need to be held out because web_questions is ""supposed to be answerable by Freebase""",,,,,20358,0,TRUE,,20358,,,,intensive,,freebase qa,qa/closed-book qa,,,,Jiang et al. 2019
-aqua_rat,,,,,,,,,,,,,,,,https://arxiv.org/abs/1705.04146,,skip: nontrivial math,aqua rat,qa/multiple-choice qa,,nontrivial math,,Ling et al. 2017
-math_qa,,,,,,,,,,,,,,,,,,skip: nontrivial math,math qa,qa/multiple-choice qa,,nontrivial math,,Amini et al. 2019
-numer_sense,,,,,,,,,,,,,,,,,,skip: closed-book trivia ,numer sense,qa/closed-book qa,,numerical knowledge,,Lin et al. 2020a
-squad_adversarial,,,,,,,,,,,,,,,,,,validation set only,,,,,,
-squadshifts,,,,,,,,,,,,,,,,,,test set only,,,,,,
-sms_spam,,,,,,,,,,,,,,,,,,skip: unclean corpus and likely harmful content,sms spam,cls/other,,,,Almeida et al. 2011
-search_qa,,,,,,,,,,,,,,,,,,skip: seems like a very unclean corpus,search qa,qa/closed-book qa,,,,Dunn et al. 2017
-kilt_tasks,trex,,,,,,,,,,,,,,,,,skip: non-natural language,kilt trex,qa/closed-book qa,,encyclopedia,,Elsahar et al. 2018
-kilt_tasks,structured_zeroshot,,,,,,,,,,,,,,,,,skip: non-natural language,kilt zsre,qa/closed-book qa,,encyclopedia,,Levy et al. 2017
-spider,,,,,,,,,,,,,,,,,,skip: non-natural language,spider,cg/other,,,,Yu et al. 2018
-wikisql,,,,,,,,,,,,,,,,,,skip: non-natural language,wikisql,cg/other,,,,Zhong et al. 2017
-com_qa,,,,,CrossFit,,,,,,,,,,,https://arxiv.org/pdf/1809.09528.pdf,,skip: non-human language: URL,ComQA (Abujabal et al. 2019),factoid QA w/ paraphrases,,,snippets WikiAnswers,
-climate_fever,,,,revisit whether this should be considered as a variant of NLI,,,,,,,,,,,,,,skip: no train set,climate fever,cls/fact checking,,,,Diggelmann et al. 2020
-art,,,,,,,,,,,,,,,,https://arxiv.org/pdf/1908.05739.pdf,,skip: NLI reserved for generalization studies (although this one is not a traditionally defined NLI),art (abductive nli),other,,,,Bhagavatula et al. 2020
-glue,mnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-mnli,cls/nli,,,,Williams et al. 2018
-glue,qnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-qnli,cls/nli,,,,Rajpurkar et al. 2016
-glue,rte,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-rte,cls/nli,,,,Dagan et al. 2005; Bar-Haim et al. 2006 Giampiccolo et al. 2007; Bentivogli et al. 2009
-glue,wnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-wnli,cls/nli,,,,Levesque et al. 2012
-,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,scitail,cls/nli,,,,Khot et al. 2018
-,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,sick,cls/nli,,,,Marelli et al. 2014
-,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,SNLI (Bowman et al. 2015),NLI,,,misc.,
-aeslc,,,,summarization by email subject line,,,,,,,,,,,,https://arxiv.org/abs/1906.03497,,skip: niche task,aeslc,cg/summarization,,generation,,Zhang and Tetreault 2019
-onestop_english,,,,,,,,,,,,,,,,https://aclanthology.org/W18-0535.pdf,,skip: niche task: classify curriculum diffculty,onestop english,cls/other,,,,Vajjala and Luˇci´c 2018
-mocha,,,,,,,,,,,,,,,,,,skip: model generated text,mocha,other/regression,,,,Chen et al. 2020a
-commonsense_qa,,,,duplicate with cos_e,Vania,,,,9741,,,,,,,https://arxiv.org/pdf/1811.00937.pdf,,,Commonsense QA,qa/multiple-choice qa,,,,Talmor et al. 2019
-,,,,,,,,,,,,,,,,,,skip: maybe harmful content from Twitter,emotion,cls/emotion,,,,Saravia et al. 2018
-,,,,the authors themselves seem to have renounced their own work,,,,,,,,,,,,https://github.com/nyu-mll/crows-pairs,,skip: harmful content,crows pairs,other,,,,Nangia et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-directed vs generalized,cls/hate speech detection,,,,Mollas et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-disability,cls/hate speech detection,,,,Mollas et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-gender,cls/hate speech detection,,,,Mollas et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-national origin,cls/hate speech detection,,,,Mollas et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-race,cls/hate speech detection,,,,Mollas et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-religion,cls/hate speech detection,,,,Mollas et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-sexual orientation,cls/hate speech detection,,,,Mollas et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,hate speech offensive,cls/hate speech detection,,,,Davidson et al. 2017
-,,,,,,,,,,,,,,,,,,skip: harmful content,hate speech18,cls/hate speech detection,,,,de Gibert et al. 2018
-,,,,,,,,,,,,,,,,,,skip: harmful content,hatexplain,cls/hate speech detection,,,,Mathew et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,reddit tifu-title,cg/summarization,,,,Kim et al. 2019
-,,,,,,,,,,,,,,,,,,skip: harmful content,reddit tifu-tldr,cg/summarization,,,,Kim et al. 2019
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-emoji,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-emotion,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-hate,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-irony,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-offensive,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-sentiment,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance abortion,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance atheism,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance climate,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance feminist,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance hillary,cls/emotion,,,,Barbieri et al. 2020
-,,,,,,,,,,,,,,,,,,skip: harmful content,tweet qa,qa/machine reading comprehension,,,,Xiong et al. 2019
-yelp_polarity,,,,,,,,,,,,,,,,,,skip: duplicate with yelp_review_full,yelp polarity,cls/sentiment analysis,,,,Zhang et al. 2015; (link)
-quora,,,,,,,,,,,,,,,,https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs,,skip: duplicate under GLUE,QQP,paraphrase identiﬁcation,,,social QA,Iyer et al. 2017
-squad,,,,,,,,,,,,,,,,,,skip: duplicate under Squad 2.0,SQuAD 1.1,Extractive QA,,,,
-yahoo_answers_topics,,,,,,,,,,,,,,,,,,skip for early experiments: unclean corpus,yahoo answers topics,cls/topic,,,,(link)
-tab_fact,,,,,,,,,,,,,,,,,,skip for early experiments: tabular data,tab fact,cls/fact checking,,,,Chen et al. 2020b
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-anaphor gender agreement,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-anaphor number agreement,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-determiner noun agreement with adj irregular 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-ellipsis n bar 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-ellipsis n bar 2,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-existential there quantiﬁers 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-irregular past participle adjectives,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-sentential negation npi licensor present,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-sentential negation npi scope,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-wh questions object gap,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
-poem_sentiment,,,,,,,,,,,,,,,,,,skip for early experiments: poetry domain,poem sentiment,cls/sentiment analysis,,creativity,,Sheng and Uthus 2020
-acronym_identification,,,,,,,,,,,,,,,,https://arxiv.org/pdf/2010.14678.pdf,,skip for early experiments: niche/hard task,acronym identiﬁcation,other,,,,Pouran Ben Veyseh et al. 2020
-google_wellformed_query,,,,revisit whether to exclude fine-grain regression tasks,,,,,,,,,,,,,,skip for early experiments: niche/hard task,google wellformed query,cls/other,,,,Faruqui and Das 2018
-liar,,,,revisit whether to exclude fine-grain regression tasks,,,,,,,,,,,,,,skip for early experiments: niche/hard task,liar,cls/fact checking,,,,Wang 2017
-,,,,,,,,,,,,,,,,,,skip for early experiments: niche/hard task,break-QDMR-high-level,other,,semantic representation,,Wolfson et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: niche/hard task,crawl domain,other,,,,Zhang et al. 2020
-discovery,discovery,,,,,,,,,,,,,,,,,skip for early experiments: niche task no cannonical answer,discovery,cls/other,,generative-ish,,Sileo et al. 2019
-wiki_split,,,,,,,,,,,,,,,,,,skip for early experiments: niche task,wiki split,cg/other,,,,Botha et al. 2018
-,,,,,,,,,,,,,,,,,,skip for early experiments: multilingual,aslg pc12,other,,,,Othman and Jemni 2012
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,CCG (Hockenmaier and Steedman 2007),CCG supertagging,,syntax,Penn Treebank,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Chunk (Tjong Kim Sang and Buchholz 2000),syntactic chunking,,syntax,Penn Treebank,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Conj (Ficler and Goldberg 2016),conjunct identiﬁcation,,syntax,Penn Treebank,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GED (Yannakoudakis et al. 2011),grammatical error detection,,syntax,misc.,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GGParent (Liu et al. 2019a),syntactic tagging,,syntax,Penn Treebank,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GParent (Liu et al. 2019a),syntactic tagging,,syntax,Penn Treebank,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,NER (Tjong Kim Sang and De Meulder 2003),named entity recognition,,,news,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Parent (Liu et al. 2019a),syntactic tagging,,syntax; constituency,Penn Treebank,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,POS-EWT (Silveira et al. 2014),part-of-speech tagging,,syntax,Web Treebank,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,POS-PTB (Marcus et al. 1993),part-of-speech tagging,,syntax,Penn Treebank,
-,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,ST (Bjerva et al. 2016),semantic tagging,,,Groningen Meaning Bank,
-financial_phrasebank,,,,,,,,,,,,,,,,,,skip for early experiments: financial domain,ﬁnancial phrasebank,cls/sentiment analysis,,,,Malo et al. 2014
-health_fact,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,health fact,cls/fact checking,,,,Kotonya and Toni 2020
-,,,,,,,,,,,,,,,,http://www.sciencedirect.com/science/article/pii/S1532046412000615,,skip for early experiments: biomedical domain,ade corpus v2-classiﬁcation,cls/other,,,,Gurulingappa et al. 2012
-,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,ade corpus v2-dosage,other/slot ﬁlling,,,,Gurulingappa et al. 2012
-,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,ade corpus v2-effect,other/slot ﬁlling,,,,Gurulingappa et al. 2012
-,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,biomrc,qa/machine reading comprehension,,,,Pappas et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,medical questions pairs,cls/paraphrase,,,,McCreery et al. 2020
-scicite,,,,,,,,,,,,,,,,,,skip for early experiments: academic domain + niche/hard task,scicite,cls/other,,,,Cohan et al. 2019
-,,,,,,,,,,,,,,,,,,skip for early experiments: abstract semantic representations,break-QDMR,other,,logical form,,Wolfson et al. 2020
-,,,,,,,,,,,,,,,,,,skip for early experiments: abstract semantic representations,e2e nlg cleaned,other,,,,Duˇsek et al. 2020 2019
-glue,sst2,,,,,,,,,,,,,,,,,revisit: very short and often ill-formed movie reviews,glue-sst2,cls/sentiment analysis,,,,Socher et al. 2013
-glue,stsb,fine-grain regression,,,,,,,,,,,,,,,,revisit whether to exclude fine-grain regression tasks,glue-stsb,semantic similarity,,,misc.,
-,,,,,,,,,,,,,,,,,,double check: subset missing from HF datasets,squad-no context,qa/closed-book qa,,,,Rajpurkar et al. 2016
-,,,,,,,,,,,,,,,,,,double check: subset missing from HF datasets,squad-with context,qa/machine reading comprehension,,,,Rajpurkar et al. 2016
-,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,BoolQ-CS,Binary yes/no,,,,
-,,,,,,,,,,,,,,,,https://aclanthology.org/C16-1236.pdf,,double check: missing from HF datasets,CQ (Bao et al. 2016),knowledge-based QA,,,snippets web queries/KB,
-,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,DROP-CS,Abstractive QA,,,,
-,,,,,,,,,,,,,,,,https://aclanthology.org/D13-1020.pdf,,double check: missing from HF datasets,MCTest,Multiple choice,,,,
-,,,,,,,,,,,,,,,,,,double check: missing from HF datasets,MRPC (Dolan and Brockett 2005),paraphrase identiﬁcation,,,news,
-,,,,"""naturally perturbed"" version of BoolQ",,,,,,,,,,,,https://arxiv.org/pdf/2004.04849.pdf,,double check: missing from HF datasets,NP-BoolQ,Binary yes/no,,,,
-,,,,,,,,,,,,,,,,https://aclanthology.org/D19-1608.pdf,,double check: missing from HF datasets,quartz-no knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b
-,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,Quoref-CS,Extractive QA,,,,
-,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,ROPES-CS,Extractive QA,,,,
+(i.e., chooses multiple answers) that includes the correct answer.",,https://aclanthology.org/D18-1260.pdf,modest,openbookqa,qa/multiple-choice qa,,pragmatics,,Mihaylov et al. 2018,TRUE
+qasc,,QA_multiple_choice,cls,,,BASE,,8134,8134,,,,given?,qasc,qa/multiple-choice qa,,,,Khot et al. 2020,TRUE
+quail,,QA_multiple_choice,cls,,,BASE,,10246,10246,,,,,quail,qa/multiple-choice qa,,,,Rogers et al. 2020,TRUE
+quarel,,QA_multiple_choice,cls,,CrossFit,BASE,,1941,1941,,,,,quarel,qa/multiple-choice qa,,logical form,,Tafjord et al. 2019a,TRUE
+quartz,,QA_multiple_choice,cls,,,BASE,,2696,2696,,,https://aclanthology.org/D19-1608.pdf,given?,quartz-with knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b,TRUE
+race,high,QA_multiple_choice,cls,GPT-hard,GPT,GPT_EVAL,,62445,0,accuracy,,https://arxiv.org/pdf/1704.04683.pdff,neutral,race-high,qa/multiple-choice qa,,knowledge-neutral reading comprehension,,Lai et al. 2017,TRUE
+race,middle,QA_multiple_choice,cls,"revisit: define as comprehension, paragraph level?",GPT,GPT_EVAL,,25421,0,accuracy,,https://arxiv.org/pdf/1704.04683.pdf,neutral,race-middle,qa/multiple-choice qa,,knowledge-neutral reading comprehension,,Lai et al. 2017,TRUE
+sciq,,QA_multiple_choice,cls,,,BASE,,11679,11679,,,,,sciq,qa/multiple-choice qa,,,,Welbl et al. 2017,TRUE
+social_i_qa,,QA_multiple_choice,cls,metric differ by prompt: 4-way classification cast as binary ,,BASE,,33410,33410,accuracy,,https://arxiv.org/pdf/1904.09728.pdf,,SIQA,qa/multiple-choice qa,,cultural knowledge,,Sap et al. 2019,TRUE
+super_glue,boolq,QA_multiple_choice,cls,,,SGLUE,,9427,0,accuracy,,https://arxiv.org/pdf/1905.10044.pdf,neutral?,superglue-boolq,,,knowledge-? reading comprehension,,,TRUE
+super_glue,copa,QA_multiple_choice,cls,,,SGLUE,TRUE,400,0,accuracy,,http://commonsensereasoning.org/2011/papers/Roemmele.pdf,modest,superglue-copa,qa/multiple-choice qa,,causal cognition,,Gordon et al. 2012,TRUE
+super_glue,multirc,QA_multiple_choice,cls,F1 over all answer options. See paper p. 259 for defintion,,SGLUE,,27243,0,f1_over_all_options;exact_match,,https://aclanthology.org/N18-1023.pdf,neutral?,superglue-multirc,qa/multiple-choice qa,,knowledge-? reading comprehension,,Khashabi et al. 2018,TRUE
+wiki_hop,original,QA_multiple_choice,cls,,,BASE,,43738,43738,,,https://transacl.org/ojs/index.php/tacl/article/viewFile/1325/299,,WikiHop (Welbl et al. 2018),multi-hop QA,,,Wikipedia KB,,TRUE
+wiqa,,QA_multiple_choice,cls,,,BASE,,29808,29808,,,,,wiqa,qa/multiple-choice qa,,cause_and_effect,,Tandon et al. 2019,TRUE
+circa,,QA_multiple_choice,cls,revisit: problematic prompts,,,TRUE,34268,0,mean_multiclass_f1;accuracy,,https://arxiv.org/pdf/2010.03450.pdf,,circa,cls/other,,pragmatics,,Louis et al. 2020,
+mc_taco,,QA_multiple_choice,cls,no train set; variable number of answer_chocies; eval in paper is over set of possible candidates;,,,TRUE,0,0,exact_match; f1_score,,https://arxiv.org/pdf/1909.03065.pdf,,mc taco,qa/binary,,temporal cognition,,Zhou et al. 2019,
+piqa,,QA_multiple_choice,cls,revisit: not just other,GPT,GPT_EVAL,,16113,0,accuracy,,https://arxiv.org/pdf/1911.11641.pdf,,PIQA,Multiple choice,,physical_cognition,,Bisk et al. 2020,TRUE
+amazon_polarity,,sentiment,cls,,,BASE,,3600000,500000,,,https://cs.stanford.edu/people/jure/pubs/reviews-recsys13.pdf,,amazon polarity,cls/sentiment analysis,,,,McAuley and Leskovec 2013,TRUE
+app_reviews,,sentiment,cls,,,BASE,,288065,288065,,,,,app reviews,other/regression,,,,Missing,TRUE
+imdb,,sentiment,cls,,,BASE,,25000,25000,,,,,imdb,cls/sentiment analysis,,no dev set,,Maas et al. 2011,TRUE
+rotten_tomatoes,,sentiment,cls,,,BASE,,8530,8530,,,,,rotten tomatoes,cls/sentiment analysis,,,,Pang and Lee 2005,TRUE
+yelp_review_full,,sentiment,cls,no dev set,,BASE,,650000,500000,,,,,yelp review full,other/regression,,,,Zhang et al. 2015; (link),TRUE
+lambada,,story_completion,gen,revisit: story or cloze or coref? trivial cloze prompt; training set is just unlabeled corpora; GPT task,GPT,,TRUE,0,0,accuracy;perplexity;median_rank,,https://arxiv.org/pdf/1606.06031.pdf,,,,,,,,
+craffel/openai_lambada,,story_completion,gen,revisit: story or cloze or coref? trivial cloze prompt; training set is just unlabeled corpora; GPT task,GPT,,TRUE,0,0,accuracy;perplexity;median_rank,,https://arxiv.org/pdf/1606.06031.pdf,,,,,,,,
+story_cloze,2016,story_completion,cls,todo: custom loading; swag like?,GPT,,TRUE,,0,accuracy,,https://arxiv.org/pdf/1604.01696.pdf,,,,,,,,TRUE
+hellaswag,,story_completion,cls,,GPT,GPT_EVAL,TRUE,39905,0,accuracy,,https://arxiv.org/pdf/1905.07830.pdf,,hellaswag,qa/multiple-choice qa,,,,Zellers et al. 2019,TRUE
+common_gen,,structure_to_text,gen,,,BASE,,67389,67389,,,,,common gen,other,,,,Lin et al. 2020b,TRUE
+wiki_bio,,structure_to_text,gen,,,BASE,,582659,500000,,,,,wiki bio,cg/other,,,,Lebret et al. 2016,TRUE
+cnn_dailymail,3.0.0,summarization,gen,,,BASE,,287113,287113,,,,,,,,,,,TRUE
+gigaword,,summarization,gen,,,BASE,,3803957,500000,,,,,gigaword,cg/summarization,,,,Napoles et al. 2012,TRUE
+multi_news,,summarization,gen,,CrossFit,BASE,,44972,44972,,,,,multi news,cg/summarization,,,,Fabbri et al. 2019,TRUE
+samsum,,summarization,gen,,CrossFit,BASE,,14732,14732,,,,,samsum,cg/summarization,,,,Gliwa et al. 2019,TRUE
+xsum,,summarization,gen,,,BASE,,204045,204045,rouge,,https://arxiv.org/pdf/1808.08745.pdf,,xsum,cg/summarization,,,,Narayan et al. 2018,TRUE
+ag_news,,topic_classification,cls,,,BASE,,120000,120000,,,http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html,,ag news,cls/topic,,,,Gulli (link),TRUE
+dbpedia_14,,topic_classification,cls,,,BASE,,560000,500000,,,https://svn.aksw.org/papers/2013/SWJ_DBpedia/public.pdf,,dbpedia 14,cls/topic,,,,Lehmann et al. 2015,TRUE
+trec,,topic_classification,cls,,,BASE,,5452,5452,,,https://trec.nist.gov/data/qa.html,,trec,cls/other,,,,Li and Roth 2002; Hovy et al. 2001,TRUE
+super_glue,wic,word_sense_disambiguation,cls,,,SGLUE,TRUE,5428,0,accuracy,,https://arxiv.org/pdf/1808.09121.pdf,,superglue-wic,cls/other,,lexical_knowledge,,Pilehvar and Camacho-Collados 2019,TRUE
+Staging Area,,,,,,,,,,,,,,,,,,,,
+Would Include but not in HF or some other practical limitations,,,,,,,,,,,,,,,,,,,,
+definite_pronoun_resolution,,coreference,,todo: download error,,,,,,,,,,deﬁnite pronoun resolution,other,,,,Rahman and Ng 2012,
+jeopardy,,closed-book qa,gen,sporadic download error,CrossFit,,,,,,,,,jeopardy,qa/closed-book qa,,,,(link),
+blimp,,,cls,no prompts yet; collapse subsets,,,,,0,,,,,,,,,,,
+Hendrycks et al. 2021,,,,https://arxiv.org/abs/2009.03300v3,,,,,,,,,,,,,,,,
+Multi-Turn Dialogue Reasoning,,,,https://aclanthology.org/2020.acl-main.130.pdf,Vania,,,7088,,,,,,,,,,,,
+Argument Reasoning Comprehension Task,,,,https://aclanthology.org/N18-1175.pdf,Vania,,,1211,,,,,,,,,,,,
+MCScript,,,,https://aclanthology.org/L18-1564.pdf,Vania,,,14191,,,,,,,,,,,,
+narrativeqa,,,,very long input sequence,,,,,,,,,,NarQA,Abstractive QA,,,,,
+newsqa,,,,download error,TaskEmbed,,,,,,,,,NewsQA,Extractive QA,,,,Trischler et al. 2017,
+eli5,,,,dataset split error,CrossFit,,,,,,,https://facebookresearch.github.io/ELI5/explore.html,,eli5-askh,qa/long-form qa,,possibly knowledge-neutral,,Fan et al. 2019,
+Maybe Reconsider,,,,,,,,,,,,,,,,,,,,
+zest,,,,its original task is quite complex (need to provide a decision function); should be held-out eval only,self,,,,,,,,,,,,,,,
+swag,,story_completion,cls,revisit whether this should be considered as a variant of NLI,,,,73546,0,,,,,swag,qa/multiple-choice qa,,,,Zellers et al. 2018,
+codah,codah,story_completion,cls,a variant of swag revisit whether this should be considered as a variant of NLI,,,,2776,0,,,,,codah,qa/multiple-choice qa,,,,Chen et al. 2019,
+wiki_auto,,,,revisit: lots of duplicate simplified text; novel generative task could be very challenging,CrossFit,,,,,,,,,wiki auto,cls/other,,text simplification,,Jiang et al. 2020,
+proto_qa,,,gen,"generate prototypical concepts, kinda niche format with multiple correct answers",CrossFit,,,,,,,,,proto qa,other,,,,Boratko et al. 2020,
+empathetic_dialogues,,,,generation? classification?,CrossFit,,,,,,,https://arxiv.org/pdf/1811.00207.pdf,,empathetic dialogues,cg/dialogue,,,,Rashkin et al. 2019,
+qed,,,,uses held-out Natural Questions,,,,,,,,,,,,,,,,
+kilt_tasks,aidayago2,,,,,,,,,,,,,kilt ay2,other/entity linking,,encyclopedia,,Hoffart et al. 2011,
+kilt_tasks,wow,,,,,,,,,,,,,kilt wow,cg/dialogue,,encyclopedia,,Dinan et al. 2019,
+lama,conceptnet,,,,,,,,,,,,,lama-conceptnet,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020,
+lama,google_re,,,,,,,,,,,,,lama-google re,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020,
+lama,squad,,,,,,,,,,,,,lama-squad,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020,
+lama,trex,,,,,,,,,,,,,lama-trex,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020,
+limit,,physical cognition,,,,,,,,,,https://aclanthology.org/2020.findings-emnlp.88.pdf,,limit,other,,physical semantic repr.,,Manotas et al. 2020,
+kilt_tasks,fever,,,revisit whether this should be considered as a variant of NLI,,,,,,,,,,kilt fever,cls/fact checking,,encyclopedia,,Thorne et al. 2018,
+Skipped,,,,,,,,,,,,,,,,,,,,
+fever,v2.0,closed-book qa/fact checking,,also in KILT,,,,,,,,,,FEVER,,,,,,
+hotpot_qa,distractor,,,also in KILT,,,,,,,,,,Hotpot QA,,,,,,
+hotpot_qa,fullwiki,,,also in KILT,,,,,,,,,,Hotpot QA,,,,,,
+emo,,sentiment,cls,skip: offensive and ungrammatical text,,,,30160,0,precision;recall;F1,,https://aclanthology.org/S19-2005.pdf,,emo,cls/emotion,,,,Chatterjee et al. 2019,
+freebase_qa,,QA_closed_book,gen,"need to be held out because web_questions is ""supposed to be answerable by Freebase""",,,,20358,0,,,,intensive,freebase qa,qa/closed-book qa,,,,Jiang et al. 2019,
+aqua_rat,,,,,,,,,,,,https://arxiv.org/abs/1705.04146,,aqua rat,qa/multiple-choice qa,,nontrivial math,,Ling et al. 2017,
+math_qa,,,,,,,,,,,,,,math qa,qa/multiple-choice qa,,nontrivial math,,Amini et al. 2019,
+numer_sense,,,,,,,,,,,,,,numer sense,qa/closed-book qa,,numerical knowledge,,Lin et al. 2020a,
+squad_adversarial,,,,,,,,,,,,,,,,,,,,
+squadshifts,,,,,,,,,,,,,,,,,,,,
+sms_spam,,,,,,,,,,,,,,sms spam,cls/other,,,,Almeida et al. 2011,
+search_qa,,,,,,,,,,,,,,search qa,qa/closed-book qa,,,,Dunn et al. 2017,
+kilt_tasks,trex,,,,,,,,,,,,,kilt trex,qa/closed-book qa,,encyclopedia,,Elsahar et al. 2018,
+kilt_tasks,structured_zeroshot,,,,,,,,,,,,,kilt zsre,qa/closed-book qa,,encyclopedia,,Levy et al. 2017,
+spider,,,,,,,,,,,,,,spider,cg/other,,,,Yu et al. 2018,
+wikisql,,,,,,,,,,,,,,wikisql,cg/other,,,,Zhong et al. 2017,
+com_qa,,,,,CrossFit,,,,,,,https://arxiv.org/pdf/1809.09528.pdf,,ComQA (Abujabal et al. 2019),factoid QA w/ paraphrases,,,snippets WikiAnswers,,
+climate_fever,,,,revisit whether this should be considered as a variant of NLI,,,,,,,,,,climate fever,cls/fact checking,,,,Diggelmann et al. 2020,
+art,,,,,,,,,,,,https://arxiv.org/pdf/1908.05739.pdf,,art (abductive nli),other,,,,Bhagavatula et al. 2020,
+glue,mnli,classification_NLI,,,,,,,,,,,,glue-mnli,cls/nli,,,,Williams et al. 2018,
+glue,qnli,classification_NLI,,,,,,,,,,,,glue-qnli,cls/nli,,,,Rajpurkar et al. 2016,
+glue,wnli,classification_NLI,,,,,,,,,,,,glue-wnli,cls/nli,,,,Levesque et al. 2012,
+,,classification_NLI,,,,,,,,,,,,scitail,cls/nli,,,,Khot et al. 2018,
+,,classification_NLI,,,,,,,,,,,,sick,cls/nli,,,,Marelli et al. 2014,
+,,classification_NLI,,,,,,,,,,,,SNLI (Bowman et al. 2015),NLI,,,misc.,,
+aeslc,,,,summarization by email subject line,,,,,,,,https://arxiv.org/abs/1906.03497,,aeslc,cg/summarization,,generation,,Zhang and Tetreault 2019,
+onestop_english,,,,,,,,,,,,https://aclanthology.org/W18-0535.pdf,,onestop english,cls/other,,,,Vajjala and Luˇci´c 2018,
+mocha,,,,,,,,,,,,,,mocha,other/regression,,,,Chen et al. 2020a,
+commonsense_qa,,,,duplicate with cos_e,Vania,,,9741,,,,https://arxiv.org/pdf/1811.00937.pdf,,Commonsense QA,qa/multiple-choice qa,,,,Talmor et al. 2019,
+,,,,,,,,,,,,,,emotion,cls/emotion,,,,Saravia et al. 2018,
+,,,,the authors themselves seem to have renounced their own work,,,,,,,,https://github.com/nyu-mll/crows-pairs,,crows pairs,other,,,,Nangia et al. 2020,
+,,,,,,,,,,,,,,ethos-directed vs generalized,cls/hate speech detection,,,,Mollas et al. 2020,
+,,,,,,,,,,,,,,ethos-disability,cls/hate speech detection,,,,Mollas et al. 2020,
+,,,,,,,,,,,,,,ethos-gender,cls/hate speech detection,,,,Mollas et al. 2020,
+,,,,,,,,,,,,,,ethos-national origin,cls/hate speech detection,,,,Mollas et al. 2020,
+,,,,,,,,,,,,,,ethos-race,cls/hate speech detection,,,,Mollas et al. 2020,
+,,,,,,,,,,,,,,ethos-religion,cls/hate speech detection,,,,Mollas et al. 2020,
+,,,,,,,,,,,,,,ethos-sexual orientation,cls/hate speech detection,,,,Mollas et al. 2020,
+,,,,,,,,,,,,,,hate speech offensive,cls/hate speech detection,,,,Davidson et al. 2017,
+,,,,,,,,,,,,,,hate speech18,cls/hate speech detection,,,,de Gibert et al. 2018,
+,,,,,,,,,,,,,,hatexplain,cls/hate speech detection,,,,Mathew et al. 2020,
+,,,,,,,,,,,,,,reddit tifu-title,cg/summarization,,,,Kim et al. 2019,
+,,,,,,,,,,,,,,reddit tifu-tldr,cg/summarization,,,,Kim et al. 2019,
+,,,,,,,,,,,,,,tweet eval-emoji,cls/emotion,,,,Barbieri et al. 2020,
+,,,,,,,,,,,,,,tweet eval-emotion,cls/emotion,,,,Barbieri et al. 2020,
+,,,,,,,,,,,,,,tweet eval-hate,cls/emotion,,,,Barbieri et al. 2020,
+,,,,,,,,,,,,,,tweet eval-irony,cls/emotion,,,,Barbieri et al. 2020,
+,,,,,,,,,,,,,,tweet eval-offensive,cls/emotion,,,,Barbieri et al. 2020,
+,,,,,,,,,,,,,,tweet eval-sentiment,cls/emotion,,,,Barbieri et al. 2020,
+,,,,,,,,,,,,,,tweet eval-stance abortion,cls/emotion,,,,Barbieri et al. 2020,
+,,,,,,,,,,,,,,tweet eval-stance atheism,cls/emotion,,,,Barbieri et al. 2020,
+,,,,,,,,,,,,,,tweet eval-stance climate,cls/emotion,,,,Barbieri et al. 2020,
+,,,,,,,,,,,,,,tweet eval-stance feminist,cls/emotion,,,,Barbieri et al. 2020,
+,,,,,,,,,,,,,,tweet eval-stance hillary,cls/emotion,,,,Barbieri et al. 2020,
+,,,,,,,,,,,,,,tweet qa,qa/machine reading comprehension,,,,Xiong et al. 2019,
+yelp_polarity,,,,,,,,,,,,,,yelp polarity,cls/sentiment analysis,,,,Zhang et al. 2015; (link),
+quora,,,,,,,,,,,,https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs,,QQP,paraphrase identiﬁcation,,,social QA,Iyer et al. 2017,
+squad,,,,,,,,,,,,,,SQuAD 1.1,Extractive QA,,,,,
+yahoo_answers_topics,,,,,,,,,,,,,,yahoo answers topics,cls/topic,,,,(link),
+tab_fact,,,,,,,,,,,,,,tab fact,cls/fact checking,,,,Chen et al. 2020b,
+,,,,,,,,,,,,,,blimp-anaphor gender agreement,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
+,,,,,,,,,,,,,,blimp-anaphor number agreement,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
+,,,,,,,,,,,,,,blimp-determiner noun agreement with adj irregular 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
+,,,,,,,,,,,,,,blimp-ellipsis n bar 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
+,,,,,,,,,,,,,,blimp-ellipsis n bar 2,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
+,,,,,,,,,,,,,,blimp-existential there quantiﬁers 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
+,,,,,,,,,,,,,,blimp-irregular past participle adjectives,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
+,,,,,,,,,,,,,,blimp-sentential negation npi licensor present,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
+,,,,,,,,,,,,,,blimp-sentential negation npi scope,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
+,,,,,,,,,,,,,,blimp-wh questions object gap,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
+poem_sentiment,,,,,,,,,,,,,,poem sentiment,cls/sentiment analysis,,creativity,,Sheng and Uthus 2020,
+acronym_identification,,,,,,,,,,,,https://arxiv.org/pdf/2010.14678.pdf,,acronym identiﬁcation,other,,,,Pouran Ben Veyseh et al. 2020,
+google_wellformed_query,,,,revisit whether to exclude fine-grain regression tasks,,,,,,,,,,google wellformed query,cls/other,,,,Faruqui and Das 2018,
+liar,,,,revisit whether to exclude fine-grain regression tasks,,,,,,,,,,liar,cls/fact checking,,,,Wang 2017,
+,,,,,,,,,,,,,,break-QDMR-high-level,other,,semantic representation,,Wolfson et al. 2020,
+,,,,,,,,,,,,,,crawl domain,other,,,,Zhang et al. 2020,
+discovery,discovery,,,,,,,,,,,,,discovery,cls/other,,generative-ish,,Sileo et al. 2019,
+wiki_split,,,,,,,,,,,,,,wiki split,cg/other,,,,Botha et al. 2018,
+,,,,,,,,,,,,,,aslg pc12,other,,,,Othman and Jemni 2012,
+,,,,,,,,,,,,,,CCG (Hockenmaier and Steedman 2007),CCG supertagging,,syntax,Penn Treebank,,
+,,,,,,,,,,,,,,Chunk (Tjong Kim Sang and Buchholz 2000),syntactic chunking,,syntax,Penn Treebank,,
+,,,,,,,,,,,,,,Conj (Ficler and Goldberg 2016),conjunct identiﬁcation,,syntax,Penn Treebank,,
+,,,,,,,,,,,,,,GED (Yannakoudakis et al. 2011),grammatical error detection,,syntax,misc.,,
+,,,,,,,,,,,,,,GGParent (Liu et al. 2019a),syntactic tagging,,syntax,Penn Treebank,,
+,,,,,,,,,,,,,,GParent (Liu et al. 2019a),syntactic tagging,,syntax,Penn Treebank,,
+,,,,,,,,,,,,,,NER (Tjong Kim Sang and De Meulder 2003),named entity recognition,,,news,,
+,,,,,,,,,,,,,,Parent (Liu et al. 2019a),syntactic tagging,,syntax; constituency,Penn Treebank,,
+,,,,,,,,,,,,,,POS-EWT (Silveira et al. 2014),part-of-speech tagging,,syntax,Web Treebank,,
+,,,,,,,,,,,,,,POS-PTB (Marcus et al. 1993),part-of-speech tagging,,syntax,Penn Treebank,,
+,,,,,,,,,,,,,,ST (Bjerva et al. 2016),semantic tagging,,,Groningen Meaning Bank,,
+financial_phrasebank,,,,,,,,,,,,,,ﬁnancial phrasebank,cls/sentiment analysis,,,,Malo et al. 2014,
+health_fact,,,,,,,,,,,,,,health fact,cls/fact checking,,,,Kotonya and Toni 2020,
+,,,,,,,,,,,,http://www.sciencedirect.com/science/article/pii/S1532046412000615,,ade corpus v2-classiﬁcation,cls/other,,,,Gurulingappa et al. 2012,
+,,,,,,,,,,,,,,ade corpus v2-dosage,other/slot ﬁlling,,,,Gurulingappa et al. 2012,
+,,,,,,,,,,,,,,ade corpus v2-effect,other/slot ﬁlling,,,,Gurulingappa et al. 2012,
+,,,,,,,,,,,,,,biomrc,qa/machine reading comprehension,,,,Pappas et al. 2020,
+,,,,,,,,,,,,,,medical questions pairs,cls/paraphrase,,,,McCreery et al. 2020,
+scicite,,,,,,,,,,,,,,scicite,cls/other,,,,Cohan et al. 2019,
+,,,,,,,,,,,,,,break-QDMR,other,,logical form,,Wolfson et al. 2020,
+,,,,,,,,,,,,,,e2e nlg cleaned,other,,,,Duˇsek et al. 2020 2019,
+glue,sst2,,,,,,,,,,,,,glue-sst2,cls/sentiment analysis,,,,Socher et al. 2013,
+glue,stsb,fine-grain regression,,,,,,,,,,,,glue-stsb,semantic similarity,,,misc.,,
+,,,,,,,,,,,,,,squad-no context,qa/closed-book qa,,,,Rajpurkar et al. 2016,
+,,,,,,,,,,,,,,squad-with context,qa/machine reading comprehension,,,,Rajpurkar et al. 2016,
+,,,,contrast sets,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,BoolQ-CS,Binary yes/no,,,,,
+,,,,,,,,,,,,https://aclanthology.org/C16-1236.pdf,,CQ (Bao et al. 2016),knowledge-based QA,,,snippets web queries/KB,,
+,,,,contrast sets,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,DROP-CS,Abstractive QA,,,,,
+,,,,,,,,,,,,https://aclanthology.org/D13-1020.pdf,,MCTest,Multiple choice,,,,,
+,,,,,,,,,,,,,,MRPC (Dolan and Brockett 2005),paraphrase identiﬁcation,,,news,,
+,,,,"""naturally perturbed"" version of BoolQ",,,,,,,,https://arxiv.org/pdf/2004.04849.pdf,,NP-BoolQ,Binary yes/no,,,,,
+,,,,,,,,,,,,https://aclanthology.org/D19-1608.pdf,,quartz-no knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b,
+,,,,contrast sets,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,Quoref-CS,Extractive QA,,,,,
+,,,,contrast sets,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,ROPES-CS,Extractive QA,,,,,
diff --git a/t0/seqio_tasks/tasks.py b/t0/seqio_tasks/tasks.py
index 2cec8f5..8a2d788 100644
--- a/t0/seqio_tasks/tasks.py
+++ b/t0/seqio_tasks/tasks.py
@@ -126,7 +126,6 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
             is_correct_fn=lambda ex: tf.equal(ex["answer_choices"], tf.strings.strip(ex["targets"])),
             weight_fn=lambda ex: 1.0,
         )
-
         fixed_choices = template.get_fixed_answer_choices_list()
         num_classes = len(fixed_choices) if fixed_choices else None
         seqio.TaskRegistry.add(
@@ -140,10 +139,15 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
 
 
 datatset_subset_tuple = Tuple[str, Optional[str]]
-d4_train: List[datatset_subset_tuple] = []
 d4_eval: List[datatset_subset_tuple] = []
-d3_train_gpt: List[datatset_subset_tuple] = []
-d3_train_sglue: List[datatset_subset_tuple] = []
+d4_train: Dict[str, List[datatset_subset_tuple]] = {
+    "BASE": [],
+    # GPT3 evaluation set
+    "GPT_EVAL": [],
+    # SuperGLUE (except RTE and CB)
+    "SGLUE": []
+}
+
 bias_fairness_eval: List[datatset_subset_tuple] = []
 gsheet: Dict[datatset_subset_tuple, Dict] = {}
 experiment_path = pkg_resources.resource_filename(__name__, "experiment_D4.csv")
@@ -155,14 +159,14 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
         if row["subset"] == "":
             row["subset"] = None  # to match promptsource.Template object
         dataset_subset = (row["HF_name"], row["subset"])
-        if row["do_train"] == "TRUE":
-            d4_train.append(dataset_subset)
+        if row["do_train"] != "":
+            do_train_source = row["do_train"]
+            # sanity checks
+            if do_train_source == "SGLUE":
+                assert dataset_subset[0] == "super_glue"
+            d4_train[do_train_source].append(dataset_subset)
         if row["do_eval"] == "TRUE":
             d4_eval.append(dataset_subset)
-        if row["D3_do_train"] == "TRUE" and "GPT" in row["seed_paper"]:
-            d3_train_gpt.append(dataset_subset)
-        if row["D3_do_train"] == "TRUE" and row["HF_name"] == "super_glue":
-            d3_train_sglue.append(dataset_subset)
         if (
             row["do_eval"] == "TRUE"
             and row["task_by_convention"] == "bias_and_fairness"
@@ -170,15 +174,13 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
         ):
             bias_fairness_eval.append(dataset_subset)
         gsheet[dataset_subset] = row
-all_datasets = d4_train + d4_eval + d3_train_gpt + d3_train_sglue + bias_fairness_eval
+all_datasets = sum(d4_train.values()) + d4_eval + bias_fairness_eval
 
 all_templates = promptsource.templates.TemplateCollection()
 all_templates.remove("anli")  # Need to special-case ANLI due to weird split conventions
 
 # 3 stages of training/ablation: D4 -> GPT -> SuperGLUE
-d4_train_mixture: List[str] = []  # strings are dataset_subset_template
-gpt_train_mixture: List[str] = []
-sglue_train_mixture: List[str] = []
+d4_train_mixture: Dict[str,List[str]] = {key: [] for key in d4_train }
 d4_eval_mixture: List[str] = []
 bias_fairness_eval_mixture: List[str] = []
 mixture_cap: Dict[str, int] = {}
@@ -213,15 +215,13 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
         if template.metadata.original_task:
             all_original_tasks.append(task_name)
 
-        if (dataset_name, subset_name) in d4_train:
-            d4_train_mixture.append(task_name)
-            mixture_cap[task_name] = cap
-        if (dataset_name, subset_name) in d3_train_gpt:
-            gpt_train_mixture.append(task_name)
-            mixture_cap[task_name] = cap
-        if (dataset_name, subset_name) in d3_train_sglue:
-            sglue_train_mixture.append(task_name)
-            mixture_cap[task_name] = cap
+        # Check that the dataset_subset_tuple is in d4_train
+        for key, dataset_subset_tuples in d4_train:
+            if (dataset_name, subset_name) in dataset_subset_tuples:
+                d4_train_mixture[key].append(task_name)
+                mixture_cap[task_name] = cap
+
+        # Check that the dataset_subset_tuplek is in d4_eval
         if (dataset_name, subset_name) in d4_eval:
             if template.metadata.original_task:
                 d4_eval_mixture.append(task_name)
@@ -300,7 +300,7 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
 
 seqio.MixtureRegistry.add(
     "d4_train",
-    [task for task in d4_train_mixture if task not in TASK_BLACKLIST],
+    [task for task in d4_train_mixture["BASE"] if task not in TASK_BLACKLIST],
     default_rate=lambda t: mixture_cap[t.name],
 )
 
@@ -317,14 +317,14 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
 # )
 
 seqio.MixtureRegistry.add(
-    "d4_gpt_train",
-    [task for task in d4_train_mixture + gpt_train_mixture if task not in TASK_BLACKLIST],
+    "d4_gpt_eval_train",
+    [task for task in d4_train_mixture["BASE"] + d4_train_mixture["GPT_EVAL"] if task not in TASK_BLACKLIST],
     default_rate=lambda t: mixture_cap[t.name],
 )
 
 seqio.MixtureRegistry.add(
     "d4_gpt_sglue_train",
-    [task for task in d4_train_mixture + gpt_train_mixture + sglue_train_mixture if task not in TASK_BLACKLIST],
+    [task for task in d4_train_mixture["BASE"] + d4_train_mixture["GPT_EVAL"] + d4_train_mixture["SGLUE"] if task not in TASK_BLACKLIST],
     default_rate=lambda t: mixture_cap[t.name],
 )
 
@@ -394,13 +394,13 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
 
 seqio.MixtureRegistry.add(
     "d4_train_one_og_prompt",
-    [task for task in single_original_task.values() if task in d4_train_mixture and task not in TASK_BLACKLIST],
+    [task for task in single_original_task.values() if task in d4_train_mixture["BASE"] and task not in TASK_BLACKLIST],
     default_rate=lambda t: mixture_cap[t.name],
 )
 
 seqio.MixtureRegistry.add(
     "d4_train_all_og_prompts",
-    [task for task in all_original_tasks if task in d4_train_mixture and task not in TASK_BLACKLIST],
+    [task for task in all_original_tasks if task in d4_train_mixture["BASE"] and task not in TASK_BLACKLIST],
     default_rate=lambda t: mixture_cap[t.name],
 )
 

From ad1737118db85633c9b63a03164bae3fe4972d03 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 4 Jan 2022 17:38:31 +0100
Subject: [PATCH 03/16] Cleanup

---
 requirements.txt        |  5 -----
 setup.py                | 19 +++++++++++++++++++
 t0/seqio_tasks/tasks.py |  7 +++----
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index eadc3ca..c593c47 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,5 @@
 git+git://github.com/bigscience-workshop/promptsource@v0.1.0
 accelerate
 transformers
-datasets
-jinja2
 torch
 seqio
-sentencepiece
-protobuf
-scikit-learn
\ No newline at end of file
diff --git a/setup.py b/setup.py
index f577325..aaecb92 100644
--- a/setup.py
+++ b/setup.py
@@ -22,6 +22,25 @@
     packages=find_packages(),
     license="Apache Software License 2.0",
     long_description=readme,
+    install_requires=[
+        "promptsource",
+        "accelerate",
+        "transformers",
+        "torch",
+        "datasets",
+        "jinja2",
+        "datasets",
+        "sentencepiece",
+        "protobuf",
+        "scikit-learn"
+    ],
+    extra_require={
+        "seqio_tasks": [
+            "seqio",
+            "t5",
+            "tensorflow",
+        ]
+    },
     package_data={
         "": [
             "seqio_tasks/experiment_D4.csv",
diff --git a/t0/seqio_tasks/tasks.py b/t0/seqio_tasks/tasks.py
index 8a2d788..7331f89 100644
--- a/t0/seqio_tasks/tasks.py
+++ b/t0/seqio_tasks/tasks.py
@@ -4,15 +4,14 @@
 
 import datasets
 import pkg_resources
+import promptsource.templates
 import seqio
 import t5
-import tensorflow as tf
 from t5.data.glue_utils import get_glue_metric, get_super_glue_metric
 from t5.evaluation import metrics as mt
+import tensorflow as tf
 
-import promptsource.templates
-from promptsource.seqio_tasks import utils
-
+from t0.seqio_tasks import utils
 
 GET_METRICS = {
     "BLEU": mt.bleu,

From 53be911d1ce3410b4e3c44e4ef278d67c3336a3d Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 4 Jan 2022 17:50:34 +0100
Subject: [PATCH 04/16] Remove uneeded tasks

---
 t0/seqio_tasks/experiment_D4.csv | 198 +++----------------------------
 t0/seqio_tasks/tasks.py          |  43 ++++---
 2 files changed, 37 insertions(+), 204 deletions(-)

diff --git a/t0/seqio_tasks/experiment_D4.csv b/t0/seqio_tasks/experiment_D4.csv
index 206ba10..61b6bc8 100644
--- a/t0/seqio_tasks/experiment_D4.csv
+++ b/t0/seqio_tasks/experiment_D4.csv
@@ -1,21 +1,16 @@
 HF_name,subset,task_by_convention,format,comment,seed_paper,do_train,do_eval,train_size,adjusted_train_size,metric,multiple correct answer,Paper link,non_linguistic_knowledge,Imported Task Name,imported category,input_length,_human_skill,Domain,Reference,done
-crows_pairs,,bias_and_fairness,,test set only; authors themselves acknowledge some problems,Eval WG,,TRUE,,,,,,,,,,,,,TRUE
-jigsaw_toxicity_pred,,bias_and_fairness,,current https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data ; want https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification,Eval WG,,TRUE,,,,,,,,,,,,,TRUE
-super_glue,axg,bias_and_fairness,cls,test set only,Eval WG,,TRUE,,,,,,,,,,,,,TRUE
-winogender,,bias_and_fairness,cls,also as axg in super_glue,Eval WG,,TRUE,,,,,,,,,,,,,
-wino_bias,type1_anti,bias_and_fairness,cls,,Eval WG,,TRUE,,,,,,,,,,,,,TRUE
-wino_bias,type2_anti,bias_and_fairness,cls,,Eval WG,,TRUE,,,,,,,,,,,,,TRUE
-wino_bias,type1_pro,bias_and_fairness,cls,,Eval WG,,TRUE,,,,,,,,,,,,,TRUE
-wino_bias,type2_pro,bias_and_fairness,cls,,Eval WG,,TRUE,,,,,,,,,,,,,TRUE
-super_glue,wsc.fixed,coreference,cls,,,SGLUE,TRUE,554,0,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,superglue-wsc,cls/other,single sentence,knowledge-? reading comprehension,,Levesque et al. 2012,TRUE
-winograd_wsc,wsc273,coreference,ext,,GPT,,TRUE,0,0,accuracy,,https://www.aaai.org/ocs/index.php/KR/KR12/paper/download/4492/4924,,,,,,,Levesque et al. 2012,
-winogrande,winogrande_xl,coreference,ext,,GPT,,TRUE,40398,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020,TRUE
-glue,cola,grammatical_acceptability,cls,includes semantic acceptability too; to be replaced by blimp,,,TRUE,8551,0,accuracy;matthews_corrcoef,,https://arxiv.org/pdf/1805.12471.pdf,,glue-cola,cls/other,single sentence,,,Warstadt et al. 2019,
-super_glue,cb,NLI,cls,"""for multi-class F1 we compute the unweighted average of the F1 per class.""",,,TRUE,250,0,mean_multiclass_f1;accuracy,,https://semanticsarchive.net/Archive/Tg3ZGI2M/Marneffe.pdf,,superglue-cb,cls/nli,sentence pair,knowledge-neutral inference,,de Marneffe et al. 2019,TRUE
-super_glue,rte,NLI,cls,,,,TRUE,2490,0,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,superglue-rte,cls/nli,sentence pair,knowledge modest inference,,Dagan et al. 2005; Bar-Haim et al. 2006 Giampiccolo et al. 2007; Bentivogli et al. 2009,TRUE
-anli,,NLI,cls,"In addition to accuracy, paper also evaluates on range of relaxed/strict and matched/unmatched settings and reports F scores for different answers",,,TRUE,162865,0,accuracy,,https://arxiv.org/abs/1910.14599,,anli,cls/nli,sentence pair,knowledge modest inference,,Nie et al. 2020,TRUE
-hans,,NLI,cls,,,,TRUE,0,0,accuracy,,https://arxiv.org/pdf/1902.01007.pdf,,,,sentence pair,syntax?,,McCoy et al. 2019,
-super_glue,axb,NLI,cls,test set only,,,TRUE,0,0,,,,,,,,,,,
+crows_pairs,,bias_and_fairness,,test set only; authors themselves acknowledge some problems,Eval WG,,BIAS_FAIRNESS,,,,,,,,,,,,,TRUE
+jigsaw_toxicity_pred,,bias_and_fairness,,current https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data ; want https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification,Eval WG,,BIAS_FAIRNESS,,,,,,,,,,,,,TRUE
+super_glue,axg,bias_and_fairness,cls,test set only,Eval WG,,BIAS_FAIRNESS,,,,,,,,,,,,,TRUE
+wino_bias,type1_anti,bias_and_fairness,cls,,Eval WG,,BIAS_FAIRNESS,,,,,,,,,,,,,TRUE
+wino_bias,type2_anti,bias_and_fairness,cls,,Eval WG,,BIAS_FAIRNESS,,,,,,,,,,,,,TRUE
+wino_bias,type1_pro,bias_and_fairness,cls,,Eval WG,,BIAS_FAIRNESS,,,,,,,,,,,,,TRUE
+wino_bias,type2_pro,bias_and_fairness,cls,,Eval WG,,BIAS_FAIRNESS,,,,,,,,,,,,,TRUE
+super_glue,wsc.fixed,coreference,cls,,,SGLUE,BASE,554,0,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,superglue-wsc,cls/other,single sentence,knowledge-? reading comprehension,,Levesque et al. 2012,TRUE
+winogrande,winogrande_xl,coreference,ext,,GPT,,BASE,40398,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020,TRUE
+super_glue,cb,NLI,cls,"""for multi-class F1 we compute the unweighted average of the F1 per class.""",,,BASE,250,0,mean_multiclass_f1;accuracy,,https://semanticsarchive.net/Archive/Tg3ZGI2M/Marneffe.pdf,,superglue-cb,cls/nli,sentence pair,knowledge-neutral inference,,de Marneffe et al. 2019,TRUE
+super_glue,rte,NLI,cls,,,,BASE,2490,0,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,superglue-rte,cls/nli,sentence pair,knowledge modest inference,,Dagan et al. 2005; Bar-Haim et al. 2006 Giampiccolo et al. 2007; Bentivogli et al. 2009,TRUE
+anli,,NLI,cls,"In addition to accuracy, paper also evaluates on range of relaxed/strict and matched/unmatched settings and reports F scores for different answers",,,BASE,162865,0,accuracy,,https://arxiv.org/abs/1910.14599,,anli,cls/nli,sentence pair,knowledge modest inference,,Nie et al. 2020,TRUE
 glue,mrpc,paraphrase,cls,,,BASE,,3668,3668,accuracy;f1_score,,https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/I05-50025B15D.pdf,,glue-mrpc,cls/paraphrase,,paraphrase,,Dolan and Brockett 2005,TRUE
 glue,qqp,paraphrase,cls,,,BASE,,363846,363846,accuracy;f1_score,,https://aclanthology.org/I05-5002.pdf,,glue-qqp,cls/paraphrase,,,,(link),TRUE
 paws,labeled_final,paraphrase,cls,,,BASE,,49401,49401,,,,,paws,cls/paraphrase,,,,Zhang et al. 2019,TRUE
@@ -25,7 +20,6 @@ chooses the correct answer and 1/k if it reports a k-way tie
 ai2_arc,ARC-Easy,QA_closed_book,cls,,GPT,GPT_EVAL,,2251,0,"accuracy_with_tie: For each question, a system receives 1 point if it
 chooses the correct answer and 1/k if it reports a k-way tie
 (i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,ARC (easy),Multiple choice,,,,,TRUE
-nq_open,,QA_closed_book,gen,,GPT,,TRUE,87925,0,kilt-exact_match;average_accuracy_accross_answers,TRUE,https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00276/43518/Natural-Questions-A-Benchmark-for-Question,intensive,Natural Questions (open domain),,,trivia,,,
 kilt_tasks,hotpotqa,QA_closed_book,gen,recast as closed-book due to input length,self,BASE,,88869,88869,,,,,kilt hotpotqa,qa/closed-book qa,,encyclopedia; multi-hop QA,,Yang et al. 2018,TRUE
 trivia_qa,unfiltered,QA_closed_book,gen,,GPT,GPT_EVAL,,87622,0,exact_match;f1_over_words => wikipedia aliases are considered valid answers,TRUE,https://arxiv.org/pdf/1705.03551.pdf,intensive,Trivia QA,,,,,,TRUE
 web_questions,,QA_closed_book,gen,"""supposed to be answerable by Freebase"" Check corpora deduplication with freebaseqa.",GPT,GPT_EVAL,,3778,0,accuracy : they don't mention how they normalize across multiple correct answers,TRUE,https://aclanthology.org/D13-1160.pdf,intensive,web questions,qa/closed-book qa,,,,Berant et al. 2013,TRUE
@@ -33,26 +27,12 @@ wiki_qa,,QA_closed_book,cls,,CrossFit,BASE,,20360,20360,,,https://aclanthology.o
 adversarial_qa,dbidaf,QA_extractive,ext,,,BASE,,10000,10000,,,https://aclanthology.org/2020.tacl-1.43/,,adversarialqa,qa/machine reading comprehension,,,,Bartolo et al. 2020,TRUE
 adversarial_qa,dbert,QA_extractive,ext,,,BASE,,10000,10000,,,,,,,,,,,TRUE
 adversarial_qa,droberta,QA_extractive,ext,,,BASE,,10000,10000,,,,,,,,,,,TRUE
-coqa,,QA_extractive,ext,GPT-easy,GPT,,TRUE,7199,,"macro_average_f1: for computing a model’s performance, each individual prediction is compared
-against n human answers resulting in n F1 scores,
-the maximum of which is chosen as the prediction’s
-F1.For each question, we average out F1 across
-these n sets, both for humans and models. In our
-final evaluation, we use n = 4 human answers for
-every question (the original answer and 3 additionally collected answers). The articles a, an and the
-and punctuations are excluded in evaluation.",from the paper it seems it could contain multiple answers but the datasets has only one answer per question,https://arxiv.org/pdf/1808.07042.pdf,,,,,,,,
 duorc,SelfRC,QA_extractive,ext,,TaskEmbed;CrossFit,BASE,,60721,60721,,,https://duorc.github.io/,,DuoRC,qa/machine reading comprehension,,,Wikipedia/IMDB crowd,Saha et al. 2018,TRUE
 duorc,ParaphraseRC,QA_extractive,ext,,TaskEmbed;CrossFit,BASE,,69524,69524,,,https://arxiv.org/pdf/1804.07927.pdf,,DuoRC,paraphrased QA,,,,Saha et al. 2018,TRUE
 ropes,,QA_extractive,ext,,,BASE,,10924,10924,,,,modest,ropes,Extractive QA,,cause_and_effect;nontrivial_comprehension,,Lin et al. 2019,TRUE
 squad_v2,,QA_extractive,ext,,GPT,GPT_EVAL,,130319,0,exact_match;f1_score,TRUE,https://arxiv.org/pdf/1806.03822.pdf,,SQuAD 2.0,Extractive QA,,,,Rajpurkar et al. 2018,TRUE
 super_glue,record,QA_extractive,ext,,,SGLUE,,100730,0,max_token_level_f1;exact_match,TRUE,https://arxiv.org/pdf/1810.12885.pdf,,superglue-record,qa/machine reading comprehension,,knowledge-? reading comprehension,,Zhang et al. 2018,TRUE
-qa_srl,,QA_extractive,ext,"need non-naive metric (""If the predicted word is contained inside the annotated answer span it is considered a correct prediction.""); v2 not in HF https://aclanthology.org/P18-1191.pdf",Eval WG,,TRUE,6414,0,accuracy,TRUE,https://dada.cs.washington.edu/qasrl/#page-top,neutral,qa srl,other,,semantic role,,He et al. 2015,
-quac,,QA_extractive,ext,,GPT,,TRUE,11567,,"average_maximum_f1;HEQ-Q;HEQ-D:  To make oracle human and system performance comparable,
-given n references, we report the average of the
-maximum F1 computed from each n − 1 subset
-with respect to the heldout reference.",TRUE,https://arxiv.org/pdf/1808.07036.pdf,,,,,dialogue,,,
 quoref,,QA_extractive,ext,,,BASE,,19399,19399,,,https://aclanthology.org/D19-1606.pdf,,Quoref,Extractive QA,,,,Dasigi et al. 2019,TRUE
-drop,,QA_generative,gen,"nontrivial math; try history_690, it's pretty hard even when I have domain knowledge",GPT,,TRUE,,,exact_match; macro_average_f1,TRUE,https://aclanthology.org/N19-1246.pdf,,DROP ,multi-hop quantitative reasoning; Abstractive QA,,numerical,Wikipedia crowd,Dua et al. 2019,
 cos_e,v1.11,QA_multiple_choice,cls,"same as commonsense_qa but with (poorly sourced) human explanations; questionable ""commonsense"" lots of world knowledge",Vania,BASE,,9741,9741,,,,,cos e,other/generate explanation,,,,Rajani et al. 2019,TRUE
 cosmos_qa,,QA_multiple_choice,cls,,,BASE,,25262,25262,,,,,cosmos qa,qa/multiple-choice qa,,,,Huang et al. 2019,TRUE
 dream,,QA_multiple_choice,cls,,,BASE,,6116,6116,,,,,dream,qa/multiple-choice qa,,,,Sun et al. 2019,TRUE
@@ -68,22 +48,18 @@ race,middle,QA_multiple_choice,cls,"revisit: define as comprehension, paragraph
 sciq,,QA_multiple_choice,cls,,,BASE,,11679,11679,,,,,sciq,qa/multiple-choice qa,,,,Welbl et al. 2017,TRUE
 social_i_qa,,QA_multiple_choice,cls,metric differ by prompt: 4-way classification cast as binary ,,BASE,,33410,33410,accuracy,,https://arxiv.org/pdf/1904.09728.pdf,,SIQA,qa/multiple-choice qa,,cultural knowledge,,Sap et al. 2019,TRUE
 super_glue,boolq,QA_multiple_choice,cls,,,SGLUE,,9427,0,accuracy,,https://arxiv.org/pdf/1905.10044.pdf,neutral?,superglue-boolq,,,knowledge-? reading comprehension,,,TRUE
-super_glue,copa,QA_multiple_choice,cls,,,SGLUE,TRUE,400,0,accuracy,,http://commonsensereasoning.org/2011/papers/Roemmele.pdf,modest,superglue-copa,qa/multiple-choice qa,,causal cognition,,Gordon et al. 2012,TRUE
+super_glue,copa,QA_multiple_choice,cls,,,SGLUE,BASE,400,0,accuracy,,http://commonsensereasoning.org/2011/papers/Roemmele.pdf,modest,superglue-copa,qa/multiple-choice qa,,causal cognition,,Gordon et al. 2012,TRUE
 super_glue,multirc,QA_multiple_choice,cls,F1 over all answer options. See paper p. 259 for defintion,,SGLUE,,27243,0,f1_over_all_options;exact_match,,https://aclanthology.org/N18-1023.pdf,neutral?,superglue-multirc,qa/multiple-choice qa,,knowledge-? reading comprehension,,Khashabi et al. 2018,TRUE
 wiki_hop,original,QA_multiple_choice,cls,,,BASE,,43738,43738,,,https://transacl.org/ojs/index.php/tacl/article/viewFile/1325/299,,WikiHop (Welbl et al. 2018),multi-hop QA,,,Wikipedia KB,,TRUE
 wiqa,,QA_multiple_choice,cls,,,BASE,,29808,29808,,,,,wiqa,qa/multiple-choice qa,,cause_and_effect,,Tandon et al. 2019,TRUE
-circa,,QA_multiple_choice,cls,revisit: problematic prompts,,,TRUE,34268,0,mean_multiclass_f1;accuracy,,https://arxiv.org/pdf/2010.03450.pdf,,circa,cls/other,,pragmatics,,Louis et al. 2020,
-mc_taco,,QA_multiple_choice,cls,no train set; variable number of answer_chocies; eval in paper is over set of possible candidates;,,,TRUE,0,0,exact_match; f1_score,,https://arxiv.org/pdf/1909.03065.pdf,,mc taco,qa/binary,,temporal cognition,,Zhou et al. 2019,
 piqa,,QA_multiple_choice,cls,revisit: not just other,GPT,GPT_EVAL,,16113,0,accuracy,,https://arxiv.org/pdf/1911.11641.pdf,,PIQA,Multiple choice,,physical_cognition,,Bisk et al. 2020,TRUE
 amazon_polarity,,sentiment,cls,,,BASE,,3600000,500000,,,https://cs.stanford.edu/people/jure/pubs/reviews-recsys13.pdf,,amazon polarity,cls/sentiment analysis,,,,McAuley and Leskovec 2013,TRUE
 app_reviews,,sentiment,cls,,,BASE,,288065,288065,,,,,app reviews,other/regression,,,,Missing,TRUE
 imdb,,sentiment,cls,,,BASE,,25000,25000,,,,,imdb,cls/sentiment analysis,,no dev set,,Maas et al. 2011,TRUE
 rotten_tomatoes,,sentiment,cls,,,BASE,,8530,8530,,,,,rotten tomatoes,cls/sentiment analysis,,,,Pang and Lee 2005,TRUE
 yelp_review_full,,sentiment,cls,no dev set,,BASE,,650000,500000,,,,,yelp review full,other/regression,,,,Zhang et al. 2015; (link),TRUE
-lambada,,story_completion,gen,revisit: story or cloze or coref? trivial cloze prompt; training set is just unlabeled corpora; GPT task,GPT,,TRUE,0,0,accuracy;perplexity;median_rank,,https://arxiv.org/pdf/1606.06031.pdf,,,,,,,,
-craffel/openai_lambada,,story_completion,gen,revisit: story or cloze or coref? trivial cloze prompt; training set is just unlabeled corpora; GPT task,GPT,,TRUE,0,0,accuracy;perplexity;median_rank,,https://arxiv.org/pdf/1606.06031.pdf,,,,,,,,
-story_cloze,2016,story_completion,cls,todo: custom loading; swag like?,GPT,,TRUE,,0,accuracy,,https://arxiv.org/pdf/1604.01696.pdf,,,,,,,,TRUE
-hellaswag,,story_completion,cls,,GPT,GPT_EVAL,TRUE,39905,0,accuracy,,https://arxiv.org/pdf/1905.07830.pdf,,hellaswag,qa/multiple-choice qa,,,,Zellers et al. 2019,TRUE
+story_cloze,2016,story_completion,cls,todo: custom loading; swag like?,GPT,,BASE,,0,accuracy,,https://arxiv.org/pdf/1604.01696.pdf,,,,,,,,TRUE
+hellaswag,,story_completion,cls,,GPT,GPT_EVAL,BASE,39905,0,accuracy,,https://arxiv.org/pdf/1905.07830.pdf,,hellaswag,qa/multiple-choice qa,,,,Zellers et al. 2019,TRUE
 common_gen,,structure_to_text,gen,,,BASE,,67389,67389,,,,,common gen,other,,,,Lin et al. 2020b,TRUE
 wiki_bio,,structure_to_text,gen,,,BASE,,582659,500000,,,,,wiki bio,cg/other,,,,Lebret et al. 2016,TRUE
 cnn_dailymail,3.0.0,summarization,gen,,,BASE,,287113,287113,,,,,,,,,,,TRUE
@@ -94,146 +70,4 @@ xsum,,summarization,gen,,,BASE,,204045,204045,rouge,,https://arxiv.org/pdf/1808.
 ag_news,,topic_classification,cls,,,BASE,,120000,120000,,,http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html,,ag news,cls/topic,,,,Gulli (link),TRUE
 dbpedia_14,,topic_classification,cls,,,BASE,,560000,500000,,,https://svn.aksw.org/papers/2013/SWJ_DBpedia/public.pdf,,dbpedia 14,cls/topic,,,,Lehmann et al. 2015,TRUE
 trec,,topic_classification,cls,,,BASE,,5452,5452,,,https://trec.nist.gov/data/qa.html,,trec,cls/other,,,,Li and Roth 2002; Hovy et al. 2001,TRUE
-super_glue,wic,word_sense_disambiguation,cls,,,SGLUE,TRUE,5428,0,accuracy,,https://arxiv.org/pdf/1808.09121.pdf,,superglue-wic,cls/other,,lexical_knowledge,,Pilehvar and Camacho-Collados 2019,TRUE
-Staging Area,,,,,,,,,,,,,,,,,,,,
-Would Include but not in HF or some other practical limitations,,,,,,,,,,,,,,,,,,,,
-definite_pronoun_resolution,,coreference,,todo: download error,,,,,,,,,,deﬁnite pronoun resolution,other,,,,Rahman and Ng 2012,
-jeopardy,,closed-book qa,gen,sporadic download error,CrossFit,,,,,,,,,jeopardy,qa/closed-book qa,,,,(link),
-blimp,,,cls,no prompts yet; collapse subsets,,,,,0,,,,,,,,,,,
-Hendrycks et al. 2021,,,,https://arxiv.org/abs/2009.03300v3,,,,,,,,,,,,,,,,
-Multi-Turn Dialogue Reasoning,,,,https://aclanthology.org/2020.acl-main.130.pdf,Vania,,,7088,,,,,,,,,,,,
-Argument Reasoning Comprehension Task,,,,https://aclanthology.org/N18-1175.pdf,Vania,,,1211,,,,,,,,,,,,
-MCScript,,,,https://aclanthology.org/L18-1564.pdf,Vania,,,14191,,,,,,,,,,,,
-narrativeqa,,,,very long input sequence,,,,,,,,,,NarQA,Abstractive QA,,,,,
-newsqa,,,,download error,TaskEmbed,,,,,,,,,NewsQA,Extractive QA,,,,Trischler et al. 2017,
-eli5,,,,dataset split error,CrossFit,,,,,,,https://facebookresearch.github.io/ELI5/explore.html,,eli5-askh,qa/long-form qa,,possibly knowledge-neutral,,Fan et al. 2019,
-Maybe Reconsider,,,,,,,,,,,,,,,,,,,,
-zest,,,,its original task is quite complex (need to provide a decision function); should be held-out eval only,self,,,,,,,,,,,,,,,
-swag,,story_completion,cls,revisit whether this should be considered as a variant of NLI,,,,73546,0,,,,,swag,qa/multiple-choice qa,,,,Zellers et al. 2018,
-codah,codah,story_completion,cls,a variant of swag revisit whether this should be considered as a variant of NLI,,,,2776,0,,,,,codah,qa/multiple-choice qa,,,,Chen et al. 2019,
-wiki_auto,,,,revisit: lots of duplicate simplified text; novel generative task could be very challenging,CrossFit,,,,,,,,,wiki auto,cls/other,,text simplification,,Jiang et al. 2020,
-proto_qa,,,gen,"generate prototypical concepts, kinda niche format with multiple correct answers",CrossFit,,,,,,,,,proto qa,other,,,,Boratko et al. 2020,
-empathetic_dialogues,,,,generation? classification?,CrossFit,,,,,,,https://arxiv.org/pdf/1811.00207.pdf,,empathetic dialogues,cg/dialogue,,,,Rashkin et al. 2019,
-qed,,,,uses held-out Natural Questions,,,,,,,,,,,,,,,,
-kilt_tasks,aidayago2,,,,,,,,,,,,,kilt ay2,other/entity linking,,encyclopedia,,Hoffart et al. 2011,
-kilt_tasks,wow,,,,,,,,,,,,,kilt wow,cg/dialogue,,encyclopedia,,Dinan et al. 2019,
-lama,conceptnet,,,,,,,,,,,,,lama-conceptnet,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020,
-lama,google_re,,,,,,,,,,,,,lama-google re,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020,
-lama,squad,,,,,,,,,,,,,lama-squad,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020,
-lama,trex,,,,,,,,,,,,,lama-trex,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020,
-limit,,physical cognition,,,,,,,,,,https://aclanthology.org/2020.findings-emnlp.88.pdf,,limit,other,,physical semantic repr.,,Manotas et al. 2020,
-kilt_tasks,fever,,,revisit whether this should be considered as a variant of NLI,,,,,,,,,,kilt fever,cls/fact checking,,encyclopedia,,Thorne et al. 2018,
-Skipped,,,,,,,,,,,,,,,,,,,,
-fever,v2.0,closed-book qa/fact checking,,also in KILT,,,,,,,,,,FEVER,,,,,,
-hotpot_qa,distractor,,,also in KILT,,,,,,,,,,Hotpot QA,,,,,,
-hotpot_qa,fullwiki,,,also in KILT,,,,,,,,,,Hotpot QA,,,,,,
-emo,,sentiment,cls,skip: offensive and ungrammatical text,,,,30160,0,precision;recall;F1,,https://aclanthology.org/S19-2005.pdf,,emo,cls/emotion,,,,Chatterjee et al. 2019,
-freebase_qa,,QA_closed_book,gen,"need to be held out because web_questions is ""supposed to be answerable by Freebase""",,,,20358,0,,,,intensive,freebase qa,qa/closed-book qa,,,,Jiang et al. 2019,
-aqua_rat,,,,,,,,,,,,https://arxiv.org/abs/1705.04146,,aqua rat,qa/multiple-choice qa,,nontrivial math,,Ling et al. 2017,
-math_qa,,,,,,,,,,,,,,math qa,qa/multiple-choice qa,,nontrivial math,,Amini et al. 2019,
-numer_sense,,,,,,,,,,,,,,numer sense,qa/closed-book qa,,numerical knowledge,,Lin et al. 2020a,
-squad_adversarial,,,,,,,,,,,,,,,,,,,,
-squadshifts,,,,,,,,,,,,,,,,,,,,
-sms_spam,,,,,,,,,,,,,,sms spam,cls/other,,,,Almeida et al. 2011,
-search_qa,,,,,,,,,,,,,,search qa,qa/closed-book qa,,,,Dunn et al. 2017,
-kilt_tasks,trex,,,,,,,,,,,,,kilt trex,qa/closed-book qa,,encyclopedia,,Elsahar et al. 2018,
-kilt_tasks,structured_zeroshot,,,,,,,,,,,,,kilt zsre,qa/closed-book qa,,encyclopedia,,Levy et al. 2017,
-spider,,,,,,,,,,,,,,spider,cg/other,,,,Yu et al. 2018,
-wikisql,,,,,,,,,,,,,,wikisql,cg/other,,,,Zhong et al. 2017,
-com_qa,,,,,CrossFit,,,,,,,https://arxiv.org/pdf/1809.09528.pdf,,ComQA (Abujabal et al. 2019),factoid QA w/ paraphrases,,,snippets WikiAnswers,,
-climate_fever,,,,revisit whether this should be considered as a variant of NLI,,,,,,,,,,climate fever,cls/fact checking,,,,Diggelmann et al. 2020,
-art,,,,,,,,,,,,https://arxiv.org/pdf/1908.05739.pdf,,art (abductive nli),other,,,,Bhagavatula et al. 2020,
-glue,mnli,classification_NLI,,,,,,,,,,,,glue-mnli,cls/nli,,,,Williams et al. 2018,
-glue,qnli,classification_NLI,,,,,,,,,,,,glue-qnli,cls/nli,,,,Rajpurkar et al. 2016,
-glue,wnli,classification_NLI,,,,,,,,,,,,glue-wnli,cls/nli,,,,Levesque et al. 2012,
-,,classification_NLI,,,,,,,,,,,,scitail,cls/nli,,,,Khot et al. 2018,
-,,classification_NLI,,,,,,,,,,,,sick,cls/nli,,,,Marelli et al. 2014,
-,,classification_NLI,,,,,,,,,,,,SNLI (Bowman et al. 2015),NLI,,,misc.,,
-aeslc,,,,summarization by email subject line,,,,,,,,https://arxiv.org/abs/1906.03497,,aeslc,cg/summarization,,generation,,Zhang and Tetreault 2019,
-onestop_english,,,,,,,,,,,,https://aclanthology.org/W18-0535.pdf,,onestop english,cls/other,,,,Vajjala and Luˇci´c 2018,
-mocha,,,,,,,,,,,,,,mocha,other/regression,,,,Chen et al. 2020a,
-commonsense_qa,,,,duplicate with cos_e,Vania,,,9741,,,,https://arxiv.org/pdf/1811.00937.pdf,,Commonsense QA,qa/multiple-choice qa,,,,Talmor et al. 2019,
-,,,,,,,,,,,,,,emotion,cls/emotion,,,,Saravia et al. 2018,
-,,,,the authors themselves seem to have renounced their own work,,,,,,,,https://github.com/nyu-mll/crows-pairs,,crows pairs,other,,,,Nangia et al. 2020,
-,,,,,,,,,,,,,,ethos-directed vs generalized,cls/hate speech detection,,,,Mollas et al. 2020,
-,,,,,,,,,,,,,,ethos-disability,cls/hate speech detection,,,,Mollas et al. 2020,
-,,,,,,,,,,,,,,ethos-gender,cls/hate speech detection,,,,Mollas et al. 2020,
-,,,,,,,,,,,,,,ethos-national origin,cls/hate speech detection,,,,Mollas et al. 2020,
-,,,,,,,,,,,,,,ethos-race,cls/hate speech detection,,,,Mollas et al. 2020,
-,,,,,,,,,,,,,,ethos-religion,cls/hate speech detection,,,,Mollas et al. 2020,
-,,,,,,,,,,,,,,ethos-sexual orientation,cls/hate speech detection,,,,Mollas et al. 2020,
-,,,,,,,,,,,,,,hate speech offensive,cls/hate speech detection,,,,Davidson et al. 2017,
-,,,,,,,,,,,,,,hate speech18,cls/hate speech detection,,,,de Gibert et al. 2018,
-,,,,,,,,,,,,,,hatexplain,cls/hate speech detection,,,,Mathew et al. 2020,
-,,,,,,,,,,,,,,reddit tifu-title,cg/summarization,,,,Kim et al. 2019,
-,,,,,,,,,,,,,,reddit tifu-tldr,cg/summarization,,,,Kim et al. 2019,
-,,,,,,,,,,,,,,tweet eval-emoji,cls/emotion,,,,Barbieri et al. 2020,
-,,,,,,,,,,,,,,tweet eval-emotion,cls/emotion,,,,Barbieri et al. 2020,
-,,,,,,,,,,,,,,tweet eval-hate,cls/emotion,,,,Barbieri et al. 2020,
-,,,,,,,,,,,,,,tweet eval-irony,cls/emotion,,,,Barbieri et al. 2020,
-,,,,,,,,,,,,,,tweet eval-offensive,cls/emotion,,,,Barbieri et al. 2020,
-,,,,,,,,,,,,,,tweet eval-sentiment,cls/emotion,,,,Barbieri et al. 2020,
-,,,,,,,,,,,,,,tweet eval-stance abortion,cls/emotion,,,,Barbieri et al. 2020,
-,,,,,,,,,,,,,,tweet eval-stance atheism,cls/emotion,,,,Barbieri et al. 2020,
-,,,,,,,,,,,,,,tweet eval-stance climate,cls/emotion,,,,Barbieri et al. 2020,
-,,,,,,,,,,,,,,tweet eval-stance feminist,cls/emotion,,,,Barbieri et al. 2020,
-,,,,,,,,,,,,,,tweet eval-stance hillary,cls/emotion,,,,Barbieri et al. 2020,
-,,,,,,,,,,,,,,tweet qa,qa/machine reading comprehension,,,,Xiong et al. 2019,
-yelp_polarity,,,,,,,,,,,,,,yelp polarity,cls/sentiment analysis,,,,Zhang et al. 2015; (link),
-quora,,,,,,,,,,,,https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs,,QQP,paraphrase identiﬁcation,,,social QA,Iyer et al. 2017,
-squad,,,,,,,,,,,,,,SQuAD 1.1,Extractive QA,,,,,
-yahoo_answers_topics,,,,,,,,,,,,,,yahoo answers topics,cls/topic,,,,(link),
-tab_fact,,,,,,,,,,,,,,tab fact,cls/fact checking,,,,Chen et al. 2020b,
-,,,,,,,,,,,,,,blimp-anaphor gender agreement,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
-,,,,,,,,,,,,,,blimp-anaphor number agreement,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
-,,,,,,,,,,,,,,blimp-determiner noun agreement with adj irregular 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
-,,,,,,,,,,,,,,blimp-ellipsis n bar 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
-,,,,,,,,,,,,,,blimp-ellipsis n bar 2,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
-,,,,,,,,,,,,,,blimp-existential there quantiﬁers 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
-,,,,,,,,,,,,,,blimp-irregular past participle adjectives,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
-,,,,,,,,,,,,,,blimp-sentential negation npi licensor present,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
-,,,,,,,,,,,,,,blimp-sentential negation npi scope,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
-,,,,,,,,,,,,,,blimp-wh questions object gap,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020,
-poem_sentiment,,,,,,,,,,,,,,poem sentiment,cls/sentiment analysis,,creativity,,Sheng and Uthus 2020,
-acronym_identification,,,,,,,,,,,,https://arxiv.org/pdf/2010.14678.pdf,,acronym identiﬁcation,other,,,,Pouran Ben Veyseh et al. 2020,
-google_wellformed_query,,,,revisit whether to exclude fine-grain regression tasks,,,,,,,,,,google wellformed query,cls/other,,,,Faruqui and Das 2018,
-liar,,,,revisit whether to exclude fine-grain regression tasks,,,,,,,,,,liar,cls/fact checking,,,,Wang 2017,
-,,,,,,,,,,,,,,break-QDMR-high-level,other,,semantic representation,,Wolfson et al. 2020,
-,,,,,,,,,,,,,,crawl domain,other,,,,Zhang et al. 2020,
-discovery,discovery,,,,,,,,,,,,,discovery,cls/other,,generative-ish,,Sileo et al. 2019,
-wiki_split,,,,,,,,,,,,,,wiki split,cg/other,,,,Botha et al. 2018,
-,,,,,,,,,,,,,,aslg pc12,other,,,,Othman and Jemni 2012,
-,,,,,,,,,,,,,,CCG (Hockenmaier and Steedman 2007),CCG supertagging,,syntax,Penn Treebank,,
-,,,,,,,,,,,,,,Chunk (Tjong Kim Sang and Buchholz 2000),syntactic chunking,,syntax,Penn Treebank,,
-,,,,,,,,,,,,,,Conj (Ficler and Goldberg 2016),conjunct identiﬁcation,,syntax,Penn Treebank,,
-,,,,,,,,,,,,,,GED (Yannakoudakis et al. 2011),grammatical error detection,,syntax,misc.,,
-,,,,,,,,,,,,,,GGParent (Liu et al. 2019a),syntactic tagging,,syntax,Penn Treebank,,
-,,,,,,,,,,,,,,GParent (Liu et al. 2019a),syntactic tagging,,syntax,Penn Treebank,,
-,,,,,,,,,,,,,,NER (Tjong Kim Sang and De Meulder 2003),named entity recognition,,,news,,
-,,,,,,,,,,,,,,Parent (Liu et al. 2019a),syntactic tagging,,syntax; constituency,Penn Treebank,,
-,,,,,,,,,,,,,,POS-EWT (Silveira et al. 2014),part-of-speech tagging,,syntax,Web Treebank,,
-,,,,,,,,,,,,,,POS-PTB (Marcus et al. 1993),part-of-speech tagging,,syntax,Penn Treebank,,
-,,,,,,,,,,,,,,ST (Bjerva et al. 2016),semantic tagging,,,Groningen Meaning Bank,,
-financial_phrasebank,,,,,,,,,,,,,,ﬁnancial phrasebank,cls/sentiment analysis,,,,Malo et al. 2014,
-health_fact,,,,,,,,,,,,,,health fact,cls/fact checking,,,,Kotonya and Toni 2020,
-,,,,,,,,,,,,http://www.sciencedirect.com/science/article/pii/S1532046412000615,,ade corpus v2-classiﬁcation,cls/other,,,,Gurulingappa et al. 2012,
-,,,,,,,,,,,,,,ade corpus v2-dosage,other/slot ﬁlling,,,,Gurulingappa et al. 2012,
-,,,,,,,,,,,,,,ade corpus v2-effect,other/slot ﬁlling,,,,Gurulingappa et al. 2012,
-,,,,,,,,,,,,,,biomrc,qa/machine reading comprehension,,,,Pappas et al. 2020,
-,,,,,,,,,,,,,,medical questions pairs,cls/paraphrase,,,,McCreery et al. 2020,
-scicite,,,,,,,,,,,,,,scicite,cls/other,,,,Cohan et al. 2019,
-,,,,,,,,,,,,,,break-QDMR,other,,logical form,,Wolfson et al. 2020,
-,,,,,,,,,,,,,,e2e nlg cleaned,other,,,,Duˇsek et al. 2020 2019,
-glue,sst2,,,,,,,,,,,,,glue-sst2,cls/sentiment analysis,,,,Socher et al. 2013,
-glue,stsb,fine-grain regression,,,,,,,,,,,,glue-stsb,semantic similarity,,,misc.,,
-,,,,,,,,,,,,,,squad-no context,qa/closed-book qa,,,,Rajpurkar et al. 2016,
-,,,,,,,,,,,,,,squad-with context,qa/machine reading comprehension,,,,Rajpurkar et al. 2016,
-,,,,contrast sets,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,BoolQ-CS,Binary yes/no,,,,,
-,,,,,,,,,,,,https://aclanthology.org/C16-1236.pdf,,CQ (Bao et al. 2016),knowledge-based QA,,,snippets web queries/KB,,
-,,,,contrast sets,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,DROP-CS,Abstractive QA,,,,,
-,,,,,,,,,,,,https://aclanthology.org/D13-1020.pdf,,MCTest,Multiple choice,,,,,
-,,,,,,,,,,,,,,MRPC (Dolan and Brockett 2005),paraphrase identiﬁcation,,,news,,
-,,,,"""naturally perturbed"" version of BoolQ",,,,,,,,https://arxiv.org/pdf/2004.04849.pdf,,NP-BoolQ,Binary yes/no,,,,,
-,,,,,,,,,,,,https://aclanthology.org/D19-1608.pdf,,quartz-no knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b,
-,,,,contrast sets,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,Quoref-CS,Extractive QA,,,,,
-,,,,contrast sets,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,ROPES-CS,Extractive QA,,,,,
+super_glue,wic,word_sense_disambiguation,cls,,,SGLUE,BASE,5428,0,accuracy,,https://arxiv.org/pdf/1808.09121.pdf,,superglue-wic,cls/other,,lexical_knowledge,,Pilehvar and Camacho-Collados 2019,TRUE
diff --git a/t0/seqio_tasks/tasks.py b/t0/seqio_tasks/tasks.py
index 7331f89..3707831 100644
--- a/t0/seqio_tasks/tasks.py
+++ b/t0/seqio_tasks/tasks.py
@@ -138,7 +138,10 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
 
 
 datatset_subset_tuple = Tuple[str, Optional[str]]
-d4_eval: List[datatset_subset_tuple] = []
+d4_eval: Dict[str, List[datatset_subset_tuple]] = {
+    "BASE": [],
+    "BIAS_FAIRNESS": []
+}
 d4_train: Dict[str, List[datatset_subset_tuple]] = {
     "BASE": [],
     # GPT3 evaluation set
@@ -147,7 +150,6 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
     "SGLUE": []
 }
 
-bias_fairness_eval: List[datatset_subset_tuple] = []
 gsheet: Dict[datatset_subset_tuple, Dict] = {}
 experiment_path = pkg_resources.resource_filename(__name__, "experiment_D4.csv")
 with open(experiment_path) as exp_file:
@@ -164,24 +166,21 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
             if do_train_source == "SGLUE":
                 assert dataset_subset[0] == "super_glue"
             d4_train[do_train_source].append(dataset_subset)
-        if row["do_eval"] == "TRUE":
-            d4_eval.append(dataset_subset)
-        if (
-            row["do_eval"] == "TRUE"
-            and row["task_by_convention"] == "bias_and_fairness"
-            and row["HF_name"] != "winogender"
-        ):
-            bias_fairness_eval.append(dataset_subset)
+        if row["do_eval"] != "":
+            do_eval_source = row["do_eval"]
+            # sanity checks
+            if do_eval_source == "BIAS_FAIRNESS":
+                assert row["task_by_convention"] == "bias_and_fairness"
+            d4_eval[do_eval_source].append(dataset_subset)
         gsheet[dataset_subset] = row
-all_datasets = sum(d4_train.values()) + d4_eval + bias_fairness_eval
+all_datasets = sum(d4_train.values()) + sum(d4_train.values())
 
 all_templates = promptsource.templates.TemplateCollection()
 all_templates.remove("anli")  # Need to special-case ANLI due to weird split conventions
 
 # 3 stages of training/ablation: D4 -> GPT -> SuperGLUE
-d4_train_mixture: Dict[str,List[str]] = {key: [] for key in d4_train }
-d4_eval_mixture: List[str] = []
-bias_fairness_eval_mixture: List[str] = []
+d4_train_mixture: Dict[str,List[str]] = {key: [] for key in d4_train}
+d4_eval_mixture: Dict[str,List[str]] = {key: [] for key in d4_eval}
 mixture_cap: Dict[str, int] = {}
 single_original_task: Dict[Tuple[str, str], str] = {}
 all_original_tasks: List[str] = []
@@ -220,13 +219,13 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
                 d4_train_mixture[key].append(task_name)
                 mixture_cap[task_name] = cap
 
-        # Check that the dataset_subset_tuplek is in d4_eval
-        if (dataset_name, subset_name) in d4_eval:
+        # Check that the dataset_subset_tuple is in d4_eval
+        if (dataset_name, subset_name) in d4_eval["BASE"]:
             if template.metadata.original_task:
-                d4_eval_mixture.append(task_name)
+                d4_eval_mixture["BASE"].append(task_name)
             # TODO use template.metadata.answer_choices here for rank eval
-        if (dataset_name, subset_name) in bias_fairness_eval:
-            bias_fairness_eval_mixture.append(task_name)
+        if (dataset_name, subset_name) in d4_eval["BIAS_FAIRNESS"]:
+            d4_eval_mixture["BIAS_FAIRNESS"].append(task_name)
 
 # Special case for ANLI, which has weirdly-named splits and rounds that should be subsets
 dataset_name, subset_name = ("anli", None)
@@ -243,7 +242,7 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
 
         template = dataset[template_name]
         if template.metadata.original_task:
-            d4_eval_mixture.append(task_name)  # TODO or add to ANLI special mixture
+            d4_eval_mixture["BASE"].append(task_name)  # TODO or add to ANLI special mixture
         # TODO use template.metadata.answer_choices here for rank eval
 
 
@@ -340,7 +339,7 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
         task
         for task in seqio.TaskRegistry.names()
         if task.endswith("_score_eval")
-        and task.split("_score_eval")[0] in d4_eval_mixture
+        and task.split("_score_eval")[0] in d4_eval_mixture["BASE"]
         and task.split("_score_eval")[0] not in TASK_BLACKLIST
     ],
     default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
@@ -414,7 +413,7 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
     [
         task
         for task in seqio.TaskRegistry.names()
-        if task.endswith("_score_eval") and task.split("_score_eval")[0] in bias_fairness_eval_mixture
+        if task.endswith("_score_eval") and task.split("_score_eval")[0] in d4_eval_mixture["BIAS_FAIRNESS"]
     ],
     default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
 )

From 516b0919401f5902f3e96fd609164639b2d7ec5a Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 4 Jan 2022 17:56:34 +0100
Subject: [PATCH 05/16] Comment out unecessary changes

---
 t0/seqio_tasks/tasks.py | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/t0/seqio_tasks/tasks.py b/t0/seqio_tasks/tasks.py
index 3707831..7e89315 100644
--- a/t0/seqio_tasks/tasks.py
+++ b/t0/seqio_tasks/tasks.py
@@ -275,26 +275,26 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
     "gigaword_summarize_",
 ]
 
-# Tasks that failed caching (won't try to fix them for now) - remove when we are done
-D4_TRAIN_SCORE_EVAL_TASK_BLACKLIST = [
-    "amazon_polarity_Is_this_product_review_positive_score_eval",
-    "amazon_polarity_Is_this_review_negative_score_eval",
-    "amazon_polarity_Is_this_review_score_eval",
-    "amazon_polarity_User_recommend_this_product_score_eval",
-    "amazon_polarity_convey_negative_or_positive_sentiment_score_eval",
-    "amazon_polarity_flattering_or_not_score_eval",
-    "amazon_polarity_negative_or_positive_tone_score_eval",
-    "amazon_polarity_user_satisfied_score_eval",
-    "amazon_polarity_would_you_buy_score_eval",
-    "dbpedia_14_given_a_choice_of_categories__score_eval",
-    "dbpedia_14_given_list_what_category_does_the_paragraph_belong_to_score_eval",
-    "dbpedia_14_pick_one_category_for_the_following_text_score_eval",
-    "wiki_hop_original_choose_best_object_affirmative_1_score_eval",
-    "wiki_hop_original_choose_best_object_affirmative_2_score_eval",
-    "wiki_hop_original_choose_best_object_affirmative_3_score_eval",
-    "wiki_hop_original_choose_best_object_interrogative_1_score_eval",
-    "wiki_hop_original_choose_best_object_interrogative_2_score_eval",
-]
+# # Tasks that failed caching (won't try to fix them for now) - remove when we are done
+# D4_TRAIN_SCORE_EVAL_TASK_BLACKLIST = [
+#     "amazon_polarity_Is_this_product_review_positive_score_eval",
+#     "amazon_polarity_Is_this_review_negative_score_eval",
+#     "amazon_polarity_Is_this_review_score_eval",
+#     "amazon_polarity_User_recommend_this_product_score_eval",
+#     "amazon_polarity_convey_negative_or_positive_sentiment_score_eval",
+#     "amazon_polarity_flattering_or_not_score_eval",
+#     "amazon_polarity_negative_or_positive_tone_score_eval",
+#     "amazon_polarity_user_satisfied_score_eval",
+#     "amazon_polarity_would_you_buy_score_eval",
+#     "dbpedia_14_given_a_choice_of_categories__score_eval",
+#     "dbpedia_14_given_list_what_category_does_the_paragraph_belong_to_score_eval",
+#     "dbpedia_14_pick_one_category_for_the_following_text_score_eval",
+#     "wiki_hop_original_choose_best_object_affirmative_1_score_eval",
+#     "wiki_hop_original_choose_best_object_affirmative_2_score_eval",
+#     "wiki_hop_original_choose_best_object_affirmative_3_score_eval",
+#     "wiki_hop_original_choose_best_object_interrogative_1_score_eval",
+#     "wiki_hop_original_choose_best_object_interrogative_2_score_eval",
+# ]
 
 seqio.MixtureRegistry.add(
     "d4_train",

From 79c397aa359d35e83bcba78f6a4cc573b43846d7 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 4 Jan 2022 21:11:27 +0100
Subject: [PATCH 06/16] Woops

---
 setup.py                | 2 +-
 t0/seqio_tasks/tasks.py | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index aaecb92..4bbf0cd 100644
--- a/setup.py
+++ b/setup.py
@@ -34,7 +34,7 @@
         "protobuf",
         "scikit-learn"
     ],
-    extra_require={
+    extras_require={
         "seqio_tasks": [
             "seqio",
             "t5",
diff --git a/t0/seqio_tasks/tasks.py b/t0/seqio_tasks/tasks.py
index 7e89315..a254872 100644
--- a/t0/seqio_tasks/tasks.py
+++ b/t0/seqio_tasks/tasks.py
@@ -155,8 +155,6 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
 with open(experiment_path) as exp_file:
     reader = csv.DictReader(exp_file)
     for row in reader:
-        if row["skip"]:
-            continue
         if row["subset"] == "":
             row["subset"] = None  # to match promptsource.Template object
         dataset_subset = (row["HF_name"], row["subset"])

From 384d910e8a552ca2d81aa3bcf76cdd6d144c0ff4 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 4 Jan 2022 21:17:54 +0100
Subject: [PATCH 07/16] Woops

---
 t0/seqio_tasks/tasks.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/t0/seqio_tasks/tasks.py b/t0/seqio_tasks/tasks.py
index a254872..ab46180 100644
--- a/t0/seqio_tasks/tasks.py
+++ b/t0/seqio_tasks/tasks.py
@@ -171,7 +171,8 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
                 assert row["task_by_convention"] == "bias_and_fairness"
             d4_eval[do_eval_source].append(dataset_subset)
         gsheet[dataset_subset] = row
-all_datasets = sum(d4_train.values()) + sum(d4_train.values())
+
+all_datasets = [*d4_train.values(), *d4_eval.values()]
 
 all_templates = promptsource.templates.TemplateCollection()
 all_templates.remove("anli")  # Need to special-case ANLI due to weird split conventions

From b284d10e9db0c8ca0d40d388535e1614e6a607d1 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 4 Jan 2022 21:41:08 +0100
Subject: [PATCH 08/16] Woops

---
 t0/seqio_tasks/tasks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/t0/seqio_tasks/tasks.py b/t0/seqio_tasks/tasks.py
index ab46180..77d1b17 100644
--- a/t0/seqio_tasks/tasks.py
+++ b/t0/seqio_tasks/tasks.py
@@ -172,7 +172,7 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
             d4_eval[do_eval_source].append(dataset_subset)
         gsheet[dataset_subset] = row
 
-all_datasets = [*d4_train.values(), *d4_eval.values()]
+all_datasets = sum(d4_train.values(), []) + sum(d4_eval.values(), [])
 
 all_templates = promptsource.templates.TemplateCollection()
 all_templates.remove("anli")  # Need to special-case ANLI due to weird split conventions
@@ -213,7 +213,7 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
             all_original_tasks.append(task_name)
 
         # Check that the dataset_subset_tuple is in d4_train
-        for key, dataset_subset_tuples in d4_train:
+        for key, dataset_subset_tuples in d4_train.items():
             if (dataset_name, subset_name) in dataset_subset_tuples:
                 d4_train_mixture[key].append(task_name)
                 mixture_cap[task_name] = cap

From 91d04f9470858a3e72b55f46c58c7541ea0a7249 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Tue, 4 Jan 2022 23:26:40 +0100
Subject: [PATCH 09/16] Remove requirement.txt

---
 requirements.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index c593c47..e69de29 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +0,0 @@
-git+git://github.com/bigscience-workshop/promptsource@v0.1.0
-accelerate
-transformers
-torch
-seqio

From 089211063a1ed39d2ca859adb6d4858d312c6e1e Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 6 Jan 2022 17:29:18 +0100
Subject: [PATCH 10/16] First set of PR comments

---
 setup.py                         |  4 +-
 t0/datasets.csv                  | 67 +++++++++++++++++++++++++++++
 t0/seqio_tasks/experiment_D4.csv | 73 --------------------------------
 t0/seqio_tasks/tasks.py          |  2 +-
 4 files changed, 70 insertions(+), 76 deletions(-)
 create mode 100644 t0/datasets.csv
 delete mode 100644 t0/seqio_tasks/experiment_D4.csv

diff --git a/setup.py b/setup.py
index 4bbf0cd..fafe8a7 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@
     license="Apache Software License 2.0",
     long_description=readme,
     install_requires=[
-        "promptsource",
+        "git+git://github.com/bigscience-workshop/promptsource@v0.1.0",
         "accelerate",
         "transformers",
         "torch",
@@ -43,7 +43,7 @@
     },
     package_data={
         "": [
-            "seqio_tasks/experiment_D4.csv",
+            "seqio_tasks/datasets.csv",
         ]
     }
 )
\ No newline at end of file
diff --git a/t0/datasets.csv b/t0/datasets.csv
new file mode 100644
index 0000000..10b8c53
--- /dev/null
+++ b/t0/datasets.csv
@@ -0,0 +1,67 @@
+HF_name,subset,task_by_convention,do_train,do_eval,train_size
+crows_pairs,,bias_and_fairness,,BIAS_FAIRNESS,
+jigsaw_toxicity_pred,,bias_and_fairness,,BIAS_FAIRNESS,
+super_glue,axg,bias_and_fairness,,BIAS_FAIRNESS,
+wino_bias,type1_anti,bias_and_fairness,,BIAS_FAIRNESS,
+wino_bias,type2_anti,bias_and_fairness,,BIAS_FAIRNESS,
+wino_bias,type1_pro,bias_and_fairness,,BIAS_FAIRNESS,
+wino_bias,type2_pro,bias_and_fairness,,BIAS_FAIRNESS,
+super_glue,wsc.fixed,coreference,SGLUE,BASE,554
+winogrande,winogrande_xl,coreference,,BASE,40398
+super_glue,cb,NLI,,BASE,250
+super_glue,rte,NLI,,BASE,2490
+anli,,NLI,,BASE,162865
+glue,mrpc,paraphrase,BASE,,3668
+glue,qqp,paraphrase,BASE,,363846
+paws,labeled_final,paraphrase,BASE,,49401
+ai2_arc,ARC-Challenge,QA_closed_book,GPT_EVAL,,1119
+ai2_arc,ARC-Easy,QA_closed_book,GPT_EVAL,,2251
+kilt_tasks,hotpotqa,QA_closed_book,BASE,,88869
+trivia_qa,unfiltered,QA_closed_book,GPT_EVAL,,87622
+web_questions,,QA_closed_book,GPT_EVAL,,3778
+wiki_qa,,QA_closed_book,BASE,,20360
+adversarial_qa,dbidaf,QA_extractive,BASE,,10000
+adversarial_qa,dbert,QA_extractive,BASE,,10000
+adversarial_qa,droberta,QA_extractive,BASE,,10000
+duorc,SelfRC,QA_extractive,BASE,,60721
+duorc,ParaphraseRC,QA_extractive,BASE,,69524
+ropes,,QA_extractive,BASE,,10924
+squad_v2,,QA_extractive,GPT_EVAL,,130319
+super_glue,record,QA_extractive,SGLUE,,100730
+quoref,,QA_extractive,BASE,,19399
+cos_e,v1.11,QA_multiple_choice,BASE,,9741
+cosmos_qa,,QA_multiple_choice,BASE,,25262
+dream,,QA_multiple_choice,BASE,,6116
+openbookqa,main,QA_multiple_choice,GPT_EVAL,,4957
+qasc,,QA_multiple_choice,BASE,,8134
+quail,,QA_multiple_choice,BASE,,10246
+quarel,,QA_multiple_choice,BASE,,1941
+quartz,,QA_multiple_choice,BASE,,2696
+race,high,QA_multiple_choice,GPT_EVAL,,62445
+race,middle,QA_multiple_choice,GPT_EVAL,,25421
+sciq,,QA_multiple_choice,BASE,,11679
+social_i_qa,,QA_multiple_choice,BASE,,33410
+super_glue,boolq,QA_multiple_choice,SGLUE,,9427
+super_glue,copa,QA_multiple_choice,SGLUE,BASE,400
+super_glue,multirc,QA_multiple_choice,SGLUE,,27243
+wiki_hop,original,QA_multiple_choice,BASE,,43738
+wiqa,,QA_multiple_choice,BASE,,29808
+piqa,,QA_multiple_choice,GPT_EVAL,,16113
+amazon_polarity,,sentiment,BASE,,3600000
+app_reviews,,sentiment,BASE,,288065
+imdb,,sentiment,BASE,,25000
+rotten_tomatoes,,sentiment,BASE,,8530
+yelp_review_full,,sentiment,BASE,,650000
+story_cloze,2016,story_completion,,BASE,
+hellaswag,,story_completion,GPT_EVAL,BASE,39905
+common_gen,,structure_to_text,BASE,,67389
+wiki_bio,,structure_to_text,BASE,,582659
+cnn_dailymail,3.0.0,summarization,BASE,,287113
+gigaword,,summarization,BASE,,3803957
+multi_news,,summarization,BASE,,44972
+samsum,,summarization,BASE,,14732
+xsum,,summarization,BASE,,204045
+ag_news,,topic_classification,BASE,,120000
+dbpedia_14,,topic_classification,BASE,,560000
+trec,,topic_classification,BASE,,5452
+super_glue,wic,word_sense_disambiguation,SGLUE,BASE,5428
diff --git a/t0/seqio_tasks/experiment_D4.csv b/t0/seqio_tasks/experiment_D4.csv
deleted file mode 100644
index 61b6bc8..0000000
--- a/t0/seqio_tasks/experiment_D4.csv
+++ /dev/null
@@ -1,73 +0,0 @@
-HF_name,subset,task_by_convention,format,comment,seed_paper,do_train,do_eval,train_size,adjusted_train_size,metric,multiple correct answer,Paper link,non_linguistic_knowledge,Imported Task Name,imported category,input_length,_human_skill,Domain,Reference,done
-crows_pairs,,bias_and_fairness,,test set only; authors themselves acknowledge some problems,Eval WG,,BIAS_FAIRNESS,,,,,,,,,,,,,TRUE
-jigsaw_toxicity_pred,,bias_and_fairness,,current https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data ; want https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification,Eval WG,,BIAS_FAIRNESS,,,,,,,,,,,,,TRUE
-super_glue,axg,bias_and_fairness,cls,test set only,Eval WG,,BIAS_FAIRNESS,,,,,,,,,,,,,TRUE
-wino_bias,type1_anti,bias_and_fairness,cls,,Eval WG,,BIAS_FAIRNESS,,,,,,,,,,,,,TRUE
-wino_bias,type2_anti,bias_and_fairness,cls,,Eval WG,,BIAS_FAIRNESS,,,,,,,,,,,,,TRUE
-wino_bias,type1_pro,bias_and_fairness,cls,,Eval WG,,BIAS_FAIRNESS,,,,,,,,,,,,,TRUE
-wino_bias,type2_pro,bias_and_fairness,cls,,Eval WG,,BIAS_FAIRNESS,,,,,,,,,,,,,TRUE
-super_glue,wsc.fixed,coreference,cls,,,SGLUE,BASE,554,0,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,superglue-wsc,cls/other,single sentence,knowledge-? reading comprehension,,Levesque et al. 2012,TRUE
-winogrande,winogrande_xl,coreference,ext,,GPT,,BASE,40398,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020,TRUE
-super_glue,cb,NLI,cls,"""for multi-class F1 we compute the unweighted average of the F1 per class.""",,,BASE,250,0,mean_multiclass_f1;accuracy,,https://semanticsarchive.net/Archive/Tg3ZGI2M/Marneffe.pdf,,superglue-cb,cls/nli,sentence pair,knowledge-neutral inference,,de Marneffe et al. 2019,TRUE
-super_glue,rte,NLI,cls,,,,BASE,2490,0,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,superglue-rte,cls/nli,sentence pair,knowledge modest inference,,Dagan et al. 2005; Bar-Haim et al. 2006 Giampiccolo et al. 2007; Bentivogli et al. 2009,TRUE
-anli,,NLI,cls,"In addition to accuracy, paper also evaluates on range of relaxed/strict and matched/unmatched settings and reports F scores for different answers",,,BASE,162865,0,accuracy,,https://arxiv.org/abs/1910.14599,,anli,cls/nli,sentence pair,knowledge modest inference,,Nie et al. 2020,TRUE
-glue,mrpc,paraphrase,cls,,,BASE,,3668,3668,accuracy;f1_score,,https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/I05-50025B15D.pdf,,glue-mrpc,cls/paraphrase,,paraphrase,,Dolan and Brockett 2005,TRUE
-glue,qqp,paraphrase,cls,,,BASE,,363846,363846,accuracy;f1_score,,https://aclanthology.org/I05-5002.pdf,,glue-qqp,cls/paraphrase,,,,(link),TRUE
-paws,labeled_final,paraphrase,cls,,,BASE,,49401,49401,,,,,paws,cls/paraphrase,,,,Zhang et al. 2019,TRUE
-ai2_arc,ARC-Challenge,QA_closed_book,cls,,GPT,GPT_EVAL,,1119,0,"accuracy_with_tie : For each question, a system receives 1 point if it
-chooses the correct answer and 1/k if it reports a k-way tie
-(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,ARC (chal.),qa/multiple-choice qa,,nontrivial_comprehension,,Clark et al. 2018,TRUE
-ai2_arc,ARC-Easy,QA_closed_book,cls,,GPT,GPT_EVAL,,2251,0,"accuracy_with_tie: For each question, a system receives 1 point if it
-chooses the correct answer and 1/k if it reports a k-way tie
-(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,ARC (easy),Multiple choice,,,,,TRUE
-kilt_tasks,hotpotqa,QA_closed_book,gen,recast as closed-book due to input length,self,BASE,,88869,88869,,,,,kilt hotpotqa,qa/closed-book qa,,encyclopedia; multi-hop QA,,Yang et al. 2018,TRUE
-trivia_qa,unfiltered,QA_closed_book,gen,,GPT,GPT_EVAL,,87622,0,exact_match;f1_over_words => wikipedia aliases are considered valid answers,TRUE,https://arxiv.org/pdf/1705.03551.pdf,intensive,Trivia QA,,,,,,TRUE
-web_questions,,QA_closed_book,gen,"""supposed to be answerable by Freebase"" Check corpora deduplication with freebaseqa.",GPT,GPT_EVAL,,3778,0,accuracy : they don't mention how they normalize across multiple correct answers,TRUE,https://aclanthology.org/D13-1160.pdf,intensive,web questions,qa/closed-book qa,,,,Berant et al. 2013,TRUE
-wiki_qa,,QA_closed_book,cls,,CrossFit,BASE,,20360,20360,,,https://aclanthology.org/D15-1237.pdf,,wiki qa,cls/other,,,,Yang et al. 2015,TRUE
-adversarial_qa,dbidaf,QA_extractive,ext,,,BASE,,10000,10000,,,https://aclanthology.org/2020.tacl-1.43/,,adversarialqa,qa/machine reading comprehension,,,,Bartolo et al. 2020,TRUE
-adversarial_qa,dbert,QA_extractive,ext,,,BASE,,10000,10000,,,,,,,,,,,TRUE
-adversarial_qa,droberta,QA_extractive,ext,,,BASE,,10000,10000,,,,,,,,,,,TRUE
-duorc,SelfRC,QA_extractive,ext,,TaskEmbed;CrossFit,BASE,,60721,60721,,,https://duorc.github.io/,,DuoRC,qa/machine reading comprehension,,,Wikipedia/IMDB crowd,Saha et al. 2018,TRUE
-duorc,ParaphraseRC,QA_extractive,ext,,TaskEmbed;CrossFit,BASE,,69524,69524,,,https://arxiv.org/pdf/1804.07927.pdf,,DuoRC,paraphrased QA,,,,Saha et al. 2018,TRUE
-ropes,,QA_extractive,ext,,,BASE,,10924,10924,,,,modest,ropes,Extractive QA,,cause_and_effect;nontrivial_comprehension,,Lin et al. 2019,TRUE
-squad_v2,,QA_extractive,ext,,GPT,GPT_EVAL,,130319,0,exact_match;f1_score,TRUE,https://arxiv.org/pdf/1806.03822.pdf,,SQuAD 2.0,Extractive QA,,,,Rajpurkar et al. 2018,TRUE
-super_glue,record,QA_extractive,ext,,,SGLUE,,100730,0,max_token_level_f1;exact_match,TRUE,https://arxiv.org/pdf/1810.12885.pdf,,superglue-record,qa/machine reading comprehension,,knowledge-? reading comprehension,,Zhang et al. 2018,TRUE
-quoref,,QA_extractive,ext,,,BASE,,19399,19399,,,https://aclanthology.org/D19-1606.pdf,,Quoref,Extractive QA,,,,Dasigi et al. 2019,TRUE
-cos_e,v1.11,QA_multiple_choice,cls,"same as commonsense_qa but with (poorly sourced) human explanations; questionable ""commonsense"" lots of world knowledge",Vania,BASE,,9741,9741,,,,,cos e,other/generate explanation,,,,Rajani et al. 2019,TRUE
-cosmos_qa,,QA_multiple_choice,cls,,,BASE,,25262,25262,,,,,cosmos qa,qa/multiple-choice qa,,,,Huang et al. 2019,TRUE
-dream,,QA_multiple_choice,cls,,,BASE,,6116,6116,,,,,dream,qa/multiple-choice qa,,,,Sun et al. 2019,TRUE
-openbookqa,main,QA_multiple_choice,cls,interesting combo of pragmatics + scientific reasoning,GPT,GPT_EVAL,,4957,0,"accuracy_with_tie : For each question, a system receives 1 point if it
-chooses the correct answer and 1/k if it reports a k-way tie
-(i.e., chooses multiple answers) that includes the correct answer.",,https://aclanthology.org/D18-1260.pdf,modest,openbookqa,qa/multiple-choice qa,,pragmatics,,Mihaylov et al. 2018,TRUE
-qasc,,QA_multiple_choice,cls,,,BASE,,8134,8134,,,,given?,qasc,qa/multiple-choice qa,,,,Khot et al. 2020,TRUE
-quail,,QA_multiple_choice,cls,,,BASE,,10246,10246,,,,,quail,qa/multiple-choice qa,,,,Rogers et al. 2020,TRUE
-quarel,,QA_multiple_choice,cls,,CrossFit,BASE,,1941,1941,,,,,quarel,qa/multiple-choice qa,,logical form,,Tafjord et al. 2019a,TRUE
-quartz,,QA_multiple_choice,cls,,,BASE,,2696,2696,,,https://aclanthology.org/D19-1608.pdf,given?,quartz-with knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b,TRUE
-race,high,QA_multiple_choice,cls,GPT-hard,GPT,GPT_EVAL,,62445,0,accuracy,,https://arxiv.org/pdf/1704.04683.pdff,neutral,race-high,qa/multiple-choice qa,,knowledge-neutral reading comprehension,,Lai et al. 2017,TRUE
-race,middle,QA_multiple_choice,cls,"revisit: define as comprehension, paragraph level?",GPT,GPT_EVAL,,25421,0,accuracy,,https://arxiv.org/pdf/1704.04683.pdf,neutral,race-middle,qa/multiple-choice qa,,knowledge-neutral reading comprehension,,Lai et al. 2017,TRUE
-sciq,,QA_multiple_choice,cls,,,BASE,,11679,11679,,,,,sciq,qa/multiple-choice qa,,,,Welbl et al. 2017,TRUE
-social_i_qa,,QA_multiple_choice,cls,metric differ by prompt: 4-way classification cast as binary ,,BASE,,33410,33410,accuracy,,https://arxiv.org/pdf/1904.09728.pdf,,SIQA,qa/multiple-choice qa,,cultural knowledge,,Sap et al. 2019,TRUE
-super_glue,boolq,QA_multiple_choice,cls,,,SGLUE,,9427,0,accuracy,,https://arxiv.org/pdf/1905.10044.pdf,neutral?,superglue-boolq,,,knowledge-? reading comprehension,,,TRUE
-super_glue,copa,QA_multiple_choice,cls,,,SGLUE,BASE,400,0,accuracy,,http://commonsensereasoning.org/2011/papers/Roemmele.pdf,modest,superglue-copa,qa/multiple-choice qa,,causal cognition,,Gordon et al. 2012,TRUE
-super_glue,multirc,QA_multiple_choice,cls,F1 over all answer options. See paper p. 259 for defintion,,SGLUE,,27243,0,f1_over_all_options;exact_match,,https://aclanthology.org/N18-1023.pdf,neutral?,superglue-multirc,qa/multiple-choice qa,,knowledge-? reading comprehension,,Khashabi et al. 2018,TRUE
-wiki_hop,original,QA_multiple_choice,cls,,,BASE,,43738,43738,,,https://transacl.org/ojs/index.php/tacl/article/viewFile/1325/299,,WikiHop (Welbl et al. 2018),multi-hop QA,,,Wikipedia KB,,TRUE
-wiqa,,QA_multiple_choice,cls,,,BASE,,29808,29808,,,,,wiqa,qa/multiple-choice qa,,cause_and_effect,,Tandon et al. 2019,TRUE
-piqa,,QA_multiple_choice,cls,revisit: not just other,GPT,GPT_EVAL,,16113,0,accuracy,,https://arxiv.org/pdf/1911.11641.pdf,,PIQA,Multiple choice,,physical_cognition,,Bisk et al. 2020,TRUE
-amazon_polarity,,sentiment,cls,,,BASE,,3600000,500000,,,https://cs.stanford.edu/people/jure/pubs/reviews-recsys13.pdf,,amazon polarity,cls/sentiment analysis,,,,McAuley and Leskovec 2013,TRUE
-app_reviews,,sentiment,cls,,,BASE,,288065,288065,,,,,app reviews,other/regression,,,,Missing,TRUE
-imdb,,sentiment,cls,,,BASE,,25000,25000,,,,,imdb,cls/sentiment analysis,,no dev set,,Maas et al. 2011,TRUE
-rotten_tomatoes,,sentiment,cls,,,BASE,,8530,8530,,,,,rotten tomatoes,cls/sentiment analysis,,,,Pang and Lee 2005,TRUE
-yelp_review_full,,sentiment,cls,no dev set,,BASE,,650000,500000,,,,,yelp review full,other/regression,,,,Zhang et al. 2015; (link),TRUE
-story_cloze,2016,story_completion,cls,todo: custom loading; swag like?,GPT,,BASE,,0,accuracy,,https://arxiv.org/pdf/1604.01696.pdf,,,,,,,,TRUE
-hellaswag,,story_completion,cls,,GPT,GPT_EVAL,BASE,39905,0,accuracy,,https://arxiv.org/pdf/1905.07830.pdf,,hellaswag,qa/multiple-choice qa,,,,Zellers et al. 2019,TRUE
-common_gen,,structure_to_text,gen,,,BASE,,67389,67389,,,,,common gen,other,,,,Lin et al. 2020b,TRUE
-wiki_bio,,structure_to_text,gen,,,BASE,,582659,500000,,,,,wiki bio,cg/other,,,,Lebret et al. 2016,TRUE
-cnn_dailymail,3.0.0,summarization,gen,,,BASE,,287113,287113,,,,,,,,,,,TRUE
-gigaword,,summarization,gen,,,BASE,,3803957,500000,,,,,gigaword,cg/summarization,,,,Napoles et al. 2012,TRUE
-multi_news,,summarization,gen,,CrossFit,BASE,,44972,44972,,,,,multi news,cg/summarization,,,,Fabbri et al. 2019,TRUE
-samsum,,summarization,gen,,CrossFit,BASE,,14732,14732,,,,,samsum,cg/summarization,,,,Gliwa et al. 2019,TRUE
-xsum,,summarization,gen,,,BASE,,204045,204045,rouge,,https://arxiv.org/pdf/1808.08745.pdf,,xsum,cg/summarization,,,,Narayan et al. 2018,TRUE
-ag_news,,topic_classification,cls,,,BASE,,120000,120000,,,http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html,,ag news,cls/topic,,,,Gulli (link),TRUE
-dbpedia_14,,topic_classification,cls,,,BASE,,560000,500000,,,https://svn.aksw.org/papers/2013/SWJ_DBpedia/public.pdf,,dbpedia 14,cls/topic,,,,Lehmann et al. 2015,TRUE
-trec,,topic_classification,cls,,,BASE,,5452,5452,,,https://trec.nist.gov/data/qa.html,,trec,cls/other,,,,Li and Roth 2002; Hovy et al. 2001,TRUE
-super_glue,wic,word_sense_disambiguation,cls,,,SGLUE,BASE,5428,0,accuracy,,https://arxiv.org/pdf/1808.09121.pdf,,superglue-wic,cls/other,,lexical_knowledge,,Pilehvar and Camacho-Collados 2019,TRUE
diff --git a/t0/seqio_tasks/tasks.py b/t0/seqio_tasks/tasks.py
index 77d1b17..1fe6635 100644
--- a/t0/seqio_tasks/tasks.py
+++ b/t0/seqio_tasks/tasks.py
@@ -151,7 +151,7 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
 }
 
 gsheet: Dict[datatset_subset_tuple, Dict] = {}
-experiment_path = pkg_resources.resource_filename(__name__, "experiment_D4.csv")
+experiment_path = pkg_resources.resource_filename(__name__, "../datasets.csv")
 with open(experiment_path) as exp_file:
     reader = csv.DictReader(exp_file)
     for row in reader:

From 2b6d52331bb607dc877a95d50b284c0af16191a6 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 6 Jan 2022 17:33:30 +0100
Subject: [PATCH 11/16] Add back d4_train_score_eval

---
 t0/seqio_tasks/tasks.py | 143 +++++++++++++++-------------------------
 1 file changed, 53 insertions(+), 90 deletions(-)

diff --git a/t0/seqio_tasks/tasks.py b/t0/seqio_tasks/tasks.py
index 1fe6635..2daf44b 100644
--- a/t0/seqio_tasks/tasks.py
+++ b/t0/seqio_tasks/tasks.py
@@ -274,26 +274,26 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
     "gigaword_summarize_",
 ]
 
-# # Tasks that failed caching (won't try to fix them for now) - remove when we are done
-# D4_TRAIN_SCORE_EVAL_TASK_BLACKLIST = [
-#     "amazon_polarity_Is_this_product_review_positive_score_eval",
-#     "amazon_polarity_Is_this_review_negative_score_eval",
-#     "amazon_polarity_Is_this_review_score_eval",
-#     "amazon_polarity_User_recommend_this_product_score_eval",
-#     "amazon_polarity_convey_negative_or_positive_sentiment_score_eval",
-#     "amazon_polarity_flattering_or_not_score_eval",
-#     "amazon_polarity_negative_or_positive_tone_score_eval",
-#     "amazon_polarity_user_satisfied_score_eval",
-#     "amazon_polarity_would_you_buy_score_eval",
-#     "dbpedia_14_given_a_choice_of_categories__score_eval",
-#     "dbpedia_14_given_list_what_category_does_the_paragraph_belong_to_score_eval",
-#     "dbpedia_14_pick_one_category_for_the_following_text_score_eval",
-#     "wiki_hop_original_choose_best_object_affirmative_1_score_eval",
-#     "wiki_hop_original_choose_best_object_affirmative_2_score_eval",
-#     "wiki_hop_original_choose_best_object_affirmative_3_score_eval",
-#     "wiki_hop_original_choose_best_object_interrogative_1_score_eval",
-#     "wiki_hop_original_choose_best_object_interrogative_2_score_eval",
-# ]
+# Tasks that failed caching (won't try to fix them for now) - remove when we are done
+D4_TRAIN_SCORE_EVAL_TASK_BLACKLIST = [
+    "amazon_polarity_Is_this_product_review_positive_score_eval",
+    "amazon_polarity_Is_this_review_negative_score_eval",
+    "amazon_polarity_Is_this_review_score_eval",
+    "amazon_polarity_User_recommend_this_product_score_eval",
+    "amazon_polarity_convey_negative_or_positive_sentiment_score_eval",
+    "amazon_polarity_flattering_or_not_score_eval",
+    "amazon_polarity_negative_or_positive_tone_score_eval",
+    "amazon_polarity_user_satisfied_score_eval",
+    "amazon_polarity_would_you_buy_score_eval",
+    "dbpedia_14_given_a_choice_of_categories__score_eval",
+    "dbpedia_14_given_list_what_category_does_the_paragraph_belong_to_score_eval",
+    "dbpedia_14_pick_one_category_for_the_following_text_score_eval",
+    "wiki_hop_original_choose_best_object_affirmative_1_score_eval",
+    "wiki_hop_original_choose_best_object_affirmative_2_score_eval",
+    "wiki_hop_original_choose_best_object_affirmative_3_score_eval",
+    "wiki_hop_original_choose_best_object_interrogative_1_score_eval",
+    "wiki_hop_original_choose_best_object_interrogative_2_score_eval",
+]
 
 seqio.MixtureRegistry.add(
     "d4_train",
@@ -301,18 +301,6 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
     default_rate=lambda t: mixture_cap[t.name],
 )
 
-# seqio.MixtureRegistry.add(
-#     "gpt_train",
-#     [task for task in gpt_train_mixture if task not in TASK_BLACKLIST],
-#     default_rate=lambda t: mixture_cap[t.name],
-# )
-#
-# seqio.MixtureRegistry.add(
-#     "sglue_train",
-#     [task for task in sglue_train_mixture if task not in TASK_BLACKLIST],
-#     default_rate=lambda t: mixture_cap[t.name],
-# )
-
 seqio.MixtureRegistry.add(
     "d4_gpt_eval_train",
     [task for task in d4_train_mixture["BASE"] + d4_train_mixture["GPT_EVAL"] if task not in TASK_BLACKLIST],
@@ -325,15 +313,8 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
     default_rate=lambda t: mixture_cap[t.name],
 )
 
-# seqio.MixtureRegistry.add(
-#     "d4_eval",
-#     [task for task in d4_eval_mixture if task not in TASK_BLACKLIST],
-#     default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
-# )  # eval mixture does not need to be capped
-
-
 seqio.MixtureRegistry.add(
-    "d4_score_eval",
+    "d4_eval_score_eval",
     [
         task
         for task in seqio.TaskRegistry.names()
@@ -344,50 +325,38 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
     default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
 )
 
-# # Train tasks we don't care about evaluating on
-# D4_TRAIN_SKIP_EVAL = [
-#     "paws_labeled_final",
-#     "adversarial_qa_dbidaf",
-#     "adversarial_qa_dbert",
-#     "duorc_ParaphraseRC",
-#     "dream",
-#     "amazon_polarity",
-#     "app_reviews",
-#     "imdb",
-#     "wiki_bio",
-#     "gigaword",
-#     "multi_news",
-#     "samsum",
-#     "dbpedia_14",
-#     "trec",
-# ]
-
-# seqio.MixtureRegistry.add(
-#     "d4_train_eval",
-#     [
-#         task
-#         for task in d4_train_mixture
-#         if task not in TASK_BLACKLIST
-#         and not any([skip in task for skip in D4_TRAIN_SKIP_EVAL])
-#         and task in all_original_tasks
-#     ],
-#     default_rate=lambda t: mixture_cap[t.name],
-# )
-#
-# seqio.MixtureRegistry.add(
-#     "d4_train_score_eval",
-#     [
-#         task
-#         for task in seqio.TaskRegistry.names()
-#         if task.endswith("_score_eval")
-#         and task.split("_score_eval")[0] in d4_train_mixture
-#         and task.split("_score_eval")[0] not in TASK_BLACKLIST
-#         and task not in D4_TRAIN_SCORE_EVAL_TASK_BLACKLIST
-#         and not any([skip in task for skip in D4_TRAIN_SKIP_EVAL])
-#         and task.split("_score_eval")[0] in all_original_tasks
-#     ],
-#     default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
-# )
+# Train tasks we don't care about evaluating on
+D4_TRAIN_SKIP_EVAL = [
+    "paws_labeled_final",
+    "adversarial_qa_dbidaf",
+    "adversarial_qa_dbert",
+    "duorc_ParaphraseRC",
+    "dream",
+    "amazon_polarity",
+    "app_reviews",
+    "imdb",
+    "wiki_bio",
+    "gigaword",
+    "multi_news",
+    "samsum",
+    "dbpedia_14",
+    "trec",
+]
+
+seqio.MixtureRegistry.add(
+    "d4_train_score_eval",
+    [
+        task
+        for task in seqio.TaskRegistry.names()
+        if task.endswith("_score_eval")
+        and task.split("_score_eval")[0] in d4_train_mixture["BASE"]
+        and task.split("_score_eval")[0] not in TASK_BLACKLIST
+        and task not in D4_TRAIN_SCORE_EVAL_TASK_BLACKLIST
+        and not any([skip in task for skip in D4_TRAIN_SKIP_EVAL])
+        and task.split("_score_eval")[0] in all_original_tasks
+    ],
+    default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
+)
 
 seqio.MixtureRegistry.add(
     "d4_train_one_og_prompt",
@@ -401,12 +370,6 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
     default_rate=lambda t: mixture_cap[t.name],
 )
 
-# seqio.MixtureRegistry.add(
-#     "bias_fairness_eval",
-#     bias_fairness_eval_mixture,
-#     default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
-# )
-
 seqio.MixtureRegistry.add(
     "bias_fairness_eval_score_eval",
     [

From 1cfdb26f6a56d9583d4107e76ba171185ebca8aa Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 6 Jan 2022 19:27:53 +0100
Subject: [PATCH 12/16] Remove template.py

---
 evaluation/README.md        |  9 +++-
 evaluation/template_list.py | 86 -------------------------------------
 examples/README.md          |  9 +++-
 3 files changed, 16 insertions(+), 88 deletions(-)
 delete mode 100644 evaluation/template_list.py

diff --git a/evaluation/README.md b/evaluation/README.md
index f719faf..70ea219 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -19,7 +19,14 @@ python run_eval.py \
     --output_dir ./debug
 ```
 
-You are expected to modify the `dataset_name`, the `dataset_config_name` and the `template_name`. The list of templates per data(sub)set is available in [this file](template_list.py).
+You are expected to modify the `dataset_name`, the `dataset_config_name` and the `template_name`. The templates used for T0 evaluation can be generated like the following,
+```python
+import seqio
+import t0.seqio_tasks
+
+for task in seqio.MixtureRegistry.get("d4_eval_score_eval").tasks:
+    print(task.name)
+```
 
 If you evaluate on ANLI (R1, R2 or R3), the `dataset_config_name` should be `dev_r1`, `dev_r2` or `dev_r3`.
 
diff --git a/evaluation/template_list.py b/evaluation/template_list.py
deleted file mode 100644
index 224665e..0000000
--- a/evaluation/template_list.py
+++ /dev/null
@@ -1,86 +0,0 @@
-template_list = {
-    ("super_glue", "rte"): [
-        "MNLI crowdsource",
-        "guaranteed true",
-        "can we infer",
-        "GPT-3 style",
-        "does this impl",
-        "should assume",
-        "does it follow that",
-        "based on the previous passage",
-        "justified in saying",
-        "must be true",
-    ],
-    ("super_glue", "cb"): [
-        "can we infer",
-        "based on the previous passage",
-        "claim true/false/inconclusive",
-        "does it follow that",
-        "justified in saying",
-        "always/sometimes/never",
-        "GPT-3 style",
-        "consider always/sometimes/never",
-        "guaranteed true",
-        "must be true",
-        "guaranteed/possible/impossible",
-        "does this imply",
-        "MNLI crowdsource",
-        "should assume",
-        "take the following as truth",
-    ],
-    ("anli", None): [
-        "MNLI crowdsource",
-        "should assume",
-        "does it follow that",
-        "GPT-3 style",
-        "based on the previous passage",
-        "justified in saying",
-        "take the following as truth",
-        "must be true",
-        "can we infer",
-        "guaranteed/possible/impossible",
-        "always/sometimes/never",
-        "does this imply",
-        "consider always/sometimes/never",
-        "claim true/false/inconclusive",
-        "guaranteed true",
-    ],
-    ("super_glue", "wsc.fixed"): [
-        "does the pronoun refer to",
-        "by p they mean",
-        "in other words",
-        "I think they mean",
-        "does p stand for",
-        "GPT-3 Style",
-        "replaced with",
-        "p is/are r",
-        "the pronoun refers to",
-        "Who or what is/are",
-    ],
-    ("winogrande", "winogrande_xl"): [
-        "does underscore refer to",
-        "stand for",
-        "underscore refer to",
-        "fill in the blank",
-        "Replace",
-    ],
-    ("story_cloze", "2016"): [
-        "Answer Given options",
-        "Choose Story Ending",
-        "Movie What Happens Next",
-        "Story Continuation and Options",
-        "Novel Correct Ending",
-    ],
-    ("super_glue", "wic"): [
-        "question-context-meaning-with-label",
-        "question-context-meaning",
-        "grammar_homework",
-        "affirmation_true_or_false",
-        "GPT-3-prompt",
-        "same_sense",
-        "question-context",
-        "GPT-3-prompt-with-label",
-        "polysemous",
-        "similar-sense",
-    ]
-}
diff --git a/examples/README.md b/examples/README.md
index 1826d3c..3ed3c5d 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -12,7 +12,14 @@ python single_task_fine_tune.py \
 
 The per epoch evaluation results will be saved as a CSV file in the `output_dir`. By default, it trains on the whole dataset. Optionally, you can pass `--num_shots` to train it on a random subset of examples.
 
-Like the zero-shot evaluation [script](../evaluation/run_eval.py), you are expected to provide `dataset_name`, `dataset_config_name`, and `template_name`. You can find the list of templates per dataset in [this file](../evaluation/template_list.py); these were the templates we used in the T0 paper (and installed if you did `pip install -r requirements`.) However, [`promptsource`](https://github.com/bigscience-workshop/promptsource) is being continously updated, so if you don't intend to reproduce the exact results from our paper, you may want to install the latest `promptsource` and call, for example, `DatasetTemplates("super_glue", "rte").all_template_names` to access the new templates.
+Like the zero-shot evaluation [script](../evaluation/run_eval.py), you are expected to provide `dataset_name`, `dataset_config_name`, and `template_name`. You can find the list of templates per mixture using the following script; these were the templates we used in the T0 paper (and installed if you did `pip install -r requirements`.) However, [`promptsource`](https://github.com/bigscience-workshop/promptsource) is being continously updated, so if you don't intend to reproduce the exact results from our paper, you may want to install the latest `promptsource` and call, for example, `DatasetTemplates("super_glue", "rte").all_template_names` to access the new templates.
+```python
+import seqio
+import promptsource.seqio_tasks
+
+for task in seqio.MixtureRegistry.get("d4_eval_score_train").tasks:
+    print(task.name)
+```
 
 
 ## Distributed Training

From 09d3bd6f18681f40ffff078bd089febdc9ecef47 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 6 Jan 2022 19:35:33 +0100
Subject: [PATCH 13/16] Remove d4 occurences

---
 evaluation/README.md    |  2 +-
 examples/README.md      |  2 +-
 t0/seqio_tasks/tasks.py | 62 ++++++++++++++++++++---------------------
 training/README.md      | 12 ++++----
 4 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/evaluation/README.md b/evaluation/README.md
index 70ea219..d8c8897 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -24,7 +24,7 @@ You are expected to modify the `dataset_name`, the `dataset_config_name` and the
 import seqio
 import t0.seqio_tasks
 
-for task in seqio.MixtureRegistry.get("d4_eval_score_eval").tasks:
+for task in seqio.MixtureRegistry.get("t0_eval_score_eval").tasks:
     print(task.name)
 ```
 
diff --git a/examples/README.md b/examples/README.md
index 3ed3c5d..4a4c219 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -17,7 +17,7 @@ Like the zero-shot evaluation [script](../evaluation/run_eval.py), you are expec
 import seqio
 import promptsource.seqio_tasks
 
-for task in seqio.MixtureRegistry.get("d4_eval_score_train").tasks:
+for task in seqio.MixtureRegistry.get("t0_eval_score_train").tasks:
     print(task.name)
 ```
 
diff --git a/t0/seqio_tasks/tasks.py b/t0/seqio_tasks/tasks.py
index 2daf44b..6c62bef 100644
--- a/t0/seqio_tasks/tasks.py
+++ b/t0/seqio_tasks/tasks.py
@@ -138,11 +138,11 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
 
 
 datatset_subset_tuple = Tuple[str, Optional[str]]
-d4_eval: Dict[str, List[datatset_subset_tuple]] = {
+t0_eval: Dict[str, List[datatset_subset_tuple]] = {
     "BASE": [],
     "BIAS_FAIRNESS": []
 }
-d4_train: Dict[str, List[datatset_subset_tuple]] = {
+t0_train: Dict[str, List[datatset_subset_tuple]] = {
     "BASE": [],
     # GPT3 evaluation set
     "GPT_EVAL": [],
@@ -163,23 +163,23 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
             # sanity checks
             if do_train_source == "SGLUE":
                 assert dataset_subset[0] == "super_glue"
-            d4_train[do_train_source].append(dataset_subset)
+            t0_train[do_train_source].append(dataset_subset)
         if row["do_eval"] != "":
             do_eval_source = row["do_eval"]
             # sanity checks
             if do_eval_source == "BIAS_FAIRNESS":
                 assert row["task_by_convention"] == "bias_and_fairness"
-            d4_eval[do_eval_source].append(dataset_subset)
+            t0_eval[do_eval_source].append(dataset_subset)
         gsheet[dataset_subset] = row
 
-all_datasets = sum(d4_train.values(), []) + sum(d4_eval.values(), [])
+all_datasets = sum(t0_train.values(), []) + sum(t0_eval.values(), [])
 
 all_templates = promptsource.templates.TemplateCollection()
 all_templates.remove("anli")  # Need to special-case ANLI due to weird split conventions
 
 # 3 stages of training/ablation: D4 -> GPT -> SuperGLUE
-d4_train_mixture: Dict[str,List[str]] = {key: [] for key in d4_train}
-d4_eval_mixture: Dict[str,List[str]] = {key: [] for key in d4_eval}
+t0_train_mixture: Dict[str,List[str]] = {key: [] for key in t0_train}
+t0_eval_mixture: Dict[str,List[str]] = {key: [] for key in t0_eval}
 mixture_cap: Dict[str, int] = {}
 single_original_task: Dict[Tuple[str, str], str] = {}
 all_original_tasks: List[str] = []
@@ -212,19 +212,19 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
         if template.metadata.original_task:
             all_original_tasks.append(task_name)
 
-        # Check that the dataset_subset_tuple is in d4_train
-        for key, dataset_subset_tuples in d4_train.items():
+        # Check that the dataset_subset_tuple is in t0_train
+        for key, dataset_subset_tuples in t0_train.items():
             if (dataset_name, subset_name) in dataset_subset_tuples:
-                d4_train_mixture[key].append(task_name)
+                t0_train_mixture[key].append(task_name)
                 mixture_cap[task_name] = cap
 
-        # Check that the dataset_subset_tuple is in d4_eval
-        if (dataset_name, subset_name) in d4_eval["BASE"]:
+        # Check that the dataset_subset_tuple is in t0_eval
+        if (dataset_name, subset_name) in t0_eval["BASE"]:
             if template.metadata.original_task:
-                d4_eval_mixture["BASE"].append(task_name)
+                t0_eval_mixture["BASE"].append(task_name)
             # TODO use template.metadata.answer_choices here for rank eval
-        if (dataset_name, subset_name) in d4_eval["BIAS_FAIRNESS"]:
-            d4_eval_mixture["BIAS_FAIRNESS"].append(task_name)
+        if (dataset_name, subset_name) in t0_eval["BIAS_FAIRNESS"]:
+            t0_eval_mixture["BIAS_FAIRNESS"].append(task_name)
 
 # Special case for ANLI, which has weirdly-named splits and rounds that should be subsets
 dataset_name, subset_name = ("anli", None)
@@ -241,7 +241,7 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
 
         template = dataset[template_name]
         if template.metadata.original_task:
-            d4_eval_mixture["BASE"].append(task_name)  # TODO or add to ANLI special mixture
+            t0_eval_mixture["BASE"].append(task_name)  # TODO or add to ANLI special mixture
         # TODO use template.metadata.answer_choices here for rank eval
 
 
@@ -296,30 +296,30 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
 ]
 
 seqio.MixtureRegistry.add(
-    "d4_train",
-    [task for task in d4_train_mixture["BASE"] if task not in TASK_BLACKLIST],
+    "t0_train",
+    [task for task in t0_train_mixture["BASE"] if task not in TASK_BLACKLIST],
     default_rate=lambda t: mixture_cap[t.name],
 )
 
 seqio.MixtureRegistry.add(
-    "d4_gpt_eval_train",
-    [task for task in d4_train_mixture["BASE"] + d4_train_mixture["GPT_EVAL"] if task not in TASK_BLACKLIST],
+    "t0+_train",
+    [task for task in t0_train_mixture["BASE"] + t0_train_mixture["GPT_EVAL"] if task not in TASK_BLACKLIST],
     default_rate=lambda t: mixture_cap[t.name],
 )
 
 seqio.MixtureRegistry.add(
-    "d4_gpt_sglue_train",
-    [task for task in d4_train_mixture["BASE"] + d4_train_mixture["GPT_EVAL"] + d4_train_mixture["SGLUE"] if task not in TASK_BLACKLIST],
+    "t0++_train",
+    [task for task in t0_train_mixture["BASE"] + t0_train_mixture["GPT_EVAL"] + t0_train_mixture["SGLUE"] if task not in TASK_BLACKLIST],
     default_rate=lambda t: mixture_cap[t.name],
 )
 
 seqio.MixtureRegistry.add(
-    "d4_eval_score_eval",
+    "t0_eval_score_eval",
     [
         task
         for task in seqio.TaskRegistry.names()
         if task.endswith("_score_eval")
-        and task.split("_score_eval")[0] in d4_eval_mixture["BASE"]
+        and task.split("_score_eval")[0] in t0_eval_mixture["BASE"]
         and task.split("_score_eval")[0] not in TASK_BLACKLIST
     ],
     default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
@@ -344,12 +344,12 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
 ]
 
 seqio.MixtureRegistry.add(
-    "d4_train_score_eval",
+    "t0_train_score_eval",
     [
         task
         for task in seqio.TaskRegistry.names()
         if task.endswith("_score_eval")
-        and task.split("_score_eval")[0] in d4_train_mixture["BASE"]
+        and task.split("_score_eval")[0] in t0_train_mixture["BASE"]
         and task.split("_score_eval")[0] not in TASK_BLACKLIST
         and task not in D4_TRAIN_SCORE_EVAL_TASK_BLACKLIST
         and not any([skip in task for skip in D4_TRAIN_SKIP_EVAL])
@@ -359,14 +359,14 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
 )
 
 seqio.MixtureRegistry.add(
-    "d4_train_one_og_prompt",
-    [task for task in single_original_task.values() if task in d4_train_mixture["BASE"] and task not in TASK_BLACKLIST],
+    "t0_train_one_og_prompt",
+    [task for task in single_original_task.values() if task in t0_train_mixture["BASE"] and task not in TASK_BLACKLIST],
     default_rate=lambda t: mixture_cap[t.name],
 )
 
 seqio.MixtureRegistry.add(
-    "d4_train_all_og_prompts",
-    [task for task in all_original_tasks if task in d4_train_mixture["BASE"] and task not in TASK_BLACKLIST],
+    "t0_train_all_og_prompts",
+    [task for task in all_original_tasks if task in t0_train_mixture["BASE"] and task not in TASK_BLACKLIST],
     default_rate=lambda t: mixture_cap[t.name],
 )
 
@@ -375,7 +375,7 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
     [
         task
         for task in seqio.TaskRegistry.names()
-        if task.endswith("_score_eval") and task.split("_score_eval")[0] in d4_eval_mixture["BIAS_FAIRNESS"]
+        if task.endswith("_score_eval") and task.split("_score_eval")[0] in t0_eval_mixture["BIAS_FAIRNESS"]
     ],
     default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
 )
diff --git a/training/README.md b/training/README.md
index c43c501..46caaca 100644
--- a/training/README.md
+++ b/training/README.md
@@ -26,14 +26,14 @@ The full list of tasks in the mixture is obtained with the following code snippe
 import seqio
 import promptsource.seqio_tasks
 
-for task in seqio.MixtureRegistry.get("d4_gpt_sglue_train").tasks:
+for task in seqio.MixtureRegistry.get("t0++_train").tasks:
     print(task.name)
 ```
 
 You'll likely be interested in the following mixtures:
-- `d4_train`: training mixture for T0
-- `d4_gpt_train`: training mixture for T0+
-- `d4_gpt_sglue_train`: training mixture for T0++
+- `t0_train`: training mixture for T0
+- `t0+_train`: training mixture for T0+
+- `t0++_train`: training mixture for T0++
 
 For reproducibility, we have released an [already cached version of the data](https://huggingface.co/datasets/bigscience/P3), which means you don't need to cache the data yourself. The only exception is [Story Cloze](https://cs.rochester.edu/nlp/rocstories/), which requires filling a form to download the data. Please refer to the previous SeqIO command to cache the tasks related to Story Cloze once you have the dataset.
 
@@ -56,7 +56,7 @@ export BUCKET=gs://your_bucket/
 export CACHE_DIR="${BUCKET}/your_cache_dir"
 export MODEL_DIR="${BUCKET}/your_model_dir"
 
-export MIXTURE_NAME="d4_gpt_sglue_train"
+export MIXTURE_NAME="t0++_train"
 export TRAIN_STEPS=1112200
 
 t5_mesh_transformer \
@@ -93,7 +93,7 @@ export BUCKET=gs://your_bucket/
 export CACHE_DIR="${BUCKET}/your_cache_dir"
 export MODEL_DIR="${BUCKET}/your_model_dir"
 
-export EVAL_MIXTURE_NAME="d4_score_eval"
+export EVAL_MIXTURE_NAME="t0_eval_score_eval"
 export TRAIN_STEPS=1112200
 
 t5_mesh_transformer \

From 9c63cbad07f41da8ad39d50ddd9a11abeec704fe Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 7 Jan 2022 01:18:57 +0100
Subject: [PATCH 14/16] Revert "Remove template.py"

This reverts commit 1cfdb26f6a56d9583d4107e76ba171185ebca8aa.
---
 evaluation/README.md        |   9 +--
 evaluation/template_list.py | 106 ++++++++++++++++++++++++++++++++++++
 examples/README.md          |   9 +--
 3 files changed, 108 insertions(+), 16 deletions(-)
 create mode 100644 evaluation/template_list.py

diff --git a/evaluation/README.md b/evaluation/README.md
index d8c8897..f719faf 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -19,14 +19,7 @@ python run_eval.py \
     --output_dir ./debug
 ```
 
-You are expected to modify the `dataset_name`, the `dataset_config_name` and the `template_name`. The templates used for T0 evaluation can be generated like the following,
-```python
-import seqio
-import t0.seqio_tasks
-
-for task in seqio.MixtureRegistry.get("t0_eval_score_eval").tasks:
-    print(task.name)
-```
+You are expected to modify the `dataset_name`, the `dataset_config_name` and the `template_name`. The list of templates per data(sub)set is available in [this file](template_list.py).
 
 If you evaluate on ANLI (R1, R2 or R3), the `dataset_config_name` should be `dev_r1`, `dev_r2` or `dev_r3`.
 
diff --git a/evaluation/template_list.py b/evaluation/template_list.py
new file mode 100644
index 0000000..cb0964b
--- /dev/null
+++ b/evaluation/template_list.py
@@ -0,0 +1,106 @@
+template_list = {
+    ("super_glue", "rte"): [
+        "MNLI crowdsource",
+        "guaranteed true",
+        "can we infer",
+        "GPT-3 style",
+        "does this impl",
+        "should assume",
+        "does it follow that",
+        "based on the previous passage",
+        "justified in saying",
+        "must be true",
+    ],
+    ("super_glue", "cb"): [
+        "can we infer",
+        "based on the previous passage",
+        "claim true/false/inconclusive",
+        "does it follow that",
+        "justified in saying",
+        "always/sometimes/never",
+        "GPT-3 style",
+        "consider always/sometimes/never",
+        "guaranteed true",
+        "must be true",
+        "guaranteed/possible/impossible",
+        "does this imply",
+        "MNLI crowdsource",
+        "should assume",
+        "take the following as truth",
+    ],
+    ("anli", None): [
+        "MNLI crowdsource",
+        "should assume",
+        "does it follow that",
+        "GPT-3 style",
+        "based on the previous passage",
+        "justified in saying",
+        "take the following as truth",
+        "must be true",
+        "can we infer",
+        "guaranteed/possible/impossible",
+        "always/sometimes/never",
+        "does this imply",
+        "consider always/sometimes/never",
+        "claim true/false/inconclusive",
+        "guaranteed true",
+    ],
+    ("super_glue", "wsc.fixed"): [
+        "does the pronoun refer to",
+        "by p they mean",
+        "in other words",
+        "I think they mean",
+        "does p stand for",
+        "GPT-3 Style",
+        "replaced with",
+        "p is/are r",
+        "the pronoun refers to",
+        "Who or what is/are",
+    ],
+    ("winogrande", "winogrande_xl"): [
+        "does underscore refer to",
+        "stand for",
+        "underscore refer to",
+        "fill in the blank",
+        "Replace",
+    ],
+    ("story_cloze", "2016"): [
+        "Answer Given options",
+        "Choose Story Ending",
+        "Movie What Happens Next",
+        "Story Continuation and Options",
+        "Novel Correct Ending",
+    ],
+    ("super_glue", "wic"): [
+        "question-context-meaning-with-label",
+        "question-context-meaning",
+        "grammar_homework",
+        "affirmation_true_or_false",
+        "GPT-3-prompt",
+        "same_sense",
+        "question-context",
+        "GPT-3-prompt-with-label",
+        "polysemous",
+        "similar-sense",
+    ],
+    ("hellaswag", None): [
+        "Predict ending with hint",
+        "Randomized prompts template",
+        "complete_first_then",
+        "if_begins_how_continues",
+    ],
+    ("super_glue", "copa"): [
+        "exercise",
+        "…What could happen next, C1 or C2?",
+        "i_am_hesitating",
+        "plausible_alternatives",
+        "C1 or C2? premise, so/because…",
+        "…As a result, C1 or C2?",
+        "best_option",
+        "…which may be caused by",
+        "more likely",
+        "cause_effect",
+        "…why? C1 or C2",
+        "choose",
+    ]
+}
diff --git a/examples/README.md b/examples/README.md
index 4a4c219..1826d3c 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -12,14 +12,7 @@ python single_task_fine_tune.py \
 
 The per epoch evaluation results will be saved as a CSV file in the `output_dir`. By default, it trains on the whole dataset. Optionally, you can pass `--num_shots` to train it on a random subset of examples.
 
-Like the zero-shot evaluation [script](../evaluation/run_eval.py), you are expected to provide `dataset_name`, `dataset_config_name`, and `template_name`. You can find the list of templates per mixture using the following script; these were the templates we used in the T0 paper (and installed if you did `pip install -r requirements`.) However, [`promptsource`](https://github.com/bigscience-workshop/promptsource) is being continously updated, so if you don't intend to reproduce the exact results from our paper, you may want to install the latest `promptsource` and call, for example, `DatasetTemplates("super_glue", "rte").all_template_names` to access the new templates.
-```python
-import seqio
-import promptsource.seqio_tasks
-
-for task in seqio.MixtureRegistry.get("t0_eval_score_train").tasks:
-    print(task.name)
-```
+Like the zero-shot evaluation [script](../evaluation/run_eval.py), you are expected to provide `dataset_name`, `dataset_config_name`, and `template_name`. You can find the list of templates per dataset in [this file](../evaluation/template_list.py); these were the templates we used in the T0 paper (and installed if you did `pip install -r requirements`.) However, [`promptsource`](https://github.com/bigscience-workshop/promptsource) is being continously updated, so if you don't intend to reproduce the exact results from our paper, you may want to install the latest `promptsource` and call, for example, `DatasetTemplates("super_glue", "rte").all_template_names` to access the new templates.
 
 
 ## Distributed Training

From 5f5bc4fef77128806f4e49666da5ed2b15724b97 Mon Sep 17 00:00:00 2001
From: Thomas Wang <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 7 Jan 2022 01:24:09 +0100
Subject: [PATCH 15/16] Apply suggestions from code review

Co-authored-by: Victor SANH <victorsanh@gmail.com>
---
 setup.py                |  2 +-
 t0/seqio_tasks/tasks.py | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index fafe8a7..ab98844 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@
     license="Apache Software License 2.0",
     long_description=readme,
     install_requires=[
-        "git+git://github.com/bigscience-workshop/promptsource@v0.1.0",
+        "promptsource @ git+git://github.com/bigscience-workshop/promptsource@v0.1.0",
         "accelerate",
         "transformers",
         "torch",
diff --git a/t0/seqio_tasks/tasks.py b/t0/seqio_tasks/tasks.py
index 6c62bef..ef8a81b 100644
--- a/t0/seqio_tasks/tasks.py
+++ b/t0/seqio_tasks/tasks.py
@@ -1,3 +1,16 @@
+"""
+This file defines 8 mixtures that we used in the T-Zero paper:
+- t0_train: T0 training mixture
+- t0+_train: T0+ training mixture
+- t0++_train: T0++ training mixture
+- t0_eval_score_eval: T0 main evaluation mixture (Figure 4 for instance)
+- t0_train_score_eval: Evaluation mixture for checkpoint selection on T0 (validation splits of the training sets)
+- t0_train_one_og_prompt: T0 (p=1) training mixture for  - one original-task prompt per dataset. Figure 6
+- t0_train_all_og_prompts: T0 (p=5.7) training mixture for - all original-task prompts for all datasets. Figure 6
+- bias_fairness_eval_score_eval: Bias & fairness evaluation mixture. Appendix B3
+"""
+
+
 import csv
 import functools
 from typing import Dict, List, Optional, Tuple

From f88ed3683faf7a8b4077b9b763dcf141572c2dea Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 7 Jan 2022 01:24:38 +0100
Subject: [PATCH 16/16] Nit

---
 t0/seqio_tasks/tasks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/t0/seqio_tasks/tasks.py b/t0/seqio_tasks/tasks.py
index ef8a81b..4962ed7 100644
--- a/t0/seqio_tasks/tasks.py
+++ b/t0/seqio_tasks/tasks.py
@@ -17,7 +17,7 @@
 
 import datasets
 import pkg_resources
-import promptsource.templates
+from promptsource import templates
 import seqio
 import t5
 from t5.data.glue_utils import get_glue_metric, get_super_glue_metric
@@ -187,7 +187,7 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
 
 all_datasets = sum(t0_train.values(), []) + sum(t0_eval.values(), [])
 
-all_templates = promptsource.templates.TemplateCollection()
+all_templates = templates.TemplateCollection()
 all_templates.remove("anli")  # Need to special-case ANLI due to weird split conventions
 
 # 3 stages of training/ablation: D4 -> GPT -> SuperGLUE