From 36b8f39157d9ddb8dc2f3467e7ac10355d18476d Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 30 Mar 2022 16:16:02 +0200 Subject: [PATCH 01/10] update tasks list --- src/datasets/utils/resources/tasks.json | 259 +++++++++++++++--------- 1 file changed, 158 insertions(+), 101 deletions(-) diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json index 0585f97b917..540e6ffbccd 100644 --- a/src/datasets/utils/resources/tasks.json +++ b/src/datasets/utils/resources/tasks.json @@ -1,54 +1,110 @@ { - "conditional-text-generation": { - "description": "data-to-text and text transduction tasks such as translation or summarization", - "options": [ - "machine-translation", - "sentence-splitting-fusion", - "summarization", - "table-to-text", - "text-simplification", - "explanation-generation", - "other-structured-to-text", - "other" + "audio-classification": { + "type": "audio", + "subtasks": [ + "keyword-spotting", + "speaker-identification", + "speaker-intent-classification", + "emotion-recognition", + "speaker-language-identification" + ] + }, + "audio-to-audio": { + "type": "audio" + }, + "automatic-speech-recognition": { + "type": "multimodal" + }, + "conversational": { + "type": "text", + "subtasks": [ + "dialogue-generation" + ] + }, + "feature-extraction": { + "type": "multimodal" + }, + "fill-mask": { + "type": "text", + "subtasks": [ + "slot-filling", + "masked-language-modeling" + ] + }, + "image-classification": { + "type": "image", + "subtasks": [ + "multi-label-image-classification", + "multi-class-image-classification" + ] + }, + "image-segmentation": { + "type": "image", + "subtasks": [ + "instance-segmentation", + "semantic-segmentation", + "panoptic-segmentation" + ] + }, + "image-to-text": { + "type": "multimodal", + "subtasks": [ + "image-captioning" + ] + }, + "multiple-choice": { + "type": "text", + "subtasks": [ + "multiple-choice-question-answering", + "multiple-choice-coreference-resolution" + ] + }, + "object-detection": { + "type": "image", + "subtasks": [ + "face-detection", + "vehicle-detection" ] }, "question-answering": { - "description": "question answering tasks", - "options": [ - "open-domain-qa", - "closed-domain-qa", - "multiple-choice-qa", + "type": "text", + "aliases": [ + "extractive-question-answering" + ], + "subtasks": [ "extractive-qa", - "abstractive-qa", - "other" + "open-domain-qa", + "closed-domain-qa" ] }, - "sequence-modeling": { - "description": "such as language modeling or dialogue", - "options": [ - "dialogue-modeling", - "language-modeling", - "other-multi-turn", - "slot-filling", - "other" + "sentence-similarity": { + "type": "text" + }, + "tabular-classification": { + "type": "text" + }, + "tabular-to-text": { + "type": "text", + "subtasks": [ + "rdf-to-text" ] }, - "structure-prediction": { - "description": "predicting structural properties of the text, such as syntax", - "options": [ - "coreference-resolution", - "named-entity-recognition", - "part-of-speech-tagging", - "parsing", - "semantic-role-labeling", - "lemmatization", - "word-sense-disambiguation", - "other" + "summarization": { + "type": "text", + "subtasks": [ + "news-articles-summarization", + "news-articles-headline-generation" ] }, + "table-to-text": { + "type": "text" + }, + "table-question-answering": { + "type": "text" + }, "text-classification": { - "description": "predicting a class index or boolean value", - "options": [ + "type": "text", + "subtasks": [ "acceptability-classification", "entity-linking-classification", "fact-checking", @@ -59,90 +115,91 @@ "semantic-similarity-classification", "sentiment-classification", "topic-classification", - "other" + "semantic-similarity-scoring", + "sentiment-scoring", + "sentiment-analysis", + "hate-speech-detection", + "text-scoring" + ] + }, + "text-generation": { + "type": "text", + "subtasks": [ + "dialogue-modeling", + "language-modeling" ] }, "text-retrieval": { - "description": "information or text retrieval tasks", - "options": [ + "type": "text", + "subtasks": [ "document-retrieval", "utterance-retrieval", "entity-linking-retrieval", - "fact-checking-retrieval", - "other" + "fact-checking-retrieval" ] }, - "text-scoring": { - "description": "text scoring tasks, predicting a real valued score for some text", - "options": [ - "semantic-similarity-scoring", - "sentiment-scoring", - "other" + "text-to-image": { + "type": "multimodal" + }, + "text-to-tabular": { + "type": "text", + "subtasks": [ + "relation-extraction", + "semantic-role-labeling" ] }, - "speech-processing": { - "description": "tasks related to the analysis and representations of speech signals", - "options": [ - "automatic-speech-recognition", - "phoneme-recognition", - "keyword-spotting", - "query-by-example-spoken-term-detection", - "speaker-identification", - "automatic-speaker-verification", - "speaker-diarization", - "intent-classification", - "slot-filling", - "emotion-recognition" + "text-to-speech": { + "type": "multimodal" + }, + "text2text-generation": { + "type": "text", + "subtasks": [ + "text-simplification", + "explanation-generation", + "abstractive-qa", + "open-domain-abstractive-qa", + "closed-domain-qa", + "open-book-qa", + "closed-book-qa" ] }, "time-series-forecasting": { - "description": "tasks related to predicting future values of a time series", - "options": [ + "type": "time series", + "subtasks": [ "univariate-time-series-forecasting", "multivariate-time-series-forecasting" ] }, - "object-detection": { - "description": "tasks related to detecting instances of objects from a particular class in an image", - "options": [ - "face-detection", - "other" + "token-classification": { + "type": "text", + "aliases": [ + "structure-prediction" + ], + "subtasks": [ + "named-entity-recognition", + "part-of-speech-tagging", + "parsing", + "lemmatization", + "word-sense-disambiguation", + "coreference-resolution" ] }, - "image-to-text": { - "description": "tasks related to generating text from images", - "options": [ - "image-captioning", - "other" - ] + "translation": { + "type": "text" }, - "text-to-image": { - "description": "tasks related to generating images from text", - "options": [] + "visual-question-answering": { + "type": "multimodal" }, - "image-segmentation": { - "description": "tasks related to detecting and delineating distinct objects in images", - "options": [ - "instance-segmentation", - "semantic-segmentation", - "panoptic-segmentation", - "other" - ] + "voice-activity-detection": { + "type": "audio" }, - "image-classification": { - "description": "tasks related to identifying what images represent", - "options": [ - "multi-label-image-classification", - "single-label-image-classification", - "other" - ] - + "zero-shot-classification": { + "type": "text" }, - - "other": { - "description": "other task family not mentioned here", - "options": [ - "other" - ] + "zero-shot-image-classification": { + "type": "multimodal" + }, + "reinforcement-learning": { + "type": "other" } } \ No newline at end of file From 1ccbe6b38dd8af67f0b04d9cbcf0b66aac8f20e5 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 30 Mar 2022 16:16:17 +0200 Subject: [PATCH 02/10] update tags in dataset cards --- datasets/acronym_identification/README.md | 4 +- datasets/ade_corpus_v2/README.md | 4 +- datasets/afrikaans_ner_corpus/README.md | 2 +- datasets/air_dialogue/README.md | 8 ++-- datasets/allegro_reviews/README.md | 3 +- datasets/alt/README.md | 5 +-- datasets/amazon_reviews_multi/README.md | 9 ++-- datasets/ami/README.md | 3 +- datasets/amttl/README.md | 2 +- datasets/app_reviews/README.md | 3 +- datasets/aquamuse/README.md | 1 + datasets/arabic_billion_words/README.md | 4 +- datasets/arabic_pos_dialect/README.md | 2 +- datasets/arabic_speech_corpus/README.md | 3 +- datasets/arxiv_dataset/README.md | 5 +-- datasets/asset/README.md | 8 ++-- datasets/assin/README.md | 2 +- datasets/assin2/README.md | 2 +- datasets/atomic/README.md | 4 +- datasets/autshumato/README.md | 5 +-- datasets/bbaw_egyptian/README.md | 5 +-- datasets/bc2gm_corpus/README.md | 2 +- datasets/best2009/README.md | 4 +- datasets/bianet/README.md | 5 +-- datasets/bible_para/README.md | 5 +-- datasets/big_patent/README.md | 4 +- datasets/billsum/README.md | 4 +- datasets/biosses/README.md | 3 +- datasets/blbooks/README.md | 4 +- datasets/blbooksgenre/README.md | 4 +- datasets/bnl_newspapers/README.md | 4 +- datasets/bookcorpus/README.md | 4 +- datasets/bookcorpusopen/README.md | 4 +- datasets/brwac/README.md | 4 +- datasets/bsd_ja_en/README.md | 4 +- datasets/bswac/README.md | 4 +- datasets/c4/README.md | 4 +- datasets/caner/README.md | 2 +- datasets/capes/README.md | 5 ++- datasets/casino/README.md | 6 +-- datasets/cawac/README.md | 4 +- datasets/cc100/README.md | 4 +- datasets/cc_news/README.md | 4 +- datasets/chr_en/README.md | 19 ++++---- datasets/climate_fever/README.md | 2 +- datasets/cmu_hinglish_dog/README.md | 5 +-- datasets/cnn_dailymail/README.md | 4 +- datasets/coached_conv_pref/README.md | 5 ++- datasets/code_search_net/README.md | 4 +- .../README.md | 3 +- .../README.md | 3 +- .../README.md | 3 +- .../README.md | 4 +- .../code_x_glue_cc_code_refinement/README.md | 6 +-- .../README.md | 4 +- .../code_x_glue_ct_code_to_text/README.md | 4 +- .../code_x_glue_tc_text_to_code/README.md | 4 +- .../code_x_glue_tt_text_to_text/README.md | 4 +- datasets/common_gen/README.md | 4 ++ datasets/common_language/README.md | 4 +- datasets/common_voice/README.md | 3 +- datasets/competition_math/README.md | 4 +- datasets/conll2002/README.md | 2 +- datasets/conll2003/README.md | 2 +- datasets/conll2012_ontonotesv5/README.md | 2 +- datasets/conllpp/README.md | 2 +- datasets/conv_ai/README.md | 7 +-- datasets/conv_ai_2/README.md | 5 ++- datasets/conv_ai_3/README.md | 5 ++- datasets/conv_questions/README.md | 3 +- datasets/counter/README.md | 2 +- datasets/covost2/README.md | 3 +- datasets/craigslist_bargains/README.md | 3 +- datasets/crd3/README.md | 6 +-- datasets/crows_pairs/README.md | 5 ++- datasets/cs_restaurants/README.md | 10 +++-- datasets/curiosity_dialogs/README.md | 8 ++-- datasets/dane/README.md | 2 +- datasets/dart/README.md | 6 +-- datasets/dbrd/README.md | 6 ++- datasets/deal_or_no_dialog/README.md | 5 +-- datasets/dialog_re/README.md | 3 +- datasets/disaster_response_messages/README.md | 2 +- datasets/drop/README.md | 1 + datasets/duorc/README.md | 1 + datasets/e2e_nlg/README.md | 6 +-- datasets/e2e_nlg_cleaned/README.md | 6 +-- datasets/ecb/README.md | 5 +-- datasets/ehealth_kd/README.md | 4 +- datasets/eitb_parcc/README.md | 5 +-- datasets/eli5/README.md | 6 +-- datasets/eli5_category/README.md | 6 +-- datasets/elkarhizketak/README.md | 2 +- datasets/emea/README.md | 5 +-- datasets/enriched_web_nlg/README.md | 6 +-- datasets/euronews/README.md | 2 +- datasets/europa_eac_tm/README.md | 7 ++- datasets/europa_ecdc_tm/README.md | 7 ++- datasets/europarl_bilingual/README.md | 5 +-- datasets/farsi_news/README.md | 4 +- datasets/finer/README.md | 2 +- datasets/flores/README.md | 5 +-- datasets/gem/README.md | 45 ++++++++++--------- datasets/generated_reviews_enth/README.md | 3 +- .../german_legal_entity_recognition/README.md | 2 +- datasets/germaner/README.md | 2 +- datasets/giga_fren/README.md | 5 +-- datasets/gigaword/README.md | 4 +- datasets/glucose/README.md | 8 ++-- datasets/glue/README.md | 3 +- datasets/google_wellformed_query/README.md | 4 +- datasets/great_code/README.md | 3 +- datasets/harem/README.md | 2 +- datasets/has_part/README.md | 5 ++- datasets/hate_speech_pl/README.md | 2 +- datasets/hausa_voa_ner/README.md | 2 +- datasets/hebrew_projectbenyehuda/README.md | 4 +- datasets/hebrew_this_world/README.md | 4 +- datasets/hind_encorp/README.md | 5 +-- datasets/hindi_discourse/README.md | 8 ++-- datasets/hippocorpus/README.md | 5 ++- datasets/hkcancor/README.md | 6 +-- datasets/hrenwac_para/README.md | 5 +-- datasets/hrwac/README.md | 4 +- datasets/humicroedit/README.md | 5 ++- datasets/id_liputan6/README.md | 3 +- datasets/id_nergrit_corpus/README.md | 2 +- datasets/id_newspapers_2018/README.md | 4 +- datasets/id_panl_bppt/README.md | 5 +-- datasets/id_puisi/README.md | 8 ++-- .../README.md | 5 +-- datasets/igbo_monolingual/README.md | 4 +- datasets/igbo_ner/README.md | 2 +- datasets/indonlu/README.md | 16 +++---- datasets/inquisitive_qg/README.md | 4 +- datasets/irc_disentangle/README.md | 4 +- datasets/isixhosa_ner_corpus/README.md | 2 +- datasets/isizulu_ner_corpus/README.md | 2 +- datasets/jfleg/README.md | 4 +- datasets/jigsaw_unintended_bias/README.md | 5 ++- datasets/jnlpba/README.md | 2 +- datasets/kd_conv/README.md | 4 +- datasets/kde4/README.md | 5 +-- datasets/kilt_tasks/README.md | 13 +++--- datasets/klue/README.md | 14 +++--- datasets/kor_ner/README.md | 2 +- datasets/lama/README.md | 5 ++- datasets/lambada/README.md | 4 +- datasets/lener_br/README.md | 2 +- datasets/librispeech_asr/README.md | 3 +- datasets/limit/README.md | 2 +- datasets/linnaeus/README.md | 2 +- datasets/lj_speech/README.md | 3 +- datasets/lst20/README.md | 8 ++-- datasets/m_lama/README.md | 5 ++- datasets/mac_morpho/README.md | 2 +- datasets/makhzan/README.md | 4 +- datasets/masakhaner/README.md | 2 +- datasets/mbpp/README.md | 4 +- datasets/mc4/README.md | 4 +- datasets/mdd/README.md | 3 +- datasets/menyo20k_mt/README.md | 5 +-- datasets/meta_woz/README.md | 3 +- datasets/miam/README.md | 8 +++- datasets/mkb/README.md | 4 +- datasets/mlsum/README.md | 3 +- datasets/ms_terms/README.md | 5 +-- datasets/msr_text_compression/README.md | 3 +- .../msr_zhen_translation_parity/README.md | 5 +-- datasets/msra_ner/README.md | 2 +- datasets/mt_eng_vietnamese/README.md | 5 +-- datasets/multi_news/README.md | 5 ++- datasets/multi_nli/README.md | 3 +- datasets/multi_nli_mismatch/README.md | 3 +- datasets/multi_para_crawl/README.md | 5 +-- datasets/multi_woz_v22/README.md | 5 ++- datasets/multi_x_science_sum/README.md | 6 +-- datasets/multilingual_librispeech/README.md | 3 +- datasets/mutual_friends/README.md | 3 +- datasets/narrativeqa/README.md | 2 +- datasets/narrativeqa_manual/README.md | 2 +- datasets/ncbi_disease/README.md | 2 +- datasets/nchlt/README.md | 2 +- datasets/ncslgr/README.md | 5 +-- datasets/nell/README.md | 5 ++- datasets/news_commentary/README.md | 5 +-- datasets/newsph/README.md | 4 +- datasets/newspop/README.md | 5 ++- datasets/newsroom/README.md | 4 +- datasets/nkjp-ner/README.md | 2 +- datasets/norec/README.md | 2 +- datasets/norne/README.md | 2 +- datasets/norwegian_ner/README.md | 2 +- datasets/numer_sense/README.md | 3 +- datasets/numeric_fused_head/README.md | 4 +- datasets/oclar/README.md | 2 +- datasets/ofis_publik/README.md | 5 +-- datasets/ollie/README.md | 5 +-- datasets/onestop_english/README.md | 2 +- datasets/open_subtitles/README.md | 5 +-- datasets/openai_humaneval/README.md | 4 +- datasets/openslr/README.md | 3 +- datasets/openwebtext/README.md | 4 +- datasets/opus100/README.md | 4 +- datasets/opus_books/README.md | 5 +-- datasets/opus_dgt/README.md | 5 +-- datasets/opus_dogc/README.md | 5 +-- datasets/opus_elhuyar/README.md | 5 +-- datasets/opus_euconst/README.md | 5 +-- datasets/opus_finlex/README.md | 5 +-- datasets/opus_fiskmo/README.md | 5 +-- datasets/opus_gnome/README.md | 5 +-- datasets/opus_infopankki/README.md | 5 +-- datasets/opus_memat/README.md | 5 +-- datasets/opus_montenegrinsubs/README.md | 5 +-- datasets/opus_openoffice/README.md | 5 +-- datasets/opus_paracrawl/README.md | 5 +-- datasets/opus_rf/README.md | 5 +-- datasets/opus_tedtalks/README.md | 5 +-- datasets/opus_ubuntu/README.md | 5 +-- datasets/opus_wikipedia/README.md | 5 +-- datasets/opus_xhosanavy/README.md | 5 +-- datasets/orange_sum/README.md | 5 ++- datasets/oscar/README.md | 4 +- datasets/para_crawl/README.md | 5 +-- datasets/para_pat/README.md | 4 +- datasets/paws-x/README.md | 4 +- datasets/paws/README.md | 4 +- datasets/pec/README.md | 3 +- datasets/peoples_daily_ner/README.md | 2 +- datasets/persian_ner/README.md | 2 +- datasets/php/README.md | 5 +-- datasets/pib/README.md | 7 +-- datasets/pn_summary/README.md | 5 ++- datasets/poleval2019_mt/README.md | 5 +-- datasets/polsum/README.md | 4 +- datasets/polyglot_ner/README.md | 2 +- datasets/psc/README.md | 4 +- datasets/ptb_text_only/README.md | 4 +- datasets/pubmed/README.md | 10 ++--- datasets/py_ast/README.md | 8 ++-- datasets/qed_amara/README.md | 5 +-- datasets/quac/README.md | 3 +- datasets/recipe_nlg/README.md | 6 ++- datasets/reddit/README.md | 4 +- datasets/reddit_tifu/README.md | 6 +-- datasets/refresd/README.md | 2 +- datasets/ro_sts/README.md | 3 +- datasets/ro_sts_parallel/README.md | 5 +-- datasets/ronec/README.md | 2 +- datasets/s2orc/README.md | 4 +- datasets/samsum/README.md | 4 +- datasets/sanskrit_classic/README.md | 4 +- datasets/saudinewsnet/README.md | 4 +- datasets/scb_mt_enth_2020/README.md | 5 +-- datasets/schema_guided_dstc8/README.md | 5 ++- datasets/scielo/README.md | 5 +-- datasets/scitldr/README.md | 4 +- datasets/sede/README.md | 2 +- datasets/sem_eval_2014_task_1/README.md | 2 +- datasets/sem_eval_2020_task_11/README.md | 4 +- datasets/senti_ws/README.md | 7 +-- datasets/sepedi_ner/README.md | 2 +- datasets/sesotho_ner_corpus/README.md | 2 +- datasets/setimes/README.md | 5 +-- datasets/setswana_ner_corpus/README.md | 2 +- datasets/silicone/README.md | 24 +++++++++- datasets/siswati_ner_corpus/README.md | 2 +- datasets/smartdata/README.md | 2 +- .../snow_simplified_japanese_corpus/README.md | 5 +-- datasets/so_stacksample/README.md | 4 +- datasets/social_bias_frames/README.md | 4 +- datasets/sofc_materials_articles/README.md | 5 ++- datasets/spanish_billion_words/README.md | 4 +- datasets/spc/README.md | 5 +-- datasets/species_800/README.md | 2 +- datasets/speech_commands/README.md | 4 +- datasets/spider/README.md | 4 +- datasets/srwac/README.md | 4 +- datasets/sst/README.md | 2 +- datasets/stsb_multi_mt/README.md | 3 +- datasets/superb/README.md | 10 ++--- datasets/swahili/README.md | 4 +- datasets/swedish_medical_ner/README.md | 2 +- datasets/swedish_ner_corpus/README.md | 2 +- datasets/tanzil/README.md | 5 +-- datasets/tapaco/README.md | 6 +-- datasets/tashkeela/README.md | 4 +- datasets/taskmaster1/README.md | 3 +- datasets/taskmaster2/README.md | 3 +- datasets/taskmaster3/README.md | 3 +- datasets/tatoeba/README.md | 5 +-- datasets/ted_iwlst2013/README.md | 5 +-- datasets/ted_talks_iwslt/README.md | 5 +-- datasets/telugu_books/README.md | 4 +- datasets/telugu_news/README.md | 4 +- datasets/tep_en_fa_para/README.md | 5 +-- datasets/text2log/README.md | 5 +-- datasets/thainer/README.md | 2 +- datasets/thaisum/README.md | 7 +-- datasets/the_pile/README.md | 4 +- datasets/the_pile_books3/README.md | 4 +- datasets/the_pile_openwebtext2/README.md | 4 +- datasets/the_pile_stack_exchange/README.md | 4 +- datasets/tilde_model/README.md | 5 +-- .../times_of_india_news_headlines/README.md | 4 +- datasets/timit_asr/README.md | 3 +- datasets/tlc/README.md | 4 +- datasets/tmu_gfm_dataset/README.md | 4 +- datasets/totto/README.md | 3 +- datasets/trivia_qa/README.md | 2 + datasets/turk/README.md | 2 +- datasets/turkic_xwmt/README.md | 5 +-- datasets/turkish_ner/README.md | 2 +- datasets/turkish_shrinked_ner/README.md | 2 +- datasets/turku_ner_corpus/README.md | 2 +- datasets/tweets_ar_en_parallel/README.md | 4 +- datasets/twi_text_c3/README.md | 4 +- datasets/twi_wordsim353/README.md | 3 +- datasets/udhr/README.md | 5 +-- datasets/um005/README.md | 5 +-- datasets/un_ga/README.md | 5 +-- datasets/un_multi/README.md | 5 +-- datasets/un_pc/README.md | 5 +-- datasets/universal_morphologies/README.md | 4 +- datasets/vctk/README.md | 3 +- datasets/vivos/README.md | 3 +- datasets/web_nlg/README.md | 34 +------------- datasets/weibo_ner/README.md | 2 +- datasets/wi_locness/README.md | 4 +- datasets/wiki_asp/README.md | 4 +- datasets/wiki_atomic_edits/README.md | 4 +- datasets/wiki_auto/README.md | 2 +- datasets/wiki_bio/README.md | 4 +- datasets/wiki_dpr/README.md | 4 +- datasets/wiki_lingua/README.md | 3 +- datasets/wiki_source/README.md | 5 +-- datasets/wiki_summary/README.md | 5 ++- datasets/wikiann/README.md | 2 +- datasets/wikicorpus/README.md | 28 +++++++----- datasets/wikipedia/README.md | 4 +- datasets/wikitext/README.md | 4 +- datasets/wikitext_tl39/README.md | 4 +- datasets/wino_bias/README.md | 2 +- datasets/winograd_wsc/README.md | 2 +- datasets/wisesight1000/README.md | 4 +- datasets/wmt14/README.md | 5 +-- datasets/wmt15/README.md | 5 +-- datasets/wmt16/README.md | 5 +-- datasets/wmt17/README.md | 5 +-- datasets/wmt18/README.md | 5 +-- datasets/wmt19/README.md | 5 +-- datasets/wmt20_mlqe_task1/README.md | 5 +-- datasets/wmt20_mlqe_task2/README.md | 5 +-- datasets/wmt20_mlqe_task3/README.md | 5 +-- datasets/wmt_t2t/README.md | 5 +-- datasets/wnut_17/README.md | 2 +- datasets/woz_dialogue/README.md | 5 ++- datasets/xglue/README.md | 13 +++--- datasets/xsum/README.md | 4 +- datasets/xsum_factuality/README.md | 4 +- datasets/xtreme/README.md | 2 +- datasets/yoruba_gv_ner/README.md | 2 +- datasets/yoruba_text_c3/README.md | 4 +- datasets/yoruba_wordsim353/README.md | 3 +- .../youtube_caption_corrections/README.md | 3 +- datasets/zest/README.md | 4 +- 367 files changed, 858 insertions(+), 786 deletions(-) diff --git a/datasets/acronym_identification/README.md b/datasets/acronym_identification/README.md index a2e205679fd..dbe60263ce4 100644 --- a/datasets/acronym_identification/README.md +++ b/datasets/acronym_identification/README.md @@ -14,9 +14,9 @@ size_categories: source_datasets: - original task_categories: -- structure-prediction +- token-classification task_ids: -- structure-prediction-other-acronym-identification +- token-classification-other-acronym-identification paperswithcode_id: acronym-identification pretty_name: Acronym Identification Dataset --- diff --git a/datasets/ade_corpus_v2/README.md b/datasets/ade_corpus_v2/README.md index 5d375ed9e2b..14246693348 100644 --- a/datasets/ade_corpus_v2/README.md +++ b/datasets/ade_corpus_v2/README.md @@ -22,9 +22,9 @@ task_categories: Ade_corpus_v2_classification: - text-classification Ade_corpus_v2_drug_ade_relation: - - structure-prediction + - token-classification Ade_corpus_v2_drug_dosage_relation: - - structure-prediction + - token-classification task_ids: Ade_corpus_v2_classification: - fact-checking diff --git a/datasets/afrikaans_ner_corpus/README.md b/datasets/afrikaans_ner_corpus/README.md index b32383cb67d..44385876c49 100644 --- a/datasets/afrikaans_ner_corpus/README.md +++ b/datasets/afrikaans_ner_corpus/README.md @@ -14,7 +14,7 @@ size_categories: source_datasets: - original task_categories: -- structure-prediction +- token-classification task_ids: - named-entity-recognition paperswithcode_id: null diff --git a/datasets/air_dialogue/README.md b/datasets/air_dialogue/README.md index e6952f18de6..112569c12eb 100644 --- a/datasets/air_dialogue/README.md +++ b/datasets/air_dialogue/README.md @@ -15,12 +15,14 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation -- sequence-modeling +- conversational +- text-generation +- fill-mask task_ids: -- conditional-text-generation-other-dialogue-generation +- dialogue-generation - dialogue-modeling - language-modeling +- masked-language-modeling paperswithcode_id: null --- diff --git a/datasets/allegro_reviews/README.md b/datasets/allegro_reviews/README.md index 53b29b53d7b..c9f8434335f 100644 --- a/datasets/allegro_reviews/README.md +++ b/datasets/allegro_reviews/README.md @@ -14,9 +14,10 @@ size_categories: source_datasets: - original task_categories: -- text-scoring +- text-classification task_ids: - sentiment-scoring +- text-scoring paperswithcode_id: allegro-reviews pretty_name: Allegro Reviews --- diff --git a/datasets/alt/README.md b/datasets/alt/README.md index 3546370e30d..a035b4e5594 100644 --- a/datasets/alt/README.md +++ b/datasets/alt/README.md @@ -40,10 +40,9 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation -- structure-prediction +- translation +- token-classification task_ids: -- machine-translation - parsing paperswithcode_id: alt pretty_name: Asian Language Treebank diff --git a/datasets/amazon_reviews_multi/README.md b/datasets/amazon_reviews_multi/README.md index 55391781530..a998cf8511d 100644 --- a/datasets/amazon_reviews_multi/README.md +++ b/datasets/amazon_reviews_multi/README.md @@ -58,15 +58,16 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation -- sequence-modeling +- summarization +- text-generation +- fill-mask - text-classification -- text-scoring task_ids: +- text-scoring - language-modeling +- masked-language-modeling - sentiment-classification - sentiment-scoring -- summarization - topic-classification paperswithcode_id: null pretty_name: The Multilingual Amazon Reviews Corpus diff --git a/datasets/ami/README.md b/datasets/ami/README.md index c5e763a76a5..93b1c0cb17d 100644 --- a/datasets/ami/README.md +++ b/datasets/ami/README.md @@ -16,9 +16,8 @@ size_categories: source_datasets: - original task_categories: -- speech-processing -task_ids: - automatic-speech-recognition +task_ids: [] --- # Dataset Card for AMI Corpus diff --git a/datasets/amttl/README.md b/datasets/amttl/README.md index 8670c554b65..7b5ea22e195 100644 --- a/datasets/amttl/README.md +++ b/datasets/amttl/README.md @@ -14,7 +14,7 @@ size_categories: source_datasets: - original task_categories: -- structure-prediction +- token-classification task_ids: - parsing paperswithcode_id: null diff --git a/datasets/app_reviews/README.md b/datasets/app_reviews/README.md index 1a932dad5b1..3a6fe0e664c 100644 --- a/datasets/app_reviews/README.md +++ b/datasets/app_reviews/README.md @@ -14,8 +14,9 @@ size_categories: source_datasets: - original task_categories: -- text-scoring +- text-classification task_ids: +- text-scoring - sentiment-scoring paperswithcode_id: null pretty_name: AppReviews diff --git a/datasets/aquamuse/README.md b/datasets/aquamuse/README.md index 41e80eae341..745e1bb6704 100644 --- a/datasets/aquamuse/README.md +++ b/datasets/aquamuse/README.md @@ -20,6 +20,7 @@ source_datasets: task_categories: - other - question-answering +- text2text-generation task_ids: - abstractive-qa - extractive-qa diff --git a/datasets/arabic_billion_words/README.md b/datasets/arabic_billion_words/README.md index ce93ac175f7..62d7bca307a 100644 --- a/datasets/arabic_billion_words/README.md +++ b/datasets/arabic_billion_words/README.md @@ -33,9 +33,11 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask task_ids: - language-modeling +- masked-language-modeling paperswithcode_id: null pretty_name: Arabic Billion Words --- diff --git a/datasets/arabic_pos_dialect/README.md b/datasets/arabic_pos_dialect/README.md index a519eca613d..e6632488cc3 100644 --- a/datasets/arabic_pos_dialect/README.md +++ b/datasets/arabic_pos_dialect/README.md @@ -14,7 +14,7 @@ size_categories: source_datasets: - extended task_categories: -- structure-prediction +- token-classification task_ids: - part-of-speech-tagging paperswithcode_id: null diff --git a/datasets/arabic_speech_corpus/README.md b/datasets/arabic_speech_corpus/README.md index c9f064dd02d..a12f9bf6d56 100644 --- a/datasets/arabic_speech_corpus/README.md +++ b/datasets/arabic_speech_corpus/README.md @@ -16,9 +16,8 @@ size_categories: source_datasets: - original task_categories: -- speech-processing -task_ids: - automatic-speech-recognition +task_ids: [] --- # Dataset Card for Arabic Speech Corpus diff --git a/datasets/arxiv_dataset/README.md b/datasets/arxiv_dataset/README.md index fdd9b79b264..6793e480e05 100644 --- a/datasets/arxiv_dataset/README.md +++ b/datasets/arxiv_dataset/README.md @@ -14,15 +14,14 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation +- translation +- summarization - text-retrieval task_ids: - document-retrieval - entity-linking-retrieval - explanation-generation - fact-checking-retrieval -- machine-translation -- summarization - text-simplification paperswithcode_id: null pretty_name: arXiv Dataset diff --git a/datasets/asset/README.md b/datasets/asset/README.md index b70d66cc253..24f697d338d 100644 --- a/datasets/asset/README.md +++ b/datasets/asset/README.md @@ -16,12 +16,12 @@ source_datasets: - extended|other-turkcorpus task_categories: ratings: - - text-scoring + - text-classification simplification: - - conditional-text-generation + - text2text-generation task_ids: ratings: - - text-scoring-other-simplification-evaluation + - text-classification-other-simplification-evaluation simplification: - text-simplification paperswithcode_id: asset @@ -67,7 +67,7 @@ splitting in [HSplit](https://www.aclweb.org/anthology/D18-1081.pdf)), the simpl ### Supported Tasks and Leaderboards -The dataset supports the evaluation of `test-simplification` systems. Success in this tasks is typically measured using the [SARI](https://huggingface.co/metrics/sari) and [FKBLEU](https://huggingface.co/metrics/fkbleu) metrics described in the paper [Optimizing Statistical Machine Translation for Text Simplification](https://www.aclweb.org/anthology/Q16-1029.pdf). +The dataset supports the evaluation of `text-simplification` systems. Success in this tasks is typically measured using the [SARI](https://huggingface.co/metrics/sari) and [FKBLEU](https://huggingface.co/metrics/fkbleu) metrics described in the paper [Optimizing Statistical Machine Translation for Text Simplification](https://www.aclweb.org/anthology/Q16-1029.pdf). ### Languages diff --git a/datasets/assin/README.md b/datasets/assin/README.md index c15f2de834b..3717b499871 100644 --- a/datasets/assin/README.md +++ b/datasets/assin/README.md @@ -16,8 +16,8 @@ source_datasets: - original task_categories: - text-classification -- text-scoring task_ids: +- text-scoring - natural-language-inference - semantic-similarity-scoring paperswithcode_id: assin diff --git a/datasets/assin2/README.md b/datasets/assin2/README.md index 6a4cbdc4c52..9bf5e2c309a 100644 --- a/datasets/assin2/README.md +++ b/datasets/assin2/README.md @@ -15,8 +15,8 @@ source_datasets: - original task_categories: - text-classification -- text-scoring task_ids: +- text-scoring - natural-language-inference - semantic-similarity-scoring paperswithcode_id: assin2 diff --git a/datasets/atomic/README.md b/datasets/atomic/README.md index 410502f1220..903354a606f 100755 --- a/datasets/atomic/README.md +++ b/datasets/atomic/README.md @@ -15,9 +15,9 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation +- text2text-generation task_ids: -- other-structured-to-text +- text2text-generation-other-common-sense-if-then-reasoning paperswithcode_id: atomic --- diff --git a/datasets/autshumato/README.md b/datasets/autshumato/README.md index 46ecb299df9..3afffcd1342 100644 --- a/datasets/autshumato/README.md +++ b/datasets/autshumato/README.md @@ -40,9 +40,8 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation -task_ids: -- machine-translation +- translation +task_ids: [] paperswithcode_id: null pretty_name: autshumato --- diff --git a/datasets/bbaw_egyptian/README.md b/datasets/bbaw_egyptian/README.md index a34c9e2610e..93f6f6b4b76 100644 --- a/datasets/bbaw_egyptian/README.md +++ b/datasets/bbaw_egyptian/README.md @@ -16,9 +16,8 @@ size_categories: source_datasets: - extended|wikipedia task_categories: -- conditional-text-generation -task_ids: -- machine-translation +- translation +task_ids: [] paperswithcode_id: null pretty_name: BbawEgyptian --- diff --git a/datasets/bc2gm_corpus/README.md b/datasets/bc2gm_corpus/README.md index d8b3d9670fa..64da0edcea9 100644 --- a/datasets/bc2gm_corpus/README.md +++ b/datasets/bc2gm_corpus/README.md @@ -14,7 +14,7 @@ size_categories: source_datasets: - original task_categories: -- structure-prediction +- token-classification task_ids: - named-entity-recognition paperswithcode_id: null diff --git a/datasets/best2009/README.md b/datasets/best2009/README.md index 026633232b6..5bf4362e31d 100644 --- a/datasets/best2009/README.md +++ b/datasets/best2009/README.md @@ -14,9 +14,9 @@ size_categories: source_datasets: - original task_categories: -- structure-prediction +- token-classification task_ids: -- structure-prediction-other-word-tokenization +- token-classification-other-word-tokenization paperswithcode_id: null pretty_name: best2009 --- diff --git a/datasets/bianet/README.md b/datasets/bianet/README.md index 3e8169f9284..85061259a50 100644 --- a/datasets/bianet/README.md +++ b/datasets/bianet/README.md @@ -27,9 +27,8 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation -task_ids: -- machine-translation +- translation +task_ids: [] paperswithcode_id: bianet pretty_name: Bianet --- diff --git a/datasets/bible_para/README.md b/datasets/bible_para/README.md index 67755ef417a..8fa40fdf31f 100644 --- a/datasets/bible_para/README.md +++ b/datasets/bible_para/README.md @@ -115,9 +115,8 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation -task_ids: -- machine-translation +- translation +task_ids: [] paperswithcode_id: null pretty_name: BiblePara --- diff --git a/datasets/big_patent/README.md b/datasets/big_patent/README.md index 9ca05553a89..cdee0f5f7a7 100644 --- a/datasets/big_patent/README.md +++ b/datasets/big_patent/README.md @@ -33,9 +33,9 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation -task_ids: - summarization +task_ids: +- summarization-other-patent-summarization paperswithcode_id: bigpatent pretty_name: Big Patent --- diff --git a/datasets/billsum/README.md b/datasets/billsum/README.md index 694d095cac7..cdca4737043 100644 --- a/datasets/billsum/README.md +++ b/datasets/billsum/README.md @@ -14,9 +14,9 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation -task_ids: - summarization +task_ids: +- summarization-other-bills-summarization paperswithcode_id: billsum pretty_name: BillSum --- diff --git a/datasets/biosses/README.md b/datasets/biosses/README.md index 5b66590225a..6e6b75ab5ab 100644 --- a/datasets/biosses/README.md +++ b/datasets/biosses/README.md @@ -14,8 +14,9 @@ size_categories: source_datasets: - original task_categories: -- text-scoring +- text-classification task_ids: +- text-scoring - semantic-similarity-scoring paperswithcode_id: biosses pretty_name: BIOSSES diff --git a/datasets/blbooks/README.md b/datasets/blbooks/README.md index 2542ce04f9a..1969d2604da 100644 --- a/datasets/blbooks/README.md +++ b/datasets/blbooks/README.md @@ -20,10 +20,12 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask - other task_ids: - language-modeling +- masked-language-modeling - other-other-digital-humanities-research --- diff --git a/datasets/blbooksgenre/README.md b/datasets/blbooksgenre/README.md index 6a3a6aee0e0..53d20bf6be8 100644 --- a/datasets/blbooksgenre/README.md +++ b/datasets/blbooksgenre/README.md @@ -25,11 +25,13 @@ source_datasets: - original task_categories: - text-classification -- sequence-modeling +- text-generation +- fill-mask task_ids: - topic-classification - multi-label-classification - language-modeling +- masked-language-modeling --- # Dataset Card for blbooksgenre diff --git a/datasets/bnl_newspapers/README.md b/datasets/bnl_newspapers/README.md index 4b6a7f1ae9e..9cbfac740ff 100644 --- a/datasets/bnl_newspapers/README.md +++ b/datasets/bnl_newspapers/README.md @@ -22,9 +22,11 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask task_ids: - language-modeling +- masked-language-modeling --- # Dataset Card for BnL Historical Newspapers diff --git a/datasets/bookcorpus/README.md b/datasets/bookcorpus/README.md index bc6badeb50b..8e5ca1dff9b 100644 --- a/datasets/bookcorpus/README.md +++ b/datasets/bookcorpus/README.md @@ -15,9 +15,11 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask task_ids: - language-modeling +- masked-language-modeling paperswithcode_id: bookcorpus --- diff --git a/datasets/bookcorpusopen/README.md b/datasets/bookcorpusopen/README.md index 2f4ffc95dbf..b06cb5f3c21 100644 --- a/datasets/bookcorpusopen/README.md +++ b/datasets/bookcorpusopen/README.md @@ -15,9 +15,11 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask task_ids: - language-modeling +- masked-language-modeling paperswithcode_id: bookcorpus --- diff --git a/datasets/brwac/README.md b/datasets/brwac/README.md index 3669e44f0f3..3cc4bd102b0 100644 --- a/datasets/brwac/README.md +++ b/datasets/brwac/README.md @@ -14,9 +14,11 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask task_ids: - language-modeling +- masked-language-modeling paperswithcode_id: brwac pretty_name: BrWaC --- diff --git a/datasets/bsd_ja_en/README.md b/datasets/bsd_ja_en/README.md index 1bc90e8f54c..87c1aa85b3a 100644 --- a/datasets/bsd_ja_en/README.md +++ b/datasets/bsd_ja_en/README.md @@ -15,9 +15,9 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation +- translation task_ids: -- machine-translation +- translation-other-business-conversations-translation paperswithcode_id: business-scene-dialogue pretty_name: Business Scene Dialogue --- diff --git a/datasets/bswac/README.md b/datasets/bswac/README.md index c8a75e7c112..50799d45658 100644 --- a/datasets/bswac/README.md +++ b/datasets/bswac/README.md @@ -14,9 +14,11 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask task_ids: - language-modeling +- masked-language-modeling paperswithcode_id: null pretty_name: BsWac --- diff --git a/datasets/c4/README.md b/datasets/c4/README.md index 9b8455b15f0..d41d3a25613 100644 --- a/datasets/c4/README.md +++ b/datasets/c4/README.md @@ -15,9 +15,11 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask task_ids: - language-modeling +- masked-language-modeling paperswithcode_id: c4 --- diff --git a/datasets/caner/README.md b/datasets/caner/README.md index 99de3de9c52..7c0781659da 100644 --- a/datasets/caner/README.md +++ b/datasets/caner/README.md @@ -13,7 +13,7 @@ size_categories: source_datasets: - original task_categories: -- structure-prediction +- token-classification task_ids: - named-entity-recognition paperswithcode_id: null diff --git a/datasets/capes/README.md b/datasets/capes/README.md index 0678b06717a..edf195bc9e7 100644 --- a/datasets/capes/README.md +++ b/datasets/capes/README.md @@ -15,9 +15,10 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation +- translation task_ids: -- machine-translation +- translaiton-other-theses-translation +- translaiton-other-dissertation-abstracts-translation paperswithcode_id: capes pretty_name: CAPES --- diff --git a/datasets/casino/README.md b/datasets/casino/README.md index 9ae616c3d15..7dea40d28ee 100644 --- a/datasets/casino/README.md +++ b/datasets/casino/README.md @@ -14,10 +14,10 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation -- sequence-modeling +- conversational +- text-generation +- fill-mask task_ids: -- conditional-text-generation-other-dialogue-generation - dialogue-modeling pretty_name: Campsite Negotiation Dialogues paperswithcode_id: casino diff --git a/datasets/cawac/README.md b/datasets/cawac/README.md index 0db683b591c..211f4170e2f 100644 --- a/datasets/cawac/README.md +++ b/datasets/cawac/README.md @@ -14,9 +14,11 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask task_ids: - language-modeling +- masked-language-modeling paperswithcode_id: cawac pretty_name: caWaC --- diff --git a/datasets/cc100/README.md b/datasets/cc100/README.md index 5b1bf401dc9..ca5b71c40bc 100644 --- a/datasets/cc100/README.md +++ b/datasets/cc100/README.md @@ -132,9 +132,11 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask task_ids: - language-modeling +- masked-language-modeling paperswithcode_id: cc100 pretty_name: CC100 --- diff --git a/datasets/cc_news/README.md b/datasets/cc_news/README.md index 8aab90f8018..e0fe8568744 100644 --- a/datasets/cc_news/README.md +++ b/datasets/cc_news/README.md @@ -15,9 +15,11 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask task_ids: - language-modeling +- masked-language-modeling paperswithcode_id: cc-news --- diff --git a/datasets/chr_en/README.md b/datasets/chr_en/README.md index c92552bf5e0..9f1c03fb20d 100644 --- a/datasets/chr_en/README.md +++ b/datasets/chr_en/README.md @@ -46,22 +46,21 @@ source_datasets: - original task_categories: monolingual: - - conditional-text-generation + - translation monolingual_raw: - - sequence-modeling + - text-generation + - fill-mask parallel: - - conditional-text-generation + - translation parallel_raw: - - conditional-text-generation + - translation task_ids: - monolingual: - - machine-translation + monolingual: [] monolingual_raw: - language-modeling - parallel: - - machine-translation - parallel_raw: - - machine-translation + - masked-language-modeling + parallel: [] + parallel_raw: [] paperswithcode_id: chren --- diff --git a/datasets/climate_fever/README.md b/datasets/climate_fever/README.md index 4050e0bd20b..eb22f4e2177 100644 --- a/datasets/climate_fever/README.md +++ b/datasets/climate_fever/README.md @@ -18,8 +18,8 @@ source_datasets: task_categories: - text-classification - text-retrieval -- text-scoring task_ids: +- text-scoring - fact-checking - fact-checking-retrieval - semantic-similarity-scoring diff --git a/datasets/cmu_hinglish_dog/README.md b/datasets/cmu_hinglish_dog/README.md index 027619a9b52..607477ccd8e 100644 --- a/datasets/cmu_hinglish_dog/README.md +++ b/datasets/cmu_hinglish_dog/README.md @@ -18,9 +18,8 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation -task_ids: -- machine-translation +- translation +task_ids: [] --- # Dataset Card for CMU Document Grounded Conversations diff --git a/datasets/cnn_dailymail/README.md b/datasets/cnn_dailymail/README.md index fa16607cd43..41224565d5c 100644 --- a/datasets/cnn_dailymail/README.md +++ b/datasets/cnn_dailymail/README.md @@ -14,9 +14,9 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation -task_ids: - summarization +task_ids: +- summarization-news-articles-summarization paperswithcode_id: cnn-daily-mail-1 pretty_name: CNN / Daily Mail --- diff --git a/datasets/coached_conv_pref/README.md b/datasets/coached_conv_pref/README.md index b7d7ac2904a..e0a380103a5 100644 --- a/datasets/coached_conv_pref/README.md +++ b/datasets/coached_conv_pref/README.md @@ -15,8 +15,9 @@ source_datasets: - original task_categories: - other -- sequence-modeling -- structure-prediction +- text-generation +- fill-mask +- token-classification task_ids: - other-other-Conversational Recommendation - dialogue-modeling diff --git a/datasets/code_search_net/README.md b/datasets/code_search_net/README.md index dfefe894417..fb475993ca3 100644 --- a/datasets/code_search_net/README.md +++ b/datasets/code_search_net/README.md @@ -27,9 +27,11 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask task_ids: - language-modeling +- masked-language-modeling paperswithcode_id: codesearchnet pretty_name: CodeSearchNet --- diff --git a/datasets/code_x_glue_cc_cloze_testing_all/README.md b/datasets/code_x_glue_cc_cloze_testing_all/README.md index 69cb15a9ff5..cccd954b7e6 100644 --- a/datasets/code_x_glue_cc_cloze_testing_all/README.md +++ b/datasets/code_x_glue_cc_cloze_testing_all/README.md @@ -25,7 +25,8 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask task_ids: - slot-filling pretty_name: CodeXGlueCcClozeTestingAll diff --git a/datasets/code_x_glue_cc_cloze_testing_maxmin/README.md b/datasets/code_x_glue_cc_cloze_testing_maxmin/README.md index 6297269934a..9c3e8031df5 100644 --- a/datasets/code_x_glue_cc_cloze_testing_maxmin/README.md +++ b/datasets/code_x_glue_cc_cloze_testing_maxmin/README.md @@ -25,7 +25,8 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask task_ids: - slot-filling pretty_name: CodeXGlueCcClozeTestingMaxmin diff --git a/datasets/code_x_glue_cc_code_completion_line/README.md b/datasets/code_x_glue_cc_code_completion_line/README.md index e455c7bd219..0e3232b665c 100644 --- a/datasets/code_x_glue_cc_code_completion_line/README.md +++ b/datasets/code_x_glue_cc_code_completion_line/README.md @@ -25,7 +25,8 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask task_ids: - slot-filling pretty_name: CodeXGlueCcCodeCompletionLine diff --git a/datasets/code_x_glue_cc_code_completion_token/README.md b/datasets/code_x_glue_cc_code_completion_token/README.md index b35fd706ffb..80eaecb5c78 100644 --- a/datasets/code_x_glue_cc_code_completion_token/README.md +++ b/datasets/code_x_glue_cc_code_completion_token/README.md @@ -14,9 +14,11 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask task_ids: - language-modeling +- masked-language-modeling pretty_name: CodeXGlueCcCodeCompletionToken --- # Dataset Card for "code_x_glue_cc_code_completion_token" diff --git a/datasets/code_x_glue_cc_code_refinement/README.md b/datasets/code_x_glue_cc_code_refinement/README.md index 2a021cba785..cd8c8999a2f 100644 --- a/datasets/code_x_glue_cc_code_refinement/README.md +++ b/datasets/code_x_glue_cc_code_refinement/README.md @@ -14,9 +14,9 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation +- text2text-generation task_ids: -- conditional-text-generation-other-debugging +- text2text-generation-other-debugging pretty_name: CodeXGlueCcCodeRefinement --- @@ -58,7 +58,7 @@ We use the dataset released by this paper(https://arxiv.org/pdf/1812.08693.pdf). ### Supported Tasks and Leaderboards -- `conditional-text-generation-other-debugging`: The dataset can be used to train a model for automatically fixing buggy code. +- `text2text-generation-other-debugging`: The dataset can be used to train a model for automatically fixing buggy code. ### Languages diff --git a/datasets/code_x_glue_cc_code_to_code_trans/README.md b/datasets/code_x_glue_cc_code_to_code_trans/README.md index c403b360a42..f4ae60a8c93 100644 --- a/datasets/code_x_glue_cc_code_to_code_trans/README.md +++ b/datasets/code_x_glue_cc_code_to_code_trans/README.md @@ -14,9 +14,9 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation +- translation task_ids: -- machine-translation +- translation-other-code-to-code pretty_name: CodeXGlueCcCodeToCodeTrans --- # Dataset Card for "code_x_glue_cc_code_to_code_trans" diff --git a/datasets/code_x_glue_ct_code_to_text/README.md b/datasets/code_x_glue_ct_code_to_text/README.md index aa457452139..3865687cedf 100644 --- a/datasets/code_x_glue_ct_code_to_text/README.md +++ b/datasets/code_x_glue_ct_code_to_text/README.md @@ -26,9 +26,9 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation +- translation task_ids: -- machine-translation +- translation-other-code-to-text pretty_name: CodeXGlueCtCodeToText --- # Dataset Card for "code_x_glue_ct_code_to_text" diff --git a/datasets/code_x_glue_tc_text_to_code/README.md b/datasets/code_x_glue_tc_text_to_code/README.md index e10824df6b8..51070edac0b 100644 --- a/datasets/code_x_glue_tc_text_to_code/README.md +++ b/datasets/code_x_glue_tc_text_to_code/README.md @@ -15,9 +15,9 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation +- translation task_ids: -- machine-translation +- translation-other-text-to-code pretty_name: CodeXGlueTcTextToCode --- # Dataset Card for "code_x_glue_tc_text_to_code" diff --git a/datasets/code_x_glue_tt_text_to_text/README.md b/datasets/code_x_glue_tt_text_to_text/README.md index 48f90fda204..0761a65b925 100644 --- a/datasets/code_x_glue_tt_text_to_text/README.md +++ b/datasets/code_x_glue_tt_text_to_text/README.md @@ -18,9 +18,9 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation +- translation task_ids: -- machine-translation +- translation-other-code-documentation-translation pretty_name: CodeXGlueTtTextToText --- # Dataset Card for "code_x_glue_tt_text_to_text" diff --git a/datasets/common_gen/README.md b/datasets/common_gen/README.md index 75b73501f1b..b819df86681 100644 --- a/datasets/common_gen/README.md +++ b/datasets/common_gen/README.md @@ -3,6 +3,10 @@ languages: - en paperswithcode_id: commongen pretty_name: CommonGen +task_ids: +- text2text-generation-other-concepts-to-text +tasks_categories: +- text2text-generation --- # Dataset Card for "common_gen" diff --git a/datasets/common_language/README.md b/datasets/common_language/README.md index a2804ca3851..522b881ffcc 100644 --- a/datasets/common_language/README.md +++ b/datasets/common_language/README.md @@ -59,9 +59,9 @@ size_categories: source_datasets: - extended|common_voice task_categories: -- speech-processing +- audio-classification task_ids: -- other-other-speech-classification +- speaker-language-identification --- # Dataset Card for common_language diff --git a/datasets/common_voice/README.md b/datasets/common_voice/README.md index bdd46378910..230b1fdd465 100644 --- a/datasets/common_voice/README.md +++ b/datasets/common_voice/README.md @@ -193,9 +193,8 @@ size_categories: source_datasets: - extended|common_voice task_categories: -- speech-processing -task_ids: - automatic-speech-recognition +task_ids: [] paperswithcode_id: common-voice --- diff --git a/datasets/competition_math/README.md b/datasets/competition_math/README.md index 1de4e51f69e..09a8a567fba 100644 --- a/datasets/competition_math/README.md +++ b/datasets/competition_math/README.md @@ -15,9 +15,9 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation +- text2text-generation task_ids: -- explanation-generation +- text2text-generation-other-explanation-generation --- # Dataset Card for Mathematics Aptitude Test of Heuristics (MATH) dataset diff --git a/datasets/conll2002/README.md b/datasets/conll2002/README.md index c84899b5040..3113faefa8d 100644 --- a/datasets/conll2002/README.md +++ b/datasets/conll2002/README.md @@ -17,7 +17,7 @@ size_categories: source_datasets: - original task_categories: -- structure-prediction +- token-classification task_ids: - named-entity-recognition - part-of-speech-tagging diff --git a/datasets/conll2003/README.md b/datasets/conll2003/README.md index 3b466be3f34..97b43530119 100644 --- a/datasets/conll2003/README.md +++ b/datasets/conll2003/README.md @@ -14,7 +14,7 @@ size_categories: source_datasets: - extended|other-reuters-corpus task_categories: -- structure-prediction +- token-classification task_ids: - named-entity-recognition - part-of-speech-tagging diff --git a/datasets/conll2012_ontonotesv5/README.md b/datasets/conll2012_ontonotesv5/README.md index bd776dc04c4..9851f7e347e 100644 --- a/datasets/conll2012_ontonotesv5/README.md +++ b/datasets/conll2012_ontonotesv5/README.md @@ -18,7 +18,7 @@ size_categories: source_datasets: - original task_categories: -- structure-prediction +- token-classification task_ids: - named-entity-recognition - part-of-speech-tagging diff --git a/datasets/conllpp/README.md b/datasets/conllpp/README.md index 5284f4bd37e..4d3307da20b 100644 --- a/datasets/conllpp/README.md +++ b/datasets/conllpp/README.md @@ -14,7 +14,7 @@ size_categories: source_datasets: - extended|conll2003 task_categories: -- structure-prediction +- token-classification task_ids: - named-entity-recognition paperswithcode_id: conll diff --git a/datasets/conv_ai/README.md b/datasets/conv_ai/README.md index 76a66aeb127..69825916cc7 100644 --- a/datasets/conv_ai/README.md +++ b/datasets/conv_ai/README.md @@ -14,10 +14,11 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation -- text-scoring +- conversational +- text-classification task_ids: -- text-scoring-other-evaluating-dialogue-systems +- text-scoring +- text-classification-other-evaluating-dialogue-systems paperswithcode_id: null pretty_name: ConvAi --- diff --git a/datasets/conv_ai_2/README.md b/datasets/conv_ai_2/README.md index 19304823e8f..d6f779a2369 100644 --- a/datasets/conv_ai_2/README.md +++ b/datasets/conv_ai_2/README.md @@ -14,9 +14,10 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation -- text-scoring +- conversational +- text-classification task_ids: +- text-scoring - text-scoring-other-evaluating-dialogue-systems paperswithcode_id: convai2 pretty_name: Conversational Intelligence Challenge 2 diff --git a/datasets/conv_ai_3/README.md b/datasets/conv_ai_3/README.md index c2659226ec6..b1fbf425c27 100644 --- a/datasets/conv_ai_3/README.md +++ b/datasets/conv_ai_3/README.md @@ -14,9 +14,10 @@ size_categories: source_datasets: - original task_categories: -- conditional-text-generation -- text-scoring +- conversational +- text-classification task_ids: +- text-scoring - text-scoring-other-evaluating-dialogue-systems paperswithcode_id: null pretty_name: More Information Needed diff --git a/datasets/conv_questions/README.md b/datasets/conv_questions/README.md index 81d99d2b71f..690f244486d 100644 --- a/datasets/conv_questions/README.md +++ b/datasets/conv_questions/README.md @@ -15,7 +15,8 @@ source_datasets: - original task_categories: - question-answering -- sequence-modeling +- text-generation +- fill-mask task_ids: - open-domain-qa - dialogue-modeling diff --git a/datasets/counter/README.md b/datasets/counter/README.md index f1405bd7453..a7e18fe1c2e 100644 --- a/datasets/counter/README.md +++ b/datasets/counter/README.md @@ -15,8 +15,8 @@ source_datasets: - original task_categories: - text-classification -- text-scoring task_ids: +- text-scoring - semantic-similarity-scoring - topic-classification paperswithcode_id: counter diff --git a/datasets/covost2/README.md b/datasets/covost2/README.md index febbabb48d9..6b077d2f73f 100644 --- a/datasets/covost2/README.md +++ b/datasets/covost2/README.md @@ -35,9 +35,8 @@ size_categories: source_datasets: - extended|other-common-voice task_categories: -- speech-processing -task_ids: - automatic-speech-recognition +task_ids: [] paperswithcode_id: null pretty_name: CoVoST 2 --- diff --git a/datasets/craigslist_bargains/README.md b/datasets/craigslist_bargains/README.md index 92c4f93cc31..c28852d6f07 100644 --- a/datasets/craigslist_bargains/README.md +++ b/datasets/craigslist_bargains/README.md @@ -14,7 +14,8 @@ size_categories: source_datasets: - original task_categories: -- sequence-modeling +- text-generation +- fill-mask task_ids: - dialogue-modeling paperswithcode_id: craigslistbargains diff --git a/datasets/crd3/README.md b/datasets/crd3/README.md index b5eced9e7a1..6394e94eb8b 100644 --- a/datasets/crd3/README.md +++ b/datasets/crd3/README.md @@ -13,10 +13,10 @@ multilinguality: source_datasets: - original task_categories: -- conditional-text-generation -- sequence-modeling -task_ids: - summarization +- text-generation +- fill-mask +task_ids: - dialogue-modeling size_categories: - 10K Date: Wed, 30 Mar 2022 17:53:31 +0200 Subject: [PATCH 03/10] more cards updates --- datasets/dart/README.md | 2 +- datasets/enriched_web_nlg/README.md | 2 +- datasets/gem/README.md | 6 +++--- datasets/stsb_mt_sv/README.md | 3 ++- datasets/web_nlg/README.md | 2 +- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/datasets/dart/README.md b/datasets/dart/README.md index a62f65f6595..61a27de7522 100644 --- a/datasets/dart/README.md +++ b/datasets/dart/README.md @@ -19,7 +19,7 @@ source_datasets: - extended|web_nlg - extended|cleaned_e2e task_categories: -- structured-to-text +- tabular-to-text task_ids: - rdf-to-text paperswithcode_id: dart diff --git a/datasets/enriched_web_nlg/README.md b/datasets/enriched_web_nlg/README.md index 325426cded6..23ed1ea2bbb 100644 --- a/datasets/enriched_web_nlg/README.md +++ b/datasets/enriched_web_nlg/README.md @@ -17,7 +17,7 @@ size_categories: source_datasets: - extended|other-web-nlg task_categories: -- structured-to-text +- tabular-to-text task_ids: - rdf-to-text paperswithcode_id: null diff --git a/datasets/gem/README.md b/datasets/gem/README.md index c8a390f01b4..a4eea71128c 100644 --- a/datasets/gem/README.md +++ b/datasets/gem/README.md @@ -219,7 +219,7 @@ task_categories: cs_restaurants: - text2text-generation dart: - - structured-to-text + - tabular-to-text e2e_nlg: - text2text-generation mlsum_de: @@ -232,9 +232,9 @@ task_categories: totto: - text2text-generation web_nlg_en: - - structured-to-text + - tabular-to-text web_nlg_ru: - - structured-to-text + - tabular-to-text wiki_auto_asset_turk: - text2text-generation wiki_lingua_es_en: diff --git a/datasets/stsb_mt_sv/README.md b/datasets/stsb_mt_sv/README.md index c6af47e0488..5f169581c9b 100644 --- a/datasets/stsb_mt_sv/README.md +++ b/datasets/stsb_mt_sv/README.md @@ -15,8 +15,9 @@ size_categories: source_datasets: - extended|other-sts-b task_categories: -- text-scoring +- text-classification task_ids: +- text-scoring - semantic-similarity-scoring paperswithcode_id: null pretty_name: Swedish Machine Translated STS-B diff --git a/datasets/web_nlg/README.md b/datasets/web_nlg/README.md index ad5ad0828d9..23095876a52 100644 --- a/datasets/web_nlg/README.md +++ b/datasets/web_nlg/README.md @@ -32,7 +32,7 @@ source_datasets: - extended|other-db_pedia - original task_categories: -- structured-to-text +- tabular-to-text task_ids: - rdf-to-text paperswithcode_id: webnlg From 5ef9b7cfaeecb920d7a2de4b38248835c116ff22 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 30 Mar 2022 17:53:49 +0200 Subject: [PATCH 04/10] update dataset tags parser --- src/datasets/utils/metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index 090b59c25a6..a0ed9469ea4 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -321,9 +321,9 @@ def validate_task_categories(task_categories: Union[List[str], Dict[str, List[st def validate_task_ids(task_ids: Union[List[str], Dict[str, List[str]]]) -> ValidatorOutput: # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change # in the near future and we don't want to waste energy in tagging against a moving taxonomy. - known_set = [tid for _cat, d in known_task_ids.items() for tid in d["options"]] + known_set = [tid for _cat, d in known_task_ids.items() for tid in d.get("subtasks", [])] validated, error = tagset_validator( - task_ids, known_set, "task_ids", known_task_ids_url, lambda e: "-other-" in e or e.startswith("other-") + task_ids, known_set, "task_ids", known_task_ids_url, lambda e: not e or "-other-" in e or e.startswith("other-") ) return validated, error From 526dba9dce6f983ebcacd65bef35a868a5735df6 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 30 Mar 2022 17:53:57 +0200 Subject: [PATCH 05/10] fix multi-choice-qa --- src/datasets/utils/resources/tasks.json | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json index 540e6ffbccd..4d5658d7645 100644 --- a/src/datasets/utils/resources/tasks.json +++ b/src/datasets/utils/resources/tasks.json @@ -55,7 +55,7 @@ "multiple-choice": { "type": "text", "subtasks": [ - "multiple-choice-question-answering", + "multiple-choice-qa", "multiple-choice-coreference-resolution" ] }, @@ -201,5 +201,8 @@ }, "reinforcement-learning": { "type": "other" + }, + "other": { + "type": "other" } } \ No newline at end of file From a6fb7e9485a681d37965ea17cb557257c9e0af2e Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 8 Apr 2022 17:45:52 +0200 Subject: [PATCH 06/10] style --- src/datasets/utils/metadata.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index a0ed9469ea4..23f41b93e02 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -323,7 +323,11 @@ def validate_task_ids(task_ids: Union[List[str], Dict[str, List[str]]]) -> Valid # in the near future and we don't want to waste energy in tagging against a moving taxonomy. known_set = [tid for _cat, d in known_task_ids.items() for tid in d.get("subtasks", [])] validated, error = tagset_validator( - task_ids, known_set, "task_ids", known_task_ids_url, lambda e: not e or "-other-" in e or e.startswith("other-") + task_ids, + known_set, + "task_ids", + known_task_ids_url, + lambda e: not e or "-other-" in e or e.startswith("other-"), ) return validated, error From 54e9964e5a1f72459cabc2eab21e07c4c273dc9f Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 8 Apr 2022 19:50:54 +0200 Subject: [PATCH 07/10] small improvements in some dataset cards --- datasets/id_liputan6/README.md | 4 +++- datasets/librispeech_asr/README.md | 6 ++++-- datasets/multilingual_librispeech/README.md | 6 ++++-- datasets/wmt20_mlqe_task2/README.md | 7 +++++-- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/datasets/id_liputan6/README.md b/datasets/id_liputan6/README.md index 4e78415ee70..5cea1c06a44 100644 --- a/datasets/id_liputan6/README.md +++ b/datasets/id_liputan6/README.md @@ -15,7 +15,9 @@ source_datasets: - original task_categories: - summarization -task_ids: [] +task_ids: +- summarization-other-extractive-summarization +- news-articles-summarization paperswithcode_id: null pretty_name: Large-scale Indonesian Summarization --- diff --git a/datasets/librispeech_asr/README.md b/datasets/librispeech_asr/README.md index 329ecfb246b..8806cab4258 100644 --- a/datasets/librispeech_asr/README.md +++ b/datasets/librispeech_asr/README.md @@ -18,7 +18,9 @@ source_datasets: - original task_categories: - automatic-speech-recognition -task_ids: [] +- audio-classification +task_ids: +- audio-speaker-identification --- # Dataset Card for librispeech_asr @@ -61,7 +63,7 @@ LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech ### Supported Tasks and Leaderboards -- `automatic-speech-recognition`, `speaker-identification`: The dataset can be used to train a model for Automatic Speech Recognition (ASR). The model is presented with an audio file and asked to transcribe the audio file to written text. The most common evaluation metric is the word error rate (WER). The task has an active leaderboard which can be found at https://paperswithcode.com/sota/speech-recognition-on-librispeech-test-clean and ranks models based on their WER. +- `automatic-speech-recognition`, `audio-speaker-identification`: The dataset can be used to train a model for Automatic Speech Recognition (ASR). The model is presented with an audio file and asked to transcribe the audio file to written text. The most common evaluation metric is the word error rate (WER). The task has an active leaderboard which can be found at https://paperswithcode.com/sota/speech-recognition-on-librispeech-test-clean and ranks models based on their WER. ### Languages diff --git a/datasets/multilingual_librispeech/README.md b/datasets/multilingual_librispeech/README.md index 5d390f3f25f..87c9e25898c 100644 --- a/datasets/multilingual_librispeech/README.md +++ b/datasets/multilingual_librispeech/README.md @@ -24,7 +24,9 @@ source_datasets: - original task_categories: - automatic-speech-recognition -task_ids: [] +- audio-classification +task_ids: +- audio-speaker-identification --- # Dataset Card for MultiLingual LibriSpeech @@ -66,7 +68,7 @@ Multilingual LibriSpeech (MLS) dataset is a large multilingual corpus suitable f ### Supported Tasks and Leaderboards -- `automatic-speech-recognition`, `speaker-identification`: The dataset can be used to train a model for Automatic Speech Recognition (ASR). The model is presented with an audio file and asked to transcribe the audio file to written text. The most common evaluation metric is the word error rate (WER). The task has an active leaderboard which can be found at https://paperswithcode.com/dataset/multilingual-librispeech and ranks models based on their WER. +- `automatic-speech-recognition`, `audio-speaker-identification`: The dataset can be used to train a model for Automatic Speech Recognition (ASR). The model is presented with an audio file and asked to transcribe the audio file to written text. The most common evaluation metric is the word error rate (WER). The task has an active leaderboard which can be found at https://paperswithcode.com/dataset/multilingual-librispeech and ranks models based on their WER. ### Languages diff --git a/datasets/wmt20_mlqe_task2/README.md b/datasets/wmt20_mlqe_task2/README.md index c1890965bb0..6ab246d4b64 100644 --- a/datasets/wmt20_mlqe_task2/README.md +++ b/datasets/wmt20_mlqe_task2/README.md @@ -22,13 +22,16 @@ source_datasets: - extended|wikipedia task_categories: - translation -task_ids: [] +- text-classification +task_ids: +- text-classification-other-translation-quality-estimation paperswithcode_id: null --- -# Dataset Card Creation Guide +# Dataset Card for WMT20 - MultiLingual Quality Estimation (MLQE) Task2 ## Table of Contents + - [Dataset Description](#dataset-description) - [Dataset Summary](#dataset-summary) - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards) From fe3e79e550e4b1057d249da3fc59a5f34835d772 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 8 Apr 2022 19:51:05 +0200 Subject: [PATCH 08/10] allow certain tag fields to be empty --- src/datasets/utils/metadata.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index 23f41b93e02..2b75cc7be7a 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -146,7 +146,14 @@ def validate_type(value: Any, expected_type: Type): error_string += "\nOR\n" + "(" + temp_error_string + ")" else: - # Assuming `List`/`Dict`/`Tuple` + # Assuming non empty `List`/`Dict`/`Tuple` + if expected_type == EmptyList: + if len(value) == 0: + return "" + else: + return f"Expected `{expected_type_origin}` of length 0. Found value of type: `{type(value)}`, with length: {len(value)}.\n" + + # Assuming non empty if not isinstance(value, expected_type_origin) or len(value) == 0: return f"Expected `{expected_type_origin}` with length > 0. Found value of type: `{type(value)}`, with length: {len(value)}.\n" @@ -185,18 +192,25 @@ def validate_metadata_type(metadata_dict: dict): raise TypeError(f"The following typing errors are found: {typing_errors}") +class _nothing: + pass + + +EmptyList = List[_nothing] + + @dataclass class DatasetMetadata: annotations_creators: Union[List[str], Dict[str, List[str]]] - language_creators: Union[List[str], Dict[str, List[str]]] - languages: Union[List[str], Dict[str, List[str]]] + language_creators: Union[EmptyList, List[str], Dict[str, List[str]]] + languages: Union[EmptyList, List[str], Dict[str, List[str]]] licenses: Union[List[str], Dict[str, List[str]]] multilinguality: Union[List[str], Dict[str, List[str]]] pretty_name: Union[str, Dict[str, str]] size_categories: Union[List[str], Dict[str, List[str]]] source_datasets: Union[List[str], Dict[str, List[str]]] task_categories: Union[List[str], Dict[str, List[str]]] - task_ids: Union[List[str], Dict[str, List[str]]] + task_ids: Union[EmptyList, List[str], Dict[str, List[str]]] paperswithcode_id: Optional[str] = None def validate(self): From b82199c466604da06182b3d1850f58b3ac6413ea Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 11 Apr 2022 17:19:27 +0200 Subject: [PATCH 09/10] update vision datasets tags --- datasets/beans/README.md | 2 +- datasets/cats_vs_dogs/README.md | 2 +- datasets/cifar10/README.md | 3 +-- datasets/cifar100/README.md | 5 ++--- datasets/fashion_mnist/README.md | 2 +- datasets/food101/README.md | 2 +- datasets/mnist/README.md | 2 +- datasets/red_caps/README.md | 2 -- datasets/svhn/README.md | 2 +- 9 files changed, 9 insertions(+), 13 deletions(-) diff --git a/datasets/beans/README.md b/datasets/beans/README.md index 1b37467e350..971629f70dc 100644 --- a/datasets/beans/README.md +++ b/datasets/beans/README.md @@ -17,7 +17,7 @@ source_datasets: task_categories: - image-classification task_ids: -- single-label-image-classification +- image-classification-other-leaves-classification --- # Dataset Card for Beans diff --git a/datasets/cats_vs_dogs/README.md b/datasets/cats_vs_dogs/README.md index 11b65fb82ec..8543e6ff741 100644 --- a/datasets/cats_vs_dogs/README.md +++ b/datasets/cats_vs_dogs/README.md @@ -17,7 +17,7 @@ source_datasets: task_categories: - image-classification task_ids: -- single-label-image-classification +- image-classification-other-animals-classification --- # Dataset Card for Cats Vs. Dogs diff --git a/datasets/cifar10/README.md b/datasets/cifar10/README.md index 7edba5e9240..ce97aaa0cba 100644 --- a/datasets/cifar10/README.md +++ b/datasets/cifar10/README.md @@ -16,8 +16,7 @@ source_datasets: - extended|other-80-Million-Tiny-Images task_categories: - image-classification -task_ids: -- single-label-image-classification +task_ids: [] paperswithcode_id: cifar-10 --- diff --git a/datasets/cifar100/README.md b/datasets/cifar100/README.md index bba8b31292e..3ab94875a23 100644 --- a/datasets/cifar100/README.md +++ b/datasets/cifar100/README.md @@ -16,12 +16,11 @@ source_datasets: - extended|other-80-Million-Tiny-Images task_categories: - image-classification -task_ids: -- single-label-image-classification +task_ids: [] paperswithcode_id: cifar-100 --- -# Dataset Card for CIFAR-10 +# Dataset Card for CIFAR-100 ## Table of Contents - [Dataset Description](#dataset-description) diff --git a/datasets/fashion_mnist/README.md b/datasets/fashion_mnist/README.md index 34c84628057..75799471ecd 100644 --- a/datasets/fashion_mnist/README.md +++ b/datasets/fashion_mnist/README.md @@ -16,7 +16,7 @@ source_datasets: task_categories: - image-classification task_ids: -- single-label-image-classification +- image-classification-other-clothing-classification paperswithcode_id: fashion-mnist pretty_name: FashionMNIST --- diff --git a/datasets/food101/README.md b/datasets/food101/README.md index 6b45228ce9d..2d6fb763b83 100644 --- a/datasets/food101/README.md +++ b/datasets/food101/README.md @@ -17,7 +17,7 @@ source_datasets: task_categories: - image-classification task_ids: -- single-label-image-classification +- image-classification-other-food-classification paperswithcode_id: food-101 --- diff --git a/datasets/mnist/README.md b/datasets/mnist/README.md index 997f57522c4..21051419736 100644 --- a/datasets/mnist/README.md +++ b/datasets/mnist/README.md @@ -16,7 +16,7 @@ source_datasets: task_categories: - image-classification task_ids: -- single-label-image-classification +- image-classification-other-digits-classification paperswithcode_id: mnist pretty_name: MNIST --- diff --git a/datasets/red_caps/README.md b/datasets/red_caps/README.md index a7b021eb567..9624d208e3d 100644 --- a/datasets/red_caps/README.md +++ b/datasets/red_caps/README.md @@ -15,10 +15,8 @@ source_datasets: - original task_categories: - image-to-text -- image-classification task_ids: - image-captioning -- single-label-image-classification paperswithcode_id: redcaps pretty_name: RedCaps --- diff --git a/datasets/svhn/README.md b/datasets/svhn/README.md index 2bfed37544a..4826a0f9450 100644 --- a/datasets/svhn/README.md +++ b/datasets/svhn/README.md @@ -18,7 +18,7 @@ task_categories: - image-classification - object-detection task_ids: -- single-label-image-classification +- object-detection-other-digit-detection paperswithcode_id: svhn pretty_name: Street View House Numbers --- From 496b33c631b004fff3fdbfe5f0749d9be738f326 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 13 Apr 2022 16:53:01 +0200 Subject: [PATCH 10/10] use multi-class-image-classification and remove other tags --- datasets/beans/README.md | 2 +- datasets/cats_vs_dogs/README.md | 3 +-- datasets/fashion_mnist/README.md | 2 +- datasets/food101/README.md | 2 +- datasets/mnist/README.md | 2 +- datasets/svhn/README.md | 3 +-- 6 files changed, 6 insertions(+), 8 deletions(-) diff --git a/datasets/beans/README.md b/datasets/beans/README.md index 971629f70dc..317827a551a 100644 --- a/datasets/beans/README.md +++ b/datasets/beans/README.md @@ -17,7 +17,7 @@ source_datasets: task_categories: - image-classification task_ids: -- image-classification-other-leaves-classification +- multi-class-image-classification --- # Dataset Card for Beans diff --git a/datasets/cats_vs_dogs/README.md b/datasets/cats_vs_dogs/README.md index 8543e6ff741..28b278ca734 100644 --- a/datasets/cats_vs_dogs/README.md +++ b/datasets/cats_vs_dogs/README.md @@ -16,8 +16,7 @@ source_datasets: - original task_categories: - image-classification -task_ids: -- image-classification-other-animals-classification +task_ids: [] --- # Dataset Card for Cats Vs. Dogs diff --git a/datasets/fashion_mnist/README.md b/datasets/fashion_mnist/README.md index 75799471ecd..266e08d1369 100644 --- a/datasets/fashion_mnist/README.md +++ b/datasets/fashion_mnist/README.md @@ -16,7 +16,7 @@ source_datasets: task_categories: - image-classification task_ids: -- image-classification-other-clothing-classification +- multi-class-image-classification paperswithcode_id: fashion-mnist pretty_name: FashionMNIST --- diff --git a/datasets/food101/README.md b/datasets/food101/README.md index 2d6fb763b83..fa0f9ff0741 100644 --- a/datasets/food101/README.md +++ b/datasets/food101/README.md @@ -17,7 +17,7 @@ source_datasets: task_categories: - image-classification task_ids: -- image-classification-other-food-classification +- multi-class-image-classification paperswithcode_id: food-101 --- diff --git a/datasets/mnist/README.md b/datasets/mnist/README.md index 21051419736..302e2b87fb9 100644 --- a/datasets/mnist/README.md +++ b/datasets/mnist/README.md @@ -16,7 +16,7 @@ source_datasets: task_categories: - image-classification task_ids: -- image-classification-other-digits-classification +- multi-class-image-classification paperswithcode_id: mnist pretty_name: MNIST --- diff --git a/datasets/svhn/README.md b/datasets/svhn/README.md index 4826a0f9450..e174b55fe2a 100644 --- a/datasets/svhn/README.md +++ b/datasets/svhn/README.md @@ -17,8 +17,7 @@ source_datasets: task_categories: - image-classification - object-detection -task_ids: -- object-detection-other-digit-detection +task_ids: [] paperswithcode_id: svhn pretty_name: Street View House Numbers ---