diff --git a/examples/tensorflow/text-classification/run_tf_glue.py b/examples/tensorflow/text-classification/run_tf_glue.py new file mode 100755 index 00000000000000..5b6df337e91800 --- /dev/null +++ b/examples/tensorflow/text-classification/run_tf_glue.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Fine-tuning the library models for sequence classification.""" + + +import logging +import os +from dataclasses import dataclass, field +from enum import Enum +from typing import Dict, Optional + +import numpy as np +import tensorflow as tf +import tensorflow_datasets as tfds + +from transformers import ( + AutoConfig, + AutoTokenizer, + EvalPrediction, + HfArgumentParser, + PreTrainedTokenizer, + TFAutoModelForSequenceClassification, + TFTrainer, + TFTrainingArguments, + glue_compute_metrics, + glue_convert_examples_to_features, + glue_output_modes, + glue_processors, + glue_tasks_num_labels, +) +from transformers.utils import logging as hf_logging + + +hf_logging.set_verbosity_info() +hf_logging.enable_default_handler() +hf_logging.enable_explicit_format() + + +class Split(Enum): + train = "train" + dev = "validation" + test = "test" + + +def get_tfds( + task_name: str, + tokenizer: PreTrainedTokenizer, + max_seq_length: Optional[int] = None, + mode: Split = Split.train, + data_dir: str = None, +): + if task_name == "mnli-mm" and mode == Split.dev: + tfds_name = "mnli_mismatched" + elif task_name == "mnli-mm" and mode == Split.train: + tfds_name = "mnli" + elif task_name == "mnli" and mode == Split.dev: + tfds_name = "mnli_matched" + elif task_name == "sst-2": + tfds_name = "sst2" + elif task_name == "sts-b": + tfds_name = "stsb" + else: + tfds_name = task_name + + ds, info = tfds.load("glue/" + tfds_name, split=mode.value, with_info=True, data_dir=data_dir) + ds = glue_convert_examples_to_features(ds, tokenizer, max_seq_length, task_name) + ds = ds.apply(tf.data.experimental.assert_cardinality(info.splits[mode.value].num_examples)) + + return ds + + +logger = logging.getLogger(__name__) + + +@dataclass +class GlueDataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + + Using `HfArgumentParser` we can turn this class + into argparse arguments to be able to specify them on + the command line. + """ + + task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())}) + data_dir: Optional[str] = field(default=None, metadata={"help": "The input/output data dir for TFDS."}) + max_seq_length: int = field( + default=128, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + + def __post_init__(self): + self.task_name = self.task_name.lower() + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."}) + # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script, + # or just modify its tokenizer_config.json. + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + parser = HfArgumentParser((ModelArguments, GlueDataTrainingArguments, TFTrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if ( + os.path.exists(training_args.output_dir) + and os.listdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir + ): + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info( + f"n_replicas: {training_args.n_replicas}, distributed training: {bool(training_args.n_replicas > 1)}, " + f"16-bits training: {training_args.fp16}", + ) + logger.info(f"Training/evaluation parameters {training_args}") + + try: + num_labels = glue_tasks_num_labels["mnli" if data_args.task_name == "mnli-mm" else data_args.task_name] + output_mode = glue_output_modes[data_args.task_name] + except KeyError: + raise ValueError(f"Task not found: {data_args.task_name}") + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + num_labels=num_labels, + finetuning_task=data_args.task_name, + cache_dir=model_args.cache_dir, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + ) + + with training_args.strategy.scope(): + model = TFAutoModelForSequenceClassification.from_pretrained( + model_args.model_name_or_path, + from_pt=bool(".bin" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + ) + + # Get datasets + train_dataset = ( + get_tfds( + task_name=data_args.task_name, + tokenizer=tokenizer, + max_seq_length=data_args.max_seq_length, + data_dir=data_args.data_dir, + ) + if training_args.do_train + else None + ) + eval_dataset = ( + get_tfds( + task_name=data_args.task_name, + tokenizer=tokenizer, + max_seq_length=data_args.max_seq_length, + mode=Split.dev, + data_dir=data_args.data_dir, + ) + if training_args.do_eval + else None + ) + + def compute_metrics(p: EvalPrediction) -> Dict: + if output_mode == "classification": + preds = np.argmax(p.predictions, axis=1) + elif output_mode == "regression": + preds = np.squeeze(p.predictions) + return glue_compute_metrics(data_args.task_name, preds, p.label_ids) + + # Initialize our Trainer + trainer = TFTrainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + compute_metrics=compute_metrics, + ) + + # Training + if training_args.do_train: + trainer.train() + trainer.save_model() + tokenizer.save_pretrained(training_args.output_dir) + + # Evaluation + results = {} + if training_args.do_eval: + logger.info("*** Evaluate ***") + + result = trainer.evaluate() + output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") + + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results *****") + + for key, value in result.items(): + logger.info(f" {key} = {value}") + writer.write(f"{key} = {value}\n") + + results.update(result) + + return results + + +if __name__ == "__main__": + main() diff --git a/model_cards/google/tapas-base/README.md b/model_cards/google/tapas-base/README.md new file mode 100644 index 00000000000000..9685f28566d499 --- /dev/null +++ b/model_cards/google/tapas-base/README.md @@ -0,0 +1,123 @@ +--- +language: en +tags: +- tapas +- masked-lm +license: apache-2.0 +--- + +# TAPAS base model + +This model corresponds to the `tapas_inter_masklm_base_reset` checkpoint of the [original Github repository](https://github.com/google-research/tapas). + +Disclaimer: The team releasing TAPAS did not write a model card for this model so this model card has been written by +the Hugging Face team and contributors. + +## Model description + +TAPAS is a BERT-like transformers model pretrained on a large corpus of English data from Wikipedia in a self-supervised fashion. +This means it was pretrained on the raw tables and associated texts only, with no humans labelling them in any way (which is why it +can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it +was pretrained with two objectives: + +- Masked language modeling (MLM): taking a (flattened) table and associated context, the model randomly masks 15% of the words in + the input, then runs the entire (partially masked) sequence through the model. The model then has to predict the masked words. + This is different from traditional recurrent neural networks (RNNs) that usually see the words one after the other, + or from autoregressive models like GPT which internally mask the future tokens. It allows the model to learn a bidirectional + representation of a table and associated text. +- Intermediate pre-training: to encourage numerical reasoning on tables, the authors additionally pre-trained the model by creating + a balanced dataset of millions of syntactically created training examples. Here, the model must predict (classify) whether a sentence + is supported or refuted by the contents of a table. The training examples are created based on synthetic as well as counterfactual statements. + +This way, the model learns an inner representation of the English language used in tables and associated texts, which can then be used +to extract features useful for downstream tasks such as answering questions about a table, or determining whether a sentence is entailed +or refuted by the contents of a table. Fine-tuning is done by adding classification heads on top of the pre-trained model, and then jointly +train the randomly initialized classification heads with the base model on a labelled dataset. + +## Intended uses & limitations + +You can use the raw model for masked language modeling, but it's mostly intended to be fine-tuned on a downstream task. +See the [model hub](https://huggingface.co/models?filter=tapas) to look for fine-tuned versions on a task that interests you. + + +Here is how to use this model to get the features of a given table-text pair in PyTorch: + +```python +from transformers import TapasTokenizer, TapasModel +import pandas as pd +tokenizer = TapasTokenizer.from_pretrained('tapase-base') +model = TapasModel.from_pretrained("tapas-base") +data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], + 'Age': ["56", "45", "59"], + 'Number of movies': ["87", "53", "69"] +} +table = pd.DataFrame.from_dict(data) +queries = ["How many movies has George Clooney played in?"] +text = "Replace me by any text you'd like." +encoded_input = tokenizer(table=table, queries=queries, return_tensors='pt') +output = model(**encoded_input) +``` + +## Training data + +For masked language modeling (MLM), a collection of 6.2 million tables was extracted from English Wikipedia: 3.3M of class [Infobox](https://en.wikipedia.org/wiki/Help:Infobox) +and 2.9M of class WikiTable. The author only considered tables with at most 500 cells. As a proxy for questions that appear in the +downstream tasks, the authros extracted the table caption, article title, article description, segment title and text of the segment +the table occurs in as relevant text snippets. In this way, 21.3M snippets were created. For more info, see the original [TAPAS paper](https://www.aclweb.org/anthology/2020.acl-main.398.pdf). + +For intermediate pre-training, 2 tasks are introduced: one based on synthetic and the other from counterfactual statements. The first one +generates a sentence by sampling from a set of logical expressions that filter, combine and compare the information on the table, which is +required in table entailment (e.g., knowing that Gerald Ford is taller than the average president requires summing +all presidents and dividing by the number of presidents). The second one corrupts sentences about tables appearing on Wikipedia by swapping +entities for plausible alternatives. Examples of the two tasks can be seen in Figure 1. The procedure is described in detail in section 3 of +the [TAPAS follow-up paper](https://www.aclweb.org/anthology/2020.findings-emnlp.27.pdf). + +## Training procedure + +### Preprocessing + +The texts are lowercased and tokenized using WordPiece and a vocabulary size of 30,000. The inputs of the model are +then of the form: + +``` +[CLS] Context [SEP] Flattened table [SEP] +``` + +The details of the masking procedure for each sequence are the following: +- 15% of the tokens are masked. +- In 80% of the cases, the masked tokens are replaced by `[MASK]`. +- In 10% of the cases, the masked tokens are replaced by a random token (different) from the one they replace. +- In the 10% remaining cases, the masked tokens are left as is. + +The details of the creation of the synthetic and counterfactual examples can be found in the [follow-up paper](https://arxiv.org/abs/2010.00571). + +### Pretraining + +The model was trained on 32 Cloud TPU v3 cores for one million steps with maximum sequence length 512 and batch size of 512. +In this setup, pre-training takes around 3 days. The optimizer used is Adam with a learning rate of 5e-5, and a warmup ratio +of 0.10. + + +### BibTeX entry and citation info + +```bibtex +@misc{herzig2020tapas, + title={TAPAS: Weakly Supervised Table Parsing via Pre-training}, + author={Jonathan Herzig and Paweł Krzysztof Nowak and Thomas Müller and Francesco Piccinno and Julian Martin Eisenschlos}, + year={2020}, + eprint={2004.02349}, + archivePrefix={arXiv}, + primaryClass={cs.IR} +} +``` + +```bibtex +@misc{eisenschlos2020understanding, + title={Understanding tables with intermediate pre-training}, + author={Julian Martin Eisenschlos and Syrine Krichene and Thomas Müller}, + year={2020}, + eprint={2010.00571}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` \ No newline at end of file