diff --git a/examples/benchmarking/run_benchmark.py b/examples/benchmarking/run_benchmark.py
index 163bcfb6fc2501..f995b8212ab4b0 100644
--- a/examples/benchmarking/run_benchmark.py
+++ b/examples/benchmarking/run_benchmark.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
+# Copyright 2020 The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/examples/benchmarking/run_benchmark_tf.py b/examples/benchmarking/run_benchmark_tf.py
new file mode 100644
index 00000000000000..c37c833c9678b5
--- /dev/null
+++ b/examples/benchmarking/run_benchmark_tf.py
@@ -0,0 +1,29 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Benchmarking the library on inference and training in Tensorflow"""
+
+from transformers import HfArgumentParser, TensorflowBenchmark, TensorflowBenchmarkArguments
+
+
+def main():
+    parser = HfArgumentParser(TensorflowBenchmarkArguments)
+    benchmark_args = parser.parse_args_into_dataclasses()[0]
+    benchmark = TensorflowBenchmark(args=benchmark_args)
+    benchmark.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/longform-qa/eli5_app.py b/examples/longform-qa/eli5_app.py
index e79f1d6ed14fae..a7d75565ae1631 100644
--- a/examples/longform-qa/eli5_app.py
+++ b/examples/longform-qa/eli5_app.py
@@ -1,11 +1,11 @@
+import faiss
+import nlp
 import numpy as np
 import torch
+from elasticsearch import Elasticsearch
 
-import faiss
-import nlp
 import streamlit as st
 import transformers
-from elasticsearch import Elasticsearch
 from eli5_utils import (
     embed_questions_for_retrieval,
     make_qa_s2s_model,
diff --git a/examples/longform-qa/eli5_utils.py b/examples/longform-qa/eli5_utils.py
index 0298625cdc792b..4f7d7a9d46d037 100644
--- a/examples/longform-qa/eli5_utils.py
+++ b/examples/longform-qa/eli5_utils.py
@@ -4,17 +4,17 @@
 from random import choice, randint
 from time import time
 
+import faiss  # noqa: F401
+import nlp  # noqa: F401
 import numpy as np
 import pandas as pd
 import torch
 import torch.utils.checkpoint as checkpoint
+from elasticsearch import Elasticsearch  # noqa: F401
+from elasticsearch.helpers import bulk, streaming_bulk  # noqa: F401
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
 from tqdm import tqdm
 
-import faiss  # noqa: F401
-import nlp  # noqa: F401
-from elasticsearch import Elasticsearch  # noqa: F401
-from elasticsearch.helpers import bulk, streaming_bulk  # noqa: F401
 from transformers import AdamW, AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer, get_linear_schedule_with_warmup
 
 
diff --git a/examples/requirements.txt b/examples/requirements.txt
index 05d716bdc0790a..daf2081fe94295 100644
--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@@ -8,3 +8,8 @@ tensorflow_datasets
 pytorch-lightning==0.7.6
 matplotlib
 git-python==1.0.3
+faiss
+streamlit
+elasticsearch
+pandas
+nlp
diff --git a/setup.cfg b/setup.cfg
index 5badc1ae760a13..0b4c1af0714ce0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -5,12 +5,15 @@ include_trailing_comma = True
 known_first_party = transformers
 known_third_party =
     absl
+    elasticsearch
     fairseq
+    faiss
     fastprogress
     git
     h5py
     matplotlib
     MeCab
+    nlp
     nltk
     numpy
     packaging
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index dfe12b8bd9c49c..54db57c8670701 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -78,6 +78,9 @@
     add_end_docstrings,
     add_start_docstrings,
     cached_path,
+    is_apex_available,
+    is_psutil_available,
+    is_py3nvml_available,
     is_tf_available,
     is_torch_available,
     is_torch_tpu_available,
@@ -398,7 +401,8 @@
     from .data.datasets import GlueDataset, TextDataset, LineByLineTextDataset, GlueDataTrainingArguments
 
     # Benchmarks
-    from .benchmark import PyTorchBenchmark, PyTorchBenchmarkArguments
+    from .benchmark.benchmark import PyTorchBenchmark
+    from .benchmark.benchmark_args import PyTorchBenchmarkArguments
 
 # TensorFlow
 if is_tf_available():
@@ -608,6 +612,10 @@
     # Trainer
     from .trainer_tf import TFTrainer
 
+    # Benchmarks
+    from .benchmark.benchmark_tf import TensorflowBenchmark
+    from .benchmark.benchmark_args_tf import TensorflowBenchmarkArguments
+
 
 if not is_tf_available() and not is_torch_available():
     logger.warning(
diff --git a/src/transformers/benchmark/__init__.py b/src/transformers/benchmark/__init__.py
index 5eae4b2cb36783..e69de29bb2d1d6 100644
--- a/src/transformers/benchmark/__init__.py
+++ b/src/transformers/benchmark/__init__.py
@@ -1,10 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-from ..file_utils import is_torch_available
-
-
-if is_torch_available():
-    from .benchmark_args import PyTorchBenchmarkArguments
-    from .benchmark import PyTorchBenchmark
diff --git a/src/transformers/benchmark/benchmark.py b/src/transformers/benchmark/benchmark.py
index 63db8272364815..a24c5028e999e9 100644
--- a/src/transformers/benchmark/benchmark.py
+++ b/src/transformers/benchmark/benchmark.py
@@ -20,16 +20,24 @@
 
 import logging
 import timeit
+from typing import Callable, Optional
 
 from transformers import (
     MODEL_MAPPING,
     MODEL_WITH_LM_HEAD_MAPPING,
     PretrainedConfig,
+    is_py3nvml_available,
     is_torch_available,
-    is_torch_tpu_available,
 )
 
-from .benchmark_utils import Benchmark, Memory, measure_peak_memory_cpu, start_memory_tracing, stop_memory_tracing
+from .benchmark_utils import (
+    Benchmark,
+    Memory,
+    MemorySummary,
+    measure_peak_memory_cpu,
+    start_memory_tracing,
+    stop_memory_tracing,
+)
 
 
 if is_torch_available():
@@ -37,6 +45,10 @@
     from .benchmark_args import PyTorchBenchmarkArguments
 
 
+if is_py3nvml_available():
+    import py3nvml.py3nvml as nvml
+
+
 logger = logging.getLogger(__name__)
 
 
@@ -50,220 +62,173 @@ class PyTorchBenchmark(Benchmark):
     def framework_version(self):
         return torch.__version__
 
-    def train(self, model_name, batch_size, sequence_length, trace_memory=False):
-        try:
-            config = self.config_dict[model_name]
+    def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
+        _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
+        return self._measure_speed(_inference)
 
-            if self.args.torchscript:
-                config.torchscript = True
+    def _inference_memory(
+        self, model_name: str, batch_size: int, sequence_length: int
+    ) -> [Memory, Optional[MemorySummary]]:
+        _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
+        return self._measure_memory(_inference)
 
-            model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
-            model.to(self.args.device)
-            model.train()
+    def _train_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
+        _train = self._prepare_train_func(model_name, batch_size, sequence_length)
+        return self._measure_speed(_train)
 
-            # encoder-decoder has vocab size saved differently
-            vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
-            input_ids = torch.randint(
-                vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device
-            )
+    def _train_memory(
+        self, model_name: str, batch_size: int, sequence_length: int
+    ) -> [Memory, Optional[MemorySummary]]:
+        _train = self._prepare_train_func(model_name, batch_size, sequence_length)
+        return self._measure_memory(_train)
 
-            if self.args.torchscript:
-                raise NotImplementedError("Training for torchscript is currently not implemented")
-            else:
-                train_model = model
-
-            def compute_loss_and_backprob_encoder():
-                loss = train_model(input_ids, labels=input_ids)[0]
-                loss.backward()
-                train_model.zero_grad()
-
-            def compute_loss_and_backprob_encoder_decoder():
-                loss = train_model(input_ids, decoder_input_ids=input_ids, labels=input_ids)[0]
-                loss.backward()
-                train_model.zero_grad()
-
-            _train = (
-                compute_loss_and_backprob_encoder_decoder
-                if config.is_encoder_decoder
-                else compute_loss_and_backprob_encoder
-            )
-
-            if trace_memory is True:
-                if self.args.trace_memory_line_by_line:
-                    trace = start_memory_tracing("transformers")
-
-                if self.args.n_gpu > 0:
-                    # gpu
-                    # clear gpu cache
-                    torch.cuda.empty_cache()
-                    if hasattr(torch.cuda, "max_memory_reserved"):
-                        torch.cuda.reset_peak_memory_stats()
-                    else:
-                        logger.info(
-                            "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage"
-                        )
-                        torch.cuda.reset_max_memory_cached()
-
-                    # calculate loss and do backpropagation
-                    _train()
-                elif not self.args.no_tpu and is_torch_tpu_available():
-                    # tpu
-                    raise NotImplementedError(
-                        "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `args.no_memory=True`"
-                    )
-                else:
-                    # cpu
-                    memory_bytes = measure_peak_memory_cpu(_train)
-                    memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
+    def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
+        config = self.config_dict[model_name]
 
-                if self.args.trace_memory_line_by_line:
-                    summary = stop_memory_tracing(trace)
-                else:
-                    summary = None
-
-                if self.args.n_gpu > 0:
-                    # gpu
-                    if hasattr(torch.cuda, "max_memory_reserved"):
-                        memory = Memory(torch.cuda.max_memory_reserved())
-                    else:
-                        logger.info(
-                            "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage"
-                        )
-                        memory = Memory(torch.cuda.max_memory_reserved())
-
-                return memory, summary
-            else:
-                if (not self.args.no_tpu and is_torch_tpu_available()) or self.args.torchscript:
-                    # run additional 10 times to stabilize compilation for tpu and torchscript
-                    logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
-                    timeit.repeat(
-                        _train, repeat=1, number=5,
-                    )
+        if self.args.torchscript:
+            config.torchscript = True
+        if self.args.with_lm_head:
+            model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
+        else:
+            model = MODEL_MAPPING[config.__class__](config)
+
+        model.eval()
+        model.to(self.args.device)
+
+        # encoder-decoder has vocab size saved differently
+        vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
+        input_ids = torch.randint(vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device)
+
+        if self.args.fp16:
+            logger.info("Running training in Mixed Precision...")
+            assert self.args.is_gpu, "Mixed precision is possible only for GPU."
+            # amp seems to have memory leaks so that memory usage
+            # is measured using .half() for now https://github.com/NVIDIA/apex/issues/439
+            model.half()
+
+        if self.args.torchscript:
+            with torch.no_grad():
+                inference_model = torch.jit.trace(model, input_ids)
+        else:
+            inference_model = model
+
+        def encoder_decoder_forward():
+            with torch.no_grad():
+                outputs = inference_model(input_ids, decoder_input_ids=input_ids)
+            return outputs
+
+        def encoder_forward():
+            with torch.no_grad():
+                outputs = inference_model(input_ids)
+            return outputs
+
+        _forward = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
+        return _forward
+
+    def _prepare_train_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
+        config = self.config_dict[model_name]
+        model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
+
+        if self.args.torchscript:
+            raise NotImplementedError("Training for torchscript is currently not implemented")
+        else:
+            train_model = model
+
+        model.eval()
+        model.to(self.args.device)
+
+        # encoder-decoder has vocab size saved differently
+        vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
+        input_ids = torch.randint(vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device)
+
+        if self.args.fp16:
+            logger.info("Running training in Mixed Precision...")
+            assert self.args.is_gpu, "Mixed precision is possible only for GPU."
+
+            # amp seems to have memory leaks so that memory usage
+            # is measured using .half() for now https://github.com/NVIDIA/apex/issues/439
+            model.half()
+
+        def compute_loss_and_backprob_encoder():
+            loss = train_model(input_ids, labels=input_ids)[0]
+            loss.backward()
+            train_model.zero_grad()
+
+        def compute_loss_and_backprob_encoder_decoder():
+            loss = train_model(input_ids, decoder_input_ids=input_ids, labels=input_ids)[0]
+            loss.backward()
+            train_model.zero_grad()
+
+        _train = (
+            compute_loss_and_backprob_encoder_decoder
+            if config.is_encoder_decoder
+            else compute_loss_and_backprob_encoder
+        )
+        return _train
+
+    def _measure_speed(self, func) -> float:
+        try:
+            if self.args.is_tpu or self.args.torchscript:
+                # run additional 10 times to stabilize compilation for tpu and torchscript
+                logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
+                timeit.repeat(
+                    func, repeat=1, number=5,
+                )
 
-                # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
-                runtimes = timeit.repeat(_train, repeat=self.args.repeat, number=10,)
+            # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
+            runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
 
-                if not self.args.no_tpu and is_torch_tpu_available() and self.args.tpu_print_metrics:
-                    import torch_xla.debug.metrics as met
+            if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
+                import torch_xla.debug.metrics as met
 
-                    self.print_fn(met.metrics_report())
+                self.print_fn(met.metrics_report())
 
-                return min(runtimes) / 10.0
+            return min(runtimes) / 10.0
         except RuntimeError as e:
             self.print_fn("Doesn't fit on GPU. {}".format(e))
-            if trace_memory:
-                return "N/A", None
-            else:
-                return "N/A"
+            return "N/A"
 
-    def inference(self, model_name, batch_size, sequence_length, trace_memory=False):
+    def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
         try:
-            config = self.config_dict[model_name]
-            model = None
-
-            if self.args.torchscript:
-                config.torchscript = True
-
-            if self.args.with_lm_head:
-                model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
-            else:
-                model = MODEL_MAPPING[config.__class__](config)
-
-            model.eval()
-            model.to(self.args.device)
-
-            # encoder-decoder has vocab size saved differently
-            vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
-
-            input_ids = torch.randint(
-                vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device
-            )
-
-            if self.args.torchscript:
-                with torch.no_grad():
-                    if config.is_encoder_decoder:
-                        raise NotImplementedError("Torchscript is currently not supported for EncoderDecoder models")
-                    else:
-                        inference_model = torch.jit.trace(model, input_ids)
-            else:
-                inference_model = model
-
-            def encoder_decoder_forward():
-                with torch.no_grad():
-                    inference_model(input_ids, decoder_input_ids=input_ids)
-
-            def encoder_forward():
-                with torch.no_grad():
-                    inference_model(input_ids)
-
-            _forward = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
-
-            if trace_memory is True:
-                if self.args.trace_memory_line_by_line:
-                    trace = start_memory_tracing("transformers")
-
-                if self.args.n_gpu > 0:
-                    # gpu
-                    # clear gpu cache
-                    torch.cuda.empty_cache()
-                    if hasattr(torch.cuda, "max_memory_reserved"):
-                        torch.cuda.reset_peak_memory_stats()
-                    else:
-                        logger.info(
-                            "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage"
-                        )
-                        torch.cuda.reset_max_memory_cached()
-
-                    # run forward
-                    _forward()
-                elif not self.args.no_tpu and is_torch_tpu_available():
-                    # tpu
-                    raise NotImplementedError(
-                        "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `args.no_memory=True`"
+            if self.args.trace_memory_line_by_line:
+                trace = start_memory_tracing("transformers")
+
+            if self.args.is_tpu:
+                # tpu
+                raise NotImplementedError(
+                    "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `--no_memory` or `args.no_memory=True`"
+                )
+            elif self.args.is_gpu:
+                if not is_py3nvml_available():
+                    logger.warning(
+                        "py3nvml not installed, we won't log GPU memory usage. "
+                        "Install py3nvml (pip install py3nvml) to log information about GPU."
                     )
+                    memory = "N/A"
                 else:
-                    # cpu
-                    memory_bytes = measure_peak_memory_cpu(_forward)
-                    memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
-
-                if self.args.trace_memory_line_by_line:
-                    summary = stop_memory_tracing(trace)
-                else:
-                    summary = None
-
-                if self.args.n_gpu > 0:
-                    # gpu
-                    if hasattr(torch.cuda, "max_memory_reserved"):
-                        memory = Memory(torch.cuda.max_memory_reserved())
-                    else:
-                        logger.info(
-                            "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage"
-                        )
-                        memory = Memory(torch.cuda.max_memory_cached())
-
-                return memory, summary
-            else:
-
-                if (not self.args.no_tpu and is_torch_tpu_available()) or self.args.torchscript:
-                    # run additional 10 times to stabilize compilation for tpu and torchscript
-                    logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
-                    timeit.repeat(
-                        _forward, repeat=1, number=5,
+                    logger.info(
+                        "Measuring total GPU usage on GPU device. Make sure to not have additional processes running on the same GPU."
                     )
+                    # init nvml
+                    nvml.nvmlInit()
+                    func()
+                    handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
+                    meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
+                    max_bytes_in_use = meminfo.used
+                    memory = Memory(max_bytes_in_use)
+                    # shutdown nvml
+                    nvml.nvmlShutdown()
+            else:
+                # cpu
+                memory_bytes = measure_peak_memory_cpu(func)
+                memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
 
-                # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
-                runtimes = timeit.repeat(_forward, repeat=self.args.repeat, number=10,)
-
-                if not self.args.no_tpu and is_torch_tpu_available() and self.args.tpu_print_metrics:
-                    import torch_xla.debug.metrics as met
-
-                    self.print_fn(met.metrics_report())
-
-                return min(runtimes) / 10.0
+            if self.args.trace_memory_line_by_line:
+                summary = stop_memory_tracing(trace)
+            else:
+                summary = None
 
+            return memory, summary
         except RuntimeError as e:
             self.print_fn("Doesn't fit on GPU. {}".format(e))
-            if trace_memory:
-                return "N/A", None
-            else:
-                return "N/A"
+            return "N/A", None
diff --git a/src/transformers/benchmark/benchmark_args.py b/src/transformers/benchmark/benchmark_args.py
index 0cc043537b5ceb..0ecac83adf946d 100644
--- a/src/transformers/benchmark/benchmark_args.py
+++ b/src/transformers/benchmark/benchmark_args.py
@@ -34,11 +34,17 @@
 
 @dataclass
 class PyTorchBenchmarkArguments(BenchmarkArguments):
-    no_cuda: bool = field(default=False, metadata={"help": "Whether to run on available cuda devices"})
     torchscript: bool = field(default=False, metadata={"help": "Trace the models using torchscript"})
-    no_tpu: bool = field(default=False, metadata={"help": "Whether to run on available tpu devices"})
-    fp16: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."})
-    tpu_print_metrics: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."})
+    torch_xla_tpu_print_metrics: bool = field(default=False, metadata={"help": "Print Xla/PyTorch tpu metrics"})
+    fp16_opt_level: str = field(
+        default="O1",
+        metadata={
+            "help": (
+                "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                "See details at https://nvidia.github.io/apex/amp.html"
+            )
+        },
+    )
 
     @cached_property
     @torch_required
@@ -55,9 +61,14 @@ def _setup_devices(self) -> Tuple["torch.device", int]:
             n_gpu = torch.cuda.device_count()
         return device, n_gpu
 
+    @property
+    def is_tpu(self):
+        return is_torch_tpu_available() and not self.no_tpu
+
     @property
     @torch_required
     def device_idx(self) -> int:
+        # TODO(PVP): currently only single GPU is supported
         return torch.cuda.current_device()
 
     @property
@@ -69,3 +80,7 @@ def device(self) -> "torch.device":
     @torch_required
     def n_gpu(self):
         return self._setup_devices[1]
+
+    @property
+    def is_gpu(self):
+        return self.n_gpu > 0
diff --git a/src/transformers/benchmark/benchmark_args_tf.py b/src/transformers/benchmark/benchmark_args_tf.py
new file mode 100644
index 00000000000000..0f2b243c3838db
--- /dev/null
+++ b/src/transformers/benchmark/benchmark_args_tf.py
@@ -0,0 +1,105 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from dataclasses import dataclass, field
+from typing import Tuple
+
+from ..file_utils import cached_property, is_tf_available, tf_required
+from .benchmark_args_utils import BenchmarkArguments
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class TensorflowBenchmarkArguments(BenchmarkArguments):
+    tpu_name: str = field(
+        default=None, metadata={"help": "Name of TPU"},
+    )
+    device_idx: int = field(
+        default=0, metadata={"help": "CPU / GPU device index. Defaults to 0."},
+    )
+    eager_mode: bool = field(default=False, metadata={"help": "Benchmark models in eager model."})
+    use_xla: bool = field(
+        default=False,
+        metadata={
+            "help": "Benchmark models using XLA JIT compilation. Note that `eager_model` has to be set to `False`."
+        },
+    )
+
+    @cached_property
+    @tf_required
+    def _setup_tpu(self) -> Tuple["tf.distribute.cluster_resolver.TPUClusterResolver"]:
+        if not self.no_tpu:
+            try:
+                if self.tpu_name:
+                    tpu = tf.distribute.cluster_resolver.TPUClusterResolver(self.tpu_name)
+                else:
+                    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
+            except ValueError:
+                tpu = None
+        return tpu
+
+    @cached_property
+    @tf_required
+    def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", "tf.distribute.cluster_resolver.TPUClusterResolver"]:
+        if self.is_tpu:
+            tf.config.experimental_connect_to_cluster(self._setup_tpu)
+            tf.tpu.experimental.initialize_tpu_system(self._setup_tpu)
+
+            strategy = tf.distribute.experimental.TPUStrategy(self._setup_tpu)
+        else:
+            # currently no multi gpu is allowed
+            if self.is_gpu:
+                # TODO: Currently only single GPU is supported
+                tf.config.experimental.set_visible_devices(self.gpu_list[self.device_idx], "GPU")
+                strategy = tf.distribute.OneDeviceStrategy(device=f"/gpu:{self.device_idx}")
+            else:
+                tf.config.experimental.set_visible_devices([], "GPU")  # disable GPU
+                strategy = tf.distribute.OneDeviceStrategy(device=f"/cpu:{self.device_idx}")
+
+        return strategy
+
+    @property
+    @tf_required
+    def is_tpu(self) -> bool:
+        return self._setup_tpu is not None
+
+    @property
+    @tf_required
+    def strategy(self) -> "tf.distribute.Strategy":
+        return self._setup_strategy
+
+    @property
+    @tf_required
+    def gpu_list(self):
+        return tf.config.list_physical_devices("GPU")
+
+    @property
+    @tf_required
+    def n_gpu(self) -> int:
+        if not self.no_cuda:
+            return len(self.gpu_list)
+        return 0
+
+    @property
+    def is_gpu(self) -> bool:
+        return self.n_gpu > 0
diff --git a/src/transformers/benchmark/benchmark_args_utils.py b/src/transformers/benchmark/benchmark_args_utils.py
index ac76c37eb1f8c8..5f7dbff672e620 100644
--- a/src/transformers/benchmark/benchmark_args_utils.py
+++ b/src/transformers/benchmark/benchmark_args_utils.py
@@ -16,11 +16,15 @@
 
 import dataclasses
 import json
+import logging
 from dataclasses import dataclass, field
 from time import time
 from typing import List
 
 
+logger = logging.getLogger(__name__)
+
+
 def list_field(default=None, metadata=None):
     return field(default_factory=lambda: default, metadata=metadata)
 
@@ -53,6 +57,9 @@ class BenchmarkArguments:
     )
 
     no_inference: bool = field(default=False, metadata={"help": "Don't benchmark inference of model"})
+    no_cuda: bool = field(default=False, metadata={"help": "Whether to run on available cuda devices"})
+    no_tpu: bool = field(default=False, metadata={"help": "Whether to run on available tpu devices"})
+    fp16: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."})
     training: bool = field(default=False, metadata={"help": "Benchmark training of model"})
     verbose: bool = field(default=False, metadata={"help": "Verbose memory tracing"})
     no_speed: bool = field(default=False, metadata={"help": "Don't perform speed measurments"})
@@ -61,6 +68,12 @@ class BenchmarkArguments:
     save_to_csv: bool = field(default=False, metadata={"help": "Save result to a CSV file"})
     log_print: bool = field(default=False, metadata={"help": "Save all print statements in a log file"})
     no_env_print: bool = field(default=False, metadata={"help": "Don't print environment information"})
+    no_multi_process: bool = field(
+        default=False,
+        metadata={
+            "help": "Don't use multiprocessing for memory and speed measurement. It is highly recommended to use multiprocessing for accurate CPU and GPU memory measurements. This option should only be used for debugging / testing and on TPU."
+        },
+    )
     with_lm_head: bool = field(
         default=False,
         metadata={
@@ -101,4 +114,17 @@ def to_json_string(self):
 
     @property
     def model_names(self):
+        assert (
+            len(self.models) > 0
+        ), "Please make sure you provide at least one model name / model identifier, *e.g.* `--models bert-base-cased` or `args.models = ['bert-base-cased']."
         return self.models
+
+    @property
+    def do_multi_processing(self):
+        if self.no_multi_process:
+            return False
+        elif self.is_tpu:
+            logger.info("Multiprocessing is currently not possible on TPU.")
+            return False
+        else:
+            return True
diff --git a/src/transformers/benchmark/benchmark_tf.py b/src/transformers/benchmark/benchmark_tf.py
new file mode 100644
index 00000000000000..4a92e863a136ef
--- /dev/null
+++ b/src/transformers/benchmark/benchmark_tf.py
@@ -0,0 +1,226 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+    Benchmarking the library on inference and training in PyTorch.
+"""
+
+
+import logging
+import random
+import timeit
+from functools import wraps
+from typing import Callable, Optional
+
+from transformers import (
+    TF_MODEL_MAPPING,
+    TF_MODEL_WITH_LM_HEAD_MAPPING,
+    PretrainedConfig,
+    is_py3nvml_available,
+    is_tf_available,
+)
+
+from .benchmark_utils import (
+    Benchmark,
+    Memory,
+    MemorySummary,
+    measure_peak_memory_cpu,
+    start_memory_tracing,
+    stop_memory_tracing,
+)
+
+
+if is_tf_available():
+    import tensorflow as tf
+    from .benchmark_args_tf import TensorflowBenchmarkArguments
+    from tensorflow.python.framework.errors_impl import ResourceExhaustedError
+
+if is_py3nvml_available():
+    import py3nvml.py3nvml as nvml
+
+logger = logging.getLogger(__name__)
+
+
+def run_with_tf_optimizations(do_eager_mode: bool, use_xla: bool):
+    def run_func(func):
+        @wraps(func)
+        def run_in_eager_mode(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        @wraps(func)
+        @tf.function(experimental_compile=use_xla)
+        def run_in_graph_mode(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        if do_eager_mode is True:
+            assert (
+                use_xla is False
+            ), "Cannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`."
+            return run_in_eager_mode
+        else:
+            return run_in_graph_mode
+
+    return run_func
+
+
+def random_input_ids(batch_size: int, sequence_length: int, vocab_size: int) -> ["tf.Tensor"]:
+    rng = random.Random()
+    values = [rng.randint(0, vocab_size - 1) for i in range(batch_size * sequence_length)]
+    return tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
+
+
+class TensorflowBenchmark(Benchmark):
+
+    args: TensorflowBenchmarkArguments
+    configs: PretrainedConfig
+    framework: str = "Tensorflow"
+
+    @property
+    def framework_version(self):
+        return tf.__version__
+
+    def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
+        # initialize GPU on separate process
+        strategy = self.args.strategy
+        assert strategy is not None, "A device strategy has to be initialized before using Tensorflow."
+        _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
+        return self._measure_speed(_inference)
+
+    def _train_speed(self, model_name, batch_size, sequence_length):
+        raise NotImplementedError(
+            "Training is currently not really implemented." "Wait for TFTrainer to support CLM and MLM."
+        )
+
+    def _inference_memory(
+        self, model_name: str, batch_size: int, sequence_length: int
+    ) -> [Memory, Optional[MemorySummary]]:
+        # initialize GPU on separate process
+        if self.args.is_gpu:
+            tf.config.experimental.set_memory_growth(self.args.gpu_list[self.args.device_idx], True)
+        strategy = self.args.strategy
+        assert strategy is not None, "A device strategy has to be initialized before using Tensorflow."
+        _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
+        return self._measure_memory(_inference)
+
+    def _train_memory(self, model_name, batch_size, sequence_length):
+        raise NotImplementedError(
+            "Training is currently not really implemented. Wait for TFTrainer to support CLM and MLM."
+        )
+
+    def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
+        config = self.config_dict[model_name]
+
+        if self.args.fp16:
+            raise NotImplementedError("Mixed precision is currently not supported.")
+
+        if self.args.with_lm_head:
+            model = TF_MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
+        else:
+            model = TF_MODEL_MAPPING[config.__class__](config)
+
+        # encoder-decoder has vocab size saved differently
+        vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
+        input_ids = random_input_ids(batch_size, sequence_length, vocab_size)
+
+        @run_with_tf_optimizations(self.args.eager_mode, self.args.use_xla)
+        def encoder_decoder_forward():
+            return model(input_ids, decoder_input_ids=input_ids, training=False)
+
+        @run_with_tf_optimizations(self.args.eager_mode, self.args.use_xla)
+        def encoder_forward():
+            return model(input_ids, training=False)
+
+        _inference = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
+
+        return _inference
+
+    def _measure_speed(self, func) -> float:
+        with self.args.strategy.scope():
+            try:
+                if self.args.is_tpu or self.args.use_xla:
+                    # run additional 10 times to stabilize compilation for tpu
+                    logger.info("Do inference on TPU. Running model 5 times to stabilize compilation")
+                    timeit.repeat(func, repeat=1, number=5)
+
+                # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
+                runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
+
+                return min(runtimes) / 10.0
+            except ResourceExhaustedError as e:
+                self.print_fn("Doesn't fit on GPU. {}".format(e))
+
+    def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
+        logger.info(
+            "Note that Tensorflow allocates more memory than"
+            "it might need to speed up computation."
+            "The memory reported here corresponds to the memory"
+            "reported by `nvidia-smi`, which can vary depending"
+            "on total available memory on the GPU that is used."
+        )
+        with self.args.strategy.scope():
+            try:
+                if self.args.trace_memory_line_by_line:
+                    assert (
+                        self.args.eager_mode
+                    ), "`args.eager_mode` is set to `False`. Make sure to run model in eager mode to measure memory consumption line by line."
+                    trace = start_memory_tracing("transformers")
+
+                if self.args.is_tpu:
+                    # tpu
+                    raise NotImplementedError(
+                        "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `args.no_memory=True`"
+                    )
+                elif self.args.is_gpu:
+                    # gpu
+                    if not is_py3nvml_available():
+                        logger.warning(
+                            "py3nvml not installed, we won't log GPU memory usage. "
+                            "Install py3nvml (pip install py3nvml) to log information about GPU."
+                        )
+                        memory = "N/A"
+                    else:
+                        logger.info(
+                            "Measuring total GPU usage on GPU device. Make sure to not have additional processes running on the same GPU."
+                        )
+                        # init nvml
+                        nvml.nvmlInit()
+                        func()
+                        handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
+                        meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
+                        max_bytes_in_use = meminfo.used
+                        memory = Memory(max_bytes_in_use)
+                        # shutdown nvml
+                        nvml.nvmlShutdown()
+                else:
+                    # cpu
+                    if self.args.trace_memory_line_by_line:
+                        logger.info(
+                            "When enabling line by line tracing, the max peak memory for CPU is inaccurate in Tensorflow."
+                        )
+                        memory = None
+                    else:
+                        memory_bytes = measure_peak_memory_cpu(func)
+                        memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
+                if self.args.trace_memory_line_by_line:
+                    summary = stop_memory_tracing(trace)
+                    if memory is None:
+                        memory = summary.total
+                else:
+                    summary = None
+
+                return memory, summary
+            except ResourceExhaustedError as e:
+                self.print_fn("Doesn't fit on GPU. {}".format(e))
+                return "N/A", None
diff --git a/src/transformers/benchmark/benchmark_utils.py b/src/transformers/benchmark/benchmark_utils.py
index 5b7cb438532a07..682887d0d5dc8c 100644
--- a/src/transformers/benchmark/benchmark_utils.py
+++ b/src/transformers/benchmark/benchmark_utils.py
@@ -14,14 +14,14 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict, namedtuple
 from datetime import datetime
-from multiprocessing import Pipe, Process
+from multiprocessing import Pipe, Process, Queue
 from multiprocessing.connection import Connection
 from typing import Callable, Iterable, List, NamedTuple, Optional, Union
 
 from transformers import AutoConfig, PretrainedConfig
 from transformers import __version__ as version
 
-from ..file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
+from ..file_utils import is_psutil_available, is_py3nvml_available, is_tf_available, is_torch_available
 from .benchmark_args_utils import BenchmarkArguments
 
 
@@ -31,6 +31,11 @@
 if is_tf_available():
     from tensorflow.python.eager import context as tf_context
 
+if is_psutil_available():
+    import psutil
+
+if is_py3nvml_available():
+    import py3nvml.py3nvml as nvml
 
 if platform.system() == "Windows":
     from signal import CTRL_C_EVENT as SIGKILL
@@ -56,6 +61,45 @@
 )
 
 
+def separate_process_wrapper_fn(func: Callable[[], None], do_multi_processing: bool) -> Callable[[], None]:
+    """
+        This function wraps another function into its own separated process.
+        In order to ensure accurate memory measurements it is important that the function
+        is executed in a separate process
+
+        Args:
+            - `func`: (`callable`): function() -> ...
+                generic function which will be executed in its own separate process
+            - `do_multi_processing`: (`bool`)
+                Whether to run function on separate process or not
+    """
+
+    def multi_process_func(*args, **kwargs):
+        # run function in an individual
+        # process to get correct memory
+        def wrapper_func(queue: Queue, *args):
+            try:
+                result = func(*args)
+            except Exception as e:
+                logger.error(e)
+                print(e)
+                result = "N/A"
+            queue.put(result)
+
+        queue = Queue()
+        p = Process(target=wrapper_func, args=[queue] + list(args))
+        p.start()
+        result = queue.get()
+        p.join()
+        return result
+
+    if do_multi_processing:
+        logging.info("fFunction {func} is executed in its own process...")
+        return multi_process_func
+    else:
+        return func
+
+
 def is_memory_tracing_enabled():
     global _is_memory_tracing_enabled
     return _is_memory_tracing_enabled
@@ -136,7 +180,7 @@ class MemorySummary(NamedTuple):
 MemoryTrace = List[UsedMemoryState]
 
 
-def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5) -> int:
+def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_idx=None) -> int:
     """
         measures peak cpu memory consumption of a given `function`
         running the function for at least interval seconds
@@ -148,16 +192,38 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5) -> int:
             - `function`: (`callable`): function() -> ...
                 function without any arguments to measure for which to measure the peak memory
 
-            - `interval`: (`float`)
+            - `interval`: (`float`, `optional`, defaults to `0.5`)
                 interval in second for which to measure the memory usage
 
+            - `device_idx`: (`int`, `optional`, defaults to `None`)
+                device id for which to measure gpu usage
+
         Returns:
             - `max_memory`: (`int`)
                 cosumed memory peak in Bytes
     """
-    try:
-        import psutil
-    except (ImportError):
+
+    def get_cpu_memory(process_id: int) -> int:
+        """
+            measures current cpu memory usage of a given `process_id`
+
+            Args:
+                - `process_id`: (`int`)
+                    process_id for which to measure memory
+
+            Returns
+                - `memory`: (`int`)
+                    cosumed memory in Bytes
+        """
+        process = psutil.Process(process_id)
+        try:
+            meminfo_attr = "memory_info" if hasattr(process, "memory_info") else "get_memory_info"
+            memory = getattr(process, meminfo_attr)()[0]
+        except psutil.AccessDenied:
+            raise ValueError("Error with Psutil.")
+        return memory
+
+    if not is_psutil_available():
         logger.warning(
             "Psutil not installed, we won't log CPU memory usage. "
             "Install Psutil (pip install psutil) to use CPU memory tracing."
@@ -165,26 +231,6 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5) -> int:
         max_memory = "N/A"
     else:
 
-        def _get_memory(process_id: int) -> int:
-            """
-                measures current cpu memory usage of a given `process_id`
-
-                Args:
-                    - `process_id`: (`int`)
-                        process_id for which to measure memory
-
-                Returns
-                    - `memory`: (`int`)
-                        cosumed memory in Bytes
-            """
-            process = psutil.Process(process_id)
-            try:
-                meminfo_attr = "memory_info" if hasattr(process, "memory_info") else "get_memory_info"
-                memory = getattr(process, meminfo_attr)()[0]
-            except psutil.AccessDenied:
-                raise ValueError("Error with Psutil.")
-            return memory
-
         class MemoryMeasureProcess(Process):
 
             """
@@ -198,13 +244,13 @@ def __init__(self, process_id: int, child_connection: Connection, interval: floa
                 self.interval = interval
                 self.connection = child_connection
                 self.num_measurements = 1
-                self.mem_usage = _get_memory(process_id)
+                self.mem_usage = get_cpu_memory(self.process_id)
 
             def run(self):
                 self.connection.send(0)
                 stop = False
                 while True:
-                    self.mem_usage = max(self.mem_usage, _get_memory(self.process_id))
+                    self.mem_usage = max(self.mem_usage, get_cpu_memory(self.process_id))
                     self.num_measurements += 1
 
                     if stop:
@@ -296,34 +342,31 @@ def start_memory_tracing(
             - 'line_text' (string): Text of the line in the python script
 
     """
-    try:
-        import psutil
-    except (ImportError):
+    if is_psutil_available():
+        process = psutil.Process(os.getpid())
+    else:
         logger.warning(
             "Psutil not installed, we won't log CPU memory usage. "
             "Install psutil (pip install psutil) to use CPU memory tracing."
         )
         process = None
-    else:
-        process = psutil.Process(os.getpid())
-
-    try:
-        from py3nvml import py3nvml
 
-        py3nvml.nvmlInit()
-        devices = list(range(py3nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace
-        py3nvml.nvmlShutdown()
-    except ImportError:
+    if is_py3nvml_available():
+        try:
+            nvml.nvmlInit()
+            devices = list(range(nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace
+            nvml.nvmlShutdown()
+        except (OSError, nvml.NVMLError):
+            logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.")
+            log_gpu = False
+        else:
+            log_gpu = is_torch_available() or is_tf_available()
+    else:
         logger.warning(
             "py3nvml not installed, we won't log GPU memory usage. "
             "Install py3nvml (pip install py3nvml) to use GPU memory tracing."
         )
         log_gpu = False
-    except (OSError, py3nvml.NVMLError):
-        logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.")
-        log_gpu = False
-    else:
-        log_gpu = is_torch_available() or is_tf_available()
 
     memory_trace = []
 
@@ -385,14 +428,14 @@ def traceit(frame, event, args):
                 tf_context.context()._clear_caches()  # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802
 
             # Sum used memory for all GPUs
-            py3nvml.nvmlInit()
+            nvml.nvmlInit()
 
             for i in devices:
-                handle = py3nvml.nvmlDeviceGetHandleByIndex(i)
-                meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
+                handle = nvml.nvmlDeviceGetHandleByIndex(i)
+                meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
                 gpu_mem += meminfo.used
 
-            py3nvml.nvmlShutdown()
+            nvml.nvmlShutdown()
 
         mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem)
         memory_trace.append(mem_state)
@@ -522,7 +565,6 @@ class Benchmark(ABC):
 
     def __init__(self, args: BenchmarkArguments = None, configs: PretrainedConfig = None):
         self.args = args
-
         if configs is None:
             self.config_dict = {
                 model_name: AutoConfig.from_pretrained(model_name) for model_name in self.args.model_names
@@ -530,6 +572,11 @@ def __init__(self, args: BenchmarkArguments = None, configs: PretrainedConfig =
         else:
             self.config_dict = {model_name: config for model_name, config in zip(self.args.model_names, configs)}
 
+        if not self.args.no_memory and os.getenv("TRANSFORMERS_USE_MULTIPROCESSING") == 0:
+            logger.warning(
+                "Memory consumption will not be measured accurately if `args.no_multi_process` is set to `True.` The flag 'TRANSFORMERS_USE_MULTIPROCESSING' should only be disabled for debugging / testing."
+            )
+
         self._print_fn = None
         self._framework_version = None
         self._environment_info = None
@@ -541,7 +588,7 @@ def print_fn(self):
 
                 def print_and_log(*args):
                     with open(self.args.log_filename, "a") as log_file:
-                        log_file.write(str(*args) + "\n")
+                        log_file.write("".join(args) + "\n")
                     print(*args)
 
                 self._print_fn = print_and_log
@@ -550,26 +597,42 @@ def print_and_log(*args):
         return self._print_fn
 
     @property
-    def is_gpu(self):
-        return self.args.n_gpu > 0
+    @abstractmethod
+    def framework_version(self):
+        pass
 
-    @property
-    def is_tpu(self):
-        return is_torch_tpu_available() and not self.args.no_tpu
+    @abstractmethod
+    def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
+        pass
 
-    @property
     @abstractmethod
-    def framework_version(self):
+    def _train_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
         pass
 
     @abstractmethod
-    def train(self, model_name, batch_size, sequence_length):
+    def _inference_memory(
+        self, model_name: str, batch_size: int, sequence_length: int
+    ) -> [Memory, Optional[MemorySummary]]:
         pass
 
     @abstractmethod
-    def inference(self, model_name, batch_size, sequence_length):
+    def _train_memory(
+        self, model_name: str, batch_size: int, sequence_length: int
+    ) -> [Memory, Optional[MemorySummary]]:
         pass
 
+    def inference_speed(self, *args, **kwargs) -> float:
+        return separate_process_wrapper_fn(self._inference_speed, self.args.do_multi_processing)(*args, **kwargs)
+
+    def train_speed(self, *args, **kwargs) -> float:
+        return separate_process_wrapper_fn(self._train_speed, self.args.do_multi_processing)(*args, **kwargs)
+
+    def inference_memory(self, *args, **kwargs) -> [Memory, Optional[MemorySummary]]:
+        return separate_process_wrapper_fn(self._inference_memory, self.args.do_multi_processing)(*args, **kwargs)
+
+    def train_memory(self, *args, **kwargs) -> [Memory, Optional[MemorySummary]]:
+        return separate_process_wrapper_fn(self._train_memory, self.args.do_multi_processing)(*args, **kwargs)
+
     def run(self):
         result_dict = {model_name: {} for model_name in self.args.model_names}
         inference_result_time = copy.deepcopy(result_dict)
@@ -596,64 +659,60 @@ def run(self):
                 for sequence_length in self.args.sequence_lengths:
                     if not self.args.no_inference:
                         if not self.args.no_memory:
-                            memory, inference_summary = self.inference(
-                                model_name, batch_size, sequence_length, trace_memory=True
-                            )
+                            memory, inference_summary = self.inference_memory(model_name, batch_size, sequence_length)
                             inference_result_memory[model_name]["result"][batch_size][sequence_length] = memory
                         if not self.args.no_speed:
-                            time = self.inference(model_name, batch_size, sequence_length, trace_memory=False)
+                            time = self.inference_speed(model_name, batch_size, sequence_length)
                             inference_result_time[model_name]["result"][batch_size][sequence_length] = time
 
                     if self.args.training:
                         if not self.args.no_memory:
-                            memory, train_summary = self.train(
-                                model_name, batch_size, sequence_length, trace_memory=True
-                            )
+                            memory, train_summary = self.train_memory(model_name, batch_size, sequence_length)
                             train_result_memory[model_name]["result"][batch_size][sequence_length] = memory
                         if not self.args.no_speed:
-                            time = self.inference(model_name, batch_size, sequence_length, trace_memory=False)
+                            time = self.train_speed(model_name, batch_size, sequence_length)
                             train_result_time[model_name]["result"][batch_size][sequence_length] = time
 
         if not self.args.no_inference:
             if not self.args.no_speed:
-                self.print_fn("======= INFERENCE - SPEED - RESULT =======")
-                self.print_results(inference_result_time)
+                self.print_fn("\n" + 20 * "=" + ("INFERENCE - SPEED - RESULT").center(40) + 20 * "=")
+                self.print_results(inference_result_time, type_label="Time in s")
                 self.save_to_csv(inference_result_time, self.args.inference_time_csv_file)
-                if self.is_tpu:
+                if self.args.is_tpu:
                     self.print_fn(
                         "TPU was used for inference. Note that the time after compilation stabilized (after ~10 inferences model.forward(..) calls) was measured."
                     )
 
             if not self.args.no_memory:
-                self.print_fn("======= INFERENCE - MEMORY - RESULT =======")
-                self.print_results(inference_result_memory)
+                self.print_fn("\n" + 20 * "=" + ("INFERENCE - MEMORY - RESULT").center(40) + 20 * "=")
+                self.print_results(inference_result_memory, type_label="Memory in MB")
                 self.save_to_csv(inference_result_memory, self.args.inference_memory_csv_file)
 
             if self.args.trace_memory_line_by_line:
-                self.print_fn("======= INFERENCE - MEMORY LINE BY LINE TRACE - SUMMARY =======")
+                self.print_fn("\n" + 20 * "=" + ("INFERENCE - MEMOMRY - LINE BY LINE - SUMMARY").center(40) + 20 * "=")
                 self.print_memory_trace_statistics(inference_summary)
 
         if self.args.training:
             if not self.args.no_speed:
-                self.print_fn("======= TRAIN - SPEED - RESULT =======")
-                self.print_results(train_result_time)
+                self.print_fn("\n" + 20 * "=" + ("TRAIN - SPEED - RESULTS").center(40) + 20 * "=")
+                self.print_results(train_result_time, "Time in s")
                 self.save_to_csv(train_result_time, self.args.train_time_csv_file)
-                if self.is_tpu:
+                if self.args.is_tpu:
                     self.print_fn(
                         "TPU was used for training. Note that the time after compilation stabilized (after ~10 train loss=model.forward(...) + loss.backward() calls) was measured."
                     )
 
             if not self.args.no_memory:
-                self.print_fn("======= TRAIN - MEMORY - RESULT =======")
-                self.print_results(train_result_memory)
+                self.print_fn("\n" + 20 * "=" + ("TRAIN - MEMORY - RESULTS").center(40) + 20 * "=")
+                self.print_results(train_result_memory, type_label="Memory in MB")
                 self.save_to_csv(train_result_memory, self.args.train_memory_csv_file)
 
             if self.args.trace_memory_line_by_line:
-                self.print_fn("======= TRAIN - MEMORY LINE BY LINE TRACE - SUMMARY =======")
+                self.print_fn("\n" + 20 * "=" + ("TRAIN - MEMOMRY - LINE BY LINE - SUMMARY").center(40) + 20 * "=")
                 self.print_memory_trace_statistics(train_summary)
 
         if not self.args.no_env_print:
-            self.print_fn("\n======== ENVIRONMENT - INFORMATION ========")
+            self.print_fn("\n" + 20 * "=" + ("ENVIRONMENT INFORMATION").center(40) + 20 * "=")
             self.print_fn(
                 "\n".join(["- {}: {}".format(prop, val) for prop, val in self.environment_info.items()]) + "\n"
             )
@@ -681,6 +740,9 @@ def environment_info(self):
             info["framework"] = self.framework
             if self.framework == "PyTorch":
                 info["use_torchscript"] = self.args.torchscript
+            if self.framework == "Tensorflow":
+                info["eager_mode"] = self.args.eager_mode
+                info["use_xla"] = self.args.use_xla
             info["framework_version"] = self.framework_version
             info["python_version"] = platform.python_version()
             info["system"] = platform.system()
@@ -688,27 +750,30 @@ def environment_info(self):
             info["architecture"] = platform.architecture()[0]
             info["date"] = datetime.date(datetime.now())
             info["time"] = datetime.time(datetime.now())
+            info["fp16"] = self.args.fp16
+            info["use_multiprocessing"] = self.args.do_multi_processing
 
-            try:
-                import psutil
-            except (ImportError):
+            if is_psutil_available():
+                info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total)
+            else:
                 logger.warning(
                     "Psutil not installed, we won't log available CPU memory."
                     "Install psutil (pip install psutil) to log available CPU memory."
                 )
                 info["cpu_ram_mb"] = "N/A"
-            else:
-                info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total)
 
-            info["use_gpu"] = self.is_gpu
-            if self.is_gpu:
-                info["num_gpus"] = self.args.n_gpu
-                try:
-                    from py3nvml import py3nvml
-
-                    py3nvml.nvmlInit()
-                    handle = py3nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
-                except ImportError:
+            info["use_gpu"] = self.args.is_gpu
+            if self.args.is_gpu:
+                info["num_gpus"] = 1  # TODO(PVP) Currently only single GPU is supported
+                if is_py3nvml_available():
+                    nvml.nvmlInit()
+                    handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
+                    info["gpu"] = nvml.nvmlDeviceGetName(handle)
+                    info["gpu_ram_mb"] = bytes_to_mega_bytes(nvml.nvmlDeviceGetMemoryInfo(handle).total)
+                    info["gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000
+                    info["gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(handle)
+                    nvml.nvmlShutdown()
+                else:
                     logger.warning(
                         "py3nvml not installed, we won't log GPU memory usage. "
                         "Install py3nvml (pip install py3nvml) to log information about GPU."
@@ -717,41 +782,35 @@ def environment_info(self):
                     info["gpu_ram_mb"] = "N/A"
                     info["gpu_power_watts"] = "N/A"
                     info["gpu_performance_state"] = "N/A"
-                except (OSError, py3nvml.NVMLError):
-                    logger.warning(
-                        "Error while initializing comunication with GPU. " "We won't log information about GPU."
-                    )
-                    info["gpu"] = "N/A"
-                    info["gpu_ram_mb"] = "N/A"
-                    info["gpu_power_watts"] = "N/A"
-                    info["gpu_performance_state"] = "N/A"
-                    py3nvml.nvmlShutdown()
-                else:
-                    info["gpu"] = py3nvml.nvmlDeviceGetName(handle)
-                    info["gpu_ram_mb"] = bytes_to_mega_bytes(py3nvml.nvmlDeviceGetMemoryInfo(handle).total)
-                    info["gpu_power_watts"] = py3nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000
-                    info["gpu_performance_state"] = py3nvml.nvmlDeviceGetPerformanceState(handle)
-                    py3nvml.nvmlShutdown()
 
-            info["use_tpu"] = self.is_tpu
+            info["use_tpu"] = self.args.is_tpu
             # TODO(PVP): See if we can add more information about TPU
             # see: https://github.com/pytorch/xla/issues/2180
 
             self._environment_info = info
         return self._environment_info
 
-    def print_results(self, result_dict):
+    def print_results(self, result_dict, type_label):
+        self.print_fn(80 * "-")
+        self.print_fn(
+            "Model Name".center(30) + "Batch Size".center(15) + "Seq Length".center(15) + type_label.center(15)
+        )
+        self.print_fn(80 * "-")
         for model_name in self.args.model_names:
-            self.print_fn("\t" + f"======= MODEL CHECKPOINT: {model_name} =======")
             for batch_size in result_dict[model_name]["bs"]:
                 for sequence_length in result_dict[model_name]["ss"]:
                     result = result_dict[model_name]["result"][batch_size][sequence_length]
                     if isinstance(result, float):
-                        self.print_fn(
-                            f"\t\t{model_name}/{batch_size}/{sequence_length}: " f"{(round(1000 * result) / 1000)}s"
-                        )
+                        result = round(1000 * result) / 1000
+                        result = "< 0.001" if result == 0.0 else str(result)
                     else:
-                        self.print_fn(f"\t\t{model_name}/{batch_size}/{sequence_length}: " f"{result} MB")
+                        result = str(result)
+                    self.print_fn(
+                        model_name.center(30) + str(batch_size).center(15),
+                        str(sequence_length).center(15),
+                        result.center(15),
+                    )
+        self.print_fn(80 * "-")
 
     def print_memory_trace_statistics(self, summary: MemorySummary):
         self.print_fn(
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 433c77ae5addce..a2af66955632b4 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -81,6 +81,31 @@
     _torch_tpu_available = False
 
 
+try:
+    import psutil  # noqa: F401
+
+    _psutil_available = True
+
+except ImportError:
+    _psutil_available = False
+
+
+try:
+    import py3nvml  # noqa: F401
+
+    _py3nvml_available = True
+
+except ImportError:
+    _py3nvml_available = False
+
+
+try:
+    from apex import amp  # noqa: F401
+
+    _has_apex = True
+except ImportError:
+    _has_apex = False
+
 default_cache_path = os.path.join(torch_cache_home, "transformers")
 
 
@@ -115,6 +140,18 @@ def is_torch_tpu_available():
     return _torch_tpu_available
 
 
+def is_psutil_available():
+    return _psutil_available
+
+
+def is_py3nvml_available():
+    return _py3nvml_available
+
+
+def is_apex_available():
+    return _has_apex
+
+
 def add_start_docstrings(*docstr):
     def docstring_decorator(fn):
         fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 7777f9fa7b233f..13381dbfe71957 100644
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -20,23 +20,16 @@
 from tqdm.auto import tqdm, trange
 
 from .data.data_collator import DataCollator, default_data_collator
+from .file_utils import is_apex_available, is_torch_tpu_available
 from .modeling_utils import PreTrainedModel
 from .optimization import AdamW, get_linear_schedule_with_warmup
 from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput, TrainOutput, is_wandb_available
-from .training_args import TrainingArguments, is_torch_tpu_available
+from .training_args import TrainingArguments
 
 
-try:
+if is_apex_available():
     from apex import amp
 
-    _has_apex = True
-except ImportError:
-    _has_apex = False
-
-
-def is_apex_available():
-    return _has_apex
-
 
 if is_torch_tpu_available():
     import torch_xla.core.xla_model as xm
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index b891582c500ab4..bb20af47493add 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -5,7 +5,7 @@
 
 from transformers import AutoConfig, is_torch_available
 
-from .utils import require_torch
+from .utils import require_torch, torch_device
 
 
 if is_torch_available():
@@ -26,7 +26,12 @@ def check_results_dict_not_empty(self, results):
     def test_inference_no_configs(self):
         MODEL_ID = "sshleifer/tiny-gpt2"
         benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=False, no_inference=False, sequence_lengths=[8], batch_sizes=[1]
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
         )
         benchmark = PyTorchBenchmark(benchmark_args)
         results = benchmark.run()
@@ -42,6 +47,24 @@ def test_inference_torchscript(self):
             torchscript=True,
             sequence_lengths=[8],
             batch_sizes=[1],
+            no_multi_process=True,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_inference_fp16(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            fp16=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
         )
         benchmark = PyTorchBenchmark(benchmark_args)
         results = benchmark.run()
@@ -51,7 +74,29 @@ def test_inference_torchscript(self):
     def test_train_no_configs(self):
         MODEL_ID = "sshleifer/tiny-gpt2"
         benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=True, no_inference=True, sequence_lengths=[8], batch_sizes=[1]
+            models=[MODEL_ID],
+            training=True,
+            no_inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_train_result)
+        self.check_results_dict_not_empty(results.memory_train_result)
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_train_no_configs_fp16(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=True,
+            no_inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            fp16=True,
+            no_multi_process=True,
         )
         benchmark = PyTorchBenchmark(benchmark_args)
         results = benchmark.run()
@@ -62,7 +107,12 @@ def test_inference_with_configs(self):
         MODEL_ID = "sshleifer/tiny-gpt2"
         config = AutoConfig.from_pretrained(MODEL_ID)
         benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=False, no_inference=False, sequence_lengths=[8], batch_sizes=[1]
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
         )
         benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
         results = benchmark.run()
@@ -73,7 +123,12 @@ def test_inference_encoder_decoder_with_configs(self):
         MODEL_ID = "sshleifer/tinier_bart"
         config = AutoConfig.from_pretrained(MODEL_ID)
         benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=False, no_inference=False, sequence_lengths=[8], batch_sizes=[1]
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
         )
         benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
         results = benchmark.run()
@@ -81,26 +136,15 @@ def test_inference_encoder_decoder_with_configs(self):
         self.check_results_dict_not_empty(results.memory_inference_result)
 
     def test_train_with_configs(self):
-        MODEL_ID = "sshleifer/tiny-gpt2"
-        config = AutoConfig.from_pretrained(MODEL_ID)
-        benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=True, no_inference=True, sequence_lengths=[8], batch_sizes=[1]
-        )
-        benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
-        results = benchmark.run()
-        self.check_results_dict_not_empty(results.time_train_result)
-        self.check_results_dict_not_empty(results.memory_train_result)
-
-    def test_train_with_configs_torchscript(self):
         MODEL_ID = "sshleifer/tiny-gpt2"
         config = AutoConfig.from_pretrained(MODEL_ID)
         benchmark_args = PyTorchBenchmarkArguments(
             models=[MODEL_ID],
             training=True,
             no_inference=True,
-            torchscript=True,
             sequence_lengths=[8],
             batch_sizes=[1],
+            no_multi_process=True,
         )
         benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
         results = benchmark.run()
@@ -111,7 +155,12 @@ def test_train_encoder_decoder_with_configs(self):
         MODEL_ID = "sshleifer/tinier_bart"
         config = AutoConfig.from_pretrained(MODEL_ID)
         benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=True, no_inference=True, sequence_lengths=[8], batch_sizes=[1]
+            models=[MODEL_ID],
+            training=True,
+            no_inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
         )
         benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
         results = benchmark.run()
@@ -133,6 +182,7 @@ def test_save_csv_files(self):
                 inference_memory_csv_file=os.path.join(tmp_dir, "inf_mem.csv"),
                 train_time_csv_file=os.path.join(tmp_dir, "train_time.csv"),
                 env_info_csv_file=os.path.join(tmp_dir, "env.csv"),
+                no_multi_process=True,
             )
             benchmark = PyTorchBenchmark(benchmark_args)
             benchmark.run()
@@ -161,6 +211,7 @@ def _check_summary_is_not_empty(summary):
                 log_filename=os.path.join(tmp_dir, "log.txt"),
                 log_print=True,
                 trace_memory_line_by_line=True,
+                no_multi_process=True,
             )
             benchmark = PyTorchBenchmark(benchmark_args)
             result = benchmark.run()
diff --git a/tests/test_benchmark_tf.py b/tests/test_benchmark_tf.py
new file mode 100644
index 00000000000000..b23ff51e509849
--- /dev/null
+++ b/tests/test_benchmark_tf.py
@@ -0,0 +1,165 @@
+import os
+import tempfile
+import unittest
+from pathlib import Path
+
+from transformers import AutoConfig, is_tf_available
+
+from .utils import require_tf
+
+
+if is_tf_available():
+    import tensorflow as tf
+    from transformers import TensorflowBenchmark, TensorflowBenchmarkArguments
+
+
+@require_tf
+class TFBenchmarkTest(unittest.TestCase):
+    def check_results_dict_not_empty(self, results):
+        for model_result in results.values():
+            for batch_size, sequence_length in zip(model_result["bs"], model_result["ss"]):
+                result = model_result["result"][batch_size][sequence_length]
+                self.assertIsNotNone(result)
+
+    def test_inference_no_configs_eager(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            eager_mode=True,
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_no_configs_graph(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_with_configs_eager(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            eager_mode=True,
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args, [config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_with_configs_graph(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args, [config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_encoder_decoder_with_configs(self):
+        MODEL_ID = "patrickvonplaten/t5-tiny-random"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args, configs=[config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    @unittest.skipIf(is_tf_available() and len(tf.config.list_physical_devices("GPU")) == 0, "Cannot do xla on CPU.")
+    def test_inference_no_configs_xla(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            use_xla=True,
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_save_csv_files(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            benchmark_args = TensorflowBenchmarkArguments(
+                models=[MODEL_ID],
+                no_inference=False,
+                save_to_csv=True,
+                sequence_lengths=[8],
+                batch_sizes=[1],
+                inference_time_csv_file=os.path.join(tmp_dir, "inf_time.csv"),
+                inference_memory_csv_file=os.path.join(tmp_dir, "inf_mem.csv"),
+                env_info_csv_file=os.path.join(tmp_dir, "env.csv"),
+                no_multi_process=True,
+            )
+            benchmark = TensorflowBenchmark(benchmark_args)
+            benchmark.run()
+            self.assertTrue(Path(os.path.join(tmp_dir, "inf_time.csv")).exists())
+            self.assertTrue(Path(os.path.join(tmp_dir, "inf_mem.csv")).exists())
+            self.assertTrue(Path(os.path.join(tmp_dir, "env.csv")).exists())
+
+    def test_trace_memory(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+
+        def _check_summary_is_not_empty(summary):
+            self.assertTrue(hasattr(summary, "sequential"))
+            self.assertTrue(hasattr(summary, "cumulative"))
+            self.assertTrue(hasattr(summary, "current"))
+            self.assertTrue(hasattr(summary, "total"))
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            benchmark_args = TensorflowBenchmarkArguments(
+                models=[MODEL_ID],
+                no_inference=False,
+                sequence_lengths=[8],
+                batch_sizes=[1],
+                log_filename=os.path.join(tmp_dir, "log.txt"),
+                log_print=True,
+                trace_memory_line_by_line=True,
+                eager_mode=True,
+                no_multi_process=True,
+            )
+            benchmark = TensorflowBenchmark(benchmark_args)
+            result = benchmark.run()
+            _check_summary_is_not_empty(result.inference_summary)
+            self.assertTrue(Path(os.path.join(tmp_dir, "log.txt")).exists())