From 211cb6f082c5cc3c482e37d70234142a8fda2db3 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Wed, 20 Mar 2019 15:20:37 -0700 Subject: [PATCH] Disable WMT in preparation for rewrite (#254) PiperOrigin-RevId: 239486187 --- docs/datasets.md | 170 ---------------------- tensorflow_datasets/translate/wmt.py | 1 + tensorflow_datasets/translate/wmt_ende.py | 1 + tensorflow_datasets/translate/wmt_enfr.py | 1 + 4 files changed, 3 insertions(+), 170 deletions(-) diff --git a/docs/datasets.md b/docs/datasets.md index eab005aface..f43865c165e 100644 --- a/docs/datasets.md +++ b/docs/datasets.md @@ -65,8 +65,6 @@ np_datasets = tfds.as_numpy(datasets) * [`"flores_translate_neen"`](#flores_translate_neen) * [`"flores_translate_sien"`](#flores_translate_sien) * [`"ted_multi_translate"`](#ted_multi_translate) - * [`"wmt_translate_ende"`](#wmt_translate_ende) - * [`"wmt_translate_enfr"`](#wmt_translate_enfr) * [`video`](#video) * [`"bair_robot_pushing_small"`](#bair_robot_pushing_small) * [`"moving_mnist"`](#moving_mnist) @@ -2122,174 +2120,6 @@ VALIDATION | 6,049 --- -### `"wmt_translate_ende"` - -Translate dataset based on the data from statmt.org. - - -* URL: [http://www.statmt.org/wmt18/](http://www.statmt.org/wmt18/) -* `DatasetBuilder`: [`tfds.translate.wmt_ende.WmtTranslateEnde`](https://github.com/tensorflow/datasets/tree/master/tensorflow_datasets/translate/wmt_ende.py) - -`wmt_translate_ende` is configured with `tfds.translate.wmt_ende.WMTConfig` and has the following -configurations predefined (defaults to the first one): - -* `"ende_plain_text_t2t"` (`v0.0.2`) (`Size: 1.60 GiB`): Translation dataset from en to de, uses encoder plain_text. It uses the following data files (see the code for exact contents): {"dev": ["wmt17_newstest13"], "train": ["wmt18_news_commentary_ende", "wmt13_commoncrawl_ende", "wmt13_europarl_ende"]}. - -* `"ende_subwords8k_t2t"` (`v0.0.2`) (`Size: 1.60 GiB`): Translation dataset from en to de, uses encoder subwords8k. It uses the following data files (see the code for exact contents): {"dev": ["wmt17_newstest13"], "train": ["wmt18_news_commentary_ende", "wmt13_commoncrawl_ende", "wmt13_europarl_ende"]}. - - -#### `"wmt_translate_ende/ende_plain_text_t2t"` - -```python -Translation({ - 'de': Text(shape=(), dtype=tf.string, encoder=None), - 'en': Text(shape=(), dtype=tf.string, encoder=None), -}) -``` - - - -#### `"wmt_translate_ende/ende_subwords8k_t2t"` - -```python -Translation({ - 'de': Text(shape=(None,), dtype=tf.int64, encoder=), - 'en': Text(shape=(None,), dtype=tf.int64, encoder=), -}) -``` - - - - -#### Statistics -Split | Examples -:----- | ---: -ALL | 4,595,289 -TRAIN | 4,592,289 -VALIDATION | 3,000 - - -#### Urls - * [http://www.statmt.org/wmt18/](http://www.statmt.org/wmt18/) - -#### Supervised keys (for `as_supervised=True`) -`(u'en', u'de')` - -#### Citation -``` -@InProceedings{bojar-EtAl:2018:WMT1, - author = {Bojar, Ond {r}ej and Federmann, Christian and Fishel, Mark - and Graham, Yvette and Haddow, Barry and Huck, Matthias and - Koehn, Philipp and Monz, Christof}, - title = {Findings of the 2018 Conference on Machine Translation (WMT18)}, - booktitle = {Proceedings of the Third Conference on Machine Translation, - Volume 2: Shared Task Papers}, - month = {October}, - year = {2018}, - address = {Belgium, Brussels}, - publisher = {Association for Computational Linguistics}, - pages = {272--307}, - url = {http://www.aclweb.org/anthology/W18-6401} -} -``` - ---- - -### `"wmt_translate_enfr"` - -Translate dataset based on the data from statmt.org. - - -* URL: [http://www.statmt.org/wmt18/](http://www.statmt.org/wmt18/) -* `DatasetBuilder`: [`tfds.translate.wmt_enfr.WmtTranslateEnfr`](https://github.com/tensorflow/datasets/tree/master/tensorflow_datasets/translate/wmt_enfr.py) - -`wmt_translate_enfr` is configured with `tfds.translate.wmt_enfr.WMTConfig` and has the following -configurations predefined (defaults to the first one): - -* `"enfr_plain_text_t2t_small"` (`v0.0.2`) (`Size: ?? GiB`): Translation dataset from en to fr, uses encoder plain_text. It uses the following data files (see the code for exact contents): {"dev": ["opennmt_1M_enfr_valid"], "train": ["opennmt_1M_enfr_train"]}. - -* `"enfr_subwords8k_t2t_small"` (`v0.0.2`) (`Size: ?? GiB`): Translation dataset from en to fr, uses encoder subwords8k. It uses the following data files (see the code for exact contents): {"dev": ["opennmt_1M_enfr_valid"], "train": ["opennmt_1M_enfr_train"]}. - -* `"enfr_plain_text_t2t_large"` (`v0.0.2`) (`Size: ?? GiB`): Translation dataset from en to fr, uses encoder plain_text. It uses the following data files (see the code for exact contents): {"dev": ["wmt17_newstest13"], "train": ["wmt13_commoncrawl_enfr", "wmt13_europarl_enfr", "wmt14_news_commentary_enfr", "wmt13_undoc_enfr"]}. - -* `"enfr_subwords8k_t2t_large"` (`v0.0.2`) (`Size: ?? GiB`): Translation dataset from en to fr, uses encoder subwords8k. It uses the following data files (see the code for exact contents): {"dev": ["wmt17_newstest13"], "train": ["wmt13_commoncrawl_enfr", "wmt13_europarl_enfr", "wmt14_news_commentary_enfr", "wmt13_undoc_enfr"]}. - - -#### `"wmt_translate_enfr/enfr_plain_text_t2t_small"` - -```python -Translation({ - 'en': Text(shape=(), dtype=tf.string, encoder=None), - 'fr': Text(shape=(), dtype=tf.string, encoder=None), -}) -``` - - - -#### `"wmt_translate_enfr/enfr_subwords8k_t2t_small"` - -```python -Translation({ - 'en': Text(shape=(), dtype=tf.string, encoder=None), - 'fr': Text(shape=(), dtype=tf.string, encoder=None), -}) -``` - - - -#### `"wmt_translate_enfr/enfr_plain_text_t2t_large"` - -```python -Translation({ - 'en': Text(shape=(), dtype=tf.string, encoder=None), - 'fr': Text(shape=(), dtype=tf.string, encoder=None), -}) -``` - - - -#### `"wmt_translate_enfr/enfr_subwords8k_t2t_large"` - -```python -Translation({ - 'en': Text(shape=(), dtype=tf.string, encoder=None), - 'fr': Text(shape=(), dtype=tf.string, encoder=None), -}) -``` - - - - -#### Statistics -None computed - -#### Urls - * [http://www.statmt.org/wmt18/](http://www.statmt.org/wmt18/) - -#### Supervised keys (for `as_supervised=True`) -`(u'en', u'fr')` - -#### Citation -``` -@InProceedings{bojar-EtAl:2018:WMT1, - author = {Bojar, Ond {r}ej and Federmann, Christian and Fishel, Mark - and Graham, Yvette and Haddow, Barry and Huck, Matthias and - Koehn, Philipp and Monz, Christof}, - title = {Findings of the 2018 Conference on Machine Translation (WMT18)}, - booktitle = {Proceedings of the Third Conference on Machine Translation, - Volume 2: Shared Task Papers}, - month = {October}, - year = {2018}, - address = {Belgium, Brussels}, - publisher = {Association for Computational Linguistics}, - pages = {272--307}, - url = {http://www.aclweb.org/anthology/W18-6401} -} -``` - ---- - - ## [`video`](#video) ### `"bair_robot_pushing_small"` diff --git a/tensorflow_datasets/translate/wmt.py b/tensorflow_datasets/translate/wmt.py index faf9ad0d2d2..1c3bf41fff9 100644 --- a/tensorflow_datasets/translate/wmt.py +++ b/tensorflow_datasets/translate/wmt.py @@ -104,6 +104,7 @@ def __init__(self, class WmtTranslate(tfds.core.GeneratorBasedBuilder): """WMT translation dataset.""" _URL = "http://www.statmt.org/wmt18/" + IN_DEVELOPMENT = True @abc.abstractproperty def translate_datasets(self): diff --git a/tensorflow_datasets/translate/wmt_ende.py b/tensorflow_datasets/translate/wmt_ende.py index a08ac017b33..360c5a18eca 100644 --- a/tensorflow_datasets/translate/wmt_ende.py +++ b/tensorflow_datasets/translate/wmt_ende.py @@ -61,6 +61,7 @@ class WmtTranslateEnde(wmt.WmtTranslate): """WMT English-German translation dataset.""" + IN_DEVELOPMENT = True BUILDER_CONFIGS = [ wmt.WMTConfig( diff --git a/tensorflow_datasets/translate/wmt_enfr.py b/tensorflow_datasets/translate/wmt_enfr.py index dcc4fc83e87..8c0f9d0e8b3 100644 --- a/tensorflow_datasets/translate/wmt_enfr.py +++ b/tensorflow_datasets/translate/wmt_enfr.py @@ -90,6 +90,7 @@ class WmtTranslateEnfr(wmt.WmtTranslate): """English-French WMT translation dataset.""" + IN_DEVELOPMENT = True BUILDER_CONFIGS = [ # EN-FR translations (matching the data used by Tensor2Tensor library).