diff --git a/paddlenlp/datasets/__init__.py b/paddlenlp/datasets/__init__.py index 0bff4b4cc222f..75737cc09483a 100644 --- a/paddlenlp/datasets/__init__.py +++ b/paddlenlp/datasets/__init__.py @@ -31,3 +31,4 @@ from .wmt14ende import * from .couplet import * from .yahoo_answer_100k import * +from .imdb import * \ No newline at end of file diff --git a/paddlenlp/datasets/imdb.py b/paddlenlp/datasets/imdb.py index 89c4e04b83acb..0c05dca011590 100644 --- a/paddlenlp/datasets/imdb.py +++ b/paddlenlp/datasets/imdb.py @@ -30,16 +30,25 @@ class Imdb(DatasetBuilder): """ + Subsets of IMDb data are available for access to customers for personal and non-commercial use. + Each dataset is contained in a gzipped, tab-separated-values (TSV) formatted file in the UTF-8 character set. + The first line in each file contains headers that describe what is in each column. Implementation of `IMDB `_ dataset. """ URL = 'https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz' MD5 = '7c2ac02c03563afcf9b574c7e56c153a' + META_INFO = collections.namedtuple('META_INFO', ('data_dir', 'md5')) + SPLITS = { + 'train': META_INFO(os.path.join('aclImdb', 'train'), None), + 'test': META_INFO(os.path.join('aclImdb', 'test'), None), + } def _get_data(self, mode, **kwargs): """Downloads dataset.""" default_root = os.path.join(DATA_HOME, self.__class__.__name__) - data_dir = os.path.join(default_root, "aclImdb", mode) + filename, _ = self.SPLITS[mode] + data_dir = os.path.join(default_root, filename) if not os.path.exists(data_dir): path = get_path_from_url(self.URL, default_root, self.MD5) return data_dir diff --git a/tests/dataset/experimental/__init__.py b/tests/dataset/experimental/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/dataset/experimental/test_imdb.py b/tests/dataset/experimental/test_imdb.py deleted file mode 100644 index 33f3ea0a03bb8..0000000000000 --- a/tests/dataset/experimental/test_imdb.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import numpy as np -import os -import unittest -from paddlenlp.datasets import load_dataset - -from common_test import CpuCommonTest -import util -import unittest - - -def get_examples(mode='train'): - examples = { - 'train': - ('I loved this movie since I was 7 and I saw it on the opening day ' - 'It was so touching and beautiful I strongly recommend seeing for ' - 'all Its a movie to watch with your family by farbr br My MPAA rating ' - 'PG13 for thematic elements prolonged scenes of disastor nuditysexuality ' - 'and some language', 1), - 'test': - ('Felix in Hollywood is a great film The version I viewed was very well ' - 'restored which is sometimes a problem with these silent era animated films ' - 'It has some of Hollywoods most famous stars making cameo animated ' - 'appearances A must for any silent film or animation enthusiast', 1) - } - return examples[mode] - - -class TestImdbTrainSet(CpuCommonTest): - def setUp(self): - self.config['path'] = 'imdb' - self.config['splits'] = 'train' - - def test_train_set(self): - expected_len = 25000 - expected_text, expected_label = get_examples(self.config['splits']) - train_ds = load_dataset(**self.config) - self.check_output_equal(len(train_ds), expected_len) - self.check_output_equal(expected_text, train_ds[36]['text']) - self.check_output_equal(expected_label, train_ds[36]['label']) - - -class TestImdbTestSet(CpuCommonTest): - def setUp(self): - self.config['path'] = 'imdb' - self.config['splits'] = 'test' - - def test_test_set(self): - expected_len = 25000 - expected_text, expected_label = get_examples(self.config['splits']) - test_ds = load_dataset(**self.config) - self.check_output_equal(len(test_ds), expected_len) - self.check_output_equal(expected_text, test_ds[23]['text']) - self.check_output_equal(expected_label, test_ds[23]['label']) - - -class TestImdbTrainTestSet(CpuCommonTest): - def setUp(self): - self.config['path'] = 'imdb' - self.config['splits'] = ['train', 'test'] - - def test_train_set(self): - expected_ds_num = 2 - expected_len = 25000 - expected_train_text, expected_train_label = get_examples('train') - expected_test_text, expected_test_label = get_examples('test') - ds = load_dataset(**self.config) - - self.check_output_equal(len(ds), expected_ds_num) - self.check_output_equal(len(ds[0]), expected_len) - self.check_output_equal(len(ds[1]), expected_len) - - self.check_output_equal(expected_train_text, ds[0][36]['text']) - self.check_output_equal(expected_train_label, ds[0][36]['label']) - self.check_output_equal(expected_test_text, ds[1][23]['text']) - self.check_output_equal(expected_test_label, ds[1][23]['label']) - - -class TestImdbNoSplitDataFiles(CpuCommonTest): - def setUp(self): - self.config['path'] = 'imdb' - - @util.assert_raises - def test_no_split_datafiles(self): - load_dataset(**self.config) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/dataset/test_imdb.py b/tests/dataset/test_imdb.py index 817de5f0e7c44..1842643ca42d2 100644 --- a/tests/dataset/test_imdb.py +++ b/tests/dataset/test_imdb.py @@ -11,65 +11,90 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import numpy as np import os import unittest -from paddlenlp.datasets import Imdb +from paddlenlp.datasets import load_dataset from common_test import CpuCommonTest import util import unittest +def get_examples(mode='train'): + examples = { + 'train': + ('I loved this movie since I was 7 and I saw it on the opening day ' + 'It was so touching and beautiful I strongly recommend seeing for ' + 'all Its a movie to watch with your family by farbr br My MPAA rating ' + 'PG13 for thematic elements prolonged scenes of disastor nuditysexuality ' + 'and some language', 1), + 'test': + ('Felix in Hollywood is a great film The version I viewed was very well ' + 'restored which is sometimes a problem with these silent era animated films ' + 'It has some of Hollywoods most famous stars making cameo animated ' + 'appearances A must for any silent film or animation enthusiast', 1) + } + return examples[mode] + + class TestImdbTrainSet(CpuCommonTest): def setUp(self): - self.config['mode'] = 'train' - np.random.seed(102) - - def test_training_set(self): - expected_text, expected_label = ( - 'Its a good movie maybe I like it because it was filmed here ' - 'in PR The actors did a good performance and not only did the ' - 'girls be girlish but they were good in fighting so it was awsome ' - 'The guy is cute too so its a good match if you want to the guy ' - 'or the girls', 1) - expected_len = 25000 + self.config['path_or_read_func'] = 'imdb' + self.config['splits'] = 'train' - train_ds = Imdb(**self.config) + def test_train_set(self): + expected_len = 25000 + expected_text, expected_label = get_examples(self.config['splits']) + train_ds = load_dataset(**self.config) self.check_output_equal(len(train_ds), expected_len) - self.check_output_equal(expected_text, train_ds[14][0]) - self.check_output_equal(expected_label, train_ds[14][1]) + self.check_output_equal(expected_text, train_ds[36]['text']) + self.check_output_equal(expected_label, train_ds[36]['label']) class TestImdbTestSet(CpuCommonTest): def setUp(self): - self.config['mode'] = 'test' - np.random.seed(102) + self.config['path_or_read_func'] = 'imdb' + self.config['splits'] = 'test' def test_test_set(self): - expected_text, expected_label = ( - 'This is one of the great ones It works so beautifully that ' - 'you hardly notice the miscasting of then 37 year old Dana ' - 'Andrews as the drugstore soda jerk who goes to war and comes ' - 'back four years later when he would have been at most 25 But ' - 'then who else should have played him', 1) expected_len = 25000 - - test_ds = Imdb(**self.config) + expected_text, expected_label = get_examples(self.config['splits']) + test_ds = load_dataset(**self.config) self.check_output_equal(len(test_ds), expected_len) - self.check_output_equal(expected_text, test_ds[2][0]) - self.check_output_equal(expected_label, test_ds[2][1]) + self.check_output_equal(expected_text, test_ds[23]['text']) + self.check_output_equal(expected_label, test_ds[23]['label']) + + +class TestImdbTrainTestSet(CpuCommonTest): + def setUp(self): + self.config['path_or_read_func'] = 'imdb' + self.config['splits'] = ['train', 'test'] + + def test_train_set(self): + expected_ds_num = 2 + expected_len = 25000 + expected_train_text, expected_train_label = get_examples('train') + expected_test_text, expected_test_label = get_examples('test') + ds = load_dataset(**self.config) + + self.check_output_equal(len(ds), expected_ds_num) + self.check_output_equal(len(ds[0]), expected_len) + self.check_output_equal(len(ds[1]), expected_len) + + self.check_output_equal(expected_train_text, ds[0][36]['text']) + self.check_output_equal(expected_train_label, ds[0][36]['label']) + self.check_output_equal(expected_test_text, ds[1][23]['text']) + self.check_output_equal(expected_test_label, ds[1][23]['label']) -class TestImdbWrongMode(CpuCommonTest): +class TestImdbNoSplitDataFiles(CpuCommonTest): def setUp(self): - # valid mode is 'train' and 'test', wrong mode would raise an error - self.config['mode'] = 'wrong' + self.config['path_or_read_func'] = 'imdb' @util.assert_raises - def test_wrong_set(self): - Imdb(**self.config) + def test_no_split_datafiles(self): + load_dataset(**self.config) if __name__ == "__main__":