From 4c2d41478ca78ce282de3d1e738e97fdc40d9379 Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Mon, 11 Apr 2022 18:28:54 +0530 Subject: [PATCH 01/21] adds split_dataset utility to dataset_utils.py --- keras/utils/dataset_utils.py | 122 +++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index c8074df9ab0..e167253f35b 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -14,14 +14,136 @@ # ============================================================================== """Keras image dataset loading utilities.""" + import tensorflow.compat.v2 as tf # pylint: disable=g-classes-have-attributes import multiprocessing import os +import time import numpy as np +from tensorflow.python.util.tf_export import keras_export + +@keras_export('keras.utils.split_dataset') +def split_dataset(dataset, + left_size=None, + right_size=None, + shuffle=False, + seed=None): + """Split a dataset into a left half and a right half (e.g. training / validation). + + Args: + dataset: A `tf.data.Dataset` object or + a list/tuple of arrays with the same length. + left_size: If float, it should be in range `[0, 1]` range + and signifies the fraction of the data to pack in + the left dataset. If integer, it signifies the number + of samples to pack in the left dataset. + If `None`, it defaults to the complement to `right_size`. + right_size: If float, it should be in range `[0, 1]` range + and signifies the fraction of the data to pack + in the right dataset. If integer, it signifies + the number of samples to pack in the right dataset. + If `None`, it defaults to the complement to `left_size`. + shuffle: Boolean, whether to shuffle the data before splitting it. + seed: A random seed for shuffling. + + Returns: + A tuple of two `tf.data.Dataset` objects: the left and right splits. + """ + + if dataset and not isinstance(dataset,(tf.data.Dataset,list)): + raise TypeError('`dataset` must be either a tf.data.Dataset object' + 'or a list/tuple of arrays',f'Got {type(dataset)}') + + dataset_as_list = [] + data_size_warning_flag = False + start_time = time.time() + + for datum in list(dataset): + cur_time = time.time() + # warns user if the dataset is too large to iterate within 10s + if int(cur_time - start_time) > 10 and not data_size_warning_flag: + Warning('Takes too long to process the dataset,' + 'the utility is only meant for small datasets that fit in memory') + data_size_warning_flag = True + dataset_as_list.append(datum) + + if shuffle: + if seed: + np.random.seed(seed) + np.random.shuffle(dataset) + + total_size = len(dataset_as_list) + + if right_size is None and left_size is None: + raise ValueError('Both `left_size` and `right_size`cannot be `None`' + ' either one of them must specified') + + # if left_size is None, it defaults to the complement to right_size + # raises error if right_size not in range [0, 1] or [0, total_size] + elif left_size is None and right_size: + if type(right_size) == float: + if right_size < 0 or right_size > 1: + raise ValueError('`right_size` must be in range `[0, 1]`') + right_size = int(right_size*total_size) + left_size = total_size - right_size + elif type(right_size) == int: + if right_size < 0 or right_size > total_size: + raise ValueError('`right_size` must be in range `[0, 'f'{total_size}]`') + left_size = total_size - right_size + + # if right_size is None, it defaults to the complement to left_size + # raise error if left_size not in range [0, 1] or [0, total_size] + elif left_size and right_size is None: + if type(left_size) == float: + if left_size < 0 or left_size > 1: + raise ValueError('`left_size` must be in range `[0, 1]`') + left_size = int(left_size*total_size) + right_size = total_size - left_size + elif type(left_size) == int: + if left_size < 0 or left_size > total_size: + raise ValueError('`left_size` must be in range `[0, 'f'{total_size}]`') + right_size = total_size - left_size + + # if both left and right sizes are specified, + # raise error if they are not in range [0, 1] or [0, total_size] + elif left_size and right_size: + if type(left_size) == int and type(right_size) == int: + if left_size < 0 or left_size > total_size: + raise ValueError('`left_size` must be in range `[0, 'f'{total_size}]`') + elif right_size < 0 or right_size > total_size: + raise ValueError('`right_size` must be in range `[0, 'f'{total_size}]`') + elif left_size + right_size != total_size: + raise ValueError('The sum of `left_size` and `right_size`' + 'must be equal to the total size of the dataset.') + elif type(left_size) == float and type(right_size) == float: + if left_size < 0 or left_size > 1: + raise ValueError('`left_size` must be in range `[0, 1]`') + elif right_size < 0 or right_size > 1: + raise ValueError('`right_size` must be in range `[0, 1]`') + elif left_size + right_size != 1: + raise ValueError('The sum of `left_size` and `right_size`' + 'must be equal to 1.') + left_size = int(left_size*total_size) + right_size = int(right_size*total_size) + else: + raise ValueError('`left_size` and `right_size` must be either ' + 'both floats or both integers.') + + left_dataset = dataset_as_list[:int(left_size)] + right_dataset = dataset_as_list[int(-1*right_size):] + + left_dataset = tf.data.Dataset.from_tensor_slices(left_dataset) + right_dataset = tf.data.Dataset.from_tensor_slices(right_dataset) + + left_dataset = left_dataset.prefetch(tf.data.AUTOTUNE) + right_dataset = right_dataset.prefetch(tf.data.AUTOTUNE) + + return left_dataset, right_dataset + def index_directory(directory, labels, From 231d90f9a4903b4bfebc8dd356ace341121b8054 Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Mon, 11 Apr 2022 19:05:57 +0530 Subject: [PATCH 02/21] fixes random shuffle bug --- keras/utils/dataset_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index e167253f35b..0fbd4fb269f 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -73,10 +73,11 @@ def split_dataset(dataset, if shuffle: if seed: np.random.seed(seed) - np.random.shuffle(dataset) + np.random.shuffle(dataset_as_list) total_size = len(dataset_as_list) + # if both left_size and right_size are None, raise error if right_size is None and left_size is None: raise ValueError('Both `left_size` and `right_size`cannot be `None`' ' either one of them must specified') From a127de7007fe49413bd9167e179f5df12b6c100e Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Tue, 12 Apr 2022 01:32:08 +0530 Subject: [PATCH 03/21] fixes dataset slicing errors --- keras/utils/dataset_utils.py | 179 +++++++++++++++++++++-------------- 1 file changed, 106 insertions(+), 73 deletions(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index 0fbd4fb269f..413c1ce04f2 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -15,12 +15,16 @@ """Keras image dataset loading utilities.""" + + import tensorflow.compat.v2 as tf # pylint: disable=g-classes-have-attributes import multiprocessing import os import time +import warnings +import math import numpy as np from tensorflow.python.util.tf_export import keras_export @@ -52,24 +56,36 @@ def split_dataset(dataset, Returns: A tuple of two `tf.data.Dataset` objects: the left and right splits. """ + # TODO (prakashsellathurai) : integrate unit test. - if dataset and not isinstance(dataset,(tf.data.Dataset,list)): + if not isinstance(dataset,(tf.data.Dataset,list,tuple)): raise TypeError('`dataset` must be either a tf.data.Dataset object' - 'or a list/tuple of arrays',f'Got {type(dataset)}') + 'or a list/tuple of arrays',f'Got {type(dataset)}') + if right_size is None and left_size is None: + raise ValueError('Both `left_size` and `right_size`cannot be `None`' + 'atleast specify one with valid value') + dataset_as_list = [] - data_size_warning_flag = False - start_time = time.time() - - for datum in list(dataset): - cur_time = time.time() - # warns user if the dataset is too large to iterate within 10s - if int(cur_time - start_time) > 10 and not data_size_warning_flag: - Warning('Takes too long to process the dataset,' - 'the utility is only meant for small datasets that fit in memory') - data_size_warning_flag = True - dataset_as_list.append(datum) - + + if isinstance(dataset,tf.data.Dataset): + data_size_warning_flag = False + start_time = time.time() + i = 0 + for datum in list(dataset): + cur_time = time.time() + # warns user if the dataset is too large to iterate within 10s + if int(cur_time - start_time) > 10 and not data_size_warning_flag: + warnings.warn('Takes too long time to process the `dataset`,' + 'this function is only for small datasets' + 'that fits within the memory') + data_size_warning_flag = True + dataset_as_list.append(datum) + elif isinstance(dataset,list): + dataset_as_list = dataset.copy() + elif isinstance(dataset,tuple): + dataset_as_list = list(zip(*dataset)) + if shuffle: if seed: np.random.seed(seed) @@ -77,65 +93,13 @@ def split_dataset(dataset, total_size = len(dataset_as_list) - # if both left_size and right_size are None, raise error - if right_size is None and left_size is None: - raise ValueError('Both `left_size` and `right_size`cannot be `None`' - ' either one of them must specified') - - # if left_size is None, it defaults to the complement to right_size - # raises error if right_size not in range [0, 1] or [0, total_size] - elif left_size is None and right_size: - if type(right_size) == float: - if right_size < 0 or right_size > 1: - raise ValueError('`right_size` must be in range `[0, 1]`') - right_size = int(right_size*total_size) - left_size = total_size - right_size - elif type(right_size) == int: - if right_size < 0 or right_size > total_size: - raise ValueError('`right_size` must be in range `[0, 'f'{total_size}]`') - left_size = total_size - right_size - - # if right_size is None, it defaults to the complement to left_size - # raise error if left_size not in range [0, 1] or [0, total_size] - elif left_size and right_size is None: - if type(left_size) == float: - if left_size < 0 or left_size > 1: - raise ValueError('`left_size` must be in range `[0, 1]`') - left_size = int(left_size*total_size) - right_size = total_size - left_size - elif type(left_size) == int: - if left_size < 0 or left_size > total_size: - raise ValueError('`left_size` must be in range `[0, 'f'{total_size}]`') - right_size = total_size - left_size - - # if both left and right sizes are specified, - # raise error if they are not in range [0, 1] or [0, total_size] - elif left_size and right_size: - if type(left_size) == int and type(right_size) == int: - if left_size < 0 or left_size > total_size: - raise ValueError('`left_size` must be in range `[0, 'f'{total_size}]`') - elif right_size < 0 or right_size > total_size: - raise ValueError('`right_size` must be in range `[0, 'f'{total_size}]`') - elif left_size + right_size != total_size: - raise ValueError('The sum of `left_size` and `right_size`' - 'must be equal to the total size of the dataset.') - elif type(left_size) == float and type(right_size) == float: - if left_size < 0 or left_size > 1: - raise ValueError('`left_size` must be in range `[0, 1]`') - elif right_size < 0 or right_size > 1: - raise ValueError('`right_size` must be in range `[0, 1]`') - elif left_size + right_size != 1: - raise ValueError('The sum of `left_size` and `right_size`' - 'must be equal to 1.') - left_size = int(left_size*total_size) - right_size = int(right_size*total_size) - else: - raise ValueError('`left_size` and `right_size` must be either ' - 'both floats or both integers.') - - left_dataset = dataset_as_list[:int(left_size)] - right_dataset = dataset_as_list[int(-1*right_size):] - + left_size,right_size = convert_dataset_split_sizes( + left_size,right_size,total_size + ) + + left_dataset = dataset_as_list[:left_size] + right_dataset = dataset_as_list[left_size:] + left_dataset = tf.data.Dataset.from_tensor_slices(left_dataset) right_dataset = tf.data.Dataset.from_tensor_slices(right_dataset) @@ -144,6 +108,75 @@ def split_dataset(dataset, return left_dataset, right_dataset +def convert_dataset_split_sizes(left_size,right_size,total_size): + """Helper function to convert left_size/right_size relative to dataset's size + """ + + left_size_type = type(left_size) + right_size_type = type(right_size) + + + if left_size is not None and left_size_type not in [int,float]: + raise ValueError(f'Invalid `left_size` type Got {left_size_type}' + 'It should be one of float,int or None') + if right_size is not None and right_size_type not in [int,float]: + raise ValueError(f'Invalid `right_size` type Got {right_size_type}' + 'It should be one of float,int or None') + + + if (left_size_type == int + and (left_size <= 0 or left_size>= total_size) + or left_size_type == float + and (left_size <= 0 or left_size>= 1) ): + raise ValueError('`left_size` should be either a positive integer' + f'and smaller than {total_size} or a float ' + 'within the range `[0, 1]`') + + if (right_size_type == int + and (right_size <= 0 or right_size>= total_size) + or right_size_type == float + and (right_size <= 0 or right_size>= 1)): + raise ValueError('`right_size` should be either a positive integer ' + f'and smaller than {total_size} or' + 'a float within the range `[0, 1]`') + + if right_size_type == left_size_type == float and right_size + left_size > 1: + raise ValueError('sum of `left_size` and `right_size`' + ' should be within `[0,1]`' + f'Got {right_size + left_size} ,' + 'reduce the `left_size` or `right_size`') + + if left_size_type == float: + left_size = math.ceil(left_size*total_size) + else: + left_size = float(left_size) + + if right_size_type == float: + right_size = math.ceil(right_size*total_size) + else: + right_size = float(right_size) + + + if left_size is None: + left_size = total_size - right_size + elif right_size is None: + right_size = total_size - left_size + + if left_size + right_size > total_size: + raise ValueError('The sum of `left_size` and `right_size`' + f' should be smaller than the samples {total_size} ' + ' reduce `left_size` or `right_size` ' ) + + + if left_size == 0: + raise ValueError(f'with dataset of length={total_size}' + '`left_size`={left_size} and `right_size`={right_size} ' + 'resulting left dataset split will be empty, ' + 'adjust any of the aforementioned parameters') + + left_size,right_size = int(left_size) ,int(right_size) + return left_size,right_size + def index_directory(directory, From b5bc434575aa3a42dde4553a18f45806949d58e6 Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Wed, 13 Apr 2022 17:46:58 +0530 Subject: [PATCH 04/21] adds test file and Bazel config for dataset_utils.py --- keras/utils/BUILD | 13 ++++++++ keras/utils/dataset_utils_test.py | 52 +++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 keras/utils/dataset_utils_test.py diff --git a/keras/utils/BUILD b/keras/utils/BUILD index 6800ebb5548..643d18ce476 100644 --- a/keras/utils/BUILD +++ b/keras/utils/BUILD @@ -301,6 +301,19 @@ tf_py_test( ], ) +tf_py_test( + name = "dataset_utils_test", + size = "small", + srcs = ["dataset_utils_test.py"], + python_version = "PY3", + deps = [ + "//:expect_absl_installed", + "//:expect_numpy_installed", + "//:expect_tensorflow_installed", + "//keras", + ], +) + tf_py_test( name = "data_utils_test", size = "medium", diff --git a/keras/utils/dataset_utils_test.py b/keras/utils/dataset_utils_test.py new file mode 100644 index 00000000000..0f6a6ff92d4 --- /dev/null +++ b/keras/utils/dataset_utils_test.py @@ -0,0 +1,52 @@ +"""Tests for dataset_utils.""" + +import tensorflow.compat.v2 as tf + +from keras.utils import dataset_utils + + +class TestSplitDataset(tf.test.TestCase): + + def test_invalid_dataset_cases(self): + with self.assertRaises(TypeError): + dataset_utils.split_dataset(dataset=None, left_size=5) + + with self.assertRaises(TypeError): + dataset_utils.split_dataset(dataset=1, left_size=5) + + with self.assertRaises(TypeError): + dataset_utils.split_dataset(dataset=float(1.2), left_size=5) + + with self.assertRaises(TypeError): + dataset_utils.split_dataset(dataset=dict({})) + + with self.assertRaises(TypeError): + dataset_utils.split_dataset(dataset=float('INF')) + + def test_valid_left_size_cases(self): + + dataset = [1,2,3] + splitted_dataset = dataset_utils.split_dataset(dataset, left_size=1,right_size=2) + assert(len(splitted_dataset) == 2) + left_dataset,right_dataset = splitted_dataset + self.assertEqual(len(left_dataset), 1) + self.assertEqual(len(right_dataset), 2) + self.assertEqual(list(left_dataset), [1]) + self.assertEqual(list(right_dataset), [2,3]) + + + def test_invalid_left_and_right_case(self): + with self.assertRaises(ValueError): + dataset_utils.split_dataset(dataset=[1,2,3], left_size=None) + + with self.assertRaises(ValueError): + dataset_utils.split_dataset([1,2,3], left_size=None,right_size=None) + + with self.assertRaises(ValueError): + dataset_utils.split_dataset([1,2,3], left_size=3,right_size=None) + + + + +if __name__ == "__main__": + tf.test.main() From f0e215987c51a4f0e98c96a726276465be2f3071 Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Thu, 14 Apr 2022 00:03:29 +0530 Subject: [PATCH 05/21] add support for basic dataset inputs --- keras/utils/dataset_utils.py | 149 ++++++++++++++++++------------ keras/utils/dataset_utils_test.py | 125 +++++++++++++++++++++---- 2 files changed, 200 insertions(+), 74 deletions(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index 413c1ce04f2..c2c1771f9a7 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -17,6 +17,7 @@ + import tensorflow.compat.v2 as tf # pylint: disable=g-classes-have-attributes @@ -24,7 +25,7 @@ import os import time import warnings -import math +from random import Random import numpy as np from tensorflow.python.util.tf_export import keras_export @@ -56,49 +57,31 @@ def split_dataset(dataset, Returns: A tuple of two `tf.data.Dataset` objects: the left and right splits. """ - # TODO (prakashsellathurai) : integrate unit test. if not isinstance(dataset,(tf.data.Dataset,list,tuple)): raise TypeError('`dataset` must be either a tf.data.Dataset object' - 'or a list/tuple of arrays',f'Got {type(dataset)}') + f' or a list/tuple of arrays. Received : {type(dataset)}') if right_size is None and left_size is None: - raise ValueError('Both `left_size` and `right_size`cannot be `None`' - 'atleast specify one with valid value') + raise ValueError('you must specify either `left_size` or `right_size`' + ' Received: `left_size`= None, and `right_size`=None') - dataset_as_list = [] - - if isinstance(dataset,tf.data.Dataset): - data_size_warning_flag = False - start_time = time.time() - i = 0 - for datum in list(dataset): - cur_time = time.time() - # warns user if the dataset is too large to iterate within 10s - if int(cur_time - start_time) > 10 and not data_size_warning_flag: - warnings.warn('Takes too long time to process the `dataset`,' - 'this function is only for small datasets' - 'that fits within the memory') - data_size_warning_flag = True - dataset_as_list.append(datum) - elif isinstance(dataset,list): - dataset_as_list = dataset.copy() - elif isinstance(dataset,tuple): - dataset_as_list = list(zip(*dataset)) + dataset_as_list = _convert_dataset_to_list(dataset) + + if seed is None: + seed = np.random.randint(1e6) if shuffle: - if seed: - np.random.seed(seed) - np.random.shuffle(dataset_as_list) + Random(seed).shuffle(dataset_as_list) - total_size = len(dataset_as_list) + total_length = len(dataset_as_list) - left_size,right_size = convert_dataset_split_sizes( - left_size,right_size,total_size + left_size,right_size = _rescale_dataset_split_sizes( + left_size,right_size,total_length ) left_dataset = dataset_as_list[:left_size] - right_dataset = dataset_as_list[left_size:] + right_dataset = dataset_as_list[-right_size:] left_dataset = tf.data.Dataset.from_tensor_slices(left_dataset) right_dataset = tf.data.Dataset.from_tensor_slices(right_dataset) @@ -108,71 +91,123 @@ def split_dataset(dataset, return left_dataset, right_dataset -def convert_dataset_split_sizes(left_size,right_size,total_size): - """Helper function to convert left_size/right_size relative to dataset's size +def _convert_dataset_to_list(dataset,data_size_warning_flag = True): + """Helper function to convert a tf.data.Dataset object or a list/tuple of numpy.ndarrays to a list + """ + + if isinstance(dataset,tuple): + dataset_iterator = list(zip(*dataset)) + elif isinstance(dataset,list): + dataset_iterator = dataset.copy() + elif isinstance(dataset,tf.data.Dataset): + dataset_iterator = list(dataset) + else: + raise TypeError('`dataset` must be either a tf.data.Dataset object' + f' or a list/tuple of arrays. Received : {type(dataset)}' + ) + + + dataset_as_list = [] + start_time = time.time() + i = 0 + for i,datum in enumerate(dataset_iterator): + if data_size_warning_flag: + if i % 10 == 0: + cur_time = time.time() + # warns user if the dataset is too large to iterate within 10s + if int(cur_time - start_time) > 10 and data_size_warning_flag: + warnings.warn('Takes too long time to process the `dataset`,' + 'this function is only for small datasets ' + '(e.g. < 10,000 samples).' + ) + data_size_warning_flag = False + + dataset_as_list.append(datum) + + return dataset_as_list + +def _rescale_dataset_split_sizes(left_size,right_size,total_length): + """Helper function to rescale left_size/right_size args relative + to dataset's size """ left_size_type = type(left_size) right_size_type = type(right_size) + if ((left_size is not None and left_size_type not in [int,float]) and + (right_size is not None and right_size_type not in [int,float])): + raise TypeError('Invalid `left_size` and `right_size` Types. ' + 'Expected: integer or float or None. ' + f' Received: {left_size_type} and {right_size_type}') if left_size is not None and left_size_type not in [int,float]: - raise ValueError(f'Invalid `left_size` type Got {left_size_type}' - 'It should be one of float,int or None') + raise TypeError(f'Invalid `left_size` Type. Received: {left_size_type}. ' + ' Expected: int or float or None') + if right_size is not None and right_size_type not in [int,float]: - raise ValueError(f'Invalid `right_size` type Got {right_size_type}' - 'It should be one of float,int or None') + raise TypeError(f'Invalid `right_size` Type. Received: {right_size_type}.' + ' Expected: int or float or None') + if left_size == 0 and right_size == 0: + raise ValueError('Invalid `left_size` and `right_size` values. ' + 'You must specify either `left_size` or `right_size` with ' + f'value greater than 0 and less than {total_length} ' + 'or a float within range [0,1] to split the dataset' + f'Received: `left_size`={left_size}, ' + f'`right_size`={right_size}') if (left_size_type == int - and (left_size <= 0 or left_size>= total_size) + and (left_size <= 0 or left_size>= total_length) or left_size_type == float and (left_size <= 0 or left_size>= 1) ): - raise ValueError('`left_size` should be either a positive integer' - f'and smaller than {total_size} or a float ' - 'within the range `[0, 1]`') + raise ValueError('`left_size` should be either a positive integer ' + f'and smaller than {total_length} or a float ' + 'within the range `[0, 1]`. Received: left_size=' + f'{left_size}') if (right_size_type == int - and (right_size <= 0 or right_size>= total_size) + and (right_size <= 0 or right_size>= total_length) or right_size_type == float and (right_size <= 0 or right_size>= 1)): raise ValueError('`right_size` should be either a positive integer ' - f'and smaller than {total_size} or' - 'a float within the range `[0, 1]`') + f'and smaller than {total_length} or ' + 'a float within the range `[0, 1]`. Received: right_size=' + f'{right_size}') if right_size_type == left_size_type == float and right_size + left_size > 1: raise ValueError('sum of `left_size` and `right_size`' - ' should be within `[0,1]`' - f'Got {right_size + left_size} ,' + ' should be within `[0,1]`.' + f'Received: {right_size + left_size} ,' 'reduce the `left_size` or `right_size`') if left_size_type == float: - left_size = math.ceil(left_size*total_size) + left_size = round(left_size*total_length) else: left_size = float(left_size) if right_size_type == float: - right_size = math.ceil(right_size*total_size) + right_size = round(right_size*total_length) else: right_size = float(right_size) if left_size is None: - left_size = total_size - right_size + left_size = total_length - right_size elif right_size is None: - right_size = total_size - left_size + right_size = total_length - left_size - if left_size + right_size > total_size: + if left_size + right_size > total_length: raise ValueError('The sum of `left_size` and `right_size`' - f' should be smaller than the samples {total_size} ' + f' should be smaller than the samples {total_length} ' ' reduce `left_size` or `right_size` ' ) - if left_size == 0: - raise ValueError(f'with dataset of length={total_size}' - '`left_size`={left_size} and `right_size`={right_size} ' - 'resulting left dataset split will be empty, ' - 'adjust any of the aforementioned parameters') + for split,side in [(left_size,'left'),(right_size,'right')]: + if split == 0: + raise ValueError(f'with dataset of length={total_length} ' + '`left_size`={left_size} and `right_size`={right_size}, ' + f'resulting {side} dataset split will be empty. ' + 'Adjust any of the aforementioned parameters') left_size,right_size = int(left_size) ,int(right_size) return left_size,right_size diff --git a/keras/utils/dataset_utils_test.py b/keras/utils/dataset_utils_test.py index 0f6a6ff92d4..45f98a95f33 100644 --- a/keras/utils/dataset_utils_test.py +++ b/keras/utils/dataset_utils_test.py @@ -2,48 +2,139 @@ import tensorflow.compat.v2 as tf +import numpy as np + from keras.utils import dataset_utils -class TestSplitDataset(tf.test.TestCase): +class SplitDatasetTest(tf.test.TestCase): - def test_invalid_dataset_cases(self): + def test_with_list_dataset(self): + dataset = [np.ones(shape=(10,10,10)) for _ in range(10)] + left_dataset,right_dataset = dataset_utils.split_dataset(dataset, + left_size=5, + right_size=5) + self.assertEqual(len(left_dataset), len(right_dataset)) + self.assertIsInstance(left_dataset, tf.data.Dataset) + self.assertIsInstance(left_dataset, tf.data.Dataset) + + dataset = [np.ones(shape=(10,10,10)) for _ in range(10)] + left_dataset,right_dataset = dataset_utils.split_dataset(dataset, + left_size=0.6, + right_size=0.4) + self.assertEqual(len(left_dataset), 6) + self.assertEqual(len(right_dataset), 4) + + + def test_with_tuple_dataset(self): + dataset = (np.ones(shape=(10,10,10)),np.zeros(shape=(10,10,10))) + left_dataset,right_dataset = dataset_utils.split_dataset(dataset, + left_size=0.75, + right_size=0.25) + self.assertLen(left_dataset, 8) + self.assertLen(right_dataset, 2) + + left_dataset,right_dataset = dataset_utils.split_dataset(dataset, + left_size=0.35, + right_size=0.65) + self.assertLen(left_dataset, 4) + self.assertLen(right_dataset, 6) + self.assertIsInstance(left_dataset, tf.data.Dataset) + self.assertIsInstance(right_dataset, tf.data.Dataset) + + + def test_with_invalid_dataset(self): with self.assertRaises(TypeError): - dataset_utils.split_dataset(dataset=None, left_size=5) - + dataset_utils.split_dataset(dataset=None, left_size=5) with self.assertRaises(TypeError): - dataset_utils.split_dataset(dataset=1, left_size=5) - + dataset_utils.split_dataset(dataset=1, left_size=5) with self.assertRaises(TypeError): - dataset_utils.split_dataset(dataset=float(1.2), left_size=5) - + dataset_utils.split_dataset(dataset=float(1.2), left_size=5) with self.assertRaises(TypeError): - dataset_utils.split_dataset(dataset=dict({})) - + dataset_utils.split_dataset(dataset=dict({}), left_size=5) with self.assertRaises(TypeError): - dataset_utils.split_dataset(dataset=float('INF')) + dataset_utils.split_dataset(dataset=float('INF'), left_size=5) - def test_valid_left_size_cases(self): + def test_with_valid_left_and_right_sizes(self): dataset = [1,2,3] - splitted_dataset = dataset_utils.split_dataset(dataset, left_size=1,right_size=2) + splitted_dataset = dataset_utils.split_dataset(dataset, + left_size=1, + right_size=2) assert(len(splitted_dataset) == 2) left_dataset,right_dataset = splitted_dataset self.assertEqual(len(left_dataset), 1) self.assertEqual(len(right_dataset), 2) self.assertEqual(list(left_dataset), [1]) self.assertEqual(list(right_dataset), [2,3]) - - def test_invalid_left_and_right_case(self): + + dataset = [1,2,3,4,5,6,7,8,9,10] + splitted_dataset = dataset_utils.split_dataset(dataset, + left_size=0.1, + right_size=0.9) + assert(len(splitted_dataset) == 2) + left_dataset,right_dataset = splitted_dataset + self.assertEqual(len(left_dataset), 1 ) + self.assertEqual(len(right_dataset), 9 ) + self.assertEqual(list(left_dataset), [1]) + self.assertEqual(list(right_dataset), [2,3,4,5,6,7,8,9,10]) + + dataset = [1,2,3,4,5,6,7,8,9,10] + splitted_dataset = dataset_utils.split_dataset(dataset, + left_size=2, + right_size=5) + assert(len(splitted_dataset) == 2) + left_dataset,right_dataset = splitted_dataset + self.assertEqual(len(left_dataset), 2 ) + self.assertEqual(len(right_dataset), 5 ) + self.assertEqual(list(left_dataset), [1,2]) + self.assertEqual(list(right_dataset), [6,7,8,9,10]) + + def test_with_float_left_and_right_sizes(self): + dataset = tf.data.Dataset.from_tensor_slices(np.array([[0.1,0.2,0.3], + [0.4,0.5,0.6], + [0.7,0.8,0.9]])) + left_dataset,right_dataset = dataset_utils.split_dataset(dataset, + left_size=0.8, + right_size=0.2) + self.assertEqual(len(left_dataset), 2) + self.assertEqual(len(right_dataset), 1) + + def test_with_invalid_float_left_and_right_sizes(self): with self.assertRaises(ValueError): - dataset_utils.split_dataset(dataset=[1,2,3], left_size=None) + dataset = [np.ones(shape=(200, 32,32)), np.zeros(shape=(200, 32,32))] + dataset_utils.split_dataset(dataset, left_size=0.8,right_size=0.2) + with self.assertRaises(ValueError): + dataset = [1] + dataset_utils.split_dataset(dataset, left_size=0.8,right_size=0.2) + + + def test_with_None_and_zero_left_and_right_size(self): + with self.assertRaises(ValueError): + dataset_utils.split_dataset(dataset=[1,2,3], left_size=None) with self.assertRaises(ValueError): dataset_utils.split_dataset([1,2,3], left_size=None,right_size=None) - with self.assertRaises(ValueError): dataset_utils.split_dataset([1,2,3], left_size=3,right_size=None) + with self.assertRaises(ValueError): + dataset_utils.split_dataset([1,2], left_size=3,right_size=None) + with self.assertRaises(ValueError): + dataset_utils.split_dataset([1,2], left_size=0,right_size=0) + + def test_with_invalid_left_and_right_size_types(self): + with self.assertRaises(TypeError): + dataset_utils.split_dataset([1,2], left_size='1',right_size='1') + with self.assertRaises(TypeError): + dataset_utils.split_dataset([1,2], left_size=0,right_size='1') + with self.assertRaises(TypeError): + dataset_utils.split_dataset([1,2], left_size='100',right_size=None) + with self.assertRaises(TypeError): + dataset_utils.split_dataset([1,2], right_size='1') + with self.assertRaises(TypeError): + dataset_utils.split_dataset([1,2], left_size=0.5,right_size='1') + From aa102b3f16a3f297e3d6a0ea25731c87aa9419a6 Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Thu, 14 Apr 2022 02:45:03 +0530 Subject: [PATCH 06/21] adds support for list/tuples of arrays with tests --- keras/utils/dataset_utils.py | 38 +++++++++++--- keras/utils/dataset_utils_test.py | 85 +++++++++++++++++++++++++------ 2 files changed, 100 insertions(+), 23 deletions(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index c2c1771f9a7..7eac4c87138 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -58,7 +58,7 @@ def split_dataset(dataset, A tuple of two `tf.data.Dataset` objects: the left and right splits. """ - if not isinstance(dataset,(tf.data.Dataset,list,tuple)): + if not isinstance(dataset,(tf.data.Dataset,list,tuple,np.ndarray)): raise TypeError('`dataset` must be either a tf.data.Dataset object' f' or a list/tuple of arrays. Received : {type(dataset)}') @@ -94,22 +94,44 @@ def split_dataset(dataset, def _convert_dataset_to_list(dataset,data_size_warning_flag = True): """Helper function to convert a tf.data.Dataset object or a list/tuple of numpy.ndarrays to a list """ - + # TODO prakashsellathurai: add support for list of tuples,tuples of nd array + # TODO prakashsellathurai: add support for Batched and unbatched tf datasets if isinstance(dataset,tuple): + if len(dataset) == 0: + raise ValueError('`dataset` must be a non-empty list/tuple of' + ' numpy.ndarrays or tf.data.Dataset objects.') dataset_iterator = list(zip(*dataset)) elif isinstance(dataset,list): - dataset_iterator = dataset.copy() + if len(dataset) == 0: + raise ValueError('`dataset` must be a non-empty list/tuple of' + ' numpy.ndarrays or tf.data.Dataset objects.') + if isinstance(dataset[0],np.ndarray): + dataset_iterator = list(zip(*dataset)) + else: + dataset_iterator = list(dataset) + + elif isinstance(dataset,np.ndarray): + dataset_iterator = list(dataset) elif isinstance(dataset,tf.data.Dataset): dataset_iterator = list(dataset) else: raise TypeError('`dataset` must be either a tf.data.Dataset object' f' or a list/tuple of arrays. Received : {type(dataset)}' - ) - + ) dataset_as_list = [] + try: + dataset_iterator = iter(dataset_iterator) + first_datum = next(dataset_iterator) + dataset_as_list.append(first_datum) + except ValueError: + raise ValueError('Received an empty Dataset i.e dataset with no elements. ' + '`dataset` must be a non-empty list/tuple of' + ' numpy.ndarrays or tf.data.Dataset objects.') + + + start_time = time.time() - i = 0 for i,datum in enumerate(dataset_iterator): if data_size_warning_flag: if i % 10 == 0: @@ -182,12 +204,12 @@ def _rescale_dataset_split_sizes(left_size,right_size,total_length): if left_size_type == float: left_size = round(left_size*total_length) - else: + elif left_size_type == int: left_size = float(left_size) if right_size_type == float: right_size = round(right_size*total_length) - else: + elif right_size_type == int: right_size = float(right_size) diff --git a/keras/utils/dataset_utils_test.py b/keras/utils/dataset_utils_test.py index 45f98a95f33..3fc4c3f2818 100644 --- a/keras/utils/dataset_utils_test.py +++ b/keras/utils/dataset_utils_test.py @@ -9,6 +9,33 @@ class SplitDatasetTest(tf.test.TestCase): + def test_with_basic_dataset_values(self): + # numpy array + dataset=np.ones(shape=(200, 32)) + res = dataset_utils.split_dataset(dataset, left_size=0.8,right_size=0.2) + self.assertLen(res, 2) + left_dataset,right_dataset = res + self.assertIsInstance(left_dataset, tf.data.Dataset) + self.assertIsInstance(right_dataset, tf.data.Dataset) + + self.assertLen(left_dataset, 160) + self.assertLen(right_dataset, 40) + self.assertAllEqual(dataset[:160] ,np.array(list(left_dataset))) + self.assertAllEqual(dataset[-40:] ,np.array(list(right_dataset))) + + # list of numpy array + dataset=[np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))] + res = dataset_utils.split_dataset(dataset, left_size=4) + self.assertLen(res, 2) + left_dataset,right_dataset = res + self.assertIsInstance(left_dataset, tf.data.Dataset) + self.assertIsInstance(right_dataset, tf.data.Dataset) + self.assertLen(left_dataset, 4) + self.assertLen(right_dataset, 196) + self.assertAllEqual(list(zip(*dataset))[:4] ,np.array(list(left_dataset))) + self.assertAllEqual(list(zip(*dataset))[4:] ,np.array(list(right_dataset))) + self.assertAllEqual(np.array(list(left_dataset)+list(right_dataset)), list(zip(*dataset))) + def test_with_list_dataset(self): dataset = [np.ones(shape=(10,10,10)) for _ in range(10)] left_dataset,right_dataset = dataset_utils.split_dataset(dataset, @@ -42,7 +69,6 @@ def test_with_tuple_dataset(self): self.assertIsInstance(left_dataset, tf.data.Dataset) self.assertIsInstance(right_dataset, tf.data.Dataset) - def test_with_invalid_dataset(self): with self.assertRaises(TypeError): dataset_utils.split_dataset(dataset=None, left_size=5) @@ -57,7 +83,7 @@ def test_with_invalid_dataset(self): def test_with_valid_left_and_right_sizes(self): - dataset = [1,2,3] + dataset = np.array([1,2,3]) splitted_dataset = dataset_utils.split_dataset(dataset, left_size=1, right_size=2) @@ -69,7 +95,36 @@ def test_with_valid_left_and_right_sizes(self): self.assertEqual(list(right_dataset), [2,3]) - dataset = [1,2,3,4,5,6,7,8,9,10] + dataset=np.ones(shape=(200, 32)) + res = dataset_utils.split_dataset(dataset, left_size=150,right_size=50) + self.assertLen(res, 2) + self.assertIsInstance(res[0], tf.data.Dataset) + self.assertIsInstance(res[1], tf.data.Dataset) + + self.assertLen(res[0], 150) + self.assertLen(res[1], 50) + + dataset=np.ones(shape=(200, 32)) + res = dataset_utils.split_dataset(dataset, left_size=120) + self.assertLen(res, 2) + self.assertIsInstance(res[0], tf.data.Dataset) + self.assertIsInstance(res[1], tf.data.Dataset) + + self.assertLen(res[0], 120) + self.assertLen(res[1], 80) + + + dataset=np.ones(shape=(10000, 16)) + res = dataset_utils.split_dataset(dataset, right_size=20) + self.assertLen(res, 2) + self.assertIsInstance(res[0], tf.data.Dataset) + self.assertIsInstance(res[1], tf.data.Dataset) + + self.assertLen(res[0], 9980) + self.assertLen(res[1], 20) + + + dataset = np.array([1,2,3,4,5,6,7,8,9,10]) splitted_dataset = dataset_utils.split_dataset(dataset, left_size=0.1, right_size=0.9) @@ -80,7 +135,7 @@ def test_with_valid_left_and_right_sizes(self): self.assertEqual(list(left_dataset), [1]) self.assertEqual(list(right_dataset), [2,3,4,5,6,7,8,9,10]) - dataset = [1,2,3,4,5,6,7,8,9,10] + dataset = np.array([1,2,3,4,5,6,7,8,9,10]) splitted_dataset = dataset_utils.split_dataset(dataset, left_size=2, right_size=5) @@ -104,7 +159,7 @@ def test_with_float_left_and_right_sizes(self): def test_with_invalid_float_left_and_right_sizes(self): with self.assertRaises(ValueError): dataset = [np.ones(shape=(200, 32,32)), np.zeros(shape=(200, 32,32))] - dataset_utils.split_dataset(dataset, left_size=0.8,right_size=0.2) + dataset_utils.split_dataset(dataset, left_size=1.5,right_size=0.2) with self.assertRaises(ValueError): dataset = [1] dataset_utils.split_dataset(dataset, left_size=0.8,right_size=0.2) @@ -113,27 +168,27 @@ def test_with_invalid_float_left_and_right_sizes(self): def test_with_None_and_zero_left_and_right_size(self): with self.assertRaises(ValueError): - dataset_utils.split_dataset(dataset=[1,2,3], left_size=None) + dataset_utils.split_dataset(dataset=np.array([1,2,3]), left_size=None) with self.assertRaises(ValueError): - dataset_utils.split_dataset([1,2,3], left_size=None,right_size=None) + dataset_utils.split_dataset(np.array([1,2,3]), left_size=None,right_size=None) with self.assertRaises(ValueError): - dataset_utils.split_dataset([1,2,3], left_size=3,right_size=None) + dataset_utils.split_dataset(np.array([1,2,3]), left_size=3,right_size=None) with self.assertRaises(ValueError): - dataset_utils.split_dataset([1,2], left_size=3,right_size=None) + dataset_utils.split_dataset(np.array([1,2,3]), left_size=3,right_size=None) with self.assertRaises(ValueError): - dataset_utils.split_dataset([1,2], left_size=0,right_size=0) + dataset_utils.split_dataset(np.array([1,2,3]), left_size=0,right_size=0) def test_with_invalid_left_and_right_size_types(self): with self.assertRaises(TypeError): - dataset_utils.split_dataset([1,2], left_size='1',right_size='1') + dataset_utils.split_dataset(np.array([1,2,3]), left_size='1',right_size='1') with self.assertRaises(TypeError): - dataset_utils.split_dataset([1,2], left_size=0,right_size='1') + dataset_utils.split_dataset(np.array([1,2,3]), left_size=0,right_size='1') with self.assertRaises(TypeError): - dataset_utils.split_dataset([1,2], left_size='100',right_size=None) + dataset_utils.split_dataset(np.array([1,2,3]), left_size='100',right_size=None) with self.assertRaises(TypeError): - dataset_utils.split_dataset([1,2], right_size='1') + dataset_utils.split_dataset(np.array([1,2,3]), right_size='1') with self.assertRaises(TypeError): - dataset_utils.split_dataset([1,2], left_size=0.5,right_size='1') + dataset_utils.split_dataset(np.array([1,2,3]), left_size=0.5,right_size='1') From f6a1bda81886a721413eb21a12fcbd69b3f14dfa Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Thu, 14 Apr 2022 02:45:30 +0530 Subject: [PATCH 07/21] update dataset_utils.py --- keras/utils/dataset_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index 7eac4c87138..8bb1ceb07b1 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -94,7 +94,7 @@ def split_dataset(dataset, def _convert_dataset_to_list(dataset,data_size_warning_flag = True): """Helper function to convert a tf.data.Dataset object or a list/tuple of numpy.ndarrays to a list """ - # TODO prakashsellathurai: add support for list of tuples,tuples of nd array + # TODO prakashsellathurai: add failing test cases for list of tuples,tuples of nd array # TODO prakashsellathurai: add support for Batched and unbatched tf datasets if isinstance(dataset,tuple): if len(dataset) == 0: From 6937c383a5e06a653ef0d421cd65679eccd5764d Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Thu, 14 Apr 2022 23:55:08 +0530 Subject: [PATCH 08/21] adds support for batched list/tuples from batched tf dataset --- keras/utils/dataset_utils.py | 92 +++++++++------- keras/utils/dataset_utils_test.py | 174 ++++++++++++++++++++---------- 2 files changed, 170 insertions(+), 96 deletions(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index 8bb1ceb07b1..c791567f8f5 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -76,50 +76,51 @@ def split_dataset(dataset, total_length = len(dataset_as_list) - left_size,right_size = _rescale_dataset_split_sizes( - left_size,right_size,total_length - ) + left_size,right_size = _rescale_dataset_split_sizes(left_size, + right_size, + total_length) - left_dataset = dataset_as_list[:left_size] - right_dataset = dataset_as_list[-right_size:] + left_split = dataset_as_list[:left_size] + right_split = dataset_as_list[-right_size:] - left_dataset = tf.data.Dataset.from_tensor_slices(left_dataset) - right_dataset = tf.data.Dataset.from_tensor_slices(right_dataset) + left_split = tf.data.Dataset.from_tensor_slices(left_split) + right_split = tf.data.Dataset.from_tensor_slices(right_split) - left_dataset = left_dataset.prefetch(tf.data.AUTOTUNE) - right_dataset = right_dataset.prefetch(tf.data.AUTOTUNE) + left_split = left_split.prefetch(tf.data.AUTOTUNE) + right_split = right_split.prefetch(tf.data.AUTOTUNE) - return left_dataset, right_dataset + return left_split, right_split def _convert_dataset_to_list(dataset,data_size_warning_flag = True): """Helper function to convert a tf.data.Dataset object or a list/tuple of numpy.ndarrays to a list """ - # TODO prakashsellathurai: add failing test cases for list of tuples,tuples of nd array - # TODO prakashsellathurai: add support for Batched and unbatched tf datasets - if isinstance(dataset,tuple): - if len(dataset) == 0: - raise ValueError('`dataset` must be a non-empty list/tuple of' - ' numpy.ndarrays or tf.data.Dataset objects.') - dataset_iterator = list(zip(*dataset)) - elif isinstance(dataset,list): + # TODO (prakashsellathurai): add support for Batched and unbatched dict tf datasets + if isinstance(dataset,(tuple,list)): if len(dataset) == 0: raise ValueError('`dataset` must be a non-empty list/tuple of' ' numpy.ndarrays or tf.data.Dataset objects.') + + if isinstance(dataset[0],np.ndarray): - dataset_iterator = list(zip(*dataset)) + if not all(element.shape == dataset[0].shape for element in dataset): + raise ValueError('all elements of `dataset` must have the same shape.') + + dataset_iterator = iter(zip(*dataset)) else: - dataset_iterator = list(dataset) + dataset_iterator = iter(dataset) elif isinstance(dataset,np.ndarray): - dataset_iterator = list(dataset) + dataset_iterator = iter(dataset) elif isinstance(dataset,tf.data.Dataset): - dataset_iterator = list(dataset) + if is_batched(dataset): + dataset = dataset.unbatch() + dataset_iterator = iter(dataset) else: raise TypeError('`dataset` must be either a tf.data.Dataset object' - f' or a list/tuple of arrays. Received : {type(dataset)}' - ) + f' or a list/tuple of arrays. Received : {type(dataset)}') dataset_as_list = [] + try: dataset_iterator = iter(dataset_iterator) first_datum = next(dataset_iterator) @@ -128,23 +129,25 @@ def _convert_dataset_to_list(dataset,data_size_warning_flag = True): raise ValueError('Received an empty Dataset i.e dataset with no elements. ' '`dataset` must be a non-empty list/tuple of' ' numpy.ndarrays or tf.data.Dataset objects.') - - - start_time = time.time() - for i,datum in enumerate(dataset_iterator): - if data_size_warning_flag: - if i % 10 == 0: - cur_time = time.time() - # warns user if the dataset is too large to iterate within 10s - if int(cur_time - start_time) > 10 and data_size_warning_flag: - warnings.warn('Takes too long time to process the `dataset`,' - 'this function is only for small datasets ' - '(e.g. < 10,000 samples).' - ) - data_size_warning_flag = False - - dataset_as_list.append(datum) + if isinstance(first_datum,dict): + raise TypeError('`dataset` must be either a tf.data.Dataset object' + ' or a list/tuple of arrays. ' + 'Received : tf.data.Dataset with dict elements') + else: + start_time = time.time() + for i,datum in enumerate(dataset_iterator): + if data_size_warning_flag: + if i % 10 == 0: + cur_time = time.time() + # warns user if the dataset is too large to iterate within 10s + if int(cur_time - start_time) > 10 and data_size_warning_flag: + warnings.warn('Takes too long time to process the `dataset`,' + 'this function is only for small datasets ' + '(e.g. < 10,000 samples).') + data_size_warning_flag = False + + dataset_as_list.append(datum) return dataset_as_list @@ -235,6 +238,15 @@ def _rescale_dataset_split_sizes(left_size,right_size,total_length): return left_size,right_size +def is_batched(tf_dataset): + """returns true if given tf dataset is batched or false if not + + refer: https://stackoverflow.com/a/66101853/8336491 + """ + try: + return tf_dataset.__class__.__name__ == 'BatchDataset' + except : + return False def index_directory(directory, labels, diff --git a/keras/utils/dataset_utils_test.py b/keras/utils/dataset_utils_test.py index 3fc4c3f2818..64ca5e3c2de 100644 --- a/keras/utils/dataset_utils_test.py +++ b/keras/utils/dataset_utils_test.py @@ -14,60 +14,107 @@ def test_with_basic_dataset_values(self): dataset=np.ones(shape=(200, 32)) res = dataset_utils.split_dataset(dataset, left_size=0.8,right_size=0.2) self.assertLen(res, 2) - left_dataset,right_dataset = res - self.assertIsInstance(left_dataset, tf.data.Dataset) - self.assertIsInstance(right_dataset, tf.data.Dataset) + left_split,right_split = res + self.assertIsInstance(left_split, tf.data.Dataset) + self.assertIsInstance(right_split, tf.data.Dataset) + + self.assertLen(left_split, 160) + self.assertLen(right_split, 40) + self.assertAllEqual(dataset[:160] ,list(left_split)) + self.assertAllEqual(dataset[-40:] ,list(right_split)) - self.assertLen(left_dataset, 160) - self.assertLen(right_dataset, 40) - self.assertAllEqual(dataset[:160] ,np.array(list(left_dataset))) - self.assertAllEqual(dataset[-40:] ,np.array(list(right_dataset))) # list of numpy array dataset=[np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))] res = dataset_utils.split_dataset(dataset, left_size=4) self.assertLen(res, 2) - left_dataset,right_dataset = res - self.assertIsInstance(left_dataset, tf.data.Dataset) - self.assertIsInstance(right_dataset, tf.data.Dataset) - self.assertLen(left_dataset, 4) - self.assertLen(right_dataset, 196) - self.assertAllEqual(list(zip(*dataset))[:4] ,np.array(list(left_dataset))) - self.assertAllEqual(list(zip(*dataset))[4:] ,np.array(list(right_dataset))) - self.assertAllEqual(np.array(list(left_dataset)+list(right_dataset)), list(zip(*dataset))) + left_split,right_split = res + self.assertIsInstance(left_split, tf.data.Dataset) + self.assertIsInstance(right_split, tf.data.Dataset) + self.assertLen(left_split, 4) + self.assertLen(right_split, 196) + self.assertAllEqual(list(zip(*dataset))[:4] ,list(left_split)) + self.assertAllEqual(list(zip(*dataset))[4:] ,list(right_split)) + self.assertAllEqual(list(left_split)+list(right_split), + list(zip(*dataset))) + + + # fail with invalid shape + with self.assertRaises(ValueError): + dataset=[np.ones(shape=(200, 32)), np.zeros(shape=(100, 32))] + dataset_utils.split_dataset(dataset, left_size=4) + + with self.assertRaises(ValueError): + dataset=(np.ones(shape=(200, 32)), np.zeros(shape=(201, 32))) + dataset_utils.split_dataset(dataset, left_size=4) + + # list with single np array + dataset=[np.ones(shape=(200, 32))] + left_split,right_split = dataset_utils.split_dataset(dataset, + left_size=4) + self.assertAllEqual(list(zip(*dataset)), + list(left_split)+list(right_split)) + + # Tule of np arrays + dataset=(np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))) + left_split,right_split = dataset_utils.split_dataset(dataset, + left_size=80) + self.assertAllEqual(list(zip(*dataset))[:80] ,list(left_split)) + self.assertAllEqual(list(zip(*dataset))[80:] ,list(right_split)) + + + # Batched tf.data.Dataset that yields batches of vectors + dataset = tf.data.Dataset.from_tensor_slices(np.ones(shape=(16, 16,3))) + dataset = dataset.batch(10) + left_split,right_split=dataset_utils.split_dataset(dataset,left_size=2) + self.assertAllEqual(list(dataset.unbatch()), + list(left_split)+list(right_split)) + + # Batched tf.data.Dataset that yields batches of tuples of vectors + dataset = tf.data.Dataset.from_tensor_slices((np.random.rand(32,32), np.random.rand(32,32))) + dataset = dataset.batch(2) + left_split,right_split=dataset_utils.split_dataset(dataset,left_size=5) + self.assertAllEqual(list(dataset.unbatch()), + list(left_split)+list(right_split)) + + + + + + def test_with_list_dataset(self): dataset = [np.ones(shape=(10,10,10)) for _ in range(10)] - left_dataset,right_dataset = dataset_utils.split_dataset(dataset, + left_split,right_split = dataset_utils.split_dataset(dataset, left_size=5, right_size=5) - self.assertEqual(len(left_dataset), len(right_dataset)) - self.assertIsInstance(left_dataset, tf.data.Dataset) - self.assertIsInstance(left_dataset, tf.data.Dataset) + self.assertEqual(len(left_split), len(right_split)) + self.assertIsInstance(left_split, tf.data.Dataset) + self.assertIsInstance(left_split, tf.data.Dataset) dataset = [np.ones(shape=(10,10,10)) for _ in range(10)] - left_dataset,right_dataset = dataset_utils.split_dataset(dataset, + left_split,right_split = dataset_utils.split_dataset(dataset, left_size=0.6, right_size=0.4) - self.assertEqual(len(left_dataset), 6) - self.assertEqual(len(right_dataset), 4) + self.assertEqual(len(left_split), 6) + self.assertEqual(len(right_split), 4) def test_with_tuple_dataset(self): dataset = (np.ones(shape=(10,10,10)),np.zeros(shape=(10,10,10))) - left_dataset,right_dataset = dataset_utils.split_dataset(dataset, + left_split,right_split = dataset_utils.split_dataset(dataset, left_size=0.75, right_size=0.25) - self.assertLen(left_dataset, 8) - self.assertLen(right_dataset, 2) + self.assertLen(left_split, 8) + self.assertLen(right_split, 2) - left_dataset,right_dataset = dataset_utils.split_dataset(dataset, + left_split,right_split = dataset_utils.split_dataset(dataset, left_size=0.35, right_size=0.65) - self.assertLen(left_dataset, 4) - self.assertLen(right_dataset, 6) - self.assertIsInstance(left_dataset, tf.data.Dataset) - self.assertIsInstance(right_dataset, tf.data.Dataset) + self.assertLen(left_split, 4) + self.assertLen(right_split, 6) + self.assertIsInstance(left_split, tf.data.Dataset) + self.assertIsInstance(right_split, tf.data.Dataset) def test_with_invalid_dataset(self): with self.assertRaises(TypeError): @@ -88,11 +135,11 @@ def test_with_valid_left_and_right_sizes(self): left_size=1, right_size=2) assert(len(splitted_dataset) == 2) - left_dataset,right_dataset = splitted_dataset - self.assertEqual(len(left_dataset), 1) - self.assertEqual(len(right_dataset), 2) - self.assertEqual(list(left_dataset), [1]) - self.assertEqual(list(right_dataset), [2,3]) + left_split,right_split = splitted_dataset + self.assertEqual(len(left_split), 1) + self.assertEqual(len(right_split), 2) + self.assertEqual(list(left_split), [1]) + self.assertEqual(list(right_split), [2,3]) dataset=np.ones(shape=(200, 32)) @@ -129,32 +176,32 @@ def test_with_valid_left_and_right_sizes(self): left_size=0.1, right_size=0.9) assert(len(splitted_dataset) == 2) - left_dataset,right_dataset = splitted_dataset - self.assertEqual(len(left_dataset), 1 ) - self.assertEqual(len(right_dataset), 9 ) - self.assertEqual(list(left_dataset), [1]) - self.assertEqual(list(right_dataset), [2,3,4,5,6,7,8,9,10]) + left_split,right_split = splitted_dataset + self.assertEqual(len(left_split), 1 ) + self.assertEqual(len(right_split), 9 ) + self.assertEqual(list(left_split), [1]) + self.assertEqual(list(right_split), [2,3,4,5,6,7,8,9,10]) dataset = np.array([1,2,3,4,5,6,7,8,9,10]) splitted_dataset = dataset_utils.split_dataset(dataset, left_size=2, right_size=5) assert(len(splitted_dataset) == 2) - left_dataset,right_dataset = splitted_dataset - self.assertEqual(len(left_dataset), 2 ) - self.assertEqual(len(right_dataset), 5 ) - self.assertEqual(list(left_dataset), [1,2]) - self.assertEqual(list(right_dataset), [6,7,8,9,10]) + left_split,right_split = splitted_dataset + self.assertEqual(len(left_split), 2 ) + self.assertEqual(len(right_split), 5 ) + self.assertEqual(list(left_split), [1,2]) + self.assertEqual(list(right_split), [6,7,8,9,10]) def test_with_float_left_and_right_sizes(self): dataset = tf.data.Dataset.from_tensor_slices(np.array([[0.1,0.2,0.3], [0.4,0.5,0.6], [0.7,0.8,0.9]])) - left_dataset,right_dataset = dataset_utils.split_dataset(dataset, + left_split,right_split = dataset_utils.split_dataset(dataset, left_size=0.8, right_size=0.2) - self.assertEqual(len(left_dataset), 2) - self.assertEqual(len(right_dataset), 1) + self.assertEqual(len(left_split), 2) + self.assertEqual(len(right_split), 1) def test_with_invalid_float_left_and_right_sizes(self): with self.assertRaises(ValueError): @@ -170,25 +217,40 @@ def test_with_None_and_zero_left_and_right_size(self): with self.assertRaises(ValueError): dataset_utils.split_dataset(dataset=np.array([1,2,3]), left_size=None) with self.assertRaises(ValueError): - dataset_utils.split_dataset(np.array([1,2,3]), left_size=None,right_size=None) + dataset_utils.split_dataset(np.array([1,2,3]), + left_size=None, + right_size=None) with self.assertRaises(ValueError): - dataset_utils.split_dataset(np.array([1,2,3]), left_size=3,right_size=None) + dataset_utils.split_dataset(np.array([1,2,3]), + left_size=3, + right_size=None) with self.assertRaises(ValueError): - dataset_utils.split_dataset(np.array([1,2,3]), left_size=3,right_size=None) + dataset_utils.split_dataset(np.array([1,2,3]), + left_size=3, + right_size=None) with self.assertRaises(ValueError): dataset_utils.split_dataset(np.array([1,2,3]), left_size=0,right_size=0) def test_with_invalid_left_and_right_size_types(self): with self.assertRaises(TypeError): - dataset_utils.split_dataset(np.array([1,2,3]), left_size='1',right_size='1') + dataset_utils.split_dataset(np.array([1,2,3]), + left_size='1', + right_size='1') with self.assertRaises(TypeError): - dataset_utils.split_dataset(np.array([1,2,3]), left_size=0,right_size='1') + dataset_utils.split_dataset(np.array([1,2,3]), + left_size=0, + right_size='1') with self.assertRaises(TypeError): - dataset_utils.split_dataset(np.array([1,2,3]), left_size='100',right_size=None) + dataset_utils.split_dataset(np.array([1,2,3]), + left_size='100', + right_size=None) with self.assertRaises(TypeError): - dataset_utils.split_dataset(np.array([1,2,3]), right_size='1') + dataset_utils.split_dataset(np.array([1,2,3]), + right_size='1') with self.assertRaises(TypeError): - dataset_utils.split_dataset(np.array([1,2,3]), left_size=0.5,right_size='1') + dataset_utils.split_dataset(np.array([1,2,3]), + left_size=0.5, + right_size='1') From f5dcdb669b1a767b05d9694f699395ae3b7d1c09 Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Fri, 15 Apr 2022 21:55:22 +0530 Subject: [PATCH 09/21] renames tests with common convention --- keras/utils/dataset_utils.py | 5 +- keras/utils/dataset_utils_test.py | 103 ++++++++++++++++++++++-------- 2 files changed, 78 insertions(+), 30 deletions(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index c791567f8f5..1a42426d01e 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -108,13 +108,12 @@ def _convert_dataset_to_list(dataset,data_size_warning_flag = True): dataset_iterator = iter(zip(*dataset)) else: dataset_iterator = iter(dataset) - - elif isinstance(dataset,np.ndarray): - dataset_iterator = iter(dataset) elif isinstance(dataset,tf.data.Dataset): if is_batched(dataset): dataset = dataset.unbatch() dataset_iterator = iter(dataset) + elif isinstance(dataset,np.ndarray): + dataset_iterator = iter(dataset) else: raise TypeError('`dataset` must be either a tf.data.Dataset object' f' or a list/tuple of arrays. Received : {type(dataset)}') diff --git a/keras/utils/dataset_utils_test.py b/keras/utils/dataset_utils_test.py index 64ca5e3c2de..308adfac802 100644 --- a/keras/utils/dataset_utils_test.py +++ b/keras/utils/dataset_utils_test.py @@ -9,8 +9,7 @@ class SplitDatasetTest(tf.test.TestCase): - def test_with_basic_dataset_values(self): - # numpy array + def test_numpy_array(self): dataset=np.ones(shape=(200, 32)) res = dataset_utils.split_dataset(dataset, left_size=0.8,right_size=0.2) self.assertLen(res, 2) @@ -24,7 +23,7 @@ def test_with_basic_dataset_values(self): self.assertAllEqual(dataset[-40:] ,list(right_split)) - # list of numpy array + def test_list_of_numpy_arrays(self): dataset=[np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))] res = dataset_utils.split_dataset(dataset, left_size=4) self.assertLen(res, 2) @@ -38,8 +37,13 @@ def test_with_basic_dataset_values(self): self.assertAllEqual(list(left_split)+list(right_split), list(zip(*dataset))) + dataset=[np.ones(shape=(200, 32))] + left_split,right_split = dataset_utils.split_dataset(dataset, + left_size=4) + self.assertAllEqual(list(zip(*dataset)), + list(left_split)+list(right_split)) - # fail with invalid shape + def test_illegal_shaped_numpy_array(self): with self.assertRaises(ValueError): dataset=[np.ones(shape=(200, 32)), np.zeros(shape=(100, 32))] dataset_utils.split_dataset(dataset, left_size=4) @@ -48,42 +52,87 @@ def test_with_basic_dataset_values(self): dataset=(np.ones(shape=(200, 32)), np.zeros(shape=(201, 32))) dataset_utils.split_dataset(dataset, left_size=4) - # list with single np array - dataset=[np.ones(shape=(200, 32))] - left_split,right_split = dataset_utils.split_dataset(dataset, - left_size=4) - self.assertAllEqual(list(zip(*dataset)), - list(left_split)+list(right_split)) - - # Tule of np arrays + def test_tuple_of_numpy_arrays(self): dataset=(np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))) left_split,right_split = dataset_utils.split_dataset(dataset, left_size=80) self.assertAllEqual(list(zip(*dataset))[:80] ,list(left_split)) self.assertAllEqual(list(zip(*dataset))[80:] ,list(right_split)) - - # Batched tf.data.Dataset that yields batches of vectors - dataset = tf.data.Dataset.from_tensor_slices(np.ones(shape=(16, 16,3))) + def test_batched_tf_dataset_of_vectors(self): + dataset = tf.data.Dataset.from_tensor_slices(np.ones(shape=(100,32, 32,1))) dataset = dataset.batch(10) left_split,right_split=dataset_utils.split_dataset(dataset,left_size=2) + + self.assertAllEqual(np.array(list(left_split)).shape,(2,32,32,1)) + self.assertAllEqual(np.array(list(right_split)).shape,(98,32,32,1)) self.assertAllEqual(list(dataset.unbatch()), list(left_split)+list(right_split)) - # Batched tf.data.Dataset that yields batches of tuples of vectors - dataset = tf.data.Dataset.from_tensor_slices((np.random.rand(32,32), np.random.rand(32,32))) + def test_batched_tf_dataset_of_tuple_of_vectors(self): + dataset = tf.data.Dataset.from_tensor_slices((np.random.rand(10,32,32), + np.random.rand(10,32,32))) dataset = dataset.batch(2) - left_split,right_split=dataset_utils.split_dataset(dataset,left_size=5) + left_split,right_split=dataset_utils.split_dataset(dataset,left_size=4) + + self.assertAllEqual(np.array(list(left_split)).shape,(4,2,32,32)) + self.assertAllEqual(np.array(list(right_split)).shape,(6,2,32,32)) + self.assertAllEqual(list(dataset.unbatch()), - list(left_split)+list(right_split)) + list(left_split)+list(right_split)) + + def test_unbatched_tf_dataset_of_vectors(self): + dataset = tf.data.Dataset.from_tensor_slices(np.ones(shape=(100,16, 16,3))) + + left_split,right_split=dataset_utils.split_dataset(dataset,left_size=0.25) + + self.assertAllEqual(np.array(list(left_split)).shape,(25,16, 16,3)) + self.assertAllEqual(np.array(list(right_split)).shape,(75,16, 16,3)) + + self.assertAllEqual(list(dataset), + list(left_split)+list(right_split)) + + + + def test_unbatched_tf_dataset_of_tuple_of_vectors(self): + dataset = tf.data.Dataset.from_tensor_slices((np.random.rand(10,32,32,1), + np.random.rand(10,32,32,1))) + + left_split,right_split=dataset_utils.split_dataset(dataset,left_size=5) + + self.assertAllEqual(np.array(list(left_split)).shape,(5,2,32,32,1)) + self.assertAllEqual(np.array(list(right_split)).shape,(5,2,32,32,1)) + + + self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) + + def test_with_mnist_dataset(self): + pass + # (x_train,y_train),(_,_) = tf.keras.datasets.mnist.load_data() + + # self.assertEqual(x_train.shape,(60000,28,28,1)) + # self.assertEqual(y_train.shape,(60000,)) + + + + + + # def test_unbatched_tf_dataset_of_dicts_of_vectors(self): + # dataset = tf.data.Dataset.from_tensor_slices( + # {'images': np.random.rand(32,32,3), + # 'labels': np.random.rand(32,1) + # }) + # dataset = dataset.batch(2) + # left_split,right_split=dataset_utils.split_dataset(dataset,left_size=5) + # self.assertAllEqual(len(list(dataset.unbatch())),len(list(left_split))) - def test_with_list_dataset(self): + def test_list_dataset(self): dataset = [np.ones(shape=(10,10,10)) for _ in range(10)] left_split,right_split = dataset_utils.split_dataset(dataset, left_size=5, @@ -100,7 +149,7 @@ def test_with_list_dataset(self): self.assertEqual(len(right_split), 4) - def test_with_tuple_dataset(self): + def test_tuple_dataset(self): dataset = (np.ones(shape=(10,10,10)),np.zeros(shape=(10,10,10))) left_split,right_split = dataset_utils.split_dataset(dataset, left_size=0.75, @@ -116,7 +165,7 @@ def test_with_tuple_dataset(self): self.assertIsInstance(left_split, tf.data.Dataset) self.assertIsInstance(right_split, tf.data.Dataset) - def test_with_invalid_dataset(self): + def test_invalid_dataset(self): with self.assertRaises(TypeError): dataset_utils.split_dataset(dataset=None, left_size=5) with self.assertRaises(TypeError): @@ -128,7 +177,7 @@ def test_with_invalid_dataset(self): with self.assertRaises(TypeError): dataset_utils.split_dataset(dataset=float('INF'), left_size=5) - def test_with_valid_left_and_right_sizes(self): + def test_valid_left_and_right_sizes(self): dataset = np.array([1,2,3]) splitted_dataset = dataset_utils.split_dataset(dataset, @@ -193,7 +242,7 @@ def test_with_valid_left_and_right_sizes(self): self.assertEqual(list(left_split), [1,2]) self.assertEqual(list(right_split), [6,7,8,9,10]) - def test_with_float_left_and_right_sizes(self): + def test_float_left_and_right_sizes(self): dataset = tf.data.Dataset.from_tensor_slices(np.array([[0.1,0.2,0.3], [0.4,0.5,0.6], [0.7,0.8,0.9]])) @@ -203,7 +252,7 @@ def test_with_float_left_and_right_sizes(self): self.assertEqual(len(left_split), 2) self.assertEqual(len(right_split), 1) - def test_with_invalid_float_left_and_right_sizes(self): + def test_invalid_float_left_and_right_sizes(self): with self.assertRaises(ValueError): dataset = [np.ones(shape=(200, 32,32)), np.zeros(shape=(200, 32,32))] dataset_utils.split_dataset(dataset, left_size=1.5,right_size=0.2) @@ -213,7 +262,7 @@ def test_with_invalid_float_left_and_right_sizes(self): - def test_with_None_and_zero_left_and_right_size(self): + def test_None_and_zero_left_and_right_size(self): with self.assertRaises(ValueError): dataset_utils.split_dataset(dataset=np.array([1,2,3]), left_size=None) with self.assertRaises(ValueError): @@ -231,7 +280,7 @@ def test_with_None_and_zero_left_and_right_size(self): with self.assertRaises(ValueError): dataset_utils.split_dataset(np.array([1,2,3]), left_size=0,right_size=0) - def test_with_invalid_left_and_right_size_types(self): + def test_invalid_left_and_right_size_types(self): with self.assertRaises(TypeError): dataset_utils.split_dataset(np.array([1,2,3]), left_size='1', From 60d220fe0e5a25ca6015dd217e3fe27d07a6612e Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Sat, 16 Apr 2022 01:39:22 +0530 Subject: [PATCH 10/21] adds testcases for iregualrity and improves Exception message --- keras/utils/BUILD | 3 +- keras/utils/dataset_utils.py | 103 ++++++++++++++++++++---------- keras/utils/dataset_utils_test.py | 14 +++- 3 files changed, 85 insertions(+), 35 deletions(-) diff --git a/keras/utils/BUILD b/keras/utils/BUILD index 643d18ce476..e04175d7ea0 100644 --- a/keras/utils/BUILD +++ b/keras/utils/BUILD @@ -303,7 +303,8 @@ tf_py_test( tf_py_test( name = "dataset_utils_test", - size = "small", + size = "medium", + timeout = "moderate", srcs = ["dataset_utils_test.py"], python_version = "PY3", deps = [ diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index 1a42426d01e..f911e71703e 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -80,18 +80,40 @@ def split_dataset(dataset, right_size, total_length) + + + left_split = dataset_as_list[:left_size] right_split = dataset_as_list[-right_size:] - left_split = tf.data.Dataset.from_tensor_slices(left_split) - right_split = tf.data.Dataset.from_tensor_slices(right_split) + + try: + left_split = tf.data.Dataset.from_tensor_slices(left_split) + except Exception as e: + raise ValueError(f' with `left_size`={left_size} and ' + f' `right_size`={right_size}' + f' unable to create the dataset from ' + f' left_split of shape {np.array(left_split).shape}. ' + f' Received error: {e}') + + try: + right_split = tf.data.Dataset.from_tensor_slices(right_split) + except Exception as e: + raise ValueError(f' with `left_size`={left_size} and ' + f' `right_size`={right_size}' + f' unable to create the dataset from ' + f' right_split of shape {np.array(right_split).shape}. ' + f' Received error: {e}') + left_split = left_split.prefetch(tf.data.AUTOTUNE) right_split = right_split.prefetch(tf.data.AUTOTUNE) return left_split, right_split -def _convert_dataset_to_list(dataset,data_size_warning_flag = True): +def _convert_dataset_to_list(dataset, + data_size_warning_flag= True, + ensure_shape_similarity = True): """Helper function to convert a tf.data.Dataset object or a list/tuple of numpy.ndarrays to a list """ # TODO (prakashsellathurai): add support for Batched and unbatched dict tf datasets @@ -99,54 +121,71 @@ def _convert_dataset_to_list(dataset,data_size_warning_flag = True): if len(dataset) == 0: raise ValueError('`dataset` must be a non-empty list/tuple of' ' numpy.ndarrays or tf.data.Dataset objects.') - - + if isinstance(dataset[0],np.ndarray): - if not all(element.shape == dataset[0].shape for element in dataset): - raise ValueError('all elements of `dataset` must have the same shape.') - - dataset_iterator = iter(zip(*dataset)) + expected_shape = dataset[0].shape + for i,element in enumerate(dataset): + if not np.array(element).shape[0] == expected_shape[0]: + raise ValueError(f' Expected all numpy arrays of {type(dataset)} ' + f'in `dataset` to have the same length. ' + f'\n Received: dataset[{i}] with length = ' + f'{np.array(element).shape},' + f' while dataset[0] has length {dataset[0].shape}') else: - dataset_iterator = iter(dataset) + raise ValueError('Expected a list/tuple of numpy.ndarrays,' + 'Received: {}'.format(type(dataset[0]))) + + dataset_iterator = iter(zip(*dataset)) + elif isinstance(dataset,tf.data.Dataset): if is_batched(dataset): dataset = dataset.unbatch() dataset_iterator = iter(dataset) elif isinstance(dataset,np.ndarray): dataset_iterator = iter(dataset) - else: - raise TypeError('`dataset` must be either a tf.data.Dataset object' - f' or a list/tuple of arrays. Received : {type(dataset)}') - + dataset_as_list = [] try: dataset_iterator = iter(dataset_iterator) first_datum = next(dataset_iterator) + if isinstance(first_datum,(tf.Tensor,np.ndarray,tuple)): + first_datum_shape = np.array(first_datum).shape + else: + ensure_shape_similarity = False dataset_as_list.append(first_datum) - except ValueError: - raise ValueError('Received an empty Dataset i.e dataset with no elements. ' - '`dataset` must be a non-empty list/tuple of' + except StopIteration: + raise ValueError(' Received an empty Dataset i.e dataset with no elements.' + ' `dataset` must be a non-empty list/tuple of' ' numpy.ndarrays or tf.data.Dataset objects.') if isinstance(first_datum,dict): raise TypeError('`dataset` must be either a tf.data.Dataset object' ' or a list/tuple of arrays. ' - 'Received : tf.data.Dataset with dict elements') - else: - start_time = time.time() - for i,datum in enumerate(dataset_iterator): - if data_size_warning_flag: - if i % 10 == 0: - cur_time = time.time() - # warns user if the dataset is too large to iterate within 10s - if int(cur_time - start_time) > 10 and data_size_warning_flag: - warnings.warn('Takes too long time to process the `dataset`,' - 'this function is only for small datasets ' - '(e.g. < 10,000 samples).') - data_size_warning_flag = False - - dataset_as_list.append(datum) + ' Received : tf.data.Dataset with dict elements') + + start_time = time.time() + for i,datum in enumerate(dataset_iterator): + + if ensure_shape_similarity: + if first_datum_shape != np.array(datum).shape: + raise ValueError(' All elements of `dataset` must have the same shape,' + f' Expected shape: {np.array(first_datum).shape}' + f' Received shape: {np.array(datum).shape} at' + f' index {i}') + + if data_size_warning_flag: + if i % 10 == 0: + cur_time = time.time() + # warns user if the dataset is too large to iterate within 10s + if int(cur_time - start_time) > 10 and data_size_warning_flag: + warnings.warn(' Takes too long time to process the `dataset`,' + ' this function is only for small datasets ' + ' (e.g. < 10,000 samples).',category=ResourceWarning, + source='split_dataset',stacklevel=2) + data_size_warning_flag = False + + dataset_as_list.append(datum) return dataset_as_list diff --git a/keras/utils/dataset_utils_test.py b/keras/utils/dataset_utils_test.py index 308adfac802..ae462577efe 100644 --- a/keras/utils/dataset_utils_test.py +++ b/keras/utils/dataset_utils_test.py @@ -43,7 +43,7 @@ def test_list_of_numpy_arrays(self): self.assertAllEqual(list(zip(*dataset)), list(left_split)+list(right_split)) - def test_illegal_shaped_numpy_array(self): + def test_dataset_with_irregular_shape(self): with self.assertRaises(ValueError): dataset=[np.ones(shape=(200, 32)), np.zeros(shape=(100, 32))] dataset_utils.split_dataset(dataset, left_size=4) @@ -51,7 +51,17 @@ def test_illegal_shaped_numpy_array(self): with self.assertRaises(ValueError): dataset=(np.ones(shape=(200, 32)), np.zeros(shape=(201, 32))) dataset_utils.split_dataset(dataset, left_size=4) - + + with self.assertRaises(ValueError): + dataset=tf.data.Dataset.from_tensor_slices( + np.ones(shape=(200, 32,32,1)), np.zeros(shape=(32))) + dataset_utils.split_dataset(dataset, left_size=4) + + with self.assertRaises(ValueError): + dataset=tf.data.Dataset.from_tensor_slices( + (np.ones(shape=(200, 32,32,1)), np.zeros(shape=(32)))) + dataset_utils.split_dataset(dataset, left_size=4) + def test_tuple_of_numpy_arrays(self): dataset=(np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))) left_split,right_split = dataset_utils.split_dataset(dataset, From 72897461e93de425b5143ec3a9dd5bd22c8aaec5 Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Sun, 17 Apr 2022 01:03:32 +0530 Subject: [PATCH 11/21] fixes bug in iterating tuple of np arrays with mismatching shapes and having same shape --- keras/utils/dataset_utils.py | 109 ++++++++++++++++++++++-------- keras/utils/dataset_utils_test.py | 48 +++++++++++-- 2 files changed, 123 insertions(+), 34 deletions(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index f911e71703e..58a5b02a3b2 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -79,14 +79,13 @@ def split_dataset(dataset, left_size,right_size = _rescale_dataset_split_sizes(left_size, right_size, total_length) - - - - left_split = dataset_as_list[:left_size] - right_split = dataset_as_list[-right_size:] + left_split = list(dataset_as_list[:left_size]) + right_split = list(dataset_as_list[-right_size:]) - + if _get_type_spec(dataset) is tuple: + left_split = tuple(np.array(sample) for sample in zip(*left_split)) + right_split = tuple(np.array(sample) for sample in zip(*right_split)) try: left_split = tf.data.Dataset.from_tensor_slices(left_split) @@ -116,62 +115,115 @@ def _convert_dataset_to_list(dataset, ensure_shape_similarity = True): """Helper function to convert a tf.data.Dataset object or a list/tuple of numpy.ndarrays to a list """ + # TODO (prakashsellathurai): fix issue with tuple of numpy arrays dataset=(np.ones(shape=(3, 2)), np.zeros(shape=(3,))) # TODO (prakashsellathurai): add support for Batched and unbatched dict tf datasets - if isinstance(dataset,(tuple,list)): + dataset_type_spec = _get_type_spec(dataset) + dataset_iterator = _get_data_iterator_from_dataset(dataset) + dataset_as_list = [] + + start_time = time.time() + for sample in _get_next_sample(dataset_iterator, + ensure_shape_similarity, + data_size_warning_flag, + start_time): + + if dataset_type_spec is None: + dataset_as_list.append(sample) + elif dataset_type_spec is tuple: + dataset_as_list.append(map(tuple,map(np.array,sample))) + + + return dataset_as_list + +def _get_data_iterator_from_dataset(dataset) : + """Helper function to get the data iterator from a tf.data.Dataset object + """ + if isinstance(dataset,(list)): if len(dataset) == 0: - raise ValueError('`dataset` must be a non-empty list/tuple of' + raise ValueError('`dataset` must be a non-empty list of' ' numpy.ndarrays or tf.data.Dataset objects.') if isinstance(dataset[0],np.ndarray): expected_shape = dataset[0].shape for i,element in enumerate(dataset): if not np.array(element).shape[0] == expected_shape[0]: - raise ValueError(f' Expected all numpy arrays of {type(dataset)} ' + raise ValueError(f' Expected numpy array of {type(dataset)} ' f'in `dataset` to have the same length. ' f'\n Received: dataset[{i}] with length = ' f'{np.array(element).shape},' f' while dataset[0] has length {dataset[0].shape}') else: - raise ValueError('Expected a list/tuple of numpy.ndarrays,' + raise ValueError('Expected a list of numpy.ndarrays,' 'Received: {}'.format(type(dataset[0]))) - dataset_iterator = iter(zip(*dataset)) + return iter(zip(*dataset)) + elif isinstance(dataset,tuple): + if len(dataset) == 0: + raise ValueError('`dataset` must be a non-empty list of' + ' numpy.ndarrays or tf.data.Dataset objects.') + + if isinstance(dataset[0],np.ndarray): + expected_shape = dataset[0].shape + for i,element in enumerate(dataset): + if not np.array(element).shape[0] == expected_shape[0]: + raise ValueError(f' Expected numpy array of {type(dataset)} ' + f'in `dataset` to have the same length. ' + f'\n Received: dataset[{i}] with length = ' + f'{np.array(element).shape},' + f' while dataset[0] has length {dataset[0].shape}') + else: + raise ValueError('Expected a tuple of numpy.ndarrays,' + 'Received: {}'.format(type(dataset[0]))) + return iter(zip(*dataset)) elif isinstance(dataset,tf.data.Dataset): if is_batched(dataset): dataset = dataset.unbatch() - dataset_iterator = iter(dataset) + return iter(dataset) elif isinstance(dataset,np.ndarray): - dataset_iterator = iter(dataset) - - dataset_as_list = [] + return iter(dataset) + +def _get_type_spec(dataset): + if isinstance(dataset,(tuple)): + return tuple + return None + +def _get_next_sample(dataset_iterator, + ensure_shape_similarity, + data_size_warning_flag, + start_time): + """Helper function to yield samples from a `dataset_iterator` + if `ensure_shape_similarity` is set to True raises error if the + shapes of the samples are not the same . + if `data_size_warning_flag` is set to True, raises warning if the + dataset iteration takes more than 10 seconds. + """ try: dataset_iterator = iter(dataset_iterator) - first_datum = next(dataset_iterator) - if isinstance(first_datum,(tf.Tensor,np.ndarray,tuple)): - first_datum_shape = np.array(first_datum).shape + first_sample = next(dataset_iterator) + if isinstance(first_sample,(tf.Tensor,np.ndarray)): + first_sample_shape = np.array(first_sample).shape else: + first_sample_shape = None ensure_shape_similarity = False - dataset_as_list.append(first_datum) + yield first_sample except StopIteration: raise ValueError(' Received an empty Dataset i.e dataset with no elements.' ' `dataset` must be a non-empty list/tuple of' ' numpy.ndarrays or tf.data.Dataset objects.') - if isinstance(first_datum,dict): + if isinstance(first_sample,dict): raise TypeError('`dataset` must be either a tf.data.Dataset object' ' or a list/tuple of arrays. ' ' Received : tf.data.Dataset with dict elements') - start_time = time.time() - for i,datum in enumerate(dataset_iterator): - + for i,sample in enumerate(dataset_iterator): if ensure_shape_similarity: - if first_datum_shape != np.array(datum).shape: + if first_sample_shape != np.array(sample).shape: raise ValueError(' All elements of `dataset` must have the same shape,' - f' Expected shape: {np.array(first_datum).shape}' - f' Received shape: {np.array(datum).shape} at' + f' Expected shape: {np.array(first_sample).shape}' + f' Received shape: {np.array(sample).shape} at' f' index {i}') if data_size_warning_flag: @@ -184,10 +236,9 @@ def _convert_dataset_to_list(dataset, ' (e.g. < 10,000 samples).',category=ResourceWarning, source='split_dataset',stacklevel=2) data_size_warning_flag = False + + yield sample - dataset_as_list.append(datum) - - return dataset_as_list def _rescale_dataset_split_sizes(left_size,right_size,total_length): """Helper function to rescale left_size/right_size args relative diff --git a/keras/utils/dataset_utils_test.py b/keras/utils/dataset_utils_test.py index ae462577efe..0362b0f1ed8 100644 --- a/keras/utils/dataset_utils_test.py +++ b/keras/utils/dataset_utils_test.py @@ -5,6 +5,7 @@ import numpy as np from keras.utils import dataset_utils +from keras.datasets import mnist class SplitDatasetTest(tf.test.TestCase): @@ -62,12 +63,43 @@ def test_dataset_with_irregular_shape(self): (np.ones(shape=(200, 32,32,1)), np.zeros(shape=(32)))) dataset_utils.split_dataset(dataset, left_size=4) + + def test_tuple_of_numpy_arrays(self): dataset=(np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))) + left_split,right_split = dataset_utils.split_dataset(dataset, left_size=80) + + self.assertEqual(len(left_split), 80) self.assertAllEqual(list(zip(*dataset))[:80] ,list(left_split)) self.assertAllEqual(list(zip(*dataset))[80:] ,list(right_split)) + + + + dataset=(np.random.rand(4, 3), np.random.rand(4, 1)) + + left_split,right_split = dataset_utils.split_dataset(dataset, + left_size=2) + self.assertEqual(len(left_split), 2) + self.assertEqual(len(right_split), 2) + + self.assertEqual(np.array(list(left_split)[0])[0].shape,(3,) ) + self.assertEqual(np.array(list(left_split)[0])[1].shape,(1,) ) + self.assertEqual(np.array(list(right_split)[0])[0].shape,(3,) ) + self.assertEqual(np.array(list(right_split)[0])[1].shape,(1,) ) + + dataset=(np.random.rand(200,32,32), np.random.rand(200,10)) + left_split,right_split = dataset_utils.split_dataset(dataset, + right_size=180) + self.assertEqual(len(right_split), 180) + self.assertEqual(len(left_split), 20) + + self.assertEqual(np.array(list(left_split)[0])[0].shape,(32,32) ) + self.assertEqual(np.array(list(left_split)[0])[1].shape,(10,) ) + self.assertEqual(np.array(list(right_split)[0])[0].shape,(32,32) ) + self.assertEqual(np.array(list(right_split)[0])[1].shape,(10,) ) + def test_batched_tf_dataset_of_vectors(self): dataset = tf.data.Dataset.from_tensor_slices(np.ones(shape=(100,32, 32,1))) @@ -118,12 +150,18 @@ def test_unbatched_tf_dataset_of_tuple_of_vectors(self): self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) - def test_with_mnist_dataset(self): - pass - # (x_train,y_train),(_,_) = tf.keras.datasets.mnist.load_data() + # def test_mnist_dataset(self): + # (x_train,y_train),(_,_) = mnist.load_data() - # self.assertEqual(x_train.shape,(60000,28,28,1)) - # self.assertEqual(y_train.shape,(60000,)) + # self.assertEqual(x_train.shape,(60000,28,28)) + # self.assertEqual(y_train.shape,(60000,)) + + # dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train)) + + # with self.assertWarns(ResourceWarning): + # left_split,right_split = dataset_utils.split_dataset(dataset, + # left_size=0.80) + From 56b75d030cd9c3e3fde0ddf1f908436e6a5be3d6 Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Mon, 18 Apr 2022 19:57:09 +0530 Subject: [PATCH 12/21] adds test case for fractional size with tuple of numpy arrays in different shape --- keras/utils/dataset_utils.py | 145 +++++++------- keras/utils/dataset_utils_test.py | 314 ++++++++++++------------------ 2 files changed, 199 insertions(+), 260 deletions(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index 58a5b02a3b2..44aa68cbbdf 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -31,71 +31,69 @@ from tensorflow.python.util.tf_export import keras_export @keras_export('keras.utils.split_dataset') -def split_dataset(dataset, - left_size=None, - right_size=None, - shuffle=False, +def split_dataset(dataset, + left_size=None, + right_size=None, + shuffle=False, seed=None): """Split a dataset into a left half and a right half (e.g. training / validation). - + Args: - dataset: A `tf.data.Dataset` object or - a list/tuple of arrays with the same length. - left_size: If float, it should be in range `[0, 1]` range - and signifies the fraction of the data to pack in - the left dataset. If integer, it signifies the number - of samples to pack in the left dataset. - If `None`, it defaults to the complement to `right_size`. - right_size: If float, it should be in range `[0, 1]` range - and signifies the fraction of the data to pack - in the right dataset. If integer, it signifies + dataset: A `tf.data.Dataset` object or + a list/tuple of arrays with the same length. + left_size: If float, it should be in range `[0, 1]` range + and signifies the fraction of the data to pack in + the left dataset. If integer, it signifies the number + of samples to pack in the left dataset. + If `None`, it defaults to the complement to `right_size`. + right_size: If float, it should be in range `[0, 1]` range + and signifies the fraction of the data to pack + in the right dataset. If integer, it signifies the number of samples to pack in the right dataset. - If `None`, it defaults to the complement to `left_size`. + If `None`, it defaults to the complement to `left_size`. shuffle: Boolean, whether to shuffle the data before splitting it. seed: A random seed for shuffling. Returns: A tuple of two `tf.data.Dataset` objects: the left and right splits. """ - + if not isinstance(dataset,(tf.data.Dataset,list,tuple,np.ndarray)): raise TypeError('`dataset` must be either a tf.data.Dataset object' f' or a list/tuple of arrays. Received : {type(dataset)}') - + if right_size is None and left_size is None: raise ValueError('you must specify either `left_size` or `right_size`' ' Received: `left_size`= None, and `right_size`=None') - + dataset_as_list = _convert_dataset_to_list(dataset) - + if seed is None: - seed = np.random.randint(1e6) - - if shuffle: + seed = np.random.randint(1e6) + + if shuffle: Random(seed).shuffle(dataset_as_list) - + total_length = len(dataset_as_list) - + left_size,right_size = _rescale_dataset_split_sizes(left_size, right_size, total_length) - + left_split = list(dataset_as_list[:left_size]) right_split = list(dataset_as_list[-right_size:]) if _get_type_spec(dataset) is tuple: left_split = tuple(np.array(sample) for sample in zip(*left_split)) right_split = tuple(np.array(sample) for sample in zip(*right_split)) - + try: left_split = tf.data.Dataset.from_tensor_slices(left_split) - except Exception as e: - raise ValueError(f' with `left_size`={left_size} and ' - f' `right_size`={right_size}' - f' unable to create the dataset from ' - f' left_split of shape {np.array(left_split).shape}. ' - f' Received error: {e}') - + except ValueError as e: + raise ValueError(f'With left_size={left_size} and right_size={right_size}.' + f' unable to create the dataset from left_split of shape ' + f'{np.array(left_split).shape}. \nReceived error: {e}') + try: right_split = tf.data.Dataset.from_tensor_slices(right_split) except Exception as e: @@ -104,10 +102,10 @@ def split_dataset(dataset, f' unable to create the dataset from ' f' right_split of shape {np.array(right_split).shape}. ' f' Received error: {e}') - + left_split = left_split.prefetch(tf.data.AUTOTUNE) right_split = right_split.prefetch(tf.data.AUTOTUNE) - + return left_split, right_split def _convert_dataset_to_list(dataset, @@ -115,24 +113,23 @@ def _convert_dataset_to_list(dataset, ensure_shape_similarity = True): """Helper function to convert a tf.data.Dataset object or a list/tuple of numpy.ndarrays to a list """ - # TODO (prakashsellathurai): fix issue with tuple of numpy arrays dataset=(np.ones(shape=(3, 2)), np.zeros(shape=(3,))) # TODO (prakashsellathurai): add support for Batched and unbatched dict tf datasets dataset_type_spec = _get_type_spec(dataset) dataset_iterator = _get_data_iterator_from_dataset(dataset) dataset_as_list = [] - + start_time = time.time() for sample in _get_next_sample(dataset_iterator, ensure_shape_similarity, data_size_warning_flag, start_time): - - if dataset_type_spec is None: + + if dataset_type_spec is tuple: + dataset_as_list.append(np.array(sample) ) + else: dataset_as_list.append(sample) - elif dataset_type_spec is tuple: - dataset_as_list.append(map(tuple,map(np.array,sample))) - + return dataset_as_list def _get_data_iterator_from_dataset(dataset) : @@ -151,11 +148,11 @@ def _get_data_iterator_from_dataset(dataset) : f'in `dataset` to have the same length. ' f'\n Received: dataset[{i}] with length = ' f'{np.array(element).shape},' - f' while dataset[0] has length {dataset[0].shape}') + f' while dataset[0] has length {dataset[0].shape}') else: raise ValueError('Expected a list of numpy.ndarrays,' 'Received: {}'.format(type(dataset[0]))) - + return iter(zip(*dataset)) elif isinstance(dataset,tuple): if len(dataset) == 0: @@ -170,11 +167,11 @@ def _get_data_iterator_from_dataset(dataset) : f'in `dataset` to have the same length. ' f'\n Received: dataset[{i}] with length = ' f'{np.array(element).shape},' - f' while dataset[0] has length {dataset[0].shape}') + f' while dataset[0] has length {dataset[0].shape}') else: raise ValueError('Expected a tuple of numpy.ndarrays,' 'Received: {}'.format(type(dataset[0]))) - + return iter(zip(*dataset)) elif isinstance(dataset,tf.data.Dataset): if is_batched(dataset): @@ -193,12 +190,12 @@ def _get_next_sample(dataset_iterator, data_size_warning_flag, start_time): """Helper function to yield samples from a `dataset_iterator` - if `ensure_shape_similarity` is set to True raises error if the + if `ensure_shape_similarity` is set to True raises error if the shapes of the samples are not the same . if `data_size_warning_flag` is set to True, raises warning if the dataset iteration takes more than 10 seconds. """ - + try: dataset_iterator = iter(dataset_iterator) first_sample = next(dataset_iterator) @@ -212,12 +209,12 @@ def _get_next_sample(dataset_iterator, raise ValueError(' Received an empty Dataset i.e dataset with no elements.' ' `dataset` must be a non-empty list/tuple of' ' numpy.ndarrays or tf.data.Dataset objects.') - + if isinstance(first_sample,dict): raise TypeError('`dataset` must be either a tf.data.Dataset object' ' or a list/tuple of arrays. ' ' Received : tf.data.Dataset with dict elements') - + for i,sample in enumerate(dataset_iterator): if ensure_shape_similarity: if first_sample_shape != np.array(sample).shape: @@ -236,19 +233,19 @@ def _get_next_sample(dataset_iterator, ' (e.g. < 10,000 samples).',category=ResourceWarning, source='split_dataset',stacklevel=2) data_size_warning_flag = False - + yield sample - + def _rescale_dataset_split_sizes(left_size,right_size,total_length): - """Helper function to rescale left_size/right_size args relative + """Helper function to rescale left_size/right_size args relative to dataset's size """ - left_size_type = type(left_size) + left_size_type = type(left_size) right_size_type = type(right_size) - if ((left_size is not None and left_size_type not in [int,float]) and + if ((left_size is not None and left_size_type not in [int,float]) and (right_size is not None and right_size_type not in [int,float])): raise TypeError('Invalid `left_size` and `right_size` Types. ' 'Expected: integer or float or None. ' @@ -257,11 +254,11 @@ def _rescale_dataset_split_sizes(left_size,right_size,total_length): if left_size is not None and left_size_type not in [int,float]: raise TypeError(f'Invalid `left_size` Type. Received: {left_size_type}. ' ' Expected: int or float or None') - - if right_size is not None and right_size_type not in [int,float]: + + if right_size is not None and right_size_type not in [int,float]: raise TypeError(f'Invalid `right_size` Type. Received: {right_size_type}.' ' Expected: int or float or None') - + if left_size == 0 and right_size == 0: raise ValueError('Invalid `left_size` and `right_size` values. ' 'You must specify either `left_size` or `right_size` with ' @@ -269,25 +266,25 @@ def _rescale_dataset_split_sizes(left_size,right_size,total_length): 'or a float within range [0,1] to split the dataset' f'Received: `left_size`={left_size}, ' f'`right_size`={right_size}') - - if (left_size_type == int + + if (left_size_type == int and (left_size <= 0 or left_size>= total_length) - or left_size_type == float + or left_size_type == float and (left_size <= 0 or left_size>= 1) ): raise ValueError('`left_size` should be either a positive integer ' f'and smaller than {total_length} or a float ' 'within the range `[0, 1]`. Received: left_size=' - f'{left_size}') - - if (right_size_type == int - and (right_size <= 0 or right_size>= total_length) - or right_size_type == float + f'{left_size}') + + if (right_size_type == int + and (right_size <= 0 or right_size>= total_length) + or right_size_type == float and (right_size <= 0 or right_size>= 1)): raise ValueError('`right_size` should be either a positive integer ' f'and smaller than {total_length} or ' 'a float within the range `[0, 1]`. Received: right_size=' - f'{right_size}') - + f'{right_size}') + if right_size_type == left_size_type == float and right_size + left_size > 1: raise ValueError('sum of `left_size` and `right_size`' ' should be within `[0,1]`.' @@ -315,26 +312,26 @@ def _rescale_dataset_split_sizes(left_size,right_size,total_length): f' should be smaller than the samples {total_length} ' ' reduce `left_size` or `right_size` ' ) - + for split,side in [(left_size,'left'),(right_size,'right')]: if split == 0: raise ValueError(f'with dataset of length={total_length} ' '`left_size`={left_size} and `right_size`={right_size}, ' f'resulting {side} dataset split will be empty. ' 'Adjust any of the aforementioned parameters') - + left_size,right_size = int(left_size) ,int(right_size) return left_size,right_size - + def is_batched(tf_dataset): """returns true if given tf dataset is batched or false if not - + refer: https://stackoverflow.com/a/66101853/8336491 """ try: return tf_dataset.__class__.__name__ == 'BatchDataset' - except : + except : return False def index_directory(directory, diff --git a/keras/utils/dataset_utils_test.py b/keras/utils/dataset_utils_test.py index 0362b0f1ed8..492262df0b1 100644 --- a/keras/utils/dataset_utils_test.py +++ b/keras/utils/dataset_utils_test.py @@ -1,106 +1,118 @@ -"""Tests for dataset_utils.""" +"""Tests for Dataset Utils""" import tensorflow.compat.v2 as tf +# pylint: disable=g-classes-have-attributes import numpy as np - from keras.utils import dataset_utils -from keras.datasets import mnist - class SplitDatasetTest(tf.test.TestCase): - def test_numpy_array(self): dataset=np.ones(shape=(200, 32)) res = dataset_utils.split_dataset(dataset, left_size=0.8,right_size=0.2) + self.assertLen(res, 2) left_split,right_split = res + self.assertIsInstance(left_split, tf.data.Dataset) self.assertIsInstance(right_split, tf.data.Dataset) - + self.assertLen(left_split, 160) self.assertLen(right_split, 40) + self.assertAllEqual(dataset[:160] ,list(left_split)) self.assertAllEqual(dataset[-40:] ,list(right_split)) - - + def test_list_of_numpy_arrays(self): dataset=[np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))] res = dataset_utils.split_dataset(dataset, left_size=4) + self.assertLen(res, 2) left_split,right_split = res + self.assertIsInstance(left_split, tf.data.Dataset) self.assertIsInstance(right_split, tf.data.Dataset) + self.assertLen(left_split, 4) self.assertLen(right_split, 196) + self.assertAllEqual(list(zip(*dataset))[:4] ,list(left_split)) - self.assertAllEqual(list(zip(*dataset))[4:] ,list(right_split)) - self.assertAllEqual(list(left_split)+list(right_split), - list(zip(*dataset))) - + self.assertAllEqual(list(zip(*dataset))[4:] ,list(right_split)) + self.assertAllEqual(list(left_split)+list(right_split),list(zip(*dataset))) + dataset=[np.ones(shape=(200, 32))] - left_split,right_split = dataset_utils.split_dataset(dataset, + left_split,right_split = dataset_utils.split_dataset(dataset, left_size=4) self.assertAllEqual(list(zip(*dataset)), list(left_split)+list(right_split)) - - def test_dataset_with_irregular_shape(self): + + def test_dataset_with_invalid_shape(self): with self.assertRaises(ValueError): dataset=[np.ones(shape=(200, 32)), np.zeros(shape=(100, 32))] dataset_utils.split_dataset(dataset, left_size=4) - + with self.assertRaises(ValueError): dataset=(np.ones(shape=(200, 32)), np.zeros(shape=(201, 32))) dataset_utils.split_dataset(dataset, left_size=4) - + with self.assertRaises(ValueError): dataset=tf.data.Dataset.from_tensor_slices( np.ones(shape=(200, 32,32,1)), np.zeros(shape=(32))) dataset_utils.split_dataset(dataset, left_size=4) - + with self.assertRaises(ValueError): dataset=tf.data.Dataset.from_tensor_slices( (np.ones(shape=(200, 32,32,1)), np.zeros(shape=(32)))) dataset_utils.split_dataset(dataset, left_size=4) - - - + def test_tuple_of_numpy_arrays(self): - dataset=(np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))) - - left_split,right_split = dataset_utils.split_dataset(dataset, - left_size=80) - - self.assertEqual(len(left_split), 80) - self.assertAllEqual(list(zip(*dataset))[:80] ,list(left_split)) - self.assertAllEqual(list(zip(*dataset))[80:] ,list(right_split)) - - - - dataset=(np.random.rand(4, 3), np.random.rand(4, 1)) - - left_split,right_split = dataset_utils.split_dataset(dataset, - left_size=2) + dataset=(np.random.rand(4, 3), np.random.rand(4, 3)) + left_split,right_split = dataset_utils.split_dataset(dataset, left_size=2) + + self.assertIsInstance(left_split, tf.data.Dataset) + self.assertIsInstance(right_split, tf.data.Dataset) + self.assertEqual(len(left_split), 2) self.assertEqual(len(right_split), 2) - - self.assertEqual(np.array(list(left_split)[0])[0].shape,(3,) ) - self.assertEqual(np.array(list(left_split)[0])[1].shape,(1,) ) - self.assertEqual(np.array(list(right_split)[0])[0].shape,(3,) ) - self.assertEqual(np.array(list(right_split)[0])[1].shape,(1,) ) - - dataset=(np.random.rand(200,32,32), np.random.rand(200,10)) - left_split,right_split = dataset_utils.split_dataset(dataset, - right_size=180) - self.assertEqual(len(right_split), 180) - self.assertEqual(len(left_split), 20) - - self.assertEqual(np.array(list(left_split)[0])[0].shape,(32,32) ) - self.assertEqual(np.array(list(left_split)[0])[1].shape,(10,) ) - self.assertEqual(np.array(list(right_split)[0])[0].shape,(32,32) ) - self.assertEqual(np.array(list(right_split)[0])[1].shape,(10,) ) - - + + self.assertEqual(np.array(list(left_split)[0]).shape, (2, 3)) + self.assertEqual(np.array(list(left_split)[1]).shape, (2, 3)) + + # test with fractional size + dataset = (np.random.rand(5, 32,32), np.random.rand(5, 32,32)) + left_split,right_split = dataset_utils.split_dataset(dataset, + right_size=0.4) + self.assertIsInstance(left_split, tf.data.Dataset) + self.assertIsInstance(right_split, tf.data.Dataset) + + self.assertEqual(np.array(list(left_split)).shape,(3,2,32,32)) + self.assertEqual(np.array(list(right_split)).shape,(2,2,32,32)) + + self.assertEqual(np.array(list(left_split))[0].shape,(2,32,32)) + self.assertEqual(np.array(list(left_split))[1].shape,(2,32,32)) + + self.assertEqual(np.array(list(right_split))[0].shape,(2,32,32)) + self.assertEqual(np.array(list(right_split))[1].shape,(2,32,32)) + + # test with tuple of np arrays with different shapes + dataset = (np.random.rand(5, 32,32), np.random.rand(5, )) + left_split,right_split = dataset_utils.split_dataset(dataset, + left_size=2, + right_size=3) + self.assertIsInstance(left_split, tf.data.Dataset) + self.assertIsInstance(right_split, tf.data.Dataset) + + self.assertEqual(np.array(list(left_split)).shape,(2,2)) + self.assertEqual(np.array(list(right_split)).shape,(3,2)) + + self.assertEqual(np.array(list(left_split)[0]).shape,(2,)) + self.assertEqual(np.array(list(left_split)[0][0]).shape,(32,32)) + self.assertEqual(np.array(list(left_split)[0][1]).shape,()) + + self.assertEqual(np.array(list(right_split)[0]).shape,(2,)) + self.assertEqual(np.array(list(right_split)[0][0]).shape,(32,32)) + self.assertEqual(np.array(list(right_split)[0][1]).shape,()) + def test_batched_tf_dataset_of_vectors(self): dataset = tf.data.Dataset.from_tensor_slices(np.ones(shape=(100,32, 32,1))) dataset = dataset.batch(10) @@ -108,127 +120,74 @@ def test_batched_tf_dataset_of_vectors(self): self.assertAllEqual(np.array(list(left_split)).shape,(2,32,32,1)) self.assertAllEqual(np.array(list(right_split)).shape,(98,32,32,1)) - self.assertAllEqual(list(dataset.unbatch()), - list(left_split)+list(right_split)) - + + dataset = dataset.unbatch() + self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) + def test_batched_tf_dataset_of_tuple_of_vectors(self): - dataset = tf.data.Dataset.from_tensor_slices((np.random.rand(10,32,32), + dataset = tf.data.Dataset.from_tensor_slices((np.random.rand(10,32,32), np.random.rand(10,32,32))) dataset = dataset.batch(2) left_split,right_split=dataset_utils.split_dataset(dataset,left_size=4) - + self.assertAllEqual(np.array(list(left_split)).shape,(4,2,32,32)) self.assertAllEqual(np.array(list(right_split)).shape,(6,2,32,32)) - self.assertAllEqual(list(dataset.unbatch()), - list(left_split)+list(right_split)) - - + dataset = dataset.unbatch() + self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) def test_unbatched_tf_dataset_of_vectors(self): dataset = tf.data.Dataset.from_tensor_slices(np.ones(shape=(100,16, 16,3))) - + left_split,right_split=dataset_utils.split_dataset(dataset,left_size=0.25) - + self.assertAllEqual(np.array(list(left_split)).shape,(25,16, 16,3)) self.assertAllEqual(np.array(list(right_split)).shape,(75,16, 16,3)) - - self.assertAllEqual(list(dataset), - list(left_split)+list(right_split)) - + self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) def test_unbatched_tf_dataset_of_tuple_of_vectors(self): - dataset = tf.data.Dataset.from_tensor_slices((np.random.rand(10,32,32,1), - np.random.rand(10,32,32,1))) + X,Y = (np.random.rand(10,32,32,1),np.random.rand(10,32,32,1)) + dataset = tf.data.Dataset.from_tensor_slices((X,Y)) left_split,right_split=dataset_utils.split_dataset(dataset,left_size=5) - + self.assertAllEqual(np.array(list(left_split)).shape,(5,2,32,32,1)) self.assertAllEqual(np.array(list(right_split)).shape,(5,2,32,32,1)) - - - self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) - - # def test_mnist_dataset(self): - # (x_train,y_train),(_,_) = mnist.load_data() - - # self.assertEqual(x_train.shape,(60000,28,28)) - # self.assertEqual(y_train.shape,(60000,)) - - # dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train)) - - # with self.assertWarns(ResourceWarning): - # left_split,right_split = dataset_utils.split_dataset(dataset, - # left_size=0.80) - - - - - - - # def test_unbatched_tf_dataset_of_dicts_of_vectors(self): - # dataset = tf.data.Dataset.from_tensor_slices( - # {'images': np.random.rand(32,32,3), - # 'labels': np.random.rand(32,1) - # }) - # dataset = dataset.batch(2) - # left_split,right_split=dataset_utils.split_dataset(dataset,left_size=5) - # self.assertAllEqual(len(list(dataset.unbatch())),len(list(left_split))) - - - - - + + self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) + def test_list_dataset(self): dataset = [np.ones(shape=(10,10,10)) for _ in range(10)] left_split,right_split = dataset_utils.split_dataset(dataset, - left_size=5, - right_size=5) + left_size=5, + right_size=5) self.assertEqual(len(left_split), len(right_split)) self.assertIsInstance(left_split, tf.data.Dataset) self.assertIsInstance(left_split, tf.data.Dataset) - + dataset = [np.ones(shape=(10,10,10)) for _ in range(10)] left_split,right_split = dataset_utils.split_dataset(dataset, - left_size=0.6, - right_size=0.4) + left_size=0.6, + right_size=0.4) self.assertEqual(len(left_split), 6) self.assertEqual(len(right_split), 4) - - def test_tuple_dataset(self): - dataset = (np.ones(shape=(10,10,10)),np.zeros(shape=(10,10,10))) - left_split,right_split = dataset_utils.split_dataset(dataset, - left_size=0.75, - right_size=0.25) - self.assertLen(left_split, 8) - self.assertLen(right_split, 2) - - left_split,right_split = dataset_utils.split_dataset(dataset, - left_size=0.35, - right_size=0.65) - self.assertLen(left_split, 4) - self.assertLen(right_split, 6) - self.assertIsInstance(left_split, tf.data.Dataset) - self.assertIsInstance(right_split, tf.data.Dataset) - def test_invalid_dataset(self): with self.assertRaises(TypeError): - dataset_utils.split_dataset(dataset=None, left_size=5) + dataset_utils.split_dataset(dataset=None, left_size=5) with self.assertRaises(TypeError): - dataset_utils.split_dataset(dataset=1, left_size=5) + dataset_utils.split_dataset(dataset=1, left_size=5) with self.assertRaises(TypeError): - dataset_utils.split_dataset(dataset=float(1.2), left_size=5) + dataset_utils.split_dataset(dataset=float(1.2), left_size=5) with self.assertRaises(TypeError): - dataset_utils.split_dataset(dataset=dict({}), left_size=5) + dataset_utils.split_dataset(dataset=dict({}), left_size=5) with self.assertRaises(TypeError): dataset_utils.split_dataset(dataset=float('INF'), left_size=5) def test_valid_left_and_right_sizes(self): - dataset = np.array([1,2,3]) - splitted_dataset = dataset_utils.split_dataset(dataset, + splitted_dataset = dataset_utils.split_dataset(dataset, left_size=1, right_size=2) assert(len(splitted_dataset) == 2) @@ -237,37 +196,34 @@ def test_valid_left_and_right_sizes(self): self.assertEqual(len(right_split), 2) self.assertEqual(list(left_split), [1]) self.assertEqual(list(right_split), [2,3]) - - + dataset=np.ones(shape=(200, 32)) res = dataset_utils.split_dataset(dataset, left_size=150,right_size=50) self.assertLen(res, 2) self.assertIsInstance(res[0], tf.data.Dataset) self.assertIsInstance(res[1], tf.data.Dataset) - + self.assertLen(res[0], 150) self.assertLen(res[1], 50) - + dataset=np.ones(shape=(200, 32)) res = dataset_utils.split_dataset(dataset, left_size=120) self.assertLen(res, 2) self.assertIsInstance(res[0], tf.data.Dataset) self.assertIsInstance(res[1], tf.data.Dataset) - + self.assertLen(res[0], 120) self.assertLen(res[1], 80) - - + dataset=np.ones(shape=(10000, 16)) res = dataset_utils.split_dataset(dataset, right_size=20) self.assertLen(res, 2) self.assertIsInstance(res[0], tf.data.Dataset) self.assertIsInstance(res[1], tf.data.Dataset) - + self.assertLen(res[0], 9980) self.assertLen(res[1], 20) - - + dataset = np.array([1,2,3,4,5,6,7,8,9,10]) splitted_dataset = dataset_utils.split_dataset(dataset, left_size=0.1, @@ -278,7 +234,7 @@ def test_valid_left_and_right_sizes(self): self.assertEqual(len(right_split), 9 ) self.assertEqual(list(left_split), [1]) self.assertEqual(list(right_split), [2,3,4,5,6,7,8,9,10]) - + dataset = np.array([1,2,3,4,5,6,7,8,9,10]) splitted_dataset = dataset_utils.split_dataset(dataset, left_size=2, @@ -289,17 +245,16 @@ def test_valid_left_and_right_sizes(self): self.assertEqual(len(right_split), 5 ) self.assertEqual(list(left_split), [1,2]) self.assertEqual(list(right_split), [6,7,8,9,10]) - + def test_float_left_and_right_sizes(self): - dataset = tf.data.Dataset.from_tensor_slices(np.array([[0.1,0.2,0.3], - [0.4,0.5,0.6], - [0.7,0.8,0.9]])) - left_split,right_split = dataset_utils.split_dataset(dataset, - left_size=0.8, - right_size=0.2) + X = np.array([[0.1,0.2,0.3],[0.4,0.5,0.6],[0.7,0.8,0.9]]) + dataset = tf.data.Dataset.from_tensor_slices(X) + left_split,right_split = dataset_utils.split_dataset(dataset, + left_size=0.8, + right_size=0.2) self.assertEqual(len(left_split), 2) self.assertEqual(len(right_split), 1) - + def test_invalid_float_left_and_right_sizes(self): with self.assertRaises(ValueError): dataset = [np.ones(shape=(200, 32,32)), np.zeros(shape=(200, 32,32))] @@ -308,50 +263,37 @@ def test_invalid_float_left_and_right_sizes(self): dataset = [1] dataset_utils.split_dataset(dataset, left_size=0.8,right_size=0.2) - - - def test_None_and_zero_left_and_right_size(self): + def test_None_and_zero_left_and_right_size(self): with self.assertRaises(ValueError): - dataset_utils.split_dataset(dataset=np.array([1,2,3]), left_size=None) + dataset_utils.split_dataset(dataset=np.array([1,2,3]), left_size=None) + with self.assertRaises(ValueError): - dataset_utils.split_dataset(np.array([1,2,3]), - left_size=None, - right_size=None) + dataset_utils.split_dataset(np.array([1,2,3]),left_size=None,right_size=None) + with self.assertRaises(ValueError): - dataset_utils.split_dataset(np.array([1,2,3]), - left_size=3, - right_size=None) + dataset_utils.split_dataset(np.array([1,2,3]),left_size=3,right_size=None) + with self.assertRaises(ValueError): - dataset_utils.split_dataset(np.array([1,2,3]), - left_size=3, - right_size=None) + dataset_utils.split_dataset(np.array([1,2,3]),left_size=3,right_size=None) + with self.assertRaises(ValueError): dataset_utils.split_dataset(np.array([1,2,3]), left_size=0,right_size=0) - - def test_invalid_left_and_right_size_types(self): - with self.assertRaises(TypeError): - dataset_utils.split_dataset(np.array([1,2,3]), - left_size='1', - right_size='1') + + def test_invalid_left_and_right_size_types(self): with self.assertRaises(TypeError): - dataset_utils.split_dataset(np.array([1,2,3]), - left_size=0, - right_size='1') + dataset_utils.split_dataset(np.array([1,2,3]),left_size='1',right_size='1') + with self.assertRaises(TypeError): - dataset_utils.split_dataset(np.array([1,2,3]), - left_size='100', - right_size=None) + dataset_utils.split_dataset(np.array([1,2,3]),left_size=0,right_size='1') + with self.assertRaises(TypeError): - dataset_utils.split_dataset(np.array([1,2,3]), - right_size='1') + dataset_utils.split_dataset(np.array([1,2,3]),left_size='100',right_size=None) + with self.assertRaises(TypeError): - dataset_utils.split_dataset(np.array([1,2,3]), - left_size=0.5, - right_size='1') + dataset_utils.split_dataset(np.array([1,2,3]),right_size='1') - - - + with self.assertRaises(TypeError): + dataset_utils.split_dataset(np.array([1,2,3]),left_size=0.5,right_size='1') if __name__ == "__main__": tf.test.main() From a95a2af089473d4fea9fdb26b0d29d6dde8f7466 Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Tue, 19 Apr 2022 00:19:54 +0530 Subject: [PATCH 13/21] update dataset_utils.py and dataset_utils_test.py --- keras/utils/dataset_utils.py | 89 ++++++++++++++++--------------- keras/utils/dataset_utils_test.py | 50 ++++++++++------- 2 files changed, 77 insertions(+), 62 deletions(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index 44aa68cbbdf..76234a28a2e 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -66,7 +66,8 @@ def split_dataset(dataset, raise ValueError('you must specify either `left_size` or `right_size`' ' Received: `left_size`= None, and `right_size`=None') - dataset_as_list = _convert_dataset_to_list(dataset) + dataset_type_spec = _get_type_spec(dataset) + dataset_as_list = _convert_dataset_to_list(dataset,dataset_type_spec) if seed is None: seed = np.random.randint(1e6) @@ -76,32 +77,20 @@ def split_dataset(dataset, total_length = len(dataset_as_list) - left_size,right_size = _rescale_dataset_split_sizes(left_size, - right_size, + left_size,right_size = _rescale_dataset_split_sizes(left_size,right_size, total_length) - left_split = list(dataset_as_list[:left_size]) right_split = list(dataset_as_list[-right_size:]) - if _get_type_spec(dataset) is tuple: + if dataset_type_spec is tuple: left_split = tuple(np.array(sample) for sample in zip(*left_split)) right_split = tuple(np.array(sample) for sample in zip(*right_split)) - - try: - left_split = tf.data.Dataset.from_tensor_slices(left_split) - except ValueError as e: - raise ValueError(f'With left_size={left_size} and right_size={right_size}.' - f' unable to create the dataset from left_split of shape ' - f'{np.array(left_split).shape}. \nReceived error: {e}') - - try: - right_split = tf.data.Dataset.from_tensor_slices(right_split) - except Exception as e: - raise ValueError(f' with `left_size`={left_size} and ' - f' `right_size`={right_size}' - f' unable to create the dataset from ' - f' right_split of shape {np.array(right_split).shape}. ' - f' Received error: {e}') + elif dataset_type_spec is list: + left_split = tuple(np.array(sample) for sample in zip(*left_split)) + right_split = tuple(np.array(sample) for sample in zip(*right_split)) + + left_split = tf.data.Dataset.from_tensor_slices(left_split) + right_split = tf.data.Dataset.from_tensor_slices(right_split) left_split = left_split.prefetch(tf.data.AUTOTUNE) right_split = right_split.prefetch(tf.data.AUTOTUNE) @@ -109,13 +98,14 @@ def split_dataset(dataset, return left_split, right_split def _convert_dataset_to_list(dataset, + dataset_type_spec, data_size_warning_flag= True, ensure_shape_similarity = True): """Helper function to convert a tf.data.Dataset object or a list/tuple of numpy.ndarrays to a list """ # TODO (prakashsellathurai): add support for Batched and unbatched dict tf datasets - dataset_type_spec = _get_type_spec(dataset) - dataset_iterator = _get_data_iterator_from_dataset(dataset) + + dataset_iterator = _get_data_iterator_from_dataset(dataset,dataset_type_spec) dataset_as_list = [] start_time = time.time() @@ -126,35 +116,37 @@ def _convert_dataset_to_list(dataset, if dataset_type_spec is tuple: dataset_as_list.append(np.array(sample) ) + elif dataset_type_spec is list: + dataset_as_list.append(np.array(sample) ) else: dataset_as_list.append(sample) - return dataset_as_list -def _get_data_iterator_from_dataset(dataset) : +def _get_data_iterator_from_dataset(dataset,dataset_type_spec) : """Helper function to get the data iterator from a tf.data.Dataset object """ - if isinstance(dataset,(list)): + if dataset_type_spec == list: if len(dataset) == 0: - raise ValueError('`dataset` must be a non-empty list of' - ' numpy.ndarrays or tf.data.Dataset objects.') + raise ValueError('Received an empty list dataset ' + 'Please provide a non-empty list dataset') if isinstance(dataset[0],np.ndarray): expected_shape = dataset[0].shape for i,element in enumerate(dataset): if not np.array(element).shape[0] == expected_shape[0]: - raise ValueError(f' Expected numpy array of {type(dataset)} ' - f'in `dataset` to have the same length. ' - f'\n Received: dataset[{i}] with length = ' - f'{np.array(element).shape},' - f' while dataset[0] has length {dataset[0].shape}') + raise ValueError('Received a list of numpy arrays with different ' + f'lengths. Mismatch found at index {i}, ' + f'Expected shape={expected_shape} ' + f'Received shape={np.array(element).shape}.' + f'Please provide a list of numpy arrays with ' + f'equal length.') else: raise ValueError('Expected a list of numpy.ndarrays,' 'Received: {}'.format(type(dataset[0]))) return iter(zip(*dataset)) - elif isinstance(dataset,tuple): + elif dataset_type_spec == tuple: if len(dataset) == 0: raise ValueError('`dataset` must be a non-empty list of' ' numpy.ndarrays or tf.data.Dataset objects.') @@ -163,27 +155,37 @@ def _get_data_iterator_from_dataset(dataset) : expected_shape = dataset[0].shape for i,element in enumerate(dataset): if not np.array(element).shape[0] == expected_shape[0]: - raise ValueError(f' Expected numpy array of {type(dataset)} ' - f'in `dataset` to have the same length. ' - f'\n Received: dataset[{i}] with length = ' - f'{np.array(element).shape},' - f' while dataset[0] has length {dataset[0].shape}') + raise ValueError('Received a tuple of numpy arrays with different ' + f'lengths. Mismatch found at index {i}, ' + f'Expected shape={expected_shape} ' + f'Received shape={np.array(element).shape}.' + f'Please provide a tuple of numpy arrays with ' + f'equal length.') else: raise ValueError('Expected a tuple of numpy.ndarrays,' 'Received: {}'.format(type(dataset[0]))) return iter(zip(*dataset)) - elif isinstance(dataset,tf.data.Dataset): + elif dataset_type_spec == tf.data.Dataset: if is_batched(dataset): dataset = dataset.unbatch() return iter(dataset) - elif isinstance(dataset,np.ndarray): + elif dataset_type_spec == np.ndarray: return iter(dataset) def _get_type_spec(dataset): if isinstance(dataset,(tuple)): return tuple - return None + elif isinstance(dataset,(list)): + return list + elif isinstance(dataset,(np.ndarray)): + return np.ndarray + elif isinstance(dataset,dict): + return dict + elif isinstance(dataset,(tf.data.Dataset)): + return tf.data.Dataset + else: + return None def _get_next_sample(dataset_iterator, ensure_shape_similarity, @@ -326,8 +328,7 @@ def _rescale_dataset_split_sizes(left_size,right_size,total_length): def is_batched(tf_dataset): """returns true if given tf dataset is batched or false if not - - refer: https://stackoverflow.com/a/66101853/8336491 + reference: https://stackoverflow.com/a/66101853/8336491 """ try: return tf_dataset.__class__.__name__ == 'BatchDataset' diff --git a/keras/utils/dataset_utils_test.py b/keras/utils/dataset_utils_test.py index 492262df0b1..22914a1ef6e 100644 --- a/keras/utils/dataset_utils_test.py +++ b/keras/utils/dataset_utils_test.py @@ -24,6 +24,7 @@ def test_numpy_array(self): self.assertAllEqual(dataset[-40:] ,list(right_split)) def test_list_of_numpy_arrays(self): + # test with list of np arrays with same shapes dataset=[np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))] res = dataset_utils.split_dataset(dataset, left_size=4) @@ -33,19 +34,24 @@ def test_list_of_numpy_arrays(self): self.assertIsInstance(left_split, tf.data.Dataset) self.assertIsInstance(right_split, tf.data.Dataset) - self.assertLen(left_split, 4) - self.assertLen(right_split, 196) + self.assertEqual(np.array(list(left_split)).shape,(4,2,32)) + self.assertEqual(np.array(list(right_split)).shape,(196,2,32)) - self.assertAllEqual(list(zip(*dataset))[:4] ,list(left_split)) - self.assertAllEqual(list(zip(*dataset))[4:] ,list(right_split)) - self.assertAllEqual(list(left_split)+list(right_split),list(zip(*dataset))) + # test with different shapes + dataset = [np.ones(shape=(5, 3)), np.ones(shape=(5, ))] + left_split,right_split = dataset_utils.split_dataset(dataset,left_size=0.3) + + self.assertEqual(np.array(list(left_split)).shape,(2,2)) + self.assertEqual(np.array(list(right_split)).shape,(3,2)) - dataset=[np.ones(shape=(200, 32))] - left_split,right_split = dataset_utils.split_dataset(dataset, - left_size=4) - self.assertAllEqual(list(zip(*dataset)), - list(left_split)+list(right_split)) + self.assertEqual(np.array(list(left_split)[0]).shape,(2,)) + self.assertEqual(np.array(list(left_split)[0][0]).shape,(3,)) + self.assertEqual(np.array(list(left_split)[0][1]).shape,()) + self.assertEqual(np.array(list(right_split)[0]).shape,(2,)) + self.assertEqual(np.array(list(right_split)[0][0]).shape,(3,)) + self.assertEqual(np.array(list(right_split)[0][1]).shape,()) + def test_dataset_with_invalid_shape(self): with self.assertRaises(ValueError): dataset=[np.ones(shape=(200, 32)), np.zeros(shape=(100, 32))] @@ -268,32 +274,40 @@ def test_None_and_zero_left_and_right_size(self): dataset_utils.split_dataset(dataset=np.array([1,2,3]), left_size=None) with self.assertRaises(ValueError): - dataset_utils.split_dataset(np.array([1,2,3]),left_size=None,right_size=None) + dataset_utils.split_dataset(np.array([1,2,3]),left_size=None, + right_size=None) with self.assertRaises(ValueError): - dataset_utils.split_dataset(np.array([1,2,3]),left_size=3,right_size=None) + dataset_utils.split_dataset(np.array([1,2,3]),left_size=3, + right_size=None) with self.assertRaises(ValueError): - dataset_utils.split_dataset(np.array([1,2,3]),left_size=3,right_size=None) + dataset_utils.split_dataset(np.array([1,2,3]),left_size=3, + right_size=None) with self.assertRaises(ValueError): - dataset_utils.split_dataset(np.array([1,2,3]), left_size=0,right_size=0) + dataset_utils.split_dataset(np.array([1,2,3]), left_size=0, + right_size=0) def test_invalid_left_and_right_size_types(self): with self.assertRaises(TypeError): - dataset_utils.split_dataset(np.array([1,2,3]),left_size='1',right_size='1') + dataset_utils.split_dataset(np.array([1,2,3]),left_size='1', + right_size='1') with self.assertRaises(TypeError): - dataset_utils.split_dataset(np.array([1,2,3]),left_size=0,right_size='1') + dataset_utils.split_dataset(np.array([1,2,3]),left_size=0, + right_size='1') with self.assertRaises(TypeError): - dataset_utils.split_dataset(np.array([1,2,3]),left_size='100',right_size=None) + dataset_utils.split_dataset(np.array([1,2,3]),left_size='100', + right_size=None) with self.assertRaises(TypeError): dataset_utils.split_dataset(np.array([1,2,3]),right_size='1') with self.assertRaises(TypeError): - dataset_utils.split_dataset(np.array([1,2,3]),left_size=0.5,right_size='1') + dataset_utils.split_dataset(np.array([1,2,3]),left_size=0.5, + right_size='1') if __name__ == "__main__": tf.test.main() From 607e3f0447a8d129c226b7b750a6db6273d4fbba Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Tue, 19 Apr 2022 16:53:52 +0530 Subject: [PATCH 14/21] adds doc string to functions --- keras/utils/dataset_utils.py | 297 ++++++++++++++++++++--------------- 1 file changed, 173 insertions(+), 124 deletions(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index 76234a28a2e..d9aee733a95 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -14,10 +14,6 @@ # ============================================================================== """Keras image dataset loading utilities.""" - - - - import tensorflow.compat.v2 as tf # pylint: disable=g-classes-have-attributes @@ -57,16 +53,17 @@ def split_dataset(dataset, Returns: A tuple of two `tf.data.Dataset` objects: the left and right splits. """ - - if not isinstance(dataset,(tf.data.Dataset,list,tuple,np.ndarray)): - raise TypeError('`dataset` must be either a tf.data.Dataset object' - f' or a list/tuple of arrays. Received : {type(dataset)}') + dataset_type_spec = _get_type_spec(dataset) + + if dataset_type_spec not in [tf.data.Dataset,list,tuple,np.ndarray]: + raise TypeError('`dataset` must be either a tf.data.Dataset object ' + f'or a list/tuple of arrays. Received : {type(dataset)}') if right_size is None and left_size is None: - raise ValueError('you must specify either `left_size` or `right_size`' - ' Received: `left_size`= None, and `right_size`=None') + raise ValueError('At least one of the `left_size` or `right_size` ' + 'must be specified .Received: left_size=None and' + 'right_size=None') - dataset_type_spec = _get_type_spec(dataset) dataset_as_list = _convert_dataset_to_list(dataset,dataset_type_spec) if seed is None: @@ -82,13 +79,9 @@ def split_dataset(dataset, left_split = list(dataset_as_list[:left_size]) right_split = list(dataset_as_list[-right_size:]) - if dataset_type_spec is tuple: - left_split = tuple(np.array(sample) for sample in zip(*left_split)) - right_split = tuple(np.array(sample) for sample in zip(*right_split)) - elif dataset_type_spec is list: - left_split = tuple(np.array(sample) for sample in zip(*left_split)) - right_split = tuple(np.array(sample) for sample in zip(*right_split)) - + left_split = _restore_dataset_from_list(left_split,dataset_type_spec) + right_split = _restore_dataset_from_list(right_split,dataset_type_spec) + left_split = tf.data.Dataset.from_tensor_slices(left_split) right_split = tf.data.Dataset.from_tensor_slices(right_split) @@ -101,7 +94,23 @@ def _convert_dataset_to_list(dataset, dataset_type_spec, data_size_warning_flag= True, ensure_shape_similarity = True): - """Helper function to convert a tf.data.Dataset object or a list/tuple of numpy.ndarrays to a list + """Convert a tf.data.Dataset object or a list/tuple of numpy.ndarrays to a list + + Args: + dataset : A `tf.data.Dataset` object or a list/tuple of arrays. + dataset_type_spec : the type of the dataset + data_size_warning_flag (bool, optional): If set to True ,a warning will + be issued if the dataset takes + longer than 10 seconds to + iterate. Defaults to True. + ensure_shape_similarity (bool, optional): If set to True , the shape of + the first sample will be used + to validate the shape of rest + of the samples. + Defaults to True. + + Returns: + List: A list of tuples/numpy arrays. """ # TODO (prakashsellathurai): add support for Batched and unbatched dict tf datasets @@ -113,34 +122,46 @@ def _convert_dataset_to_list(dataset, ensure_shape_similarity, data_size_warning_flag, start_time): - - if dataset_type_spec is tuple: - dataset_as_list.append(np.array(sample) ) - elif dataset_type_spec is list: - dataset_as_list.append(np.array(sample) ) + if dataset_type_spec in [tuple ,list]: + dataset_as_list.append(np.array(sample)) else: dataset_as_list.append(sample) return dataset_as_list def _get_data_iterator_from_dataset(dataset,dataset_type_spec) : - """Helper function to get the data iterator from a tf.data.Dataset object + """Get the iterator from the dataset + + Args: + dataset : A `tf.data.Dataset` object or a list/tuple of arrays. + dataset_type_spec : the type of the dataset + + Raises: + ValueError: + - If the dataset is empty. + - If the dataset is not a `tf.data.Dataset` object + or a list/tuple of arrays. + - If the dataset is a list/tuple of arrays and the + length of the list/tuple is not equal to the number + + Returns: + iterator: An `iterator` object. """ if dataset_type_spec == list: if len(dataset) == 0: - raise ValueError('Received an empty list dataset ' - 'Please provide a non-empty list dataset') + raise ValueError('Received an empty list dataset. ' + 'Please provide a non-empty list of arrays.') - if isinstance(dataset[0],np.ndarray): + if _get_type_spec(dataset[0]) is np.ndarray: expected_shape = dataset[0].shape for i,element in enumerate(dataset): if not np.array(element).shape[0] == expected_shape[0]: raise ValueError('Received a list of numpy arrays with different ' - f'lengths. Mismatch found at index {i}, ' - f'Expected shape={expected_shape} ' - f'Received shape={np.array(element).shape}.' - f'Please provide a list of numpy arrays with ' - f'equal length.') + f'lengths. Mismatch found at index {i}, ' + f'Expected shape={expected_shape} ' + f'Received shape={np.array(element).shape}.' + f'Please provide a list of numpy arrays with ' + f'same length.') else: raise ValueError('Expected a list of numpy.ndarrays,' 'Received: {}'.format(type(dataset[0]))) @@ -148,19 +169,19 @@ def _get_data_iterator_from_dataset(dataset,dataset_type_spec) : return iter(zip(*dataset)) elif dataset_type_spec == tuple: if len(dataset) == 0: - raise ValueError('`dataset` must be a non-empty list of' - ' numpy.ndarrays or tf.data.Dataset objects.') + raise ValueError('Received an empty list dataset.' + 'Please provide a non-empty tuple of arrays.') - if isinstance(dataset[0],np.ndarray): + if _get_type_spec(dataset[0]) is np.ndarray: expected_shape = dataset[0].shape for i,element in enumerate(dataset): if not np.array(element).shape[0] == expected_shape[0]: raise ValueError('Received a tuple of numpy arrays with different ' - f'lengths. Mismatch found at index {i}, ' - f'Expected shape={expected_shape} ' - f'Received shape={np.array(element).shape}.' - f'Please provide a tuple of numpy arrays with ' - f'equal length.') + f'lengths. Mismatch found at index {i}, ' + f'Expected shape={expected_shape} ' + f'Received shape={np.array(element).shape}.' + f'Please provide a tuple of numpy arrays with ' + 'same length.') else: raise ValueError('Expected a tuple of numpy.ndarrays,' 'Received: {}'.format(type(dataset[0]))) @@ -173,31 +194,35 @@ def _get_data_iterator_from_dataset(dataset,dataset_type_spec) : elif dataset_type_spec == np.ndarray: return iter(dataset) -def _get_type_spec(dataset): - if isinstance(dataset,(tuple)): - return tuple - elif isinstance(dataset,(list)): - return list - elif isinstance(dataset,(np.ndarray)): - return np.ndarray - elif isinstance(dataset,dict): - return dict - elif isinstance(dataset,(tf.data.Dataset)): - return tf.data.Dataset - else: - return None - def _get_next_sample(dataset_iterator, ensure_shape_similarity, data_size_warning_flag, start_time): - """Helper function to yield samples from a `dataset_iterator` - if `ensure_shape_similarity` is set to True raises error if the - shapes of the samples are not the same . - if `data_size_warning_flag` is set to True, raises warning if the - dataset iteration takes more than 10 seconds. + """"Yield data samples from the `dataset_iterator` + + Args: + dataset_iterator : An `iterator` object. + data_size_warning_flag (bool, optional): If set to True ,a warning will + be issued if the dataset takes + longer than 10 seconds to + iterate. Defaults to True. + ensure_shape_similarity (bool, optional): If set to True , the shape of + the first sample will be used + to validate the shape of rest + of the samples. + Defaults to True. + start_time (float): the start time of the dataset iteration. this is + used only if `data_size_warning_flag` is set to true. + + Raises: + ValueError: - If the dataset is empty. + - If `ensure_shape_similarity` is set to True and the + shape of the first sample is not equal to the shape of + atleast one of the rest of the samples. + + yields: + data_sample: A tuple/list of numpy arrays. """ - try: dataset_iterator = iter(dataset_iterator) first_sample = next(dataset_iterator) @@ -208,90 +233,101 @@ def _get_next_sample(dataset_iterator, ensure_shape_similarity = False yield first_sample except StopIteration: - raise ValueError(' Received an empty Dataset i.e dataset with no elements.' - ' `dataset` must be a non-empty list/tuple of' - ' numpy.ndarrays or tf.data.Dataset objects.') - - if isinstance(first_sample,dict): - raise TypeError('`dataset` must be either a tf.data.Dataset object' - ' or a list/tuple of arrays. ' - ' Received : tf.data.Dataset with dict elements') + raise ValueError('Received an empty Dataset. `dataset` must ' + 'be a non-empty list/tuple of numpy.ndarrays ' + 'or tf.data.Dataset objects.') for i,sample in enumerate(dataset_iterator): if ensure_shape_similarity: if first_sample_shape != np.array(sample).shape: - raise ValueError(' All elements of `dataset` must have the same shape,' - f' Expected shape: {np.array(first_sample).shape}' - f' Received shape: {np.array(sample).shape} at' - f' index {i}') - + raise ValueError('All `dataset` samples must have same shape, ' + f'Expected shape: {np.array(first_sample).shape} ' + f'Received shape: {np.array(sample).shape} at index ' + f'{i}.') if data_size_warning_flag: if i % 10 == 0: cur_time = time.time() # warns user if the dataset is too large to iterate within 10s if int(cur_time - start_time) > 10 and data_size_warning_flag: - warnings.warn(' Takes too long time to process the `dataset`,' - ' this function is only for small datasets ' - ' (e.g. < 10,000 samples).',category=ResourceWarning, - source='split_dataset',stacklevel=2) + warnings.warn('The dataset is taking longer than 10 seconds to ' + 'iterate. This may be due to the size of the dataset. ' + 'Please consider using a smaller dataset' + '(e.g. < 10,000 samples). \nTo hide this ' + 'warning message, set `data_size_warning_flag=False`.', + category=ResourceWarning, + source='split_dataset') data_size_warning_flag = False - yield sample +def _restore_dataset_from_list(dataset,dataset_type_spec): + """Restore the dataset from the list of arrays. + """ + if dataset_type_spec in [tuple,list]: + dataset = tuple(np.array(sample) for sample in zip(*dataset)) + return dataset def _rescale_dataset_split_sizes(left_size,right_size,total_length): - """Helper function to rescale left_size/right_size args relative - to dataset's size - """ + """Rescale the dataset split sizes to ensure that the sum of + the split sizes is equal to the total length of the dataset. + + Args: + left_size : The size of the left dataset split. + right_size : The size of the right dataset split. + total_length : The total length of the dataset. + + Raises: + TypeError: - If `left_size` or `right_size` is not an integer or float. + ValueError: - If `left_size` or `right_size` is negative or greater + than 1 or greater than `total_length`. + Returns: + tuple: A tuple of rescaled left_size and right_size + """ left_size_type = type(left_size) right_size_type = type(right_size) + # check both left_size and right_size are integers or floats if ((left_size is not None and left_size_type not in [int,float]) and (right_size is not None and right_size_type not in [int,float])): - raise TypeError('Invalid `left_size` and `right_size` Types. ' - 'Expected: integer or float or None. ' - f' Received: {left_size_type} and {right_size_type}') - + raise TypeError('Invalid `left_size` and `right_size` Types. Expected: ' + 'integer or float or None, Received: type(left_size)=' + f'{left_size_type} and type(right_size)={right_size_type}') + + # check left_size is a integer or float if left_size is not None and left_size_type not in [int,float]: - raise TypeError(f'Invalid `left_size` Type. Received: {left_size_type}. ' - ' Expected: int or float or None') + raise TypeError('Invalid `left_size` Type.Expected: int or float or None, ' + f'Received: type(left_size)={left_size_type}. ') + # check right_size is a integer or float if right_size is not None and right_size_type not in [int,float]: - raise TypeError(f'Invalid `right_size` Type. Received: {right_size_type}.' - ' Expected: int or float or None') + raise TypeError(f'Invalid `right_size` Type.Expected: int or float or None,' + f'Received: type(right_size)={right_size_type}. ') + # check left_size and right_size are non-zero if left_size == 0 and right_size == 0: - raise ValueError('Invalid `left_size` and `right_size` values. ' - 'You must specify either `left_size` or `right_size` with ' - f'value greater than 0 and less than {total_length} ' - 'or a float within range [0,1] to split the dataset' - f'Received: `left_size`={left_size}, ' - f'`right_size`={right_size}') - - if (left_size_type == int - and (left_size <= 0 or left_size>= total_length) - or left_size_type == float - and (left_size <= 0 or left_size>= 1) ): + raise ValueError('Both `left_size` and `right_size` are zero. ' + 'Atleast one of the split sizes must be non-zero.') + + # check left_size is non-negative and less than 1 and less than total_length + if (left_size_type == int and (left_size <= 0 or left_size>= total_length) + or left_size_type == float and (left_size <= 0 or left_size>= 1) ): raise ValueError('`left_size` should be either a positive integer ' - f'and smaller than {total_length} or a float ' + f'and smaller than {total_length} or a float ' 'within the range `[0, 1]`. Received: left_size=' - f'{left_size}') + f'{left_size}') - if (right_size_type == int - and (right_size <= 0 or right_size>= total_length) - or right_size_type == float - and (right_size <= 0 or right_size>= 1)): + # check right_size is non-negative and less than 1 and less than total_length + if (right_size_type == int and (right_size <= 0 or right_size>= total_length) + or right_size_type == float and (right_size <= 0 or right_size>= 1)): raise ValueError('`right_size` should be either a positive integer ' - f'and smaller than {total_length} or ' - 'a float within the range `[0, 1]`. Received: right_size=' - f'{right_size}') + f'and smaller than {total_length} or a float ' + 'within the range `[0, 1]`. Received: right_size=' + f'{right_size}') + # check sum of left_size and right_size is less than or equal to total_length if right_size_type == left_size_type == float and right_size + left_size > 1: - raise ValueError('sum of `left_size` and `right_size`' - ' should be within `[0,1]`.' - f'Received: {right_size + left_size} ,' - 'reduce the `left_size` or `right_size`') + raise ValueError('The sum of `left_size` and `right_size` is greater ' + 'than 1. It must be less than or equal to 1.') if left_size_type == float: left_size = round(left_size*total_length) @@ -303,31 +339,44 @@ def _rescale_dataset_split_sizes(left_size,right_size,total_length): elif right_size_type == int: right_size = float(right_size) - if left_size is None: left_size = total_length - right_size elif right_size is None: right_size = total_length - left_size if left_size + right_size > total_length: - raise ValueError('The sum of `left_size` and `right_size`' - f' should be smaller than the samples {total_length} ' - ' reduce `left_size` or `right_size` ' ) - + raise ValueError('The sum of `left_size` and `right_size` should ' + 'be smaller than the {total_length}. ' + f'Received: left_size + right_size = {left_size+right_size}' + f'and total_length = {total_length}') for split,side in [(left_size,'left'),(right_size,'right')]: if split == 0: - raise ValueError(f'with dataset of length={total_length} ' - '`left_size`={left_size} and `right_size`={right_size}, ' - f'resulting {side} dataset split will be empty. ' - 'Adjust any of the aforementioned parameters') + raise ValueError(f'With `dataset` of length={total_length}, `left_size`=' + '{left_size} and `right_size`={right_size}.' + f'Resulting {side} side dataset split will be empty. ' + 'Adjust any of the aforementioned parameters') left_size,right_size = int(left_size) ,int(right_size) return left_size,right_size +def _get_type_spec(dataset): + """Get the type spec of the dataset.""" + if isinstance(dataset,(tuple)): + return tuple + elif isinstance(dataset,(list)): + return list + elif isinstance(dataset,(np.ndarray)): + return np.ndarray + elif isinstance(dataset,dict): + return dict + elif isinstance(dataset,(tf.data.Dataset)): + return tf.data.Dataset + else: + return None def is_batched(tf_dataset): - """returns true if given tf dataset is batched or false if not + """"Check if the tf.data.Dataset is batched. reference: https://stackoverflow.com/a/66101853/8336491 """ try: From 46130091764f20cccd46c94cd746e37ac4b9ce75 Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Tue, 19 Apr 2022 19:16:23 +0530 Subject: [PATCH 15/21] adds assertRaisesregex tests --- keras/utils/dataset_utils.py | 2 +- keras/utils/dataset_utils_test.py | 104 ++++++++++++++++++------------ 2 files changed, 63 insertions(+), 43 deletions(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index d9aee733a95..67124e9ef18 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -61,7 +61,7 @@ def split_dataset(dataset, if right_size is None and left_size is None: raise ValueError('At least one of the `left_size` or `right_size` ' - 'must be specified .Received: left_size=None and' + 'must be specified .Received: left_size=None and ' 'right_size=None') dataset_as_list = _convert_dataset_to_list(dataset,dataset_type_spec) diff --git a/keras/utils/dataset_utils_test.py b/keras/utils/dataset_utils_test.py index 22914a1ef6e..2c33d339c1d 100644 --- a/keras/utils/dataset_utils_test.py +++ b/keras/utils/dataset_utils_test.py @@ -53,24 +53,18 @@ def test_list_of_numpy_arrays(self): self.assertEqual(np.array(list(right_split)[0][1]).shape,()) def test_dataset_with_invalid_shape(self): - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, + 'Received a list of numpy arrays ' + 'with different length'): dataset=[np.ones(shape=(200, 32)), np.zeros(shape=(100, 32))] dataset_utils.split_dataset(dataset, left_size=4) - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError, + 'Received a tuple of numpy arrays ' + 'with different length'): dataset=(np.ones(shape=(200, 32)), np.zeros(shape=(201, 32))) dataset_utils.split_dataset(dataset, left_size=4) - with self.assertRaises(ValueError): - dataset=tf.data.Dataset.from_tensor_slices( - np.ones(shape=(200, 32,32,1)), np.zeros(shape=(32))) - dataset_utils.split_dataset(dataset, left_size=4) - - with self.assertRaises(ValueError): - dataset=tf.data.Dataset.from_tensor_slices( - (np.ones(shape=(200, 32,32,1)), np.zeros(shape=(32)))) - dataset_utils.split_dataset(dataset, left_size=4) - def test_tuple_of_numpy_arrays(self): dataset=(np.random.rand(4, 3), np.random.rand(4, 3)) left_split,right_split = dataset_utils.split_dataset(dataset, left_size=2) @@ -180,22 +174,35 @@ def test_list_dataset(self): self.assertEqual(len(right_split), 4) def test_invalid_dataset(self): - with self.assertRaises(TypeError): + with self.assertRaisesRegex(TypeError, + '`dataset` must be either a tf.data.Dataset ' + f'object or a list/tuple of arrays. Received ' + ': '): dataset_utils.split_dataset(dataset=None, left_size=5) - with self.assertRaises(TypeError): + with self.assertRaisesRegex(TypeError, + '`dataset` must be either a tf.data.Dataset ' + f'object or a list/tuple of arrays. Received ' + ': '): dataset_utils.split_dataset(dataset=1, left_size=5) - with self.assertRaises(TypeError): + with self.assertRaisesRegex(TypeError, + '`dataset` must be either a tf.data.Dataset ' + f'object or a list/tuple of arrays. Received ' + ': '): dataset_utils.split_dataset(dataset=float(1.2), left_size=5) - with self.assertRaises(TypeError): + with self.assertRaisesRegex(TypeError, + '`dataset` must be either a tf.data.Dataset ' + f'object or a list/tuple of arrays. Received ' + ': '): dataset_utils.split_dataset(dataset=dict({}), left_size=5) - with self.assertRaises(TypeError): + with self.assertRaisesRegex(TypeError, + '`dataset` must be either a tf.data.Dataset ' + f'object or a list/tuple of arrays. Received ' + ': '): dataset_utils.split_dataset(dataset=float('INF'), left_size=5) def test_valid_left_and_right_sizes(self): dataset = np.array([1,2,3]) - splitted_dataset = dataset_utils.split_dataset(dataset, - left_size=1, - right_size=2) + splitted_dataset = dataset_utils.split_dataset(dataset,1,2) assert(len(splitted_dataset) == 2) left_split,right_split = splitted_dataset self.assertEqual(len(left_split), 1) @@ -262,50 +269,63 @@ def test_float_left_and_right_sizes(self): self.assertEqual(len(right_split), 1) def test_invalid_float_left_and_right_sizes(self): - with self.assertRaises(ValueError): + expected_regex = (r'^(.*?(\bleft_size\b).*?(\bshould be\b)' + r'.*?(\bwithin the range\b).*?(\b0\b).*?(\b1\b))') + with self.assertRaisesRegexp(ValueError,expected_regex): dataset = [np.ones(shape=(200, 32,32)), np.zeros(shape=(200, 32,32))] dataset_utils.split_dataset(dataset, left_size=1.5,right_size=0.2) - with self.assertRaises(ValueError): - dataset = [1] - dataset_utils.split_dataset(dataset, left_size=0.8,right_size=0.2) + + expected_regex = (r'^(.*?(\bright_size\b).*?(\bshould be\b)' + r'.*?(\bwithin the range\b).*?(\b0\b).*?(\b1\b))') + with self.assertRaisesRegex(ValueError,expected_regex): + dataset = [np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))] + dataset_utils.split_dataset(dataset, left_size=0.8,right_size=-0.8) def test_None_and_zero_left_and_right_size(self): - with self.assertRaises(ValueError): - dataset_utils.split_dataset(dataset=np.array([1,2,3]), left_size=None) + expected_regex = (r'^.*?(\bleft_size\b).*?(\bright_size\b).*?(\bmust ' + r'be specified\b).*?(\bReceived: left_size=None and' + r' right_size=None\b)') - with self.assertRaises(ValueError): + with self.assertRaisesRegex(ValueError,expected_regex): + dataset_utils.split_dataset(dataset=np.array([1,2,3]), left_size=None) + with self.assertRaisesRegex(ValueError, expected_regex): dataset_utils.split_dataset(np.array([1,2,3]),left_size=None, right_size=None) - with self.assertRaises(ValueError): - dataset_utils.split_dataset(np.array([1,2,3]),left_size=3, - right_size=None) - - with self.assertRaises(ValueError): - dataset_utils.split_dataset(np.array([1,2,3]),left_size=3, - right_size=None) + expected_regex = (r'^.*?(\bleft_size\b).*?(\bshould be\b)' + r'.*?(\bpositive\b).*?(\bsmaller than 3\b)') + with self.assertRaisesRegex(ValueError,expected_regex): + dataset_utils.split_dataset(np.array([1,2,3]),left_size=3) - with self.assertRaises(ValueError): + expected_regex = ('Both `left_size` and `right_size` are zero. ' + 'Atleast one of the split sizes must be non-zero.') + with self.assertRaisesRegex(ValueError,expected_regex): dataset_utils.split_dataset(np.array([1,2,3]), left_size=0, right_size=0) def test_invalid_left_and_right_size_types(self): - with self.assertRaises(TypeError): - dataset_utils.split_dataset(np.array([1,2,3]),left_size='1', + expected_regex = (r'^.*?(\bInvalid `left_size` and `right_size` Types' + r'\b).*?(\bExpected: integer or float or None\b)') + with self.assertRaisesRegex(TypeError,expected_regex): + dataset_utils.split_dataset(np.array([1,2,3]), left_size='1', right_size='1') - with self.assertRaises(TypeError): + expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)') + with self.assertRaisesRegex(TypeError,expected_regex): dataset_utils.split_dataset(np.array([1,2,3]),left_size=0, right_size='1') - - with self.assertRaises(TypeError): + + expected_regex = (r'^.*?(\bInvalid `left_size` Type\b)') + with self.assertRaisesRegex(TypeError,expected_regex): dataset_utils.split_dataset(np.array([1,2,3]),left_size='100', right_size=None) - with self.assertRaises(TypeError): + expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)') + with self.assertRaisesRegex(TypeError,expected_regex): dataset_utils.split_dataset(np.array([1,2,3]),right_size='1') - with self.assertRaises(TypeError): + expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)') + with self.assertRaisesRegex(TypeError,expected_regex): dataset_utils.split_dataset(np.array([1,2,3]),left_size=0.5, right_size='1') From 9e0c0ee4316976d854dac359a981a70298e01a4d Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Tue, 19 Apr 2022 23:24:35 +0530 Subject: [PATCH 16/21] adds support to batched/unbatched dicts of tf datasets --- keras/utils/dataset_utils.py | 43 +++++++++++++---- keras/utils/dataset_utils_test.py | 78 ++++++++++++++++++++++++++++++- 2 files changed, 109 insertions(+), 12 deletions(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index 67124e9ef18..078066c086d 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -78,13 +78,21 @@ def split_dataset(dataset, total_length) left_split = list(dataset_as_list[:left_size]) right_split = list(dataset_as_list[-right_size:]) - - left_split = _restore_dataset_from_list(left_split,dataset_type_spec) - right_split = _restore_dataset_from_list(right_split,dataset_type_spec) + + left_split = _restore_dataset_from_list(left_split,dataset_type_spec,dataset) + right_split = _restore_dataset_from_list(right_split,dataset_type_spec, + dataset) left_split = tf.data.Dataset.from_tensor_slices(left_split) right_split = tf.data.Dataset.from_tensor_slices(right_split) + # Batch the splits if the `dataset` is batched + if dataset_type_spec is tf.data.Dataset and is_batched(dataset): + batch_size = get_batch_size(dataset) + if batch_size is not None: + left_split = left_split.batch(batch_size) + right_split = right_split.batch(batch_size) + left_split = left_split.prefetch(tf.data.AUTOTUNE) right_split = right_split.prefetch(tf.data.AUTOTUNE) @@ -112,8 +120,6 @@ def _convert_dataset_to_list(dataset, Returns: List: A list of tuples/numpy arrays. """ - # TODO (prakashsellathurai): add support for Batched and unbatched dict tf datasets - dataset_iterator = _get_data_iterator_from_dataset(dataset,dataset_type_spec) dataset_as_list = [] @@ -259,12 +265,22 @@ def _get_next_sample(dataset_iterator, data_size_warning_flag = False yield sample -def _restore_dataset_from_list(dataset,dataset_type_spec): - """Restore the dataset from the list of arrays. - """ +def _restore_dataset_from_list(dataset_as_list,dataset_type_spec, + original_dataset): + """Restore the dataset from the list of arrays.""" if dataset_type_spec in [tuple,list]: - dataset = tuple(np.array(sample) for sample in zip(*dataset)) - return dataset + return tuple(np.array(sample) for sample in zip(*dataset_as_list)) + elif dataset_type_spec == tf.data.Dataset: + if type(original_dataset.element_spec) is dict: + restored_dataset = dict() + for d in dataset_as_list: + for k, v in d.items(): + if k not in restored_dataset: + restored_dataset[k] = [np.array(v)] + else: + restored_dataset[k].append(np.array(v)) + return restored_dataset + return dataset_as_list def _rescale_dataset_split_sizes(left_size,right_size,total_length): """Rescale the dataset split sizes to ensure that the sum of @@ -383,6 +399,13 @@ def is_batched(tf_dataset): return tf_dataset.__class__.__name__ == 'BatchDataset' except : return False + +def get_batch_size(tf_dataset): + """Get the batch size of the dataset.""" + if is_batched(tf_dataset): + return tf_dataset._batch_size + else: + return None def index_directory(directory, labels, diff --git a/keras/utils/dataset_utils_test.py b/keras/utils/dataset_utils_test.py index 2c33d339c1d..477a2cf699a 100644 --- a/keras/utils/dataset_utils_test.py +++ b/keras/utils/dataset_utils_test.py @@ -118,9 +118,12 @@ def test_batched_tf_dataset_of_vectors(self): dataset = dataset.batch(10) left_split,right_split=dataset_utils.split_dataset(dataset,left_size=2) + # Ensure that the splits are batched + self.assertAllEqual(np.array(list(right_split)).shape,(10,)) + + left_split,right_split = left_split.unbatch(),right_split.unbatch() self.assertAllEqual(np.array(list(left_split)).shape,(2,32,32,1)) self.assertAllEqual(np.array(list(right_split)).shape,(98,32,32,1)) - dataset = dataset.unbatch() self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) @@ -130,6 +133,11 @@ def test_batched_tf_dataset_of_tuple_of_vectors(self): dataset = dataset.batch(2) left_split,right_split=dataset_utils.split_dataset(dataset,left_size=4) + # Ensure that the splits are batched + self.assertEqual(np.array(list(right_split)).shape,(3, 2, 2, 32, 32)) + self.assertEqual(np.array(list(left_split)).shape,(2, 2, 2, 32, 32)) + + left_split,right_split = left_split.unbatch(),right_split.unbatch() self.assertAllEqual(np.array(list(left_split)).shape,(4,2,32,32)) self.assertAllEqual(np.array(list(right_split)).shape,(6,2,32,32)) @@ -156,7 +164,73 @@ def test_unbatched_tf_dataset_of_tuple_of_vectors(self): self.assertAllEqual(np.array(list(right_split)).shape,(5,2,32,32,1)) self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) - + + def test_unbatched_tf_dataset_of_dict_of_vectors(self): + # test with dict of np arrays of same shape + dict_samples = {'X':np.random.rand(10,2), + 'Y':np.random.rand(10,2)} + dataset = tf.data.Dataset.from_tensor_slices(dict_samples) + left_split,right_split=dataset_utils.split_dataset(dataset,left_size=2) + self.assertEqual(len(list(left_split)),2) + self.assertEqual(len(list(right_split)),8) + for i in range(10): + if i < 2: + self.assertEqual(list(left_split)[i],list(dataset)[i]) + else: + self.assertEqual(list(right_split)[i-2],list(dataset)[i]) + + # test with dict of np arrays with different shapes + dict_samples = {'images':np.random.rand(10,16,16,3), + 'labels':np.random.rand(10,)} + dataset = tf.data.Dataset.from_tensor_slices(dict_samples) + left_split,right_split=dataset_utils.split_dataset(dataset,left_size=0.3) + self.assertEqual(len(list(left_split)),3) + self.assertEqual(len(list(right_split)),7) + for i in range(10): + if i < 3: + self.assertEqual(list(left_split)[i],list(dataset)[i]) + else: + self.assertEqual(list(right_split)[i-3],list(dataset)[i]) + + def test_batched_tf_dataset_of_dict_of_vectors(self): + dict_samples = {'X':np.random.rand(10,3), + 'Y':np.random.rand(10,3)} + dataset = tf.data.Dataset.from_tensor_slices(dict_samples) + dataset = dataset.batch(2) + left_split,right_split=dataset_utils.split_dataset(dataset,left_size=2) + + self.assertAllEqual(np.array(list(left_split)).shape,(1,)) + self.assertAllEqual(np.array(list(right_split)).shape,(4,)) + + left_split,right_split = left_split.unbatch(),right_split.unbatch() + self.assertEqual(len(list(left_split)),2) + self.assertEqual(len(list(right_split)),8) + for i in range(10): + if i < 2: + self.assertEqual(list(left_split)[i],list(dataset.unbatch())[i]) + else: + self.assertEqual(list(right_split)[i-2],list(dataset.unbatch())[i]) + + # test with dict of np arrays with different shapes + dict_samples = {'images':np.random.rand(10,16,16,3), + 'labels':np.random.rand(10,)} + dataset = tf.data.Dataset.from_tensor_slices(dict_samples) + dataset = dataset.batch(1) + left_split,right_split=dataset_utils.split_dataset(dataset,right_size=0.3) + + self.assertAllEqual(np.array(list(left_split)).shape,(7,)) + self.assertAllEqual(np.array(list(right_split)).shape,(3,)) + + dataset = dataset.unbatch() + left_split,right_split = left_split.unbatch(),right_split.unbatch() + self.assertEqual(len(list(left_split)),7) + self.assertEqual(len(list(right_split)),3) + for i in range(10): + if i < 7: + self.assertEqual(list(left_split)[i],list(dataset)[i]) + else: + self.assertEqual(list(right_split)[i-7],list(dataset)[i]) + def test_list_dataset(self): dataset = [np.ones(shape=(10,10,10)) for _ in range(10)] left_split,right_split = dataset_utils.split_dataset(dataset, From 3337f8716967b9b5c9c575e73c66cef0a17e891f Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Wed, 20 Apr 2022 00:07:51 +0530 Subject: [PATCH 17/21] adds mnist dataset test case --- keras/utils/dataset_utils.py | 52 ++++++++++++------------ keras/utils/dataset_utils_test.py | 67 +++++++++++++++++++------------ 2 files changed, 68 insertions(+), 51 deletions(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index 078066c086d..47ac6e951f1 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -54,7 +54,7 @@ def split_dataset(dataset, A tuple of two `tf.data.Dataset` objects: the left and right splits. """ dataset_type_spec = _get_type_spec(dataset) - + if dataset_type_spec not in [tf.data.Dataset,list,tuple,np.ndarray]: raise TypeError('`dataset` must be either a tf.data.Dataset object ' f'or a list/tuple of arrays. Received : {type(dataset)}') @@ -78,7 +78,7 @@ def split_dataset(dataset, total_length) left_split = list(dataset_as_list[:left_size]) right_split = list(dataset_as_list[-right_size:]) - + left_split = _restore_dataset_from_list(left_split,dataset_type_spec,dataset) right_split = _restore_dataset_from_list(right_split,dataset_type_spec, dataset) @@ -86,13 +86,13 @@ def split_dataset(dataset, left_split = tf.data.Dataset.from_tensor_slices(left_split) right_split = tf.data.Dataset.from_tensor_slices(right_split) - # Batch the splits if the `dataset` is batched - if dataset_type_spec is tf.data.Dataset and is_batched(dataset): + # apply batching to the splits if the dataset is batched + if dataset_type_spec is tf.data.Dataset and is_batched(dataset): batch_size = get_batch_size(dataset) if batch_size is not None: left_split = left_split.batch(batch_size) right_split = right_split.batch(batch_size) - + left_split = left_split.prefetch(tf.data.AUTOTUNE) right_split = right_split.prefetch(tf.data.AUTOTUNE) @@ -107,13 +107,13 @@ def _convert_dataset_to_list(dataset, Args: dataset : A `tf.data.Dataset` object or a list/tuple of arrays. dataset_type_spec : the type of the dataset - data_size_warning_flag (bool, optional): If set to True ,a warning will - be issued if the dataset takes - longer than 10 seconds to + data_size_warning_flag (bool, optional): If set to True ,a warning will + be issued if the dataset takes + longer than 10 seconds to iterate. Defaults to True. - ensure_shape_similarity (bool, optional): If set to True , the shape of - the first sample will be used - to validate the shape of rest + ensure_shape_similarity (bool, optional): If set to True , the shape of + the first sample will be used + to validate the shape of rest of the samples. Defaults to True. @@ -143,9 +143,9 @@ def _get_data_iterator_from_dataset(dataset,dataset_type_spec) : dataset_type_spec : the type of the dataset Raises: - ValueError: + ValueError: - If the dataset is empty. - - If the dataset is not a `tf.data.Dataset` object + - If the dataset is not a `tf.data.Dataset` object or a list/tuple of arrays. - If the dataset is a list/tuple of arrays and the length of the list/tuple is not equal to the number @@ -205,16 +205,16 @@ def _get_next_sample(dataset_iterator, data_size_warning_flag, start_time): """"Yield data samples from the `dataset_iterator` - + Args: dataset_iterator : An `iterator` object. - data_size_warning_flag (bool, optional): If set to True ,a warning will - be issued if the dataset takes - longer than 10 seconds to + data_size_warning_flag (bool, optional): If set to True ,a warning will + be issued if the dataset takes + longer than 10 seconds to iterate. Defaults to True. - ensure_shape_similarity (bool, optional): If set to True , the shape of - the first sample will be used - to validate the shape of rest + ensure_shape_similarity (bool, optional): If set to True , the shape of + the first sample will be used + to validate the shape of rest of the samples. Defaults to True. start_time (float): the start time of the dataset iteration. this is @@ -267,7 +267,7 @@ def _get_next_sample(dataset_iterator, def _restore_dataset_from_list(dataset_as_list,dataset_type_spec, original_dataset): - """Restore the dataset from the list of arrays.""" + """Restore the dataset from the list of arrays.""" if dataset_type_spec in [tuple,list]: return tuple(np.array(sample) for sample in zip(*dataset_as_list)) elif dataset_type_spec == tf.data.Dataset: @@ -283,9 +283,9 @@ def _restore_dataset_from_list(dataset_as_list,dataset_type_spec, return dataset_as_list def _rescale_dataset_split_sizes(left_size,right_size,total_length): - """Rescale the dataset split sizes to ensure that the sum of + """Rescale the dataset split sizes to ensure that the sum of the split sizes is equal to the total length of the dataset. - + Args: left_size : The size of the left dataset split. right_size : The size of the right dataset split. @@ -297,7 +297,7 @@ def _rescale_dataset_split_sizes(left_size,right_size,total_length): than 1 or greater than `total_length`. Returns: - tuple: A tuple of rescaled left_size and right_size + tuple: A tuple of rescaled left_size and right_size """ left_size_type = type(left_size) right_size_type = type(right_size) @@ -308,7 +308,7 @@ def _rescale_dataset_split_sizes(left_size,right_size,total_length): raise TypeError('Invalid `left_size` and `right_size` Types. Expected: ' 'integer or float or None, Received: type(left_size)=' f'{left_size_type} and type(right_size)={right_size_type}') - + # check left_size is a integer or float if left_size is not None and left_size_type not in [int,float]: raise TypeError('Invalid `left_size` Type.Expected: int or float or None, ' @@ -399,7 +399,7 @@ def is_batched(tf_dataset): return tf_dataset.__class__.__name__ == 'BatchDataset' except : return False - + def get_batch_size(tf_dataset): """Get the batch size of the dataset.""" if is_batched(tf_dataset): diff --git a/keras/utils/dataset_utils_test.py b/keras/utils/dataset_utils_test.py index 477a2cf699a..c0d9de81c22 100644 --- a/keras/utils/dataset_utils_test.py +++ b/keras/utils/dataset_utils_test.py @@ -5,6 +5,7 @@ import numpy as np from keras.utils import dataset_utils +from keras.datasets import mnist class SplitDatasetTest(tf.test.TestCase): def test_numpy_array(self): @@ -40,7 +41,7 @@ def test_list_of_numpy_arrays(self): # test with different shapes dataset = [np.ones(shape=(5, 3)), np.ones(shape=(5, ))] left_split,right_split = dataset_utils.split_dataset(dataset,left_size=0.3) - + self.assertEqual(np.array(list(left_split)).shape,(2,2)) self.assertEqual(np.array(list(right_split)).shape,(3,2)) @@ -51,7 +52,7 @@ def test_list_of_numpy_arrays(self): self.assertEqual(np.array(list(right_split)[0]).shape,(2,)) self.assertEqual(np.array(list(right_split)[0][0]).shape,(3,)) self.assertEqual(np.array(list(right_split)[0][1]).shape,()) - + def test_dataset_with_invalid_shape(self): with self.assertRaisesRegex(ValueError, 'Received a list of numpy arrays ' @@ -120,7 +121,7 @@ def test_batched_tf_dataset_of_vectors(self): # Ensure that the splits are batched self.assertAllEqual(np.array(list(right_split)).shape,(10,)) - + left_split,right_split = left_split.unbatch(),right_split.unbatch() self.assertAllEqual(np.array(list(left_split)).shape,(2,32,32,1)) self.assertAllEqual(np.array(list(right_split)).shape,(98,32,32,1)) @@ -136,7 +137,7 @@ def test_batched_tf_dataset_of_tuple_of_vectors(self): # Ensure that the splits are batched self.assertEqual(np.array(list(right_split)).shape,(3, 2, 2, 32, 32)) self.assertEqual(np.array(list(left_split)).shape,(2, 2, 2, 32, 32)) - + left_split,right_split = left_split.unbatch(),right_split.unbatch() self.assertAllEqual(np.array(list(left_split)).shape,(4,2,32,32)) self.assertAllEqual(np.array(list(right_split)).shape,(6,2,32,32)) @@ -164,7 +165,7 @@ def test_unbatched_tf_dataset_of_tuple_of_vectors(self): self.assertAllEqual(np.array(list(right_split)).shape,(5,2,32,32,1)) self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) - + def test_unbatched_tf_dataset_of_dict_of_vectors(self): # test with dict of np arrays of same shape dict_samples = {'X':np.random.rand(10,2), @@ -178,12 +179,12 @@ def test_unbatched_tf_dataset_of_dict_of_vectors(self): self.assertEqual(list(left_split)[i],list(dataset)[i]) else: self.assertEqual(list(right_split)[i-2],list(dataset)[i]) - + # test with dict of np arrays with different shapes dict_samples = {'images':np.random.rand(10,16,16,3), 'labels':np.random.rand(10,)} dataset = tf.data.Dataset.from_tensor_slices(dict_samples) - left_split,right_split=dataset_utils.split_dataset(dataset,left_size=0.3) + left_split,right_split=dataset_utils.split_dataset(dataset,left_size=0.3) self.assertEqual(len(list(left_split)),3) self.assertEqual(len(list(right_split)),7) for i in range(10): @@ -191,17 +192,17 @@ def test_unbatched_tf_dataset_of_dict_of_vectors(self): self.assertEqual(list(left_split)[i],list(dataset)[i]) else: self.assertEqual(list(right_split)[i-3],list(dataset)[i]) - + def test_batched_tf_dataset_of_dict_of_vectors(self): dict_samples = {'X':np.random.rand(10,3), 'Y':np.random.rand(10,3)} dataset = tf.data.Dataset.from_tensor_slices(dict_samples) dataset = dataset.batch(2) left_split,right_split=dataset_utils.split_dataset(dataset,left_size=2) - + self.assertAllEqual(np.array(list(left_split)).shape,(1,)) - self.assertAllEqual(np.array(list(right_split)).shape,(4,)) - + self.assertAllEqual(np.array(list(right_split)).shape,(4,)) + left_split,right_split = left_split.unbatch(),right_split.unbatch() self.assertEqual(len(list(left_split)),2) self.assertEqual(len(list(right_split)),8) @@ -210,17 +211,17 @@ def test_batched_tf_dataset_of_dict_of_vectors(self): self.assertEqual(list(left_split)[i],list(dataset.unbatch())[i]) else: self.assertEqual(list(right_split)[i-2],list(dataset.unbatch())[i]) - + # test with dict of np arrays with different shapes dict_samples = {'images':np.random.rand(10,16,16,3), 'labels':np.random.rand(10,)} dataset = tf.data.Dataset.from_tensor_slices(dict_samples) dataset = dataset.batch(1) - left_split,right_split=dataset_utils.split_dataset(dataset,right_size=0.3) - + left_split,right_split=dataset_utils.split_dataset(dataset,right_size=0.3) + self.assertAllEqual(np.array(list(left_split)).shape,(7,)) self.assertAllEqual(np.array(list(right_split)).shape,(3,)) - + dataset = dataset.unbatch() left_split,right_split = left_split.unbatch(),right_split.unbatch() self.assertEqual(len(list(left_split)),7) @@ -230,7 +231,7 @@ def test_batched_tf_dataset_of_dict_of_vectors(self): self.assertEqual(list(left_split)[i],list(dataset)[i]) else: self.assertEqual(list(right_split)[i-7],list(dataset)[i]) - + def test_list_dataset(self): dataset = [np.ones(shape=(10,10,10)) for _ in range(10)] left_split,right_split = dataset_utils.split_dataset(dataset, @@ -384,24 +385,40 @@ def test_invalid_left_and_right_size_types(self): dataset_utils.split_dataset(np.array([1,2,3]), left_size='1', right_size='1') - expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)') + expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)') with self.assertRaisesRegex(TypeError,expected_regex): dataset_utils.split_dataset(np.array([1,2,3]),left_size=0, right_size='1') - - expected_regex = (r'^.*?(\bInvalid `left_size` Type\b)') - with self.assertRaisesRegex(TypeError,expected_regex): + + expected_regex = (r'^.*?(\bInvalid `left_size` Type\b)') + with self.assertRaisesRegex(TypeError,expected_regex): dataset_utils.split_dataset(np.array([1,2,3]),left_size='100', right_size=None) - expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)') - with self.assertRaisesRegex(TypeError,expected_regex): + expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)') + with self.assertRaisesRegex(TypeError,expected_regex): dataset_utils.split_dataset(np.array([1,2,3]),right_size='1') - - expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)') - with self.assertRaisesRegex(TypeError,expected_regex): + + expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)') + with self.assertRaisesRegex(TypeError,expected_regex): dataset_utils.split_dataset(np.array([1,2,3]),left_size=0.5, right_size='1') + def test_mnist_dataset(self): + (x_train, y_train), (x_test, y_test) = mnist.load_data() + assert x_train.shape == (60000, 28, 28) + assert x_test.shape == (10000, 28, 28) + assert y_train.shape == (60000,) + assert y_test.shape == (10000,) + + dataset = (x_train[:100], y_train[:100]) + left_split,right_split = dataset_utils.split_dataset(dataset,left_size=0.8) + + self.assertIsInstance(left_split, tf.data.Dataset) + self.assertIsInstance(right_split, tf.data.Dataset) + + self.assertEqual(len(left_split), 80) + self.assertEqual(len(right_split), 20) + if __name__ == "__main__": tf.test.main() From dc2beed2dc2c580ee823336cb1348b2f71d537be Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Wed, 20 Apr 2022 01:07:24 +0530 Subject: [PATCH 18/21] adds testcase for tf dataset of vectors of different shapes --- keras/utils/dataset_utils.py | 2 + keras/utils/dataset_utils_test.py | 102 +++++++++++++++++------------- 2 files changed, 61 insertions(+), 43 deletions(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index 47ac6e951f1..3faf3bfa085 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -280,6 +280,8 @@ def _restore_dataset_from_list(dataset_as_list,dataset_type_spec, else: restored_dataset[k].append(np.array(v)) return restored_dataset + else: + return tuple(np.array(sample) for sample in zip(*dataset_as_list)) return dataset_as_list def _rescale_dataset_split_sizes(left_size,right_size,total_length): diff --git a/keras/utils/dataset_utils_test.py b/keras/utils/dataset_utils_test.py index c0d9de81c22..dc0bb0e841b 100644 --- a/keras/utils/dataset_utils_test.py +++ b/keras/utils/dataset_utils_test.py @@ -120,7 +120,7 @@ def test_batched_tf_dataset_of_vectors(self): left_split,right_split=dataset_utils.split_dataset(dataset,left_size=2) # Ensure that the splits are batched - self.assertAllEqual(np.array(list(right_split)).shape,(10,)) + self.assertEqual(len(list(right_split)),10) left_split,right_split = left_split.unbatch(),right_split.unbatch() self.assertAllEqual(np.array(list(left_split)).shape,(2,32,32,1)) @@ -145,6 +145,45 @@ def test_batched_tf_dataset_of_tuple_of_vectors(self): dataset = dataset.unbatch() self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) + def test_batched_tf_dataset_of_dict_of_vectors(self): + dict_samples = {'X':np.random.rand(10,3), + 'Y':np.random.rand(10,3)} + dataset = tf.data.Dataset.from_tensor_slices(dict_samples) + dataset = dataset.batch(2) + left_split,right_split=dataset_utils.split_dataset(dataset,left_size=2) + + self.assertAllEqual(np.array(list(left_split)).shape,(1,)) + self.assertAllEqual(np.array(list(right_split)).shape,(4,)) + + left_split,right_split = left_split.unbatch(),right_split.unbatch() + self.assertEqual(len(list(left_split)),2) + self.assertEqual(len(list(right_split)),8) + for i in range(10): + if i < 2: + self.assertEqual(list(left_split)[i],list(dataset.unbatch())[i]) + else: + self.assertEqual(list(right_split)[i-2],list(dataset.unbatch())[i]) + + # test with dict of np arrays with different shapes + dict_samples = {'images':np.random.rand(10,16,16,3), + 'labels':np.random.rand(10,)} + dataset = tf.data.Dataset.from_tensor_slices(dict_samples) + dataset = dataset.batch(1) + left_split,right_split=dataset_utils.split_dataset(dataset,right_size=0.3) + + self.assertAllEqual(np.array(list(left_split)).shape,(7,)) + self.assertAllEqual(np.array(list(right_split)).shape,(3,)) + + dataset = dataset.unbatch() + left_split,right_split = left_split.unbatch(),right_split.unbatch() + self.assertEqual(len(list(left_split)),7) + self.assertEqual(len(list(right_split)),3) + for i in range(10): + if i < 7: + self.assertEqual(list(left_split)[i],list(dataset)[i]) + else: + self.assertEqual(list(right_split)[i-7],list(dataset)[i]) + def test_unbatched_tf_dataset_of_vectors(self): dataset = tf.data.Dataset.from_tensor_slices(np.ones(shape=(100,16, 16,3))) @@ -155,17 +194,33 @@ def test_unbatched_tf_dataset_of_vectors(self): self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) + dataset = [np.random.rand(10,3,3) for _ in range(5)] + dataset = tf.data.Dataset.from_tensor_slices(dataset) + + left_split,right_split=dataset_utils.split_dataset(dataset,left_size=2) + self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) + def test_unbatched_tf_dataset_of_tuple_of_vectors(self): + # test with tuple of np arrays with same shape X,Y = (np.random.rand(10,32,32,1),np.random.rand(10,32,32,1)) dataset = tf.data.Dataset.from_tensor_slices((X,Y)) left_split,right_split=dataset_utils.split_dataset(dataset,left_size=5) - self.assertAllEqual(np.array(list(left_split)).shape,(5,2,32,32,1)) - self.assertAllEqual(np.array(list(right_split)).shape,(5,2,32,32,1)) - + self.assertEqual(len(list(left_split)),5) + self.assertEqual(len(list(right_split)),5) self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) + # test with tuple of np arrays with different shapes + X,Y = (np.random.rand(5,3,3),np.random.rand(5,)) + dataset = tf.data.Dataset.from_tensor_slices((X,Y)) + left_split,right_split=dataset_utils.split_dataset(dataset,left_size=0.5) + + self.assertEqual(len(list(left_split)),2) + self.assertEqual(len(list(right_split)),3) + self.assertEqual(np.array(list(left_split)[0][0]).shape,(3,3)) + self.assertEqual(np.array(list(left_split)[0][1]).shape,()) + def test_unbatched_tf_dataset_of_dict_of_vectors(self): # test with dict of np arrays of same shape dict_samples = {'X':np.random.rand(10,2), @@ -193,45 +248,6 @@ def test_unbatched_tf_dataset_of_dict_of_vectors(self): else: self.assertEqual(list(right_split)[i-3],list(dataset)[i]) - def test_batched_tf_dataset_of_dict_of_vectors(self): - dict_samples = {'X':np.random.rand(10,3), - 'Y':np.random.rand(10,3)} - dataset = tf.data.Dataset.from_tensor_slices(dict_samples) - dataset = dataset.batch(2) - left_split,right_split=dataset_utils.split_dataset(dataset,left_size=2) - - self.assertAllEqual(np.array(list(left_split)).shape,(1,)) - self.assertAllEqual(np.array(list(right_split)).shape,(4,)) - - left_split,right_split = left_split.unbatch(),right_split.unbatch() - self.assertEqual(len(list(left_split)),2) - self.assertEqual(len(list(right_split)),8) - for i in range(10): - if i < 2: - self.assertEqual(list(left_split)[i],list(dataset.unbatch())[i]) - else: - self.assertEqual(list(right_split)[i-2],list(dataset.unbatch())[i]) - - # test with dict of np arrays with different shapes - dict_samples = {'images':np.random.rand(10,16,16,3), - 'labels':np.random.rand(10,)} - dataset = tf.data.Dataset.from_tensor_slices(dict_samples) - dataset = dataset.batch(1) - left_split,right_split=dataset_utils.split_dataset(dataset,right_size=0.3) - - self.assertAllEqual(np.array(list(left_split)).shape,(7,)) - self.assertAllEqual(np.array(list(right_split)).shape,(3,)) - - dataset = dataset.unbatch() - left_split,right_split = left_split.unbatch(),right_split.unbatch() - self.assertEqual(len(list(left_split)),7) - self.assertEqual(len(list(right_split)),3) - for i in range(10): - if i < 7: - self.assertEqual(list(left_split)[i],list(dataset)[i]) - else: - self.assertEqual(list(right_split)[i-7],list(dataset)[i]) - def test_list_dataset(self): dataset = [np.ones(shape=(10,10,10)) for _ in range(10)] left_split,right_split = dataset_utils.split_dataset(dataset, From c3a27a6642c03c6380aca22c6e3d73d0b29bb271 Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Sat, 23 Apr 2022 00:44:55 +0530 Subject: [PATCH 19/21] fixed import random, removed keras import ,fixed grammer issues --- keras/utils/__init__.py | 1 + keras/utils/dataset_utils.py | 28 ++++++++++++------------- keras/utils/dataset_utils_test.py | 34 ++++++++++++++++++++++--------- 3 files changed, 38 insertions(+), 25 deletions(-) diff --git a/keras/utils/__init__.py b/keras/utils/__init__.py index 56ec3af0da2..b157619b655 100644 --- a/keras/utils/__init__.py +++ b/keras/utils/__init__.py @@ -16,6 +16,7 @@ # pylint: disable=g-bad-import-order from keras.utils.data_utils import get_file +from keras.utils.dataset_utils import split_dataset from keras.utils.generic_utils import Progbar from keras.utils.image_dataset import image_dataset_from_directory from keras.utils.text_dataset import text_dataset_from_directory diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index 3df972b2ba1..34a6d4ce525 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -21,12 +21,10 @@ import os import time import warnings -from random import Random +import random import numpy as np -from tensorflow.python.util.tf_export import keras_export -@keras_export('keras.utils.split_dataset') def split_dataset(dataset, left_size=None, right_size=None, @@ -61,16 +59,16 @@ def split_dataset(dataset, if right_size is None and left_size is None: raise ValueError('At least one of the `left_size` or `right_size` ' - 'must be specified .Received: left_size=None and ' + 'must be specified. Received: left_size=None and ' 'right_size=None') dataset_as_list = _convert_dataset_to_list(dataset,dataset_type_spec) - if seed is None: - seed = np.random.randint(1e6) - if shuffle: - Random(seed).shuffle(dataset_as_list) + if seed is None: + seed = random.randint(0,1e6) + random.seed(seed) + random.shuffle(dataset_as_list) total_length = len(dataset_as_list) @@ -189,7 +187,7 @@ def _get_data_iterator_from_dataset(dataset,dataset_type_spec) : f'Please provide a tuple of numpy arrays with ' 'same length.') else: - raise ValueError('Expected a tuple of numpy.ndarrays,' + raise ValueError('Expected a tuple of numpy.ndarrays, ' 'Received: {}'.format(type(dataset[0]))) return iter(zip(*dataset)) @@ -271,14 +269,14 @@ def _restore_dataset_from_list(dataset_as_list,dataset_type_spec, if dataset_type_spec in [tuple,list]: return tuple(np.array(sample) for sample in zip(*dataset_as_list)) elif dataset_type_spec == tf.data.Dataset: - if type(original_dataset.element_spec) is dict: - restored_dataset = dict() + if isinstance(original_dataset.element_spec,dict): + restored_dataset = {} for d in dataset_as_list: for k, v in d.items(): if k not in restored_dataset: - restored_dataset[k] = [np.array(v)] + restored_dataset[k] = [v] else: - restored_dataset[k].append(np.array(v)) + restored_dataset[k].append(v) return restored_dataset else: return tuple(np.array(sample) for sample in zip(*dataset_as_list)) @@ -616,8 +614,8 @@ def check_validation_split_arg(validation_split, subset, shuffle, seed): Args: validation_split: float between 0 and 1, fraction of data to reserve for validation. - subset: One of "training", "validation" or "both". Only used if `validation_split` - is set. + subset: One of "training", "validation" or "both". Only used if + `validation_split` is set. shuffle: Whether to shuffle the data. Either True or False. seed: random seed for shuffling and transformations. """ diff --git a/keras/utils/dataset_utils_test.py b/keras/utils/dataset_utils_test.py index dc0bb0e841b..44d5c9c3134 100644 --- a/keras/utils/dataset_utils_test.py +++ b/keras/utils/dataset_utils_test.py @@ -247,6 +247,20 @@ def test_unbatched_tf_dataset_of_dict_of_vectors(self): self.assertEqual(list(left_split)[i],list(dataset)[i]) else: self.assertEqual(list(right_split)[i-3],list(dataset)[i]) + + # test with dict of text arrays + dict_samples = {'txt_feature':['abb','bb','cc','d','e','f','g','h','i','j'], + 'label':[1,2,3,4,5,6,7,8,9,10]} + dataset = tf.data.Dataset.from_tensor_slices(dict_samples) + left_split,right_split=dataset_utils.split_dataset(dataset,left_size=0.45, + right_size=0.55) + self.assertEqual(len(list(left_split)),4) + self.assertEqual(len(list(right_split)),6) + for i in range(10): + if i < 4: + self.assertEqual(list(left_split)[i],list(dataset)[i]) + else: + self.assertEqual(list(right_split)[i-4],list(dataset)[i]) def test_list_dataset(self): dataset = [np.ones(shape=(10,10,10)) for _ in range(10)] @@ -362,13 +376,13 @@ def test_float_left_and_right_sizes(self): def test_invalid_float_left_and_right_sizes(self): expected_regex = (r'^(.*?(\bleft_size\b).*?(\bshould be\b)' r'.*?(\bwithin the range\b).*?(\b0\b).*?(\b1\b))') - with self.assertRaisesRegexp(ValueError,expected_regex): + with self.assertRaisesRegexp(ValueError, expected_regex): dataset = [np.ones(shape=(200, 32,32)), np.zeros(shape=(200, 32,32))] dataset_utils.split_dataset(dataset, left_size=1.5,right_size=0.2) expected_regex = (r'^(.*?(\bright_size\b).*?(\bshould be\b)' r'.*?(\bwithin the range\b).*?(\b0\b).*?(\b1\b))') - with self.assertRaisesRegex(ValueError,expected_regex): + with self.assertRaisesRegex(ValueError, expected_regex): dataset = [np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))] dataset_utils.split_dataset(dataset, left_size=0.8,right_size=-0.8) @@ -377,7 +391,7 @@ def test_None_and_zero_left_and_right_size(self): r'be specified\b).*?(\bReceived: left_size=None and' r' right_size=None\b)') - with self.assertRaisesRegex(ValueError,expected_regex): + with self.assertRaisesRegex(ValueError, expected_regex): dataset_utils.split_dataset(dataset=np.array([1,2,3]), left_size=None) with self.assertRaisesRegex(ValueError, expected_regex): dataset_utils.split_dataset(np.array([1,2,3]),left_size=None, @@ -385,38 +399,38 @@ def test_None_and_zero_left_and_right_size(self): expected_regex = (r'^.*?(\bleft_size\b).*?(\bshould be\b)' r'.*?(\bpositive\b).*?(\bsmaller than 3\b)') - with self.assertRaisesRegex(ValueError,expected_regex): + with self.assertRaisesRegex(ValueError, expected_regex): dataset_utils.split_dataset(np.array([1,2,3]),left_size=3) expected_regex = ('Both `left_size` and `right_size` are zero. ' 'Atleast one of the split sizes must be non-zero.') - with self.assertRaisesRegex(ValueError,expected_regex): + with self.assertRaisesRegex(ValueError, expected_regex): dataset_utils.split_dataset(np.array([1,2,3]), left_size=0, right_size=0) def test_invalid_left_and_right_size_types(self): expected_regex = (r'^.*?(\bInvalid `left_size` and `right_size` Types' r'\b).*?(\bExpected: integer or float or None\b)') - with self.assertRaisesRegex(TypeError,expected_regex): + with self.assertRaisesRegex(TypeError, expected_regex): dataset_utils.split_dataset(np.array([1,2,3]), left_size='1', right_size='1') expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)') - with self.assertRaisesRegex(TypeError,expected_regex): + with self.assertRaisesRegex(TypeError, expected_regex): dataset_utils.split_dataset(np.array([1,2,3]),left_size=0, right_size='1') expected_regex = (r'^.*?(\bInvalid `left_size` Type\b)') - with self.assertRaisesRegex(TypeError,expected_regex): + with self.assertRaisesRegex(TypeError, expected_regex): dataset_utils.split_dataset(np.array([1,2,3]),left_size='100', right_size=None) expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)') - with self.assertRaisesRegex(TypeError,expected_regex): + with self.assertRaisesRegex(TypeError, expected_regex): dataset_utils.split_dataset(np.array([1,2,3]),right_size='1') expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)') - with self.assertRaisesRegex(TypeError,expected_regex): + with self.assertRaisesRegex(TypeError, expected_regex): dataset_utils.split_dataset(np.array([1,2,3]),left_size=0.5, right_size='1') From 4960f1eea7f4f9fdf05bca10bb03cc3e487ef2bc Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Sat, 23 Apr 2022 02:37:29 +0530 Subject: [PATCH 20/21] fixed code formatting issues --- keras/utils/dataset_utils.py | 69 +++--- keras/utils/dataset_utils_test.py | 356 +++++++++++++++--------------- 2 files changed, 219 insertions(+), 206 deletions(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index 34a6d4ce525..b39f2a18b4f 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -53,7 +53,7 @@ def split_dataset(dataset, """ dataset_type_spec = _get_type_spec(dataset) - if dataset_type_spec not in [tf.data.Dataset,list,tuple,np.ndarray]: + if dataset_type_spec not in [tf.data.Dataset, list, tuple, np.ndarray]: raise TypeError('`dataset` must be either a tf.data.Dataset object ' f'or a list/tuple of arrays. Received : {type(dataset)}') @@ -62,23 +62,25 @@ def split_dataset(dataset, 'must be specified. Received: left_size=None and ' 'right_size=None') - dataset_as_list = _convert_dataset_to_list(dataset,dataset_type_spec) + dataset_as_list = _convert_dataset_to_list(dataset, dataset_type_spec) if shuffle: if seed is None: - seed = random.randint(0,1e6) + seed = random.randint(0, int(1e6)) random.seed(seed) random.shuffle(dataset_as_list) total_length = len(dataset_as_list) - left_size,right_size = _rescale_dataset_split_sizes(left_size,right_size, + left_size, right_size = _rescale_dataset_split_sizes(left_size, right_size, total_length) left_split = list(dataset_as_list[:left_size]) right_split = list(dataset_as_list[-right_size:]) - left_split = _restore_dataset_from_list(left_split,dataset_type_spec,dataset) - right_split = _restore_dataset_from_list(right_split,dataset_type_spec, + left_split = _restore_dataset_from_list(left_split, + dataset_type_spec, + dataset) + right_split = _restore_dataset_from_list(right_split, dataset_type_spec, dataset) left_split = tf.data.Dataset.from_tensor_slices(left_split) @@ -118,7 +120,7 @@ def _convert_dataset_to_list(dataset, Returns: List: A list of tuples/numpy arrays. """ - dataset_iterator = _get_data_iterator_from_dataset(dataset,dataset_type_spec) + dataset_iterator = _get_data_iterator_from_dataset(dataset, dataset_type_spec) dataset_as_list = [] start_time = time.time() @@ -126,14 +128,14 @@ def _convert_dataset_to_list(dataset, ensure_shape_similarity, data_size_warning_flag, start_time): - if dataset_type_spec in [tuple ,list]: + if dataset_type_spec in [tuple, list]: dataset_as_list.append(np.array(sample)) else: dataset_as_list.append(sample) return dataset_as_list -def _get_data_iterator_from_dataset(dataset,dataset_type_spec) : +def _get_data_iterator_from_dataset(dataset, dataset_type_spec) : """Get the iterator from the dataset Args: @@ -158,7 +160,7 @@ def _get_data_iterator_from_dataset(dataset,dataset_type_spec) : if _get_type_spec(dataset[0]) is np.ndarray: expected_shape = dataset[0].shape - for i,element in enumerate(dataset): + for i, element in enumerate(dataset): if not np.array(element).shape[0] == expected_shape[0]: raise ValueError('Received a list of numpy arrays with different ' f'lengths. Mismatch found at index {i}, ' @@ -178,7 +180,7 @@ def _get_data_iterator_from_dataset(dataset,dataset_type_spec) : if _get_type_spec(dataset[0]) is np.ndarray: expected_shape = dataset[0].shape - for i,element in enumerate(dataset): + for i, element in enumerate(dataset): if not np.array(element).shape[0] == expected_shape[0]: raise ValueError('Received a tuple of numpy arrays with different ' f'lengths. Mismatch found at index {i}, ' @@ -230,7 +232,7 @@ def _get_next_sample(dataset_iterator, try: dataset_iterator = iter(dataset_iterator) first_sample = next(dataset_iterator) - if isinstance(first_sample,(tf.Tensor,np.ndarray)): + if isinstance(first_sample, (tf.Tensor, np.ndarray)): first_sample_shape = np.array(first_sample).shape else: first_sample_shape = None @@ -241,7 +243,7 @@ def _get_next_sample(dataset_iterator, 'be a non-empty list/tuple of numpy.ndarrays ' 'or tf.data.Dataset objects.') - for i,sample in enumerate(dataset_iterator): + for i, sample in enumerate(dataset_iterator): if ensure_shape_similarity: if first_sample_shape != np.array(sample).shape: raise ValueError('All `dataset` samples must have same shape, ' @@ -263,13 +265,14 @@ def _get_next_sample(dataset_iterator, data_size_warning_flag = False yield sample -def _restore_dataset_from_list(dataset_as_list,dataset_type_spec, +def _restore_dataset_from_list(dataset_as_list, + dataset_type_spec, original_dataset): """Restore the dataset from the list of arrays.""" - if dataset_type_spec in [tuple,list]: + if dataset_type_spec in [tuple, list]: return tuple(np.array(sample) for sample in zip(*dataset_as_list)) elif dataset_type_spec == tf.data.Dataset: - if isinstance(original_dataset.element_spec,dict): + if isinstance(original_dataset.element_spec, dict): restored_dataset = {} for d in dataset_as_list: for k, v in d.items(): @@ -282,7 +285,7 @@ def _restore_dataset_from_list(dataset_as_list,dataset_type_spec, return tuple(np.array(sample) for sample in zip(*dataset_as_list)) return dataset_as_list -def _rescale_dataset_split_sizes(left_size,right_size,total_length): +def _rescale_dataset_split_sizes(left_size, right_size, total_length): """Rescale the dataset split sizes to ensure that the sum of the split sizes is equal to the total length of the dataset. @@ -303,19 +306,19 @@ def _rescale_dataset_split_sizes(left_size,right_size,total_length): right_size_type = type(right_size) # check both left_size and right_size are integers or floats - if ((left_size is not None and left_size_type not in [int,float]) and - (right_size is not None and right_size_type not in [int,float])): + if ((left_size is not None and left_size_type not in [int, float]) and + (right_size is not None and right_size_type not in [int, float])): raise TypeError('Invalid `left_size` and `right_size` Types. Expected: ' 'integer or float or None, Received: type(left_size)=' f'{left_size_type} and type(right_size)={right_size_type}') # check left_size is a integer or float - if left_size is not None and left_size_type not in [int,float]: + if left_size is not None and left_size_type not in [int, float]: raise TypeError('Invalid `left_size` Type.Expected: int or float or None, ' f'Received: type(left_size)={left_size_type}. ') # check right_size is a integer or float - if right_size is not None and right_size_type not in [int,float]: + if right_size is not None and right_size_type not in [int, float]: raise TypeError(f'Invalid `right_size` Type.Expected: int or float or None,' f'Received: type(right_size)={right_size_type}. ') @@ -325,16 +328,16 @@ def _rescale_dataset_split_sizes(left_size,right_size,total_length): 'Atleast one of the split sizes must be non-zero.') # check left_size is non-negative and less than 1 and less than total_length - if (left_size_type == int and (left_size <= 0 or left_size>= total_length) - or left_size_type == float and (left_size <= 0 or left_size>= 1) ): + if (left_size_type == int and (left_size <= 0 or left_size >= total_length) + or left_size_type == float and (left_size <= 0 or left_size >= 1) ): raise ValueError('`left_size` should be either a positive integer ' f'and smaller than {total_length} or a float ' 'within the range `[0, 1]`. Received: left_size=' f'{left_size}') # check right_size is non-negative and less than 1 and less than total_length - if (right_size_type == int and (right_size <= 0 or right_size>= total_length) - or right_size_type == float and (right_size <= 0 or right_size>= 1)): + if (right_size_type == int and (right_size <= 0 or right_size >= total_length) + or right_size_type == float and (right_size <= 0 or right_size >= 1)): raise ValueError('`right_size` should be either a positive integer ' f'and smaller than {total_length} or a float ' 'within the range `[0, 1]`. Received: right_size=' @@ -366,27 +369,27 @@ def _rescale_dataset_split_sizes(left_size,right_size,total_length): f'Received: left_size + right_size = {left_size+right_size}' f'and total_length = {total_length}') - for split,side in [(left_size,'left'),(right_size,'right')]: + for split, side in [(left_size, 'left'), (right_size, 'right')]: if split == 0: raise ValueError(f'With `dataset` of length={total_length}, `left_size`=' '{left_size} and `right_size`={right_size}.' f'Resulting {side} side dataset split will be empty. ' 'Adjust any of the aforementioned parameters') - left_size,right_size = int(left_size) ,int(right_size) - return left_size,right_size + left_size, right_size = int(left_size), int(right_size) + return left_size, right_size def _get_type_spec(dataset): """Get the type spec of the dataset.""" - if isinstance(dataset,(tuple)): + if isinstance(dataset, tuple): return tuple - elif isinstance(dataset,(list)): + elif isinstance(dataset, list): return list - elif isinstance(dataset,(np.ndarray)): + elif isinstance(dataset, np.ndarray): return np.ndarray - elif isinstance(dataset,dict): + elif isinstance(dataset, dict): return dict - elif isinstance(dataset,(tf.data.Dataset)): + elif isinstance(dataset, tf.data.Dataset): return tf.data.Dataset else: return None diff --git a/keras/utils/dataset_utils_test.py b/keras/utils/dataset_utils_test.py index 44d5c9c3134..7b62a2b9812 100644 --- a/keras/utils/dataset_utils_test.py +++ b/keras/utils/dataset_utils_test.py @@ -9,11 +9,11 @@ class SplitDatasetTest(tf.test.TestCase): def test_numpy_array(self): - dataset=np.ones(shape=(200, 32)) - res = dataset_utils.split_dataset(dataset, left_size=0.8,right_size=0.2) + dataset = np.ones(shape=(200, 32)) + res = dataset_utils.split_dataset(dataset, left_size=0.8, right_size=0.2) self.assertLen(res, 2) - left_split,right_split = res + left_split, right_split = res self.assertIsInstance(left_split, tf.data.Dataset) self.assertIsInstance(right_split, tf.data.Dataset) @@ -21,54 +21,55 @@ def test_numpy_array(self): self.assertLen(left_split, 160) self.assertLen(right_split, 40) - self.assertAllEqual(dataset[:160] ,list(left_split)) - self.assertAllEqual(dataset[-40:] ,list(right_split)) + self.assertAllEqual(dataset[:160], list(left_split)) + self.assertAllEqual(dataset[-40:], list(right_split)) def test_list_of_numpy_arrays(self): # test with list of np arrays with same shapes - dataset=[np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))] + dataset = [np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))] res = dataset_utils.split_dataset(dataset, left_size=4) self.assertLen(res, 2) - left_split,right_split = res + left_split, right_split = res self.assertIsInstance(left_split, tf.data.Dataset) self.assertIsInstance(right_split, tf.data.Dataset) - self.assertEqual(np.array(list(left_split)).shape,(4,2,32)) - self.assertEqual(np.array(list(right_split)).shape,(196,2,32)) + self.assertEqual(np.array(list(left_split)).shape, (4, 2, 32)) + self.assertEqual(np.array(list(right_split)).shape, (196, 2, 32)) # test with different shapes dataset = [np.ones(shape=(5, 3)), np.ones(shape=(5, ))] - left_split,right_split = dataset_utils.split_dataset(dataset,left_size=0.3) + left_split, right_split = dataset_utils.split_dataset(dataset, + left_size=0.3) - self.assertEqual(np.array(list(left_split)).shape,(2,2)) - self.assertEqual(np.array(list(right_split)).shape,(3,2)) + self.assertEqual(np.array(list(left_split)).shape, (2, 2)) + self.assertEqual(np.array(list(right_split)).shape, (3, 2)) - self.assertEqual(np.array(list(left_split)[0]).shape,(2,)) - self.assertEqual(np.array(list(left_split)[0][0]).shape,(3,)) - self.assertEqual(np.array(list(left_split)[0][1]).shape,()) + self.assertEqual(np.array(list(left_split)[0]).shape, (2,)) + self.assertEqual(np.array(list(left_split)[0][0]).shape, (3,)) + self.assertEqual(np.array(list(left_split)[0][1]).shape, ()) - self.assertEqual(np.array(list(right_split)[0]).shape,(2,)) - self.assertEqual(np.array(list(right_split)[0][0]).shape,(3,)) - self.assertEqual(np.array(list(right_split)[0][1]).shape,()) + self.assertEqual(np.array(list(right_split)[0]).shape, (2,)) + self.assertEqual(np.array(list(right_split)[0][0]).shape, (3,)) + self.assertEqual(np.array(list(right_split)[0][1]).shape, ()) def test_dataset_with_invalid_shape(self): with self.assertRaisesRegex(ValueError, 'Received a list of numpy arrays ' 'with different length'): - dataset=[np.ones(shape=(200, 32)), np.zeros(shape=(100, 32))] + dataset = [np.ones(shape=(200, 32)), np.zeros(shape=(100, 32))] dataset_utils.split_dataset(dataset, left_size=4) with self.assertRaisesRegex(ValueError, 'Received a tuple of numpy arrays ' 'with different length'): - dataset=(np.ones(shape=(200, 32)), np.zeros(shape=(201, 32))) + dataset = (np.ones(shape=(200, 32)), np.zeros(shape=(201, 32))) dataset_utils.split_dataset(dataset, left_size=4) def test_tuple_of_numpy_arrays(self): - dataset=(np.random.rand(4, 3), np.random.rand(4, 3)) - left_split,right_split = dataset_utils.split_dataset(dataset, left_size=2) + dataset = (np.random.rand(4, 3), np.random.rand(4, 3)) + left_split, right_split = dataset_utils.split_dataset(dataset, left_size=2) self.assertIsInstance(left_split, tf.data.Dataset) self.assertIsInstance(right_split, tf.data.Dataset) @@ -80,243 +81,251 @@ def test_tuple_of_numpy_arrays(self): self.assertEqual(np.array(list(left_split)[1]).shape, (2, 3)) # test with fractional size - dataset = (np.random.rand(5, 32,32), np.random.rand(5, 32,32)) - left_split,right_split = dataset_utils.split_dataset(dataset, - right_size=0.4) + dataset = (np.random.rand(5, 32, 32), np.random.rand(5, 32, 32)) + left_split, right_split = dataset_utils.split_dataset(dataset, + right_size=0.4) self.assertIsInstance(left_split, tf.data.Dataset) self.assertIsInstance(right_split, tf.data.Dataset) - self.assertEqual(np.array(list(left_split)).shape,(3,2,32,32)) - self.assertEqual(np.array(list(right_split)).shape,(2,2,32,32)) + self.assertEqual(np.array(list(left_split)).shape, (3, 2, 32, 32)) + self.assertEqual(np.array(list(right_split)).shape, (2, 2, 32, 32)) - self.assertEqual(np.array(list(left_split))[0].shape,(2,32,32)) - self.assertEqual(np.array(list(left_split))[1].shape,(2,32,32)) + self.assertEqual(np.array(list(left_split))[0].shape, (2, 32, 32)) + self.assertEqual(np.array(list(left_split))[1].shape, (2, 32, 32)) - self.assertEqual(np.array(list(right_split))[0].shape,(2,32,32)) - self.assertEqual(np.array(list(right_split))[1].shape,(2,32,32)) + self.assertEqual(np.array(list(right_split))[0].shape, (2, 32, 32)) + self.assertEqual(np.array(list(right_split))[1].shape, (2, 32, 32)) # test with tuple of np arrays with different shapes - dataset = (np.random.rand(5, 32,32), np.random.rand(5, )) - left_split,right_split = dataset_utils.split_dataset(dataset, - left_size=2, - right_size=3) + dataset = (np.random.rand(5, 32, 32), np.random.rand(5,)) + left_split, right_split = dataset_utils.split_dataset(dataset, + left_size=2, + right_size=3) self.assertIsInstance(left_split, tf.data.Dataset) self.assertIsInstance(right_split, tf.data.Dataset) - self.assertEqual(np.array(list(left_split)).shape,(2,2)) - self.assertEqual(np.array(list(right_split)).shape,(3,2)) + self.assertEqual(np.array(list(left_split)).shape, (2, 2)) + self.assertEqual(np.array(list(right_split)).shape, (3, 2)) - self.assertEqual(np.array(list(left_split)[0]).shape,(2,)) - self.assertEqual(np.array(list(left_split)[0][0]).shape,(32,32)) - self.assertEqual(np.array(list(left_split)[0][1]).shape,()) + self.assertEqual(np.array(list(left_split)[0]).shape, (2,)) + self.assertEqual(np.array(list(left_split)[0][0]).shape, (32, 32)) + self.assertEqual(np.array(list(left_split)[0][1]).shape, ()) - self.assertEqual(np.array(list(right_split)[0]).shape,(2,)) - self.assertEqual(np.array(list(right_split)[0][0]).shape,(32,32)) - self.assertEqual(np.array(list(right_split)[0][1]).shape,()) + self.assertEqual(np.array(list(right_split)[0]).shape, (2,)) + self.assertEqual(np.array(list(right_split)[0][0]).shape, (32, 32)) + self.assertEqual(np.array(list(right_split)[0][1]).shape, ()) def test_batched_tf_dataset_of_vectors(self): - dataset = tf.data.Dataset.from_tensor_slices(np.ones(shape=(100,32, 32,1))) + vectors = np.ones(shape=(100, 32, 32, 1)) + dataset = tf.data.Dataset.from_tensor_slices(vectors) dataset = dataset.batch(10) - left_split,right_split=dataset_utils.split_dataset(dataset,left_size=2) + left_split, right_split = dataset_utils.split_dataset(dataset, left_size=2) # Ensure that the splits are batched - self.assertEqual(len(list(right_split)),10) + self.assertEqual(len(list(right_split)), 10) - left_split,right_split = left_split.unbatch(),right_split.unbatch() - self.assertAllEqual(np.array(list(left_split)).shape,(2,32,32,1)) - self.assertAllEqual(np.array(list(right_split)).shape,(98,32,32,1)) + left_split, right_split = left_split.unbatch(), right_split.unbatch() + self.assertAllEqual(np.array(list(left_split)).shape, (2, 32, 32, 1)) + self.assertAllEqual(np.array(list(right_split)).shape, (98, 32, 32, 1)) dataset = dataset.unbatch() - self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) + self.assertAllEqual(list(dataset), list(left_split)+list(right_split)) def test_batched_tf_dataset_of_tuple_of_vectors(self): - dataset = tf.data.Dataset.from_tensor_slices((np.random.rand(10,32,32), - np.random.rand(10,32,32))) + tuple_of_vectors = (np.random.rand(10, 32, 32), np.random.rand(10, 32, 32)) + dataset = tf.data.Dataset.from_tensor_slices(tuple_of_vectors) dataset = dataset.batch(2) - left_split,right_split=dataset_utils.split_dataset(dataset,left_size=4) + left_split, right_split = dataset_utils.split_dataset(dataset, left_size=4) # Ensure that the splits are batched - self.assertEqual(np.array(list(right_split)).shape,(3, 2, 2, 32, 32)) - self.assertEqual(np.array(list(left_split)).shape,(2, 2, 2, 32, 32)) + self.assertEqual(np.array(list(right_split)).shape, (3, 2, 2, 32, 32)) + self.assertEqual(np.array(list(left_split)).shape, (2, 2, 2, 32, 32)) - left_split,right_split = left_split.unbatch(),right_split.unbatch() - self.assertAllEqual(np.array(list(left_split)).shape,(4,2,32,32)) - self.assertAllEqual(np.array(list(right_split)).shape,(6,2,32,32)) + left_split, right_split = left_split.unbatch(), right_split.unbatch() + self.assertAllEqual(np.array(list(left_split)).shape, (4, 2, 32, 32)) + self.assertAllEqual(np.array(list(right_split)).shape, (6, 2, 32, 32)) dataset = dataset.unbatch() - self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) + self.assertAllEqual(list(dataset), list(left_split)+list(right_split)) def test_batched_tf_dataset_of_dict_of_vectors(self): - dict_samples = {'X':np.random.rand(10,3), - 'Y':np.random.rand(10,3)} + dict_samples = {'X': np.random.rand(10, 3), + 'Y': np.random.rand(10, 3)} dataset = tf.data.Dataset.from_tensor_slices(dict_samples) dataset = dataset.batch(2) - left_split,right_split=dataset_utils.split_dataset(dataset,left_size=2) + left_split, right_split = dataset_utils.split_dataset(dataset, left_size=2) - self.assertAllEqual(np.array(list(left_split)).shape,(1,)) - self.assertAllEqual(np.array(list(right_split)).shape,(4,)) + self.assertAllEqual(np.array(list(left_split)).shape, (1,)) + self.assertAllEqual(np.array(list(right_split)).shape, (4,)) - left_split,right_split = left_split.unbatch(),right_split.unbatch() - self.assertEqual(len(list(left_split)),2) - self.assertEqual(len(list(right_split)),8) + left_split, right_split = left_split.unbatch(), right_split.unbatch() + self.assertEqual(len(list(left_split)), 2) + self.assertEqual(len(list(right_split)), 8) for i in range(10): if i < 2: - self.assertEqual(list(left_split)[i],list(dataset.unbatch())[i]) + self.assertEqual(list(left_split)[i], list(dataset.unbatch())[i]) else: - self.assertEqual(list(right_split)[i-2],list(dataset.unbatch())[i]) + self.assertEqual(list(right_split)[i-2], list(dataset.unbatch())[i]) # test with dict of np arrays with different shapes - dict_samples = {'images':np.random.rand(10,16,16,3), - 'labels':np.random.rand(10,)} + dict_samples = {'images': np.random.rand(10, 16, 16, 3), + 'labels': np.random.rand(10,)} dataset = tf.data.Dataset.from_tensor_slices(dict_samples) dataset = dataset.batch(1) - left_split,right_split=dataset_utils.split_dataset(dataset,right_size=0.3) + left_split, right_split = dataset_utils.split_dataset(dataset, + right_size=0.3) - self.assertAllEqual(np.array(list(left_split)).shape,(7,)) - self.assertAllEqual(np.array(list(right_split)).shape,(3,)) + self.assertAllEqual(np.array(list(left_split)).shape, (7,)) + self.assertAllEqual(np.array(list(right_split)).shape, (3,)) dataset = dataset.unbatch() - left_split,right_split = left_split.unbatch(),right_split.unbatch() - self.assertEqual(len(list(left_split)),7) - self.assertEqual(len(list(right_split)),3) + left_split, right_split = left_split.unbatch(), right_split.unbatch() + self.assertEqual(len(list(left_split)), 7) + self.assertEqual(len(list(right_split)), 3) for i in range(10): if i < 7: - self.assertEqual(list(left_split)[i],list(dataset)[i]) + self.assertEqual(list(left_split)[i], list(dataset)[i]) else: - self.assertEqual(list(right_split)[i-7],list(dataset)[i]) + self.assertEqual(list(right_split)[i-7], list(dataset)[i]) def test_unbatched_tf_dataset_of_vectors(self): - dataset = tf.data.Dataset.from_tensor_slices(np.ones(shape=(100,16, 16,3))) + vectors = np.ones(shape=(100, 16, 16, 3)) + dataset = tf.data.Dataset.from_tensor_slices(vectors) - left_split,right_split=dataset_utils.split_dataset(dataset,left_size=0.25) + left_split, right_split = dataset_utils.split_dataset(dataset, + left_size=0.25) - self.assertAllEqual(np.array(list(left_split)).shape,(25,16, 16,3)) - self.assertAllEqual(np.array(list(right_split)).shape,(75,16, 16,3)) + self.assertAllEqual(np.array(list(left_split)).shape, (25, 16, 16, 3)) + self.assertAllEqual(np.array(list(right_split)).shape, (75, 16, 16, 3)) - self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) + self.assertAllEqual(list(dataset), list(left_split)+list(right_split)) - dataset = [np.random.rand(10,3,3) for _ in range(5)] + dataset = [np.random.rand(10, 3, 3) for _ in range(5)] dataset = tf.data.Dataset.from_tensor_slices(dataset) - left_split,right_split=dataset_utils.split_dataset(dataset,left_size=2) - self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) + left_split, right_split = dataset_utils.split_dataset(dataset, left_size=2) + self.assertAllEqual(list(dataset), list(left_split)+list(right_split)) def test_unbatched_tf_dataset_of_tuple_of_vectors(self): # test with tuple of np arrays with same shape - X,Y = (np.random.rand(10,32,32,1),np.random.rand(10,32,32,1)) - dataset = tf.data.Dataset.from_tensor_slices((X,Y)) + X, Y = (np.random.rand(10, 32, 32, 1), np.random.rand(10, 32, 32, 1)) + dataset = tf.data.Dataset.from_tensor_slices((X, Y)) - left_split,right_split=dataset_utils.split_dataset(dataset,left_size=5) + left_split, right_split = dataset_utils.split_dataset(dataset, left_size=5) - self.assertEqual(len(list(left_split)),5) - self.assertEqual(len(list(right_split)),5) - self.assertAllEqual(list(dataset),list(left_split)+list(right_split)) + self.assertEqual(len(list(left_split)), 5) + self.assertEqual(len(list(right_split)), 5) + self.assertAllEqual(list(dataset), list(left_split)+list(right_split)) # test with tuple of np arrays with different shapes - X,Y = (np.random.rand(5,3,3),np.random.rand(5,)) - dataset = tf.data.Dataset.from_tensor_slices((X,Y)) - left_split,right_split=dataset_utils.split_dataset(dataset,left_size=0.5) + X, Y = (np.random.rand(5, 3, 3), np.random.rand(5,)) + dataset = tf.data.Dataset.from_tensor_slices((X, Y)) + left_split, right_split = dataset_utils.split_dataset(dataset, + left_size=0.5) - self.assertEqual(len(list(left_split)),2) - self.assertEqual(len(list(right_split)),3) - self.assertEqual(np.array(list(left_split)[0][0]).shape,(3,3)) - self.assertEqual(np.array(list(left_split)[0][1]).shape,()) + self.assertEqual(len(list(left_split)), 2) + self.assertEqual(len(list(right_split)), 3) + self.assertEqual(np.array(list(left_split)[0][0]).shape, (3, 3)) + self.assertEqual(np.array(list(left_split)[0][1]).shape, ()) def test_unbatched_tf_dataset_of_dict_of_vectors(self): # test with dict of np arrays of same shape - dict_samples = {'X':np.random.rand(10,2), - 'Y':np.random.rand(10,2)} + dict_samples = {'X': np.random.rand(10, 2), + 'Y': np.random.rand(10, 2)} dataset = tf.data.Dataset.from_tensor_slices(dict_samples) - left_split,right_split=dataset_utils.split_dataset(dataset,left_size=2) - self.assertEqual(len(list(left_split)),2) - self.assertEqual(len(list(right_split)),8) + left_split, right_split = dataset_utils.split_dataset(dataset, left_size=2) + self.assertEqual(len(list(left_split)), 2) + self.assertEqual(len(list(right_split)), 8) for i in range(10): if i < 2: - self.assertEqual(list(left_split)[i],list(dataset)[i]) + self.assertEqual(list(left_split)[i], list(dataset)[i]) else: - self.assertEqual(list(right_split)[i-2],list(dataset)[i]) + self.assertEqual(list(right_split)[i-2], list(dataset)[i]) # test with dict of np arrays with different shapes - dict_samples = {'images':np.random.rand(10,16,16,3), - 'labels':np.random.rand(10,)} + dict_samples = {'images': np.random.rand(10, 16, 16, 3), + 'labels': np.random.rand(10,)} dataset = tf.data.Dataset.from_tensor_slices(dict_samples) - left_split,right_split=dataset_utils.split_dataset(dataset,left_size=0.3) - self.assertEqual(len(list(left_split)),3) - self.assertEqual(len(list(right_split)),7) + left_split, right_split = dataset_utils.split_dataset(dataset, + left_size=0.3) + self.assertEqual(len(list(left_split)), 3) + self.assertEqual(len(list(right_split)), 7) for i in range(10): if i < 3: - self.assertEqual(list(left_split)[i],list(dataset)[i]) + self.assertEqual(list(left_split)[i], list(dataset)[i]) else: - self.assertEqual(list(right_split)[i-3],list(dataset)[i]) - + self.assertEqual(list(right_split)[i-3], list(dataset)[i]) + # test with dict of text arrays - dict_samples = {'txt_feature':['abb','bb','cc','d','e','f','g','h','i','j'], - 'label':[1,2,3,4,5,6,7,8,9,10]} + txt_feature = ['abb', 'bb', 'cc', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] + dict_samples = {'txt_feature': txt_feature, + 'label': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]} dataset = tf.data.Dataset.from_tensor_slices(dict_samples) - left_split,right_split=dataset_utils.split_dataset(dataset,left_size=0.45, - right_size=0.55) - self.assertEqual(len(list(left_split)),4) - self.assertEqual(len(list(right_split)),6) + left_split, right_split = dataset_utils.split_dataset(dataset, + left_size=0.45, + right_size=0.55) + self.assertEqual(len(list(left_split)), 4) + self.assertEqual(len(list(right_split)), 6) for i in range(10): if i < 4: - self.assertEqual(list(left_split)[i],list(dataset)[i]) + self.assertEqual(list(left_split)[i], list(dataset)[i]) else: - self.assertEqual(list(right_split)[i-4],list(dataset)[i]) + self.assertEqual(list(right_split)[i-4], list(dataset)[i]) def test_list_dataset(self): - dataset = [np.ones(shape=(10,10,10)) for _ in range(10)] - left_split,right_split = dataset_utils.split_dataset(dataset, - left_size=5, - right_size=5) + dataset = [np.ones(shape=(10, 10, 10)) for _ in range(10)] + left_split, right_split = dataset_utils.split_dataset(dataset, + left_size=5, + right_size=5) self.assertEqual(len(left_split), len(right_split)) self.assertIsInstance(left_split, tf.data.Dataset) self.assertIsInstance(left_split, tf.data.Dataset) - dataset = [np.ones(shape=(10,10,10)) for _ in range(10)] - left_split,right_split = dataset_utils.split_dataset(dataset, - left_size=0.6, - right_size=0.4) + dataset = [np.ones(shape=(10, 10, 10)) for _ in range(10)] + left_split, right_split = dataset_utils.split_dataset(dataset, + left_size=0.6, + right_size=0.4) self.assertEqual(len(left_split), 6) self.assertEqual(len(right_split), 4) def test_invalid_dataset(self): with self.assertRaisesRegex(TypeError, '`dataset` must be either a tf.data.Dataset ' - f'object or a list/tuple of arrays. Received ' + 'object or a list/tuple of arrays. Received ' ': '): dataset_utils.split_dataset(dataset=None, left_size=5) with self.assertRaisesRegex(TypeError, '`dataset` must be either a tf.data.Dataset ' - f'object or a list/tuple of arrays. Received ' + 'object or a list/tuple of arrays. Received ' ': '): dataset_utils.split_dataset(dataset=1, left_size=5) with self.assertRaisesRegex(TypeError, '`dataset` must be either a tf.data.Dataset ' - f'object or a list/tuple of arrays. Received ' + 'object or a list/tuple of arrays. Received ' ': '): dataset_utils.split_dataset(dataset=float(1.2), left_size=5) with self.assertRaisesRegex(TypeError, '`dataset` must be either a tf.data.Dataset ' - f'object or a list/tuple of arrays. Received ' + 'object or a list/tuple of arrays. Received ' ': '): dataset_utils.split_dataset(dataset=dict({}), left_size=5) with self.assertRaisesRegex(TypeError, '`dataset` must be either a tf.data.Dataset ' - f'object or a list/tuple of arrays. Received ' + 'object or a list/tuple of arrays. Received ' ': '): dataset_utils.split_dataset(dataset=float('INF'), left_size=5) def test_valid_left_and_right_sizes(self): - dataset = np.array([1,2,3]) - splitted_dataset = dataset_utils.split_dataset(dataset,1,2) + dataset = np.array([1, 2, 3]) + splitted_dataset = dataset_utils.split_dataset(dataset, 1, 2) assert(len(splitted_dataset) == 2) - left_split,right_split = splitted_dataset + left_split, right_split = splitted_dataset self.assertEqual(len(left_split), 1) self.assertEqual(len(right_split), 2) self.assertEqual(list(left_split), [1]) - self.assertEqual(list(right_split), [2,3]) + self.assertEqual(list(right_split), [2, 3]) - dataset=np.ones(shape=(200, 32)) - res = dataset_utils.split_dataset(dataset, left_size=150,right_size=50) + dataset = np.ones(shape=(200, 32)) + res = dataset_utils.split_dataset(dataset, left_size=150, right_size=50) self.assertLen(res, 2) self.assertIsInstance(res[0], tf.data.Dataset) self.assertIsInstance(res[1], tf.data.Dataset) @@ -324,7 +333,7 @@ def test_valid_left_and_right_sizes(self): self.assertLen(res[0], 150) self.assertLen(res[1], 50) - dataset=np.ones(shape=(200, 32)) + dataset = np.ones(shape=(200, 32)) res = dataset_utils.split_dataset(dataset, left_size=120) self.assertLen(res, 2) self.assertIsInstance(res[0], tf.data.Dataset) @@ -333,7 +342,7 @@ def test_valid_left_and_right_sizes(self): self.assertLen(res[0], 120) self.assertLen(res[1], 80) - dataset=np.ones(shape=(10000, 16)) + dataset = np.ones(shape=(10000, 16)) res = dataset_utils.split_dataset(dataset, right_size=20) self.assertLen(res, 2) self.assertIsInstance(res[0], tf.data.Dataset) @@ -342,34 +351,34 @@ def test_valid_left_and_right_sizes(self): self.assertLen(res[0], 9980) self.assertLen(res[1], 20) - dataset = np.array([1,2,3,4,5,6,7,8,9,10]) + dataset = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) splitted_dataset = dataset_utils.split_dataset(dataset, left_size=0.1, right_size=0.9) assert(len(splitted_dataset) == 2) - left_split,right_split = splitted_dataset - self.assertEqual(len(left_split), 1 ) - self.assertEqual(len(right_split), 9 ) + left_split, right_split = splitted_dataset + self.assertEqual(len(left_split), 1) + self.assertEqual(len(right_split), 9) self.assertEqual(list(left_split), [1]) - self.assertEqual(list(right_split), [2,3,4,5,6,7,8,9,10]) + self.assertEqual(list(right_split), [2, 3, 4, 5, 6, 7, 8, 9, 10]) - dataset = np.array([1,2,3,4,5,6,7,8,9,10]) + dataset = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) splitted_dataset = dataset_utils.split_dataset(dataset, left_size=2, right_size=5) assert(len(splitted_dataset) == 2) - left_split,right_split = splitted_dataset - self.assertEqual(len(left_split), 2 ) - self.assertEqual(len(right_split), 5 ) - self.assertEqual(list(left_split), [1,2]) - self.assertEqual(list(right_split), [6,7,8,9,10]) + left_split, right_split = splitted_dataset + self.assertEqual(len(left_split), 2) + self.assertEqual(len(right_split), 5) + self.assertEqual(list(left_split), [1, 2]) + self.assertEqual(list(right_split), [6, 7, 8, 9, 10]) def test_float_left_and_right_sizes(self): - X = np.array([[0.1,0.2,0.3],[0.4,0.5,0.6],[0.7,0.8,0.9]]) + X = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]]) dataset = tf.data.Dataset.from_tensor_slices(X) - left_split,right_split = dataset_utils.split_dataset(dataset, - left_size=0.8, - right_size=0.2) + left_split, right_split = dataset_utils.split_dataset(dataset, + left_size=0.8, + right_size=0.2) self.assertEqual(len(left_split), 2) self.assertEqual(len(right_split), 1) @@ -377,14 +386,14 @@ def test_invalid_float_left_and_right_sizes(self): expected_regex = (r'^(.*?(\bleft_size\b).*?(\bshould be\b)' r'.*?(\bwithin the range\b).*?(\b0\b).*?(\b1\b))') with self.assertRaisesRegexp(ValueError, expected_regex): - dataset = [np.ones(shape=(200, 32,32)), np.zeros(shape=(200, 32,32))] - dataset_utils.split_dataset(dataset, left_size=1.5,right_size=0.2) + dataset = [np.ones(shape=(200, 32, 32)), np.zeros(shape=(200, 32, 32))] + dataset_utils.split_dataset(dataset, left_size=1.5, right_size=0.2) expected_regex = (r'^(.*?(\bright_size\b).*?(\bshould be\b)' r'.*?(\bwithin the range\b).*?(\b0\b).*?(\b1\b))') with self.assertRaisesRegex(ValueError, expected_regex): dataset = [np.ones(shape=(200, 32)), np.zeros(shape=(200, 32))] - dataset_utils.split_dataset(dataset, left_size=0.8,right_size=-0.8) + dataset_utils.split_dataset(dataset, left_size=0.8, right_size=-0.8) def test_None_and_zero_left_and_right_size(self): expected_regex = (r'^.*?(\bleft_size\b).*?(\bright_size\b).*?(\bmust ' @@ -392,46 +401,46 @@ def test_None_and_zero_left_and_right_size(self): r' right_size=None\b)') with self.assertRaisesRegex(ValueError, expected_regex): - dataset_utils.split_dataset(dataset=np.array([1,2,3]), left_size=None) + dataset_utils.split_dataset(dataset=np.array([1, 2, 3]), left_size=None) with self.assertRaisesRegex(ValueError, expected_regex): - dataset_utils.split_dataset(np.array([1,2,3]),left_size=None, + dataset_utils.split_dataset(np.array([1, 2, 3]), left_size=None, right_size=None) expected_regex = (r'^.*?(\bleft_size\b).*?(\bshould be\b)' r'.*?(\bpositive\b).*?(\bsmaller than 3\b)') with self.assertRaisesRegex(ValueError, expected_regex): - dataset_utils.split_dataset(np.array([1,2,3]),left_size=3) + dataset_utils.split_dataset(np.array([1, 2, 3]), left_size=3) expected_regex = ('Both `left_size` and `right_size` are zero. ' - 'Atleast one of the split sizes must be non-zero.') + 'Atleast one of the split sizes must be non-zero.') with self.assertRaisesRegex(ValueError, expected_regex): - dataset_utils.split_dataset(np.array([1,2,3]), left_size=0, + dataset_utils.split_dataset(np.array([1, 2, 3]), left_size=0, right_size=0) def test_invalid_left_and_right_size_types(self): expected_regex = (r'^.*?(\bInvalid `left_size` and `right_size` Types' r'\b).*?(\bExpected: integer or float or None\b)') with self.assertRaisesRegex(TypeError, expected_regex): - dataset_utils.split_dataset(np.array([1,2,3]), left_size='1', + dataset_utils.split_dataset(np.array([1, 2, 3]), left_size='1', right_size='1') expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)') with self.assertRaisesRegex(TypeError, expected_regex): - dataset_utils.split_dataset(np.array([1,2,3]),left_size=0, + dataset_utils.split_dataset(np.array([1, 2, 3]), left_size=0, right_size='1') expected_regex = (r'^.*?(\bInvalid `left_size` Type\b)') with self.assertRaisesRegex(TypeError, expected_regex): - dataset_utils.split_dataset(np.array([1,2,3]),left_size='100', + dataset_utils.split_dataset(np.array([1, 2, 3]), left_size='100', right_size=None) expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)') with self.assertRaisesRegex(TypeError, expected_regex): - dataset_utils.split_dataset(np.array([1,2,3]),right_size='1') + dataset_utils.split_dataset(np.array([1, 2, 3]), right_size='1') expected_regex = (r'^.*?(\bInvalid `right_size` Type\b)') with self.assertRaisesRegex(TypeError, expected_regex): - dataset_utils.split_dataset(np.array([1,2,3]),left_size=0.5, + dataset_utils.split_dataset(np.array([1, 2, 3]), left_size=0.5, right_size='1') def test_mnist_dataset(self): @@ -442,7 +451,8 @@ def test_mnist_dataset(self): assert y_test.shape == (10000,) dataset = (x_train[:100], y_train[:100]) - left_split,right_split = dataset_utils.split_dataset(dataset,left_size=0.8) + left_split, right_split = dataset_utils.split_dataset(dataset, + left_size=0.8) self.assertIsInstance(left_split, tf.data.Dataset) self.assertIsInstance(right_split, tf.data.Dataset) From 28466bbce89eb7efb0b0b83026a28e3d0aab4427 Mon Sep 17 00:00:00 2001 From: Prakash Sellathurai Date: Mon, 2 May 2022 21:43:16 +0530 Subject: [PATCH 21/21] removed `data_size_warning_flag` reference from warning message --- keras/utils/dataset_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/keras/utils/dataset_utils.py b/keras/utils/dataset_utils.py index b39f2a18b4f..c50551408a5 100644 --- a/keras/utils/dataset_utils.py +++ b/keras/utils/dataset_utils.py @@ -258,8 +258,7 @@ def _get_next_sample(dataset_iterator, warnings.warn('The dataset is taking longer than 10 seconds to ' 'iterate. This may be due to the size of the dataset. ' 'Please consider using a smaller dataset' - '(e.g. < 10,000 samples). \nTo hide this ' - 'warning message, set `data_size_warning_flag=False`.', + '(e.g. < 10,000 samples).', category=ResourceWarning, source='split_dataset') data_size_warning_flag = False