From 8bc8e932ee1797901ef665e224d64ae798f16f7c Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Mon, 25 Mar 2024 16:29:00 +0400 Subject: [PATCH] [TF FE] Add testing StringLower and TextVectorization operations on non-ASCII sentences (#23641) **Details:** Add testing non-ASCII sentences for StringLower operation. Needs to be merged after https://github.com/openvinotoolkit/openvino_tokenizers/pull/80. **Ticket:** 135752 --------- Signed-off-by: Kazantsev, Roman --- .../layer_tests/common/utils/common_utils.py | 7 +++++- .../test_tf2_keras_text_vectorization.py | 24 +++++++++++++------ .../tensorflow_tests/test_tf_StringLower.py | 16 ++++++++----- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/tests/layer_tests/common/utils/common_utils.py b/tests/layer_tests/common/utils/common_utils.py index ef282443eae996..7bf2d33a0d657d 100644 --- a/tests/layer_tests/common/utils/common_utils.py +++ b/tests/layer_tests/common/utils/common_utils.py @@ -137,7 +137,12 @@ def allclose(cur_array, ref_array, atol, rtol): # so we have to align formats of both string tensors, for example, to unicode if cur_array.dtype.type != ref_array.dtype.type: cur_array = cur_array.astype('U') - ref_array = ref_array.astype('U') + try: + ref_array = ref_array.astype('U') + except: + # ref_array of object type and each element must be utf-8 decoded + utf8_decoded_elems = [elem.decode('UTF-8') for elem in ref_array.flatten()] + ref_array = np.array(utf8_decoded_elems, dtype=str).reshape(ref_array.shape) return np.array_equal(cur_array, ref_array) elif cur_array.dtype == bool: abs_diff = np.absolute(cur_array ^ ref_array) diff --git a/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_text_vectorization.py b/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_text_vectorization.py index 2b709045cbd8e9..3c6f15c176d334 100644 --- a/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_text_vectorization.py +++ b/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_text_vectorization.py @@ -17,13 +17,13 @@ def _prepare_input(self, inputs_info): assert 'text_input' in inputs_info input_shape = inputs_info['text_input'] inputs_data = {} - strings_dictionary = ['hi OpenVINO here ', ' hello OpenVINO there', 'hello PyTorch here ', - ' hi TensorFlow here', ' hi JAX here \t'] - inputs_data['text_input'] = rng.choice(strings_dictionary, input_shape) + inputs_data['text_input'] = rng.choice(self.strings_dictionary, input_shape) return inputs_data - def create_text_vectorization_net(self, input_shapes, vocabulary, output_mode, output_sequence_length): + def create_text_vectorization_net(self, input_shapes, vocabulary, output_mode, output_sequence_length, + strings_dictionary): assert len(input_shapes) > 0 + self.strings_dictionary = strings_dictionary tf.keras.backend.clear_session() text_input = tf.keras.Input(shape=input_shapes[0][1:], name='text_input', dtype=tf.string) @@ -36,13 +36,22 @@ def create_text_vectorization_net(self, input_shapes, vocabulary, output_mode, o return tf2_net, None @pytest.mark.parametrize('input_shapes', [[[1, 1]], [[3, 1]]]) - @pytest.mark.parametrize('vocabulary', [['hello', 'there', 'OpenVINO', 'check']]) + @pytest.mark.parametrize('strings_dictionary', + [['hi OpenVINO here ', ' hello OpenVINO there', 'hello PyTorch here ', + ' hi TensorFlow here', ' hi JAX here \t'], + ['привет ОПЕНВИНО здесь ', ' привет ОпенВИНО там', 'привет Пайторч здесь ', + ' привет ТензорФлоу здесь', ' привет ДЖАКС там \t'], + ['這裡你好 OpenVINO ', '你好 OpenVINO 那裡', '你好這裡 PyTorch ', + ' 這裡是 TensorFlow', ' 這裡是 JAX \t'] + ]) + @pytest.mark.parametrize('vocabulary', [['hello', 'there', 'OpenVINO', 'check', 'привет', + 'ОПЕНВИНО', 'здесь', 'там', '你好', '那裡', '檢查']]) @pytest.mark.parametrize('output_mode', ['int']) @pytest.mark.parametrize('output_sequence_length', [32, 64]) @pytest.mark.precommit_tf_fe @pytest.mark.nightly - def test_text_vectorization(self, input_shapes, vocabulary, output_mode, output_sequence_length, ie_device, - precision, ir_version, temp_dir, use_legacy_frontend): + def test_text_vectorization(self, input_shapes, vocabulary, output_mode, output_sequence_length, strings_dictionary, + ie_device, precision, ir_version, temp_dir, use_legacy_frontend): if platform.system() in ('Darwin') or platform.machine() in ['arm', 'armv7l', 'aarch64', 'arm64', @@ -53,5 +62,6 @@ def test_text_vectorization(self, input_shapes, vocabulary, output_mode, output_ params['vocabulary'] = vocabulary params['output_mode'] = output_mode params['output_sequence_length'] = output_sequence_length + params['strings_dictionary'] = strings_dictionary self._test(*self.create_text_vectorization_net(**params), ie_device, precision, temp_dir=temp_dir, ir_version=ir_version, use_legacy_frontend=use_legacy_frontend, **params) diff --git a/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py b/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py index 25581612784820..9bcee9c86a9524 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py @@ -16,14 +16,13 @@ def _prepare_input(self, inputs_info): assert 'input:0' in inputs_info input_shape = inputs_info['input:0'] inputs_data = {} - # TODO: add non ASCII symbols, fix comparator for output string tensors - strings_dictionary = ['UPPER CASE SENTENCE', 'lower case sentence', ' UppEr LoweR CAse SENtence', ' '] - sample_data = rng.choice(strings_dictionary, input_shape) + sample_data = rng.choice(self.strings_dictionary, input_shape) inputs_data['input:0'] = sample_data return inputs_data - def create_string_lower_net(self, input_shape, encoding): + def create_string_lower_net(self, input_shape, encoding, strings_dictionary): self.encoding = encoding + self.strings_dictionary = strings_dictionary tf.compat.v1.reset_default_graph() with tf.compat.v1.Session() as sess: @@ -39,14 +38,19 @@ def create_string_lower_net(self, input_shape, encoding): @pytest.mark.parametrize("encoding", [None, '', 'utf-8']) @pytest.mark.parametrize("input_shape", [[], [2], [3, 4], [1, 3, 2]]) + @pytest.mark.parametrize("strings_dictionary", + [['UPPER CASE SENTENCE', 'lower case sentence', ' UppEr LoweR CAse SENtence', ' '], + ['Первое Предложение', 'второе предложение', ' ', ' ТРЕТЬЕ ПРЕДЛОЖЕНИЕ '], + ['第一句話在這裡', '第二句話在這裡', '第三句話在這裡']]) @pytest.mark.precommit_tf_fe @pytest.mark.nightly @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l', 'aarch64', 'arm64', 'ARM64'], reason='Ticket - 126314, 132699') - def test_string_lower(self, input_shape, encoding, ie_device, precision, ir_version, temp_dir, + def test_string_lower(self, input_shape, encoding, strings_dictionary, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): - self._test(*self.create_string_lower_net(input_shape=input_shape, encoding=encoding), + self._test(*self.create_string_lower_net(input_shape=input_shape, encoding=encoding, + strings_dictionary=strings_dictionary), ie_device, precision, ir_version, temp_dir=temp_dir, use_legacy_frontend=use_legacy_frontend)