From 8bc8e932ee1797901ef665e224d64ae798f16f7c Mon Sep 17 00:00:00 2001
From: Roman Kazantsev <roman.kazantsev@intel.com>
Date: Mon, 25 Mar 2024 16:29:00 +0400
Subject: [PATCH] [TF FE] Add testing StringLower and TextVectorization
 operations on non-ASCII sentences (#23641)

**Details:** Add testing non-ASCII sentences for StringLower operation.
Needs to be merged after
https://github.com/openvinotoolkit/openvino_tokenizers/pull/80.

**Ticket:** 135752

---------

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 .../layer_tests/common/utils/common_utils.py  |  7 +++++-
 .../test_tf2_keras_text_vectorization.py      | 24 +++++++++++++------
 .../tensorflow_tests/test_tf_StringLower.py   | 16 ++++++++-----
 3 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/tests/layer_tests/common/utils/common_utils.py b/tests/layer_tests/common/utils/common_utils.py
index ef282443eae996..7bf2d33a0d657d 100644
--- a/tests/layer_tests/common/utils/common_utils.py
+++ b/tests/layer_tests/common/utils/common_utils.py
@@ -137,7 +137,12 @@ def allclose(cur_array, ref_array, atol, rtol):
         # so we have to align formats of both string tensors, for example, to unicode
         if cur_array.dtype.type != ref_array.dtype.type:
             cur_array = cur_array.astype('U')
-            ref_array = ref_array.astype('U')
+            try:
+                ref_array = ref_array.astype('U')
+            except:
+                # ref_array of object type and each element must be utf-8 decoded
+                utf8_decoded_elems = [elem.decode('UTF-8') for elem in ref_array.flatten()]
+                ref_array = np.array(utf8_decoded_elems, dtype=str).reshape(ref_array.shape)
         return np.array_equal(cur_array, ref_array)
     elif cur_array.dtype == bool:
         abs_diff = np.absolute(cur_array ^ ref_array)
diff --git a/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_text_vectorization.py b/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_text_vectorization.py
index 2b709045cbd8e9..3c6f15c176d334 100644
--- a/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_text_vectorization.py
+++ b/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_text_vectorization.py
@@ -17,13 +17,13 @@ def _prepare_input(self, inputs_info):
         assert 'text_input' in inputs_info
         input_shape = inputs_info['text_input']
         inputs_data = {}
-        strings_dictionary = ['hi OpenVINO here  ', '  hello OpenVINO there', 'hello PyTorch here  ',
-                              '  hi TensorFlow here', '  hi JAX here \t']
-        inputs_data['text_input'] = rng.choice(strings_dictionary, input_shape)
+        inputs_data['text_input'] = rng.choice(self.strings_dictionary, input_shape)
         return inputs_data
 
-    def create_text_vectorization_net(self, input_shapes, vocabulary, output_mode, output_sequence_length):
+    def create_text_vectorization_net(self, input_shapes, vocabulary, output_mode, output_sequence_length,
+                                      strings_dictionary):
         assert len(input_shapes) > 0
+        self.strings_dictionary = strings_dictionary
         tf.keras.backend.clear_session()
         text_input = tf.keras.Input(shape=input_shapes[0][1:], name='text_input',
                                     dtype=tf.string)
@@ -36,13 +36,22 @@ def create_text_vectorization_net(self, input_shapes, vocabulary, output_mode, o
         return tf2_net, None
 
     @pytest.mark.parametrize('input_shapes', [[[1, 1]], [[3, 1]]])
-    @pytest.mark.parametrize('vocabulary', [['hello', 'there', 'OpenVINO', 'check']])
+    @pytest.mark.parametrize('strings_dictionary',
+                             [['hi OpenVINO here  ', '  hello OpenVINO there', 'hello PyTorch here  ',
+                               '  hi TensorFlow here', '  hi JAX here \t'],
+                              ['привет ОПЕНВИНО здесь  ', '  привет ОпенВИНО там', 'привет Пайторч здесь  ',
+                               '  привет ТензорФлоу здесь', '  привет ДЖАКС там \t'],
+                              ['這裡你好 OpenVINO ', '你好 OpenVINO 那裡', '你好這裡 PyTorch ',
+                               ' 這裡是 TensorFlow', ' 這裡是 JAX \t']
+                              ])
+    @pytest.mark.parametrize('vocabulary', [['hello', 'there', 'OpenVINO', 'check', 'привет',
+                                             'ОПЕНВИНО', 'здесь', 'там', '你好', '那裡', '檢查']])
     @pytest.mark.parametrize('output_mode', ['int'])
     @pytest.mark.parametrize('output_sequence_length', [32, 64])
     @pytest.mark.precommit_tf_fe
     @pytest.mark.nightly
-    def test_text_vectorization(self, input_shapes, vocabulary, output_mode, output_sequence_length, ie_device,
-                                precision, ir_version, temp_dir, use_legacy_frontend):
+    def test_text_vectorization(self, input_shapes, vocabulary, output_mode, output_sequence_length, strings_dictionary,
+                                ie_device, precision, ir_version, temp_dir, use_legacy_frontend):
         if platform.system() in ('Darwin') or platform.machine() in ['arm', 'armv7l',
                                                                      'aarch64',
                                                                      'arm64',
@@ -53,5 +62,6 @@ def test_text_vectorization(self, input_shapes, vocabulary, output_mode, output_
         params['vocabulary'] = vocabulary
         params['output_mode'] = output_mode
         params['output_sequence_length'] = output_sequence_length
+        params['strings_dictionary'] = strings_dictionary
         self._test(*self.create_text_vectorization_net(**params), ie_device, precision,
                    temp_dir=temp_dir, ir_version=ir_version, use_legacy_frontend=use_legacy_frontend, **params)
diff --git a/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py b/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py
index 25581612784820..9bcee9c86a9524 100644
--- a/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py
+++ b/tests/layer_tests/tensorflow_tests/test_tf_StringLower.py
@@ -16,14 +16,13 @@ def _prepare_input(self, inputs_info):
         assert 'input:0' in inputs_info
         input_shape = inputs_info['input:0']
         inputs_data = {}
-        # TODO: add non ASCII symbols, fix comparator for output string tensors 
-        strings_dictionary = ['UPPER CASE SENTENCE', 'lower case sentence', ' UppEr LoweR CAse SENtence', ' ']
-        sample_data = rng.choice(strings_dictionary, input_shape)
+        sample_data = rng.choice(self.strings_dictionary, input_shape)
         inputs_data['input:0'] = sample_data
         return inputs_data
 
-    def create_string_lower_net(self, input_shape, encoding):
+    def create_string_lower_net(self, input_shape, encoding, strings_dictionary):
         self.encoding = encoding
+        self.strings_dictionary = strings_dictionary
 
         tf.compat.v1.reset_default_graph()
         with tf.compat.v1.Session() as sess:
@@ -39,14 +38,19 @@ def create_string_lower_net(self, input_shape, encoding):
 
     @pytest.mark.parametrize("encoding", [None, '', 'utf-8'])
     @pytest.mark.parametrize("input_shape", [[], [2], [3, 4], [1, 3, 2]])
+    @pytest.mark.parametrize("strings_dictionary",
+                             [['UPPER CASE SENTENCE', 'lower case sentence', ' UppEr LoweR CAse SENtence', ' '],
+                              ['Первое Предложение', 'второе    предложение', ' ', ' ТРЕТЬЕ ПРЕДЛОЖЕНИЕ '],
+                              ['第一句話在這裡', '第二句話在這裡', '第三句話在這裡']])
     @pytest.mark.precommit_tf_fe
     @pytest.mark.nightly
     @pytest.mark.xfail(condition=platform.system() in ('Darwin', 'Linux') and platform.machine() in ['arm', 'armv7l',
                                                                                                      'aarch64',
                                                                                                      'arm64', 'ARM64'],
                        reason='Ticket - 126314, 132699')
-    def test_string_lower(self, input_shape, encoding, ie_device, precision, ir_version, temp_dir,
+    def test_string_lower(self, input_shape, encoding, strings_dictionary, ie_device, precision, ir_version, temp_dir,
                           use_legacy_frontend):
-        self._test(*self.create_string_lower_net(input_shape=input_shape, encoding=encoding),
+        self._test(*self.create_string_lower_net(input_shape=input_shape, encoding=encoding,
+                                                 strings_dictionary=strings_dictionary),
                    ie_device, precision, ir_version, temp_dir=temp_dir,
                    use_legacy_frontend=use_legacy_frontend)