From 1262ea2396a3582908d407247247429ffe1c4d92 Mon Sep 17 00:00:00 2001
From: lithium03 <contact@lithium03.info>
Date: Sun, 6 Aug 2023 20:50:46 +0900
Subject: [PATCH] update

---
 dataset/__init__.py         |   1 -
 dataset/data_detector.py    |   3 +-
 dataset/data_transformer.py | 232 +++++++++++++++++++++++-------------
 3 files changed, 150 insertions(+), 86 deletions(-)

diff --git a/dataset/__init__.py b/dataset/__init__.py
index ba1d462..e69de29 100644
--- a/dataset/__init__.py
+++ b/dataset/__init__.py
@@ -1 +0,0 @@
-from dataset.data import FontData, BaseData, scale
diff --git a/dataset/data_detector.py b/dataset/data_detector.py
index a96824f..fb578d5 100644
--- a/dataset/data_detector.py
+++ b/dataset/data_detector.py
@@ -764,7 +764,7 @@ def background_image(self):
         fg_c = tf.where(
             bk_c > 0.5, 
             tf.random.uniform([3], tf.clip_by_value(bk_c - bk_std * 2 - min_delta, -float('inf'), -1), bk_c - bk_std * 2 - min_delta),
-            tf.random.uniform([3], tf.clip_by_value(bk_c + bk_std * 2 + min_delta, 1, -float('inf')), bk_c + bk_std * 2 + min_delta))
+            tf.random.uniform([3], bk_c + bk_std * 2 + min_delta, tf.clip_by_value(bk_c + bk_std * 2 + min_delta, 1, float('inf'))))
         bk_alpha = tf.maximum(tf.reduce_max(tf.abs(fg_c)), 1)
         bkimg /= bk_alpha
         fg_c /= bk_alpha
@@ -832,7 +832,6 @@ def create_dataset(self, batch_size, filelist, shuffle=False):
         fs = tf.data.Dataset.from_tensor_slices(filelist)
         if shuffle:
             fs = fs.shuffle(len(filelist), reshuffle_each_iteration=True)
-            fs = fs.repeat()
         ds = tf.data.TFRecordDataset(filenames=fs, num_parallel_reads=tf.data.AUTOTUNE)
         if shuffle:
             ds.shuffle(1000, reshuffle_each_iteration=True)
diff --git a/dataset/data_transformer.py b/dataset/data_transformer.py
index c80cc1a..e5f66c8 100644
--- a/dataset/data_transformer.py
+++ b/dataset/data_transformer.py
@@ -1,117 +1,183 @@
 import tensorflow as tf
+import numpy as np
 
 from net.const import feature_dim
 from const import encoder_add_dim, max_decoderlen, max_encoderlen, decoder_SOT, decoder_EOT
-from const import samples_per_file
 
 encoder_dim = feature_dim + encoder_add_dim
 
 tfdata_path = 'train_data2'
 
-def deserialize_composite(serialized, type_spec):
-    serialized = tf.io.parse_tensor(serialized, tf.string)
-    component_specs = tf.nest.flatten(type_spec, expand_composites=True)
-    components = [
-        tf.io.parse_tensor(serialized[i], spec.dtype)
-        for i, spec in enumerate(component_specs)
-    ]
-    return tf.nest.pack_sequence_as(type_spec, components, expand_composites=True)
+npz_file = np.load('charparam.npz')
+features = []
+feature_idx = []
+idx = 0
+features.append(np.zeros([1,feature_dim], np.float32))
+feature_idx.append([0,0,0,1])
+idx = 1
+for varname in npz_file.files:
+    features.append(npz_file[varname])
+    feature_idx.append([int(varname[:-1]),0 if varname[-1] == 'n' else 1,idx,idx+npz_file[varname].shape[0]])
+    idx += npz_file[varname].shape[0]
+rng = np.random.default_rng()
+del npz_file
+
+with tf.device('cpu'):
+    features = tf.concat(features, axis=0)
+    feature_idx = tf.constant(feature_idx, tf.int64)
 
 def parse(serialized):
     return tf.io.parse_example(serialized, features={ 
-        "strcode": tf.io.FixedLenFeature([], dtype=tf.string), 
-        "features": tf.io.FixedLenFeature([], dtype=tf.string),
-        "length": tf.io.FixedLenFeature([], dtype=tf.int64),
+        "str": tf.io.FixedLenFeature([], dtype=tf.string), 
+        "code": tf.io.FixedLenFeature([], dtype=tf.string),
+        "codelen": tf.io.FixedLenFeature([], dtype=tf.int64),
+        "strlen": tf.io.FixedLenFeature([], dtype=tf.int64),
     })
 
-def deserialize_data(data):
-    rt_spec1 = tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int32, ragged_rank=1, row_splits_dtype=tf.int32)
-    rt_spec2 = tf.RaggedTensorSpec(shape=[None, None, encoder_dim], dtype=tf.float32, ragged_rank=1)
-    deserialized1 = deserialize_composite(data['strcode'], rt_spec1)
-    deserialized2 = deserialize_composite(data['features'], rt_spec2)
-    return {
-        'strcode': deserialized1,
-        'features': deserialized2,
-        'length': tf.cast(data['length'], tf.int32),
-    }
-
-def trim_data(data):
-    want_declen = tf.random.uniform([], 1, max_decoderlen - 1, dtype=tf.int32)
-    strcode = data['strcode']
-    features = data['features']
-    f = tf.zeros([0,encoder_dim])
-    s = tf.zeros([0,], dtype=tf.int32)
-    f1 = f
-    s1 = s
-    if data['length'] > 2:
-        st = tf.random.uniform([], 0, data['length'] - 1, dtype=tf.int32)
+def generate_feature(code_vec):
+    @tf.function
+    def subfun_n(v):
+        sample = tf.zeros([feature_dim])
+        v0 = tf.cast(v[0], tf.int64)
+        if v0 > 0:
+            idx = tf.where(tf.logical_and(feature_idx[:,0] == v0,feature_idx[:,1] == 0))
+            if tf.size(idx) > 0:
+                idx = tf.squeeze(idx)
+                st = feature_idx[idx,2]
+                ed = feature_idx[idx,3]
+            else:
+                st = tf.constant(0, tf.int64)
+                ed = tf.constant(1, tf.int64)
+            index = tf.random.uniform([], minval=st, maxval=ed, dtype=tf.int64)
+            sample = features[index,:]
+        return tf.concat([sample, tf.cast(v[1:], tf.float32)], axis=0)
+
+    @tf.function
+    def subfun_t(v):
+        sample = tf.zeros([feature_dim])
+        v0 = tf.cast(v[0], tf.int64)
+        if v0 > 0:
+            idx = tf.where(tf.logical_and(feature_idx[:,0] == v0,feature_idx[:,1] == 1))
+            if tf.size(idx) > 0:
+                idx = tf.squeeze(idx)
+                st = feature_idx[idx,2]
+                ed = feature_idx[idx,3]
+            else:
+                idx2 = tf.where(tf.logical_and(feature_idx[:,0] == v0,feature_idx[:,1] == 0))
+                if tf.size(idx2) > 0:
+                    idx2 = tf.squeeze(idx2)
+                    st = feature_idx[idx2,2]
+                    ed = feature_idx[idx2,3]
+                else:
+                    st = tf.constant(0, tf.int64)
+                    ed = tf.constant(1, tf.int64)
+            index = tf.random.uniform([], minval=st, maxval=ed, dtype=tf.int64)
+            sample = features[index,:]
+        return tf.concat([sample, tf.cast(v[1:], tf.float32)], axis=0)
+
+    if tf.random.uniform([]) < 0.25:
+        return tf.map_fn(subfun_t, code_vec, fn_output_signature=tf.float32)
     else:
-        st = 0
-    for i in tf.range(st, data['length']):
-        tf.autograph.experimental.set_loop_options(
-            shape_invariants=[
-                (f, tf.TensorShape([None,encoder_dim])),
-                (f1, tf.TensorShape([None,encoder_dim])),
-                (s, tf.TensorShape([None,])),
-                (s1, tf.TensorShape([None,])),
-            ]
-        )
-        s1 = tf.concat([s, strcode[i]], axis=0)
-        f1 = tf.concat([f, features[i]], axis=0)
-        if tf.shape(s1)[0] < max_decoderlen - 1 and tf.shape(f1)[0] < max_encoderlen:
-            s = s1
-            f = f1
-            if tf.shape(s)[0] >= want_declen:
-                break
+        return tf.map_fn(subfun_n, code_vec, fn_output_signature=tf.float32)
+
+def process_data(data):
+    batch = 8
+    str_data = data['str']
+    code = data['code']
+    strlen_data = data['strlen']
+    codelen_data = data['codelen']
+    max_len = tf.random.uniform([], 1, max_encoderlen, dtype=tf.int64)
+    pad_ln = tf.random.uniform([batch])
+
+    result_strlen = tf.constant(0, dtype=tf.int64)
+    result_codelen = tf.constant(0, dtype=tf.int64)
+    j = tf.constant(0, tf.int64)
+    while j < batch:
+        if result_strlen + strlen_data[j] < max_decoderlen - 2 and result_codelen < max_len and result_codelen + codelen_data[j] < max_encoderlen:
+            result_strlen += strlen_data[j]
+            result_codelen += codelen_data[j]
+            if pad_ln[j] < 0.1:
+                result_strlen += 1
+                result_codelen += 1
         else:
             break
-    return {
-        'strcode': s,
-        'features': f,
-    }
-
-def encode(data):
-    strcode = data['strcode']
-    features = data['features']
-
-    decoder_len = tf.shape(strcode)[0]
-    encoder_len = tf.shape(features)[0]
-
-    true_str = tf.pad(strcode, [[0, max_decoderlen - decoder_len]])
-
-    strcode = tf.concat([
+        j += 1
+    if j == 0:
+        j = tf.constant(1, tf.int64)
+        
+    def loop1(result,i):
+        result = tf.concat([result, tf.io.parse_tensor(code[i], tf.int32)], axis=0)
+        if pad_ln[i] < 0.1:
+            result = tf.concat([result, tf.constant([[0,0,0,0,1]], tf.int32)], axis=0)
+        return result, i+1
+
+    result_code,_ = tf.while_loop(lambda r,i: i < j, loop1, 
+                                  loop_vars=[tf.zeros([0,5], tf.int32), tf.constant(0,tf.int64)],
+                                  shape_invariants=[tf.TensorShape([None,5]),tf.TensorShape([])])
+
+    def loop2(result,i):
+        result = tf.strings.join([result, str_data[i]])
+        if pad_ln[i] < 0.1:
+            result = tf.strings.join([result, tf.constant("\n")])
+        return result, i+1
+
+    result_str,_ = tf.while_loop(lambda r,i: i < j, loop2, loop_vars=[tf.constant(""), tf.constant(0,tf.int64)])
+
+    if tf.random.uniform([]) < 0.5:
+        result_str = tf.strings.substr(result_str, 0, tf.strings.length(result_str)-1)
+        result_code = result_code[:-1,:]
+
+    if tf.shape(result_code, out_type=tf.int64)[0] > max_encoderlen:
+        return {
+            'text': tf.constant(""),
+            'decoder_true': tf.zeros([max_decoderlen], dtype=tf.int32),
+            'decoder_task': tf.zeros([max_decoderlen], dtype=tf.int32),
+            'encoder_inputs': tf.zeros([max_encoderlen, encoder_dim]),
+        }
+
+    decoder_code = tf.strings.unicode_decode(result_str, input_encoding='UTF-8')
+    if tf.shape(decoder_code, out_type=tf.int64)[0] > max_decoderlen - 2:
+        return {
+            'text': tf.constant(""),
+            'decoder_true': tf.zeros([max_decoderlen], dtype=tf.int32),
+            'decoder_task': tf.zeros([max_decoderlen], dtype=tf.int32),
+            'encoder_inputs': tf.zeros([max_encoderlen, encoder_dim]),
+        }
+
+    decoder_code = tf.concat([
         tf.cast([decoder_SOT], dtype=tf.int32),
-        strcode,
+        decoder_code,
         tf.cast([decoder_EOT], dtype=tf.int32),
     ], axis=0)
+    encoder_input = generate_feature(result_code)
+    decoder_len = tf.shape(decoder_code)[0]
+    encoder_len = tf.shape(encoder_input)[0]
 
+    decoder_true = decoder_code[1:]
+    decoder_task = decoder_code[:-1]
+    decoder_len = tf.shape(decoder_true)[0]
 
-    decoder_true = strcode[1:]
-    decoder_task = strcode[:-1]
     decoder_true = tf.pad(decoder_true, [[0, max_decoderlen - decoder_len]])
     decoder_task = tf.pad(decoder_task, [[0, max_decoderlen - decoder_len]])
-
-    encoder_inputs = tf.pad(features,[[0, max_encoderlen - encoder_len], [0, 0]])
+    encoder_inputs = tf.pad(encoder_input, [[0, max_encoderlen - encoder_len], [0, 0]])
 
     return {
-        'text': true_str,
+        'text': result_str,
         'decoder_true': decoder_true,
         'decoder_task': decoder_task,
         'encoder_inputs': encoder_inputs,
     }
 
+
 def create_dataset(batch_size, filelist):
-    files = tf.data.Dataset.from_tensor_slices(filelist)
-    files = files.shuffle(len(filelist))
-    ds = files.interleave(lambda x: tf.data.TFRecordDataset(x, 'ZLIB'),
-                            num_parallel_calls=tf.data.AUTOTUNE,
-                            deterministic=False)
-    ds = ds.apply(tf.data.experimental.assert_cardinality(len(filelist)*samples_per_file))
-    ds = ds.shuffle(1000)
+    fs = tf.data.Dataset.from_tensor_slices(filelist)
+    fs = fs.shuffle(len(filelist), reshuffle_each_iteration=True)
+    ds = tf.data.TFRecordDataset(filenames=fs)
     ds = ds.map(parse, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
-    ds = ds.map(deserialize_data, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
-    ds = ds.map(trim_data, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
-    ds = ds.map(encode, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
+    ds = ds.batch(8, drop_remainder=True)
+    ds = ds.map(process_data, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
+    ds = ds.shuffle(10000)
+    ds = ds.repeat()
     ds = ds.batch(batch_size, drop_remainder=True, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
     ds = ds.prefetch(tf.data.AUTOTUNE)
     return ds
@@ -130,5 +196,5 @@ def generate_data(data_path=''):
     return create_dataset(1, test_files)
 
 if __name__=='__main__':
-    for d in test_data(4):
-        print(d)
+    for d in generate_data().take(10):
+        print([b.decode() for b in d['text'].numpy()])