diff --git a/const.py b/const.py index 7cd3190..28d8942 100644 --- a/const.py +++ b/const.py @@ -1,10 +1,11 @@ encoder_add_dim = 4 -max_decoderlen = 512 -max_encoderlen = 512 +max_decoderlen = 128 +max_encoderlen = 128 decoder_SOT = 1 decoder_EOT = 2 -samples_per_file = 1000 \ No newline at end of file +samples_per_file = 1000 +lines_per_file = 100000 diff --git a/convert2_coreml.py b/convert2_coreml.py new file mode 100755 index 0000000..f62190d --- /dev/null +++ b/convert2_coreml.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +import tensorflow as tf +import coremltools as ct +from coremltools.converters.mil import Builder as mb +import numpy as np +import os +import time +import glob +from datetime import datetime + +from const import max_encoderlen, max_decoderlen, decoder_SOT, decoder_EOT +from net.const import hidden_dim, head_num, hopping_num_decoder +from net.transformer import TextTransformer +from net.transformer_trainer import encoder_dim +from net.detector_trainer import calc_predid + +def convert_encoder(model): + print('encoder') + + embedded = tf.keras.Input(shape=(max_encoderlen,encoder_dim), name='encoder_input') + + encoder_output = model.transformer.encoder(embedded) + + transformer_encoder = tf.keras.Model(embedded, encoder_output, name='TransformerEncoder') + + mlmodel_transformer_encoder = ct.convert(transformer_encoder, + convert_to="mlprogram", + inputs=[ + ct.TensorType(name='encoder_input', shape=ct.Shape(shape=(1, max_encoderlen, encoder_dim))), + ], + compute_units=ct.ComputeUnit.CPU_AND_NE, + minimum_deployment_target=ct.target.iOS16) + mlmodel_transformer_encoder.version = datetime.now().strftime("%Y%m%d%H%M%S") + spec = mlmodel_transformer_encoder.get_spec() + + # get output names + output_names = [out.name for out in spec.description.output] + + ct.utils.rename_feature(spec, output_names[0], 'encoder_output') + mlmodel_transformer_encoder_fix = ct.models.MLModel(spec, weights_dir=mlmodel_transformer_encoder.weights_dir) + mlmodel_transformer_encoder_fix.save("TransformerEncoder.mlpackage") + +def convert_decoder(model): + decoder_input = tf.keras.Input(shape=(max_decoderlen,), name='decoder_input') + encoder_output = tf.keras.Input(shape=(max_encoderlen,hidden_dim), name='encoder_output') + encoder_input = tf.keras.Input(shape=(max_encoderlen,encoder_dim), name='encoder_input') + + class Decoder(tf.keras.models.Model): + def __init__( + self, + decoder, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.decoder = decoder + + def call(self, inputs): + decoder_input, encoder_output, encoder_input = inputs + decoder_output = self.decoder([decoder_input, encoder_output, encoder_input]) + + out1091, out1093, out1097 = decoder_output + p1091 = out1091[0,:,:] + p1093 = out1093[0,:,:] + p1097 = out1097[0,:,:] + return p1091, p1093, p1097 + + decoder = Decoder(model.transformer.decoder) + inputs = [decoder_input, encoder_output, encoder_input] + + transformer_decoder = tf.keras.Model(inputs, decoder(inputs), name='TransformerDecoder') + + mlmodel_transformer_decoder = ct.convert(transformer_decoder, + convert_to="mlprogram", + inputs=[ + ct.TensorType(name='decoder_input', shape=ct.Shape(shape=(1, max_decoderlen))), + ct.TensorType(name='encoder_output', shape=ct.Shape(shape=(1, max_encoderlen, hidden_dim))), + ct.TensorType(name='encoder_input', shape=ct.Shape(shape=(1, max_encoderlen, encoder_dim))), + ], + compute_units=ct.ComputeUnit.CPU_AND_NE, + minimum_deployment_target=ct.target.iOS16) + + mlmodel_transformer_decoder.version = datetime.now().strftime("%Y%m%d%H%M%S") + spec = mlmodel_transformer_decoder.get_spec() + + ct.utils.rename_feature(spec, 'Identity', 'mod1091') + ct.utils.rename_feature(spec, 'Identity_1', 'mod1093') + ct.utils.rename_feature(spec, 'Identity_2', 'mod1097') + + mlmodel_transformer_decoder = ct.models.MLModel(spec, weights_dir=mlmodel_transformer_decoder.weights_dir) + mlmodel_transformer_decoder.save("TransformerDecoder.mlpackage") + +class TransformerDecoderModel(tf.keras.models.Model): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + self.transformer = TextTransformer() + embedded = tf.keras.Input(shape=(max_encoderlen,encoder_dim)) + decoderinput = tf.keras.Input(shape=(max_decoderlen,)) + self.transformer((embedded, decoderinput)) + + self.transformer.summary() + +def convert2(): + model = TransformerDecoderModel() + last = tf.train.latest_checkpoint('ckpt2') + print(last) + model.load_weights(last).expect_partial() + + convert_encoder(model) + convert_decoder(model) + #return last + +def testmodel(): + print('load char param') + npz_file = np.load('charparam.npz') + codes = [] + for varname in npz_file.files: + codes.append(int(varname[:-1])) + codes = sorted(codes) + features = {} + for code in codes: + feature = npz_file['%dn'%code] + features[chr(code)] = feature + rng = np.random.default_rng() + + print('load') + mlmodel_encoder = ct.models.MLModel('TransformerEncoder.mlpackage') + mlmodel_decoder = ct.models.MLModel('TransformerDecoder.mlpackage') + + print('make input') + encoder_input = [ + np.concatenate([rng.choice(features['t']), np.asarray([0, 0, 0, 0])]), + np.concatenate([rng.choice(features['e']), np.asarray([0, 0, 0, 0])]), + np.concatenate([rng.choice(features['s']), np.asarray([0, 0, 0, 0])]), + np.concatenate([rng.choice(features['t']), np.asarray([0, 0, 0, 0])]), + np.concatenate([rng.choice(features['o']), np.asarray([1, 0, 0, 0])]), + np.concatenate([rng.choice(features['u']), np.asarray([0, 0, 0, 0])]), + np.concatenate([rng.choice(features['t']), np.asarray([0, 0, 0, 0])]), + np.concatenate([rng.choice(features['p']), np.asarray([0, 0, 0, 0])]), + np.concatenate([rng.choice(features['u']), np.asarray([0, 0, 0, 0])]), + np.concatenate([rng.choice(features['t']), np.asarray([0, 0, 0, 1])]), + ] + encoder_input = np.pad(encoder_input, [[0, max_encoderlen - len(encoder_input)],[0,0]]) + encoder_input = np.expand_dims(encoder_input, 0) + print('encoder') + out1 = mlmodel_encoder.predict({ 'encoder_input': encoder_input }) + + print('decoder') + decoder_input = np.zeros([1,max_decoderlen], dtype=np.float32) + decoder_input[0,0] = decoder_SOT + count = 0 + while count < max_decoderlen - 1 and decoder_input[0,count] != decoder_EOT: + out2 = mlmodel_decoder.predict({ 'decoder_input': decoder_input, **out1, 'encoder_input': encoder_input }) + mod1091 = out2['mod1091'] + mod1093 = out2['mod1093'] + mod1097 = out2['mod1097'] + i1091 = np.argmax(mod1091[count,:]) + i1093 = np.argmax(mod1093[count,:]) + i1097 = np.argmax(mod1097[count,:]) + code = calc_predid(i1091,i1093,i1097) + count += 1 + decoder_input[0,count] = code + + code = decoder_input[0].astype(np.int32) + print(code) + str_code = code[1:count] + str_text = ''.join([chr(c) if c < 0x110000 else '\uFFFD' for c in str_code]) + print(str_text) + +if __name__ == '__main__': + convert2() + testmodel() \ No newline at end of file diff --git a/convert_chardata.py b/convert_chardata.py new file mode 100755 index 0000000..3d472c8 --- /dev/null +++ b/convert_chardata.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +import numpy as np +import glob +import os +import sys +from net.const import feature_dim + +rng = np.random.default_rng() + +features = {} +char_dir = 'chardata_font' +print(char_dir) +for filename in glob.glob(os.path.join(char_dir,'*.npy')): + code = os.path.splitext(os.path.basename(filename))[0] + + feature = np.load(filename) + if len(feature.shape) == 1: + feature = np.expand_dims(feature,0) + features[code] = np.concatenate([features.get(code, np.zeros([0,feature_dim], np.float32)), feature]) + print(code, feature.shape[0]) + +char_dir = 'chardata_hand' +print(char_dir) +for filename in glob.glob(os.path.join(char_dir,'*.npy')): + code = os.path.splitext(os.path.basename(filename))[0] + 'n' + + feature = np.load(filename) + if len(feature.shape) == 1: + feature = np.expand_dims(feature,0) + count = feature.shape[0] + print(code, count) + + mu = np.mean(feature, axis=0) + sd = np.std(feature, axis=0) if count > 3 else 0.5 * np.ones_like(mu, dtype=np.float32) + + font_feature = features.get(code, np.zeros([0,feature_dim], np.float32)) + features[code] = np.concatenate([font_feature, feature]) + +values = {} +for code in sorted(features.keys()): + feature = features[code] + count = feature.shape[0] + print(code, count) + + values[code] = feature + +for key in values: + if not np.all(np.isfinite(values[key])): + print(key) + print(values[key]) + exit() + +np.savez('charparam', **values) diff --git a/findline.py b/findline.py new file mode 100755 index 0000000..506e452 --- /dev/null +++ b/findline.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +import tensorflow as tf +if len(tf.config.list_physical_devices('GPU')) > 0: + tf.keras.mixed_precision.set_global_policy('mixed_float16') + +from PIL import Image +import numpy as np +import matplotlib.pyplot as plt + +import sys +import subprocess +import os + +import net +from makedata.process import TextDetectorModel, calcHist + +npzfile = 'params.npz' + +if len(sys.argv) < 2 and os.path.exists(npzfile): + print('loading params') + with np.load(npzfile, mmap_mode='r') as params: + locations = params['locations'] + glyphfeatures = params['glyphfeatures'] + lines = params['lines'] + seps = params['seps'] + im0 = params['im0'] +else: + im0 = Image.open(sys.argv[1]).convert('RGB') + im0 = np.asarray(im0) + + model = TextDetectorModel() + + stepx = net.width * 3 // 4 + stepy = net.height * 3 // 4 + + padx = max(0, stepx - (im0.shape[1] - net.width) % stepx, net.width - im0.shape[1]) + pady = max(0, stepy - (im0.shape[0] - net.height) % stepy, net.height - im0.shape[0]) + im = np.pad(im0, [[0,pady],[0,padx],[0,0]], 'constant', constant_values=((255,255),(255,255),(255,255))) + + im1 = tf.image.convert_image_dtype(im, dtype=tf.float32) + im1 = im1 * 255. + + yi = tf.data.Dataset.range(0, im.shape[0] - net.height + 1, stepy) + xi = tf.data.Dataset.range(0, im.shape[1] - net.width + 1, stepx) + ds0 = yi.flat_map(lambda y: xi.map(lambda x: (x, y))) + ds0 = ds0.map(lambda x,y: { + 'input': im1[y:y+net.height,x:x+net.width,:], + 'offsetx': x, + 'offsety': y, + }) + ds0 = ds0.batch(8) + ds0 = ds0.prefetch(tf.data.AUTOTUNE) + + locations, glyphfeatures, lines, seps = model.eval(ds0, im1, cut_off=0.5) + + valid_locations = [] + for i, (p, x, y, w, h, c1, c2, c4) in enumerate(locations): + x1 = np.clip(int(x - w/2), 0, im.shape[1]) + y1 = np.clip(int(y - h/2), 0, im.shape[0]) + x2 = np.clip(int(x + w/2) + 1, 0, im.shape[1]) + y2 = np.clip(int(y + h/2) + 1, 0, im.shape[0]) + if calcHist(im[y1:y2,x1:x2,:]) < 50: + continue + valid_locations.append(i) + locations = locations[valid_locations,:] + glyphfeatures = glyphfeatures[valid_locations,:] + + np.savez_compressed(npzfile, locations=locations, glyphfeatures=glyphfeatures, lines=lines, seps=seps, im0=im0) + +# plt.imshow(im0) +# for p, cx, cy, w, h, c1, c2 in locations: +# points = [ +# [cx - w / 2, cy - h / 2], +# [cx + w / 2, cy - h / 2], +# [cx + w / 2, cy + h / 2], +# [cx - w / 2, cy + h / 2], +# [cx - w / 2, cy - h / 2], +# ] +# points = np.array(points) +# plt.plot(points[:,0], points[:,1]) +# plt.show() + +print('construct data') +h, w = lines.shape +input_binary = int(0).to_bytes(4, 'little') +input_binary += int(w).to_bytes(4, 'little') +input_binary += int(h).to_bytes(4, 'little') +input_binary += lines.tobytes() +input_binary += seps.tobytes() +input_binary += int(locations.shape[0]).to_bytes(4, 'little') +input_binary += locations[:,1:].tobytes() +input_binary += int(im0.shape[1] // 2).to_bytes(4, 'little') +input_binary += int(im0.shape[0] // 2).to_bytes(4, 'little') + +print('run') +result = subprocess.run('./linedetect', input=input_binary, stdout=subprocess.PIPE).stdout +detected_boxes = [] +p = 0 +max_block = 0 +count = int.from_bytes(result[p:p+4], byteorder='little') +p += 4 +for i in range(count): + id = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + p += 4 + block = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + max_block = max(max_block, block) + p += 4 + idx = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + p += 4 + subidx = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + p += 4 + subtype = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + p += 4 + detected_boxes.append((id,block,idx,subidx,subtype)) + +print(detected_boxes) + +plt.imshow(im0) +cmap = plt.get_cmap('rainbow', max_block+1) +for id, block, idx, subidx, subtype in detected_boxes: + if id < 0: + continue + cx = locations[id, 1] + cy = locations[id, 2] + w = locations[id, 3] + h = locations[id, 4] + + points = [ + [cx - w / 2, cy - h / 2], + [cx + w / 2, cy - h / 2], + [cx + w / 2, cy + h / 2], + [cx - w / 2, cy + h / 2], + [cx - w / 2, cy - h / 2], + ] + points = np.array(points) + plt.plot(points[:,0], points[:,1], color=cmap(block)) + if idx < 0: + t = '*' + else: + if subtype & 2+4 == 2+4: + points = [ + [cx - w / 2 + 1, cy - h / 2 + 1], + [cx + w / 2 - 1, cy - h / 2 + 1], + [cx + w / 2 - 1, cy + h / 2 - 1], + [cx - w / 2 + 1, cy + h / 2 - 1], + [cx - w / 2 + 1, cy - h / 2 + 1], + ] + points = np.array(points) + plt.plot(points[:,0], points[:,1], color='yellow') + t = '%d-r%d-%d'%(block, idx, subidx) + elif subtype & 2+4 == 2: + points = [ + [cx - w / 2 + 1, cy - h / 2 + 1], + [cx + w / 2 - 1, cy - h / 2 + 1], + [cx + w / 2 - 1, cy + h / 2 - 1], + [cx - w / 2 + 1, cy + h / 2 - 1], + [cx - w / 2 + 1, cy - h / 2 + 1], + ] + points = np.array(points) + plt.plot(points[:,0], points[:,1], color='blue') + t = '%d-b%d-%d'%(block, idx, subidx) + else: + t = '%d-%d-%d'%(block, idx, subidx) + if subtype & 8 == 8: + t += '+' + plt.text(cx, cy, t, color='black') + +# plt.figure() +# plt.imshow(lines) + +# plt.figure() +# plt.imshow(seps) + +# linemap = np.loadtxt('linemap.txt') +# plt.figure() +# plt.imshow(linemap) + +# angle = np.loadtxt('angle.txt') +# plt.figure() +# plt.imshow(angle) + +plt.show() + diff --git a/findline_coreml.py b/findline_coreml.py new file mode 100755 index 0000000..9fad73f --- /dev/null +++ b/findline_coreml.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 + +from PIL import Image +import numpy as np +import matplotlib.pyplot as plt + +import sys +import subprocess +import os + +from net.const import width, scale, height, feature_dim +from util_funcs import calcHist + +npzfile = 'params.npz' + +def eval(ds, org_img, cut_off = 0.5): + import coremltools as ct + + mlmodel_detector = ct.models.MLModel('TextDetector.mlpackage') + + print(org_img.shape) + print("test") + + locations = [np.zeros(5+4, dtype=np.float32)] + glyphfeatures = [np.zeros(feature_dim, dtype=np.float32)] + keymap_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale], dtype=np.float32) + lines_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale], dtype=np.float32) + seps_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale], dtype=np.float32) + code_all = [] + for _ in range(4): + code_all.append(np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale], dtype=np.float32)) + + for n, inputs in enumerate(ds): + print(n) + x_i = inputs['offsetx'] + y_i = inputs['offsety'] + x_is = x_i // scale + y_is = y_i // scale + x_s = width // scale + y_s = height // scale + + input_image = Image.fromarray(inputs['input'], mode="RGB") + output = mlmodel_detector.predict({'Image': input_image}) + maps = output['Output_heatmap'] + feature = output['Output_feature'] + + mask = np.zeros([y_s, x_s], dtype=bool) + x_min = int(x_s * 1 / 6) if x_i > 0 else 0 + x_max = int(x_s * 5 / 6) if x_i + width < org_img.shape[1] else x_s + y_min = int(y_s * 1 / 6) if y_i > 0 else 0 + y_max = int(y_s * 5 / 6) if y_i + height < org_img.shape[0] else y_s + mask[y_min:y_max, x_min:x_max] = True + + keymap_p = maps[0,:,:,0] + line_p = maps[0,:,:,6] + seps_p = maps[0,:,:,7] + code_p = [] + for k in range(4): + code_p.append(maps[0,:,:,8+k]) + + keymap_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(keymap_p * mask, keymap_all[y_is:y_is+y_s,x_is:x_is+x_s]) + lines_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(line_p * mask, lines_all[y_is:y_is+y_s,x_is:x_is+x_s]) + seps_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(seps_p * mask, seps_all[y_is:y_is+y_s,x_is:x_is+x_s]) + for k in range(4): + code_all[k][y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(code_p[k] * mask, code_all[k][y_is:y_is+y_s,x_is:x_is+x_s]) + + peak = maps[0,:,:,1] + idxy, idxx = np.unravel_index(np.argsort(-peak.ravel()), peak.shape) + + for y, x in zip(idxy, idxx): + if peak[y,x] < cut_off: + break + w = maps[0,y,x,2] + h = maps[0,y,x,3] + dx = maps[0,y,x,4] + dy = maps[0,y,x,5] + if w * h <= 0: + continue + ix = x * scale + dx + x_i + iy = y * scale + dy + y_i + + codes = [] + for k in range(4): + codes.append(code_p[k][y,x]) + + locations.append(np.array([peak[y,x], ix, iy, w, h, *codes])) + glyphfeatures.append(feature[0, y, x, :]) + + locations = np.array(locations, dtype=np.float32) + glyphfeatures = np.array(glyphfeatures, dtype=np.float32) + + idx = np.argsort(-locations[:,0]) + done_area = np.zeros([0,4], dtype=np.float32) + selected_idx = [] + for i in idx: + p = locations[i,0] + if p < cut_off: + break + cx = locations[i,1] + cy = locations[i,2] + w = locations[i,3] + h = locations[i,4] + area0_vol = w * h + if done_area.size > 0: + area1_vol = done_area[:,2] * done_area[:,3] + inter_xmin = np.maximum(cx - w / 2, done_area[:,0] - done_area[:,2] / 2) + inter_ymin = np.maximum(cy - h / 2, done_area[:,1] - done_area[:,3] / 2) + inter_xmax = np.minimum(cx + w / 2, done_area[:,0] + done_area[:,2] / 2) + inter_ymax = np.minimum(cy + h / 2, done_area[:,1] + done_area[:,3] / 2) + inter_w = np.maximum(inter_xmax - inter_xmin, 0.) + inter_h = np.maximum(inter_ymax - inter_ymin, 0.) + inter_vol = inter_w * inter_h + union_vol = area0_vol + area1_vol - inter_vol + iou = np.where(union_vol > 0., inter_vol / union_vol, 0.) + if iou.max() > 0.5: + continue + if inter_vol.max() > area0_vol * 0.5: + continue + done_area = np.vstack([done_area, np.array([cx, cy, w, h])]) + selected_idx.append(i) + + if len(selected_idx) > 0: + selected_idx = np.array(selected_idx) + + locations = locations[selected_idx,:] + glyphfeatures = glyphfeatures[selected_idx,:] + else: + locations = np.zeros([0,5+4], dtype=np.float32) + glyphfeatures = np.zeros([0,feature_dim], dtype=np.float32) + + for i in range(locations.shape[0]): + cx = locations[i,1] + cy = locations[i,2] + x = int(cx / scale) + y = int(cy / scale) + if x >= 0 and x < org_img.shape[1] // scale and y >= 0 and y < org_img.shape[0] // scale: + for k in range(4): + locations[i,5+k] = max(code_all[k][y,x], locations[i,5+k]) + + return locations, glyphfeatures, lines_all, seps_all + + +if len(sys.argv) < 2 and os.path.exists(npzfile): + print('loading params') + with np.load(npzfile, mmap_mode='r') as params: + locations = params['locations'] + glyphfeatures = params['glyphfeatures'] + lines = params['lines'] + seps = params['seps'] + im0 = params['im0'] +else: + im0 = Image.open(sys.argv[1]).convert('RGB') + im0 = np.asarray(im0) + + stepx = width * 1 // 2 + stepy = height * 1 // 2 + + padx = max(0, stepx - (im0.shape[1] - width) % stepx, width - im0.shape[1]) + pady = max(0, stepy - (im0.shape[0] - height) % stepy, height - im0.shape[0]) + im0 = np.pad(im0, [[0,pady],[0,padx],[0,0]], 'constant', constant_values=((255,255),(255,255),(255,255))) + + ds0 = [] + for y in range(0, im0.shape[0] - height + 1, stepy): + for x in range(0, im0.shape[1] - width + 1, stepx): + ds0.append({ + 'input': im0[y:y+height,x:x+width,:], + 'offsetx': x, + 'offsety': y, + }) + + locations, glyphfeatures, lines, seps = eval(ds0, im0, cut_off=0.35) + + valid_locations = [] + for i, (p, x, y, w, h, c1, c2, c4, c8) in enumerate(locations): + x1 = np.clip(int(x - w/2), 0, im0.shape[1]) + y1 = np.clip(int(y - h/2), 0, im0.shape[0]) + x2 = np.clip(int(x + w/2) + 1, 0, im0.shape[1]) + y2 = np.clip(int(y + h/2) + 1, 0, im0.shape[0]) + if calcHist(im0[y1:y2,x1:x2,:]) < 35: + continue + valid_locations.append(i) + locations = locations[valid_locations,:] + glyphfeatures = glyphfeatures[valid_locations,:] + + np.savez_compressed(npzfile, locations=locations, glyphfeatures=glyphfeatures, lines=lines, seps=seps, im0=im0) + +# plt.imshow(im0) +# for p, cx, cy, w, h, c1, c2, c4, c8 in locations: +# points = [ +# [cx - w / 2, cy - h / 2], +# [cx + w / 2, cy - h / 2], +# [cx + w / 2, cy + h / 2], +# [cx - w / 2, cy + h / 2], +# [cx - w / 2, cy - h / 2], +# ] +# points = np.array(points) +# plt.plot(points[:,0], points[:,1]) +# plt.show() + +print('construct data') +h, w = lines.shape +input_binary = int(0).to_bytes(4, 'little') +input_binary += int(w).to_bytes(4, 'little') +input_binary += int(h).to_bytes(4, 'little') +input_binary += lines.tobytes() +input_binary += seps.tobytes() +input_binary += int(locations.shape[0]).to_bytes(4, 'little') +input_binary += locations[:,1:].tobytes() +input_binary += int(im0.shape[1] // 2).to_bytes(4, 'little') +input_binary += int(im0.shape[0] // 2).to_bytes(4, 'little') + +print('run') +result = subprocess.run('./linedetect', input=input_binary, stdout=subprocess.PIPE).stdout +detected_boxes = [] +p = 0 +max_block = 0 +count = int.from_bytes(result[p:p+4], byteorder='little') +p += 4 +for i in range(count): + id = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + p += 4 + block = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + max_block = max(max_block, block) + p += 4 + idx = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + p += 4 + subidx = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + p += 4 + subtype = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + p += 4 + detected_boxes.append((id,block,idx,subidx,subtype)) + +print(detected_boxes) + +plt.imshow(im0) +cmap = plt.get_cmap('rainbow', max_block+1) +for id, block, idx, subidx, subtype in detected_boxes: + if id < 0: + continue + cx = locations[id, 1] + cy = locations[id, 2] + w = locations[id, 3] + h = locations[id, 4] + + points = [ + [cx - w / 2, cy - h / 2], + [cx + w / 2, cy - h / 2], + [cx + w / 2, cy + h / 2], + [cx - w / 2, cy + h / 2], + [cx - w / 2, cy - h / 2], + ] + points = np.array(points) + plt.plot(points[:,0], points[:,1], color=cmap(block)) + if idx < 0: + t = '*' + else: + if subtype & 2+4 == 2+4: + points = [ + [cx - w / 2 + 1, cy - h / 2 + 1], + [cx + w / 2 - 1, cy - h / 2 + 1], + [cx + w / 2 - 1, cy + h / 2 - 1], + [cx - w / 2 + 1, cy + h / 2 - 1], + [cx - w / 2 + 1, cy - h / 2 + 1], + ] + points = np.array(points) + plt.plot(points[:,0], points[:,1], color='yellow') + t = '%d-r%d-%d'%(block, idx, subidx) + elif subtype & 2+4 == 2: + points = [ + [cx - w / 2 + 1, cy - h / 2 + 1], + [cx + w / 2 - 1, cy - h / 2 + 1], + [cx + w / 2 - 1, cy + h / 2 - 1], + [cx - w / 2 + 1, cy + h / 2 - 1], + [cx - w / 2 + 1, cy - h / 2 + 1], + ] + points = np.array(points) + plt.plot(points[:,0], points[:,1], color='blue') + t = '%d-b%d-%d'%(block, idx, subidx) + else: + t = '%d-%d-%d'%(block, idx, subidx) + if subtype & 8 == 8: + t += '+' + plt.text(cx, cy, t, color='black') + +# plt.figure() +# plt.imshow(lines) + +# plt.figure() +# plt.imshow(seps) + +# linemap = np.loadtxt('linemap.txt') +# plt.figure() +# plt.imshow(linemap) + +# angle = np.loadtxt('angle.txt') +# plt.figure() +# plt.imshow(angle) + +plt.show() + diff --git a/make_chardata.py b/make_chardata.py new file mode 100755 index 0000000..33097a7 --- /dev/null +++ b/make_chardata.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python3 + +import tensorflow as tf +physical_devices = tf.config.list_physical_devices('GPU') +if len(physical_devices) > 0 and tf.config.experimental.get_device_details(physical_devices[0]).get('device_name') != 'METAL': + tf.keras.mixed_precision.set_global_policy('mixed_float16') + + physical_devices = tf.config.list_physical_devices('GPU') + try: + for gpu in physical_devices: + tf.config.experimental.set_memory_growth(gpu, True) + except: + # Invalid device or cannot modify virtual devices once initialized. + pass + +import numpy as np +from PIL import Image, ImageEnhance, ImageFilter +import os, glob + +from render_font.generate_random_txt import get_random_char +from net.detector import CenterNetDetectionBlock, SimpleDecoderBlock +from net.const import width, height, scale, feature_dim + +output_dir = 'chardata_font' + +min_delta = 0.5 + +data_path = '.' +random_background = tf.io.gfile.glob(tf.io.gfile.join(data_path,'data','background','*')) +print(len(random_background),'background files loaded.') + +class TextDetectorModel(tf.keras.models.Model): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + self.detector = CenterNetDetectionBlock(pre_weight=False) + self.decoder = SimpleDecoderBlock() + + + def eval(self, ds, org_img, cut_off = 0.5): + org_img = org_img.numpy() + print(org_img.shape) + print("test") + + locations = [np.zeros(5+4)] + glyphfeatures = [np.zeros(feature_dim)] + #allfeatures = np.zeros([0,feature_dim]) + keymap_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale]) + lines_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale]) + seps_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale]) + code_all = [] + for _ in range(4): + code_all.append(np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale])) + + + for n, inputs in ds.enumerate(): + print(n.numpy()) + offsetx = inputs['offsetx'].numpy() + offsety = inputs['offsety'].numpy() + + images = inputs['input'].numpy() + maps, feature = self.detector(inputs['input']) + + keymap = maps[...,0] + local_peak = tf.nn.max_pool2d(keymap[...,tf.newaxis],5,1,'SAME') + keep = local_peak[...,0] == keymap + keymap = tf.math.sigmoid(keymap) + detectedkey = keymap * tf.cast(keep, tf.float32) + + textlines = tf.math.sigmoid(maps[...,5]) + separator = tf.math.sigmoid(maps[...,6]) + xsize = maps[...,1] + ysize = maps[...,2] + xoffset = maps[...,3] * scale + yoffset = maps[...,4] * scale + code_map = [] + for k in range(4): + code_map.append(tf.math.sigmoid(maps[...,7+k])) + + #allfeatures = np.concatenate([allfeatures, np.reshape(feature, [-1, feature_dim])]) + + for img_idx in range(images.shape[0]): + x_i = offsetx[img_idx] + y_i = offsety[img_idx] + x_is = x_i // scale + y_is = y_i // scale + x_s = width // scale + y_s = height // scale + + mask = np.zeros([y_s, x_s], dtype=bool) + x_min = int(x_s * 1 / 6) if x_i > 0 else 0 + x_max = int(x_s * 5 / 6) if x_i + width < org_img.shape[1] else x_s + y_min = int(y_s * 1 / 6) if y_i > 0 else 0 + y_max = int(y_s * 5 / 6) if y_i + height < org_img.shape[0] else y_s + mask[y_min:y_max, x_min:x_max] = True + + keymap_p = keymap[img_idx,...] + line_p = textlines[img_idx,...] + seps_p = separator[img_idx,...] + code_p = [m[img_idx,...] for m in code_map] + + keymap_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(keymap_p * mask, keymap_all[y_is:y_is+y_s,x_is:x_is+x_s]) + lines_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(line_p * mask, lines_all[y_is:y_is+y_s,x_is:x_is+x_s]) + seps_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(seps_p * mask, seps_all[y_is:y_is+y_s,x_is:x_is+x_s]) + for k in range(4): + code_all[k][y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(code_p[k] * mask, code_all[k][y_is:y_is+y_s,x_is:x_is+x_s]) + + peak = (detectedkey[img_idx, ...] * mask).numpy() + idxy, idxx = np.unravel_index(np.argsort(-peak.ravel()), peak.shape) + + for y, x in zip(idxy, idxx): + if peak[y,x] < cut_off: + break + w = tf.math.exp(xsize[img_idx,y,x] - 3) * 1024 + h = tf.math.exp(ysize[img_idx,y,x] - 3) * 1024 + if w * h <= 0: + continue + + dx = xoffset[img_idx,y,x] + dy = yoffset[img_idx,y,x] + + ix = x * scale + dx + x_i + iy = y * scale + dy + y_i + + codes = [] + for k in range(4): + codes.append(code_p[k][y,x]) + + locations.append(np.array([peak[y,x], ix, iy, w, h, *codes])) + glyphfeatures.append(feature[img_idx, y, x, :].numpy()) + + locations = np.array(locations) + glyphfeatures = np.array(glyphfeatures) + + idx = np.argsort(-locations[:,0]) + done_area = np.zeros([0,4]) + selected_idx = [] + for i in idx: + p = locations[i,0] + if p < cut_off: + break + cx = locations[i,1] + cy = locations[i,2] + w = locations[i,3] + h = locations[i,4] + area0_vol = w * h + if done_area.size > 0: + area1_vol = done_area[:,2] * done_area[:,3] + inter_xmin = np.maximum(cx - w / 2, done_area[:,0] - done_area[:,2] / 2) + inter_ymin = np.maximum(cy - h / 2, done_area[:,1] - done_area[:,3] / 2) + inter_xmax = np.minimum(cx + w / 2, done_area[:,0] + done_area[:,2] / 2) + inter_ymax = np.minimum(cy + h / 2, done_area[:,1] + done_area[:,3] / 2) + inter_w = np.maximum(inter_xmax - inter_xmin, 0.) + inter_h = np.maximum(inter_ymax - inter_ymin, 0.) + inter_vol = inter_w * inter_h + union_vol = area0_vol + area1_vol - inter_vol + iou = np.where(union_vol > 0., inter_vol / union_vol, 0.) + if iou.max() > 0.75: + continue + if inter_vol.max() > area0_vol * 0.8: + continue + done_area = np.vstack([done_area, np.array([cx, cy, w, h])]) + selected_idx.append(i) + + if len(selected_idx) > 0: + selected_idx = np.array(selected_idx) + + locations = locations[selected_idx,:] + glyphfeatures = glyphfeatures[selected_idx,:] + else: + locations = np.zeros([0,5+4]) + glyphfeatures = np.zeros([0,feature_dim]) + + return locations, glyphfeatures + +model = TextDetectorModel() +last = tf.train.latest_checkpoint('ckpt1') +print(last) +model.load_weights(last).expect_partial() + +rng = np.random.default_rng() + +def load_background_images(im_width, im_height): + import random + ind = random.choice(range(len(random_background))) + img0 = Image.open(random_background[ind]).convert('RGB') + scale_min = max(float(im_width) / float(img0.width), float(im_height) / float(img0.height)) + scale_max = max(scale_min + 0.5, 1.5) + s = rng.uniform(scale_min, scale_max) + img = img0.resize((int(float(img0.width) * s)+1, int(float(img0.height) * s)+1),Image.BILINEAR) + x1 = max(0, int(rng.uniform(0, img.width - im_width))) + y1 = max(0, int(rng.uniform(0, img.height - im_height))) + img = np.asarray(img)[y1:y1+im_height, x1:x1+im_width,:] + + if rng.uniform() < 0.5: + img = img[::-1,:,:] + if rng.uniform() < 0.5: + img = img[:,::-1,:] + enhancer = ImageEnhance.Brightness(Image.fromarray(img)) + img = enhancer.enhance(rng.uniform()) + enhancer = ImageEnhance.Contrast(img) + img = enhancer.enhance(rng.uniform(0.2,1.8)) + + img = np.asarray(img).astype(np.float32) / 255. + img = np.clip(img, 0., 1.) + return img + +def background_image(im_width, im_height): + bkimg = load_background_images(im_width, im_height) + bk_c = np.min(bkimg, axis=(0,1)) + bk_std = np.std(bkimg, axis=(0,1)) + fg_c = np.where( + bk_c > 0.5, + rng.uniform(np.clip(bk_c - bk_std * 2 - min_delta, None, -1), bk_c - bk_std * 2 - min_delta,[3]), + rng.uniform(bk_c + bk_std * 2 + min_delta, np.clip(bk_c + bk_std * 2 + min_delta, 1, None), [3])) + bk_alpha = np.maximum(np.max(np.abs(fg_c)), 1) + bkimg /= bk_alpha + fg_c /= bk_alpha + fg_c = np.clip(fg_c, 0., 1.) + fgimg = fg_c[None,None,:] + return fgimg, bkimg + +def preprocess_image(image, pos): + aspect = rng.uniform(0.75,1.3) + w = int(image.shape[1]*aspect) + h = int(image.shape[0]/aspect) + im = Image.fromarray(image).resize((w,h), Image.Resampling.BILINEAR) + image = np.asarray(im) + pos *= np.array([aspect,1/aspect,aspect,1/aspect]) + + angle = rng.normal() * 2.0 + py1 = max(0,int(image.shape[1]*np.sin(angle/180*np.pi))) + py2 = max(0,int(image.shape[1]*np.sin(-angle/180*np.pi))) + px1 = max(0,int(image.shape[0]*np.sin(-angle/180*np.pi))) + px2 = max(0,int(image.shape[0]*np.sin(angle/180*np.pi))) + image = np.pad(image, ((py1,py2),(px1,px2))) + im = Image.fromarray(image).rotate(angle, Image.Resampling.BILINEAR, center=(px1,py1)) + + M = np.array([[np.cos(angle/180*np.pi),-np.sin(angle/180*np.pi)], + [np.sin(angle/180*np.pi), np.cos(angle/180*np.pi)],]) + pos[:,:2] = (pos[:,:2] @ M) + pos[:,2:4] += np.array([pos[:,3] * np.abs(np.sin(angle/180*np.pi)), pos[:,2] * np.abs(np.sin(angle/180*np.pi))]).T + pos += np.array([px1 - 1,py1 - 1,0,0]) + return np.asarray(im), pos + +def random_filter(image): + img = Image.fromarray(image) + r = rng.uniform() + if r > 0: + img = img.filter(ImageFilter.GaussianBlur(radius=r)) + + r = rng.uniform() + if r > 0: + img = img.filter(ImageFilter.UnsharpMask(radius=r, percent=150, threshold=3)) + + return np.array(img) + +def process(rng): + turn = rng.uniform() < 0.01 + d = get_random_char(rng, turn=turn) + pos = d['position'] + if pos.size == 0: + return + codes = d['code_list'] + image = d['image'] + image, pos = preprocess_image(image, pos) + image = random_filter(image) + fgimg, bkimg = background_image(image.shape[1], image.shape[0]) + + img = image[...,None] + img = img / 255. + image = fgimg * img + bkimg * (1 - img) + image = np.clip(image, 0., 1.) + image = image * 255 + + stepx = width * 1 // 2 + stepy = height * 1 // 2 + + im0 = np.asarray(image) + + padx = max(0, stepx - (im0.shape[1] - width) % stepx, width - im0.shape[1]) + pady = max(0, stepy - (im0.shape[0] - height) % stepy, height - im0.shape[0]) + im0 = np.pad(im0, [[0,pady],[0,padx],[0,0]], 'constant', constant_values=((255,255),(255,255),(255,255))) + + im = tf.image.convert_image_dtype(im0, dtype=tf.float32) + + yi = tf.data.Dataset.range(0, im0.shape[0] - height + 1, stepy) + xi = tf.data.Dataset.range(0, im0.shape[1] - width + 1, stepx) + ds0 = yi.flat_map(lambda y: xi.map(lambda x : (x, y))) + ds0 = ds0.map(lambda x,y: { + 'input': im[y:y+height,x:x+width,:], + 'offsetx': x, + 'offsety': y, + }) + ds0 = ds0.batch(8) + ds0 = ds0.prefetch(tf.data.AUTOTUNE) + + locations, glyphfeatures = model.eval(ds0, im) + for i in range(locations.shape[0]): + cx = locations[i,1] + cy = locations[i,2] + w = locations[i,3] + h = locations[i,4] + area0_vol = w * h + + area1_vol = pos[:,2] * pos[:,3] + inter_xmin = np.maximum(cx - w / 2, pos[:,0] - pos[:,2] / 2) + inter_ymin = np.maximum(cy - h / 2, pos[:,1] - pos[:,3] / 2) + inter_xmax = np.minimum(cx + w / 2, pos[:,0] + pos[:,2] / 2) + inter_ymax = np.minimum(cy + h / 2, pos[:,1] + pos[:,3] / 2) + inter_w = np.maximum(inter_xmax - inter_xmin, 0.) + inter_h = np.maximum(inter_ymax - inter_ymin, 0.) + inter_vol = inter_w * inter_h + union_vol = area0_vol + area1_vol - inter_vol + iou = np.where(union_vol > 0., inter_vol / union_vol, 0.) + j = np.argmax(iou) + if iou[j] < 0.3: + continue + + code = codes[j,0] + feature = glyphfeatures[i,:] + + save_codefeature(code, feature, turn=turn) + +def save_codefeature(code, feature, turn=False): + os.makedirs(output_dir, exist_ok=True) + if turn: + filename = os.path.join(output_dir,'%dt.npy'%code) + else: + filename = os.path.join(output_dir,'%dn.npy'%code) + if os.path.exists(filename): + prev = np.load(filename) + feature = np.vstack([prev, feature]) + count = feature.shape[0] + else: + count = 0 + print(code, turn, count) + np.save(filename, feature) + +if __name__=="__main__": + rng = np.random.default_rng() + count = 10000 + for i in range(count): + print(i,'/',count) + process(rng) diff --git a/make_chardata_coreml.py b/make_chardata_coreml.py new file mode 100755 index 0000000..856858e --- /dev/null +++ b/make_chardata_coreml.py @@ -0,0 +1,350 @@ +#!/usr/bin/env python3 + +import coremltools as ct + +import numpy as np +from PIL import Image, ImageEnhance, ImageFilter +import os, glob + +from render_font.generate_random_txt import get_random_char + +output_dir = 'chardata_font' + +min_delta = 0.5 +width = 512 +height = 512 +scale = 2 +feature_dim = 64 + +data_path = '.' +random_background = glob.glob(os.path.join(data_path,'data','background','*')) +print(len(random_background),'background files loaded.') + +mlmodel_detector = ct.models.MLModel('TextDetector.mlpackage') + +def eval(ds, org_img, cut_off = 0.5): + print(org_img.shape) + print("test") + + locations = [np.zeros(5+4)] + glyphfeatures = [np.zeros(feature_dim, dtype=np.float32)] + keymap_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale]) + lines_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale]) + seps_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale]) + code_all = [] + for _ in range(4): + code_all.append(np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale])) + + for n, inputs in enumerate(ds): + print(n) + x_i = inputs['offsetx'] + y_i = inputs['offsety'] + x_is = x_i // scale + y_is = y_i // scale + x_s = width // scale + y_s = height // scale + + input_image = Image.fromarray(inputs['input'], mode="RGB") + output = mlmodel_detector.predict({'Image': input_image}) + maps = output['Output_heatmap'] + feature = output['Output_feature'] + + mask = np.zeros([y_s, x_s], dtype=bool) + x_min = int(x_s * 1 / 6) if x_i > 0 else 0 + x_max = int(x_s * 5 / 6) if x_i + width < org_img.shape[1] else x_s + y_min = int(y_s * 1 / 6) if y_i > 0 else 0 + y_max = int(y_s * 5 / 6) if y_i + height < org_img.shape[0] else y_s + mask[y_min:y_max, x_min:x_max] = True + + keymap_p = maps[0,:,:,0] + line_p = maps[0,:,:,6] + seps_p = maps[0,:,:,7] + code_p = [] + for k in range(4): + code_p.append(maps[0,:,:,8+k]) + + keymap_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(keymap_p * mask, keymap_all[y_is:y_is+y_s,x_is:x_is+x_s]) + lines_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(line_p * mask, lines_all[y_is:y_is+y_s,x_is:x_is+x_s]) + seps_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(seps_p * mask, seps_all[y_is:y_is+y_s,x_is:x_is+x_s]) + for k in range(4): + code_all[k][y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(code_p[k] * mask, code_all[k][y_is:y_is+y_s,x_is:x_is+x_s]) + + peak = maps[0,:,:,1] + idxy, idxx = np.unravel_index(np.argsort(-peak.ravel()), peak.shape) + + for y, x in zip(idxy, idxx): + if peak[y,x] < cut_off: + break + w = maps[0,y,x,2] + h = maps[0,y,x,3] + dx = maps[0,y,x,4] + dy = maps[0,y,x,5] + if w * h <= 0: + continue + ix = x * scale + dx + x_i + iy = y * scale + dy + y_i + + codes = [] + for k in range(4): + codes.append(code_p[k][y,x]) + + locations.append(np.array([peak[y,x], ix, iy, w, h, *codes])) + glyphfeatures.append(feature[0, y, x, :]) + + locations = np.array(locations) + glyphfeatures = np.array(glyphfeatures) + + idx = np.argsort(-locations[:,0]) + done_area = np.zeros([0,4]) + selected_idx = [] + for i in idx: + p = locations[i,0] + if p < cut_off: + break + cx = locations[i,1] + cy = locations[i,2] + w = locations[i,3] + h = locations[i,4] + area0_vol = w * h + if done_area.size > 0: + area1_vol = done_area[:,2] * done_area[:,3] + inter_xmin = np.maximum(cx - w / 2, done_area[:,0] - done_area[:,2] / 2) + inter_ymin = np.maximum(cy - h / 2, done_area[:,1] - done_area[:,3] / 2) + inter_xmax = np.minimum(cx + w / 2, done_area[:,0] + done_area[:,2] / 2) + inter_ymax = np.minimum(cy + h / 2, done_area[:,1] + done_area[:,3] / 2) + inter_w = np.maximum(inter_xmax - inter_xmin, 0.) + inter_h = np.maximum(inter_ymax - inter_ymin, 0.) + inter_vol = inter_w * inter_h + union_vol = area0_vol + area1_vol - inter_vol + iou = np.where(union_vol > 0., inter_vol / union_vol, 0.) + if iou.max() > 0.75: + continue + if inter_vol.max() > area0_vol * 0.75: + continue + done_area = np.vstack([done_area, np.array([cx, cy, w, h])]) + selected_idx.append(i) + + if len(selected_idx) > 0: + selected_idx = np.array(selected_idx) + + locations = locations[selected_idx,:] + glyphfeatures = glyphfeatures[selected_idx,:] + else: + locations = np.zeros([0,5+4]) + glyphfeatures = np.zeros([0,feature_dim], dtype=np.float32) + + for i in range(locations.shape[0]): + cx = locations[i,1] + cy = locations[i,2] + x = int(cx / scale) + y = int(cy / scale) + if x >= 0 and x < org_img.shape[1] // scale and y >= 0 and y < org_img.shape[0] // scale: + for k in range(4): + locations[i,5+k] = max(code_all[k][y,x], locations[i,5+k]) + + return locations, glyphfeatures + +def load_background_images(im_width, im_height): + import random + ind = random.choice(range(len(random_background))) + img0 = Image.open(random_background[ind]).convert('RGB') + scale_min = max(float(im_width) / float(img0.width), float(im_height) / float(img0.height)) + scale_max = max(scale_min + 0.5, 1.5) + s = np.random.uniform(scale_min, scale_max) + img = img0.resize((int(float(img0.width) * s)+1, int(float(img0.height) * s)+1),Image.BILINEAR) + x1 = max(0, int(np.random.uniform(0, img.width - im_width))) + y1 = max(0, int(np.random.uniform(0, img.height - im_height))) + img = np.asarray(img)[y1:y1+im_height, x1:x1+im_width,:] + + if np.random.uniform() < 0.5: + img = img[::-1,:,:] + if np.random.uniform() < 0.5: + img = img[:,::-1,:] + enhancer = ImageEnhance.Brightness(Image.fromarray(img)) + img = enhancer.enhance(np.random.uniform()) + enhancer = ImageEnhance.Contrast(img) + img = enhancer.enhance(np.random.uniform(0.2,1.8)) + + img = np.asarray(img).astype(np.float32) / 255. + img = np.clip(img, 0., 1.) + return img + +def background_image(im_width, im_height): + bkimg = load_background_images(im_width, im_height) + bk_c = np.min(bkimg, axis=(0,1)) + bk_std = np.std(bkimg, axis=(0,1)) + fg_c = np.where( + bk_c > 0.5, + np.random.uniform(np.clip(bk_c - bk_std * 2 - min_delta, None, -1), bk_c - bk_std * 2 - min_delta,[3]), + np.random.uniform(bk_c + bk_std * 2 + min_delta, np.clip(bk_c + bk_std * 2 + min_delta, 1, None), [3])) + bk_alpha = np.maximum(np.max(np.abs(fg_c)), 1) + bkimg /= bk_alpha + fg_c /= bk_alpha + fg_c = np.clip(fg_c, 0., 1.) + fgimg = fg_c[None,None,:] + return fgimg, bkimg + +def preprocess_image(image, pos): + aspect = rng.uniform(0.75,1.3) + w = int(image.shape[1]*aspect) + h = int(image.shape[0]/aspect) + im = Image.fromarray(image).resize((w,h), Image.Resampling.BILINEAR) + image = np.asarray(im) + pos *= np.array([aspect,1/aspect,aspect,1/aspect]) + + angle = rng.normal() * 2.0 + py1 = max(0,int(image.shape[1]*np.sin(angle/180*np.pi))) + py2 = max(0,int(image.shape[1]*np.sin(-angle/180*np.pi))) + px1 = max(0,int(image.shape[0]*np.sin(-angle/180*np.pi))) + px2 = max(0,int(image.shape[0]*np.sin(angle/180*np.pi))) + image = np.pad(image, ((py1,py2),(px1,px2))) + im = Image.fromarray(image).rotate(angle, Image.Resampling.BILINEAR, center=(px1,py1)) + + M = np.array([[np.cos(angle/180*np.pi),-np.sin(angle/180*np.pi)], + [np.sin(angle/180*np.pi), np.cos(angle/180*np.pi)],]) + pos[:,:2] = (pos[:,:2] @ M) + pos[:,2:4] += np.array([pos[:,3] * np.abs(np.sin(angle/180*np.pi)), pos[:,2] * np.abs(np.sin(angle/180*np.pi))]).T + pos += np.array([px1 - 1,py1 - 1,0,0]) + return np.asarray(im), pos + +def random_filter(image): + img = Image.fromarray(image) + r = rng.uniform() + if r > 0: + img = img.filter(ImageFilter.GaussianBlur(radius=r)) + + r = rng.uniform() + if r > 0: + img = img.filter(ImageFilter.UnsharpMask(radius=r, percent=150, threshold=3)) + + return np.array(img) + +def process(rng): + turn = rng.uniform() < 0.01 + d = get_random_char(rng, turn=turn) + pos = d['position'] + if pos.size == 0: + return + codes = d['code_list'] + image = d['image'] + image, pos = preprocess_image(image, pos) + image = random_filter(image) + fgimg, bkimg = background_image(image.shape[1], image.shape[0]) + + img = image[...,None] + img = img / 255. + image = fgimg * img + bkimg * (1 - img) + image = np.clip(image, 0., 1.) + image = image * 255 + + stepx = width * 1 // 2 + stepy = height * 1 // 2 + + im0 = np.asarray(image).astype(np.uint8) + + padx = max(0, stepx - (im0.shape[1] - width) % stepx, width - im0.shape[1]) + pady = max(0, stepy - (im0.shape[0] - height) % stepy, height - im0.shape[0]) + im0 = np.pad(im0, [[0,pady],[0,padx],[0,0]], 'constant', constant_values=((255,255),(255,255),(255,255))) + + ds0 = [] + for y in range(0, im0.shape[0] - height + 1, stepy): + for x in range(0, im0.shape[1] - width + 1, stepx): + ds0.append({ + 'input': im0[y:y+height,x:x+width,:], + 'offsetx': x, + 'offsety': y, + }) + locations, glyphfeatures = eval(ds0, im0) + + # import matplotlib.pyplot as plt + # plt.imshow(im0) + # for i in range(locations.shape[0]): + # cx = locations[i,1] + # cy = locations[i,2] + # w = locations[i,3] + # h = locations[i,4] + # points = [ + # [cx-w/2,cy-h/2], + # [cx-w/2,cy+h/2], + # [cx+w/2,cy+h/2], + # [cx+w/2,cy-h/2], + # [cx-w/2,cy-h/2], + # ] + # points = np.array(points) + # plt.plot(points[:,0],points[:,1],color='blue') + + # for i in range(pos.shape[0]): + # cx = pos[i,0] + # cy = pos[i,1] + # w = pos[i,2] + # h = pos[i,3] + # points = [ + # [cx-w/2,cy-h/2], + # [cx-w/2,cy+h/2], + # [cx+w/2,cy+h/2], + # [cx+w/2,cy-h/2], + # [cx-w/2,cy-h/2], + # ] + # points = np.array(points) + # plt.plot(points[:,0],points[:,1],color='red') + + + for i in range(locations.shape[0]): + cx = locations[i,1] + cy = locations[i,2] + w = locations[i,3] + h = locations[i,4] + area0_vol = w * h + + area1_vol = pos[:,2] * pos[:,3] + inter_xmin = np.maximum(cx - w / 2, pos[:,0] - pos[:,2] / 2) + inter_ymin = np.maximum(cy - h / 2, pos[:,1] - pos[:,3] / 2) + inter_xmax = np.minimum(cx + w / 2, pos[:,0] + pos[:,2] / 2) + inter_ymax = np.minimum(cy + h / 2, pos[:,1] + pos[:,3] / 2) + inter_w = np.maximum(inter_xmax - inter_xmin, 0.) + inter_h = np.maximum(inter_ymax - inter_ymin, 0.) + inter_vol = inter_w * inter_h + union_vol = area0_vol + area1_vol - inter_vol + iou = np.where(union_vol > 0., inter_vol / union_vol, 0.) + j = np.argmax(iou) + if iou[j] < 0.3: + continue + + # points = [ + # [cx-w/2,cy-h/2], + # [cx-w/2,cy+h/2], + # [cx+w/2,cy+h/2], + # [cx+w/2,cy-h/2], + # [cx-w/2,cy-h/2], + # ] + # points = np.array(points) + # plt.plot(points[:,0],points[:,1],color='white') + + code = codes[j,0] + feature = glyphfeatures[i,:] + + save_codefeature(code, feature, turn=turn) + + # plt.show() + +def save_codefeature(code, feature, turn=False): + os.makedirs(output_dir, exist_ok=True) + if turn: + filename = os.path.join(output_dir,'%dt.npy'%code) + else: + filename = os.path.join(output_dir,'%dn.npy'%code) + if os.path.exists(filename): + prev = np.load(filename) + feature = np.vstack([prev, feature]) + count = feature.shape[0] + else: + count = 0 + print(code, turn, count) + np.save(filename, feature) + +if __name__=="__main__": + rng = np.random.default_rng() + count = 10000 + for i in range(count): + print(i,'/',count) + process(rng) diff --git a/make_chardata_fromimage_coreml.py b/make_chardata_fromimage_coreml.py new file mode 100755 index 0000000..e40f1bd --- /dev/null +++ b/make_chardata_fromimage_coreml.py @@ -0,0 +1,331 @@ +#!/usr/bin/env python3 + +import coremltools as ct + +import numpy as np +from PIL import Image, ImageEnhance +import os, glob +import sys +from termios import tcflush, TCIOFLUSH +import matplotlib.pyplot as plt +import matplotlib.widgets as wg +from matplotlib.font_manager import FontProperties + +fprop = FontProperties(fname='./NotoSerifJP-Regular.otf') + +from util_funcs import calc_predid, calcHist, width, height, scale, feature_dim + +output_dir = 'chardata_hand' + +mlmodel_detector = ct.models.MLModel('TextDetector.mlpackage') +mlmodel_decoder = ct.models.MLModel('CodeDecoder.mlpackage') + +def eval(ds, org_img, cut_off = 0.5): + print(org_img.shape) + print("test") + + locations = [np.zeros(5+4)] + glyphfeatures = [np.zeros(feature_dim, dtype=np.float32)] + keymap_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale]) + lines_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale]) + seps_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale]) + code_all = [] + for _ in range(4): + code_all.append(np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale])) + + for n, inputs in enumerate(ds): + print(n) + x_i = inputs['offsetx'] + y_i = inputs['offsety'] + x_is = x_i // scale + y_is = y_i // scale + x_s = width // scale + y_s = height // scale + + input_image = Image.fromarray(inputs['input'], mode="RGB") + output = mlmodel_detector.predict({'Image': input_image}) + maps = output['Output_heatmap'] + feature = output['Output_feature'] + + mask = np.zeros([y_s, x_s], dtype=bool) + x_min = int(x_s * 1 / 6) if x_i > 0 else 0 + x_max = int(x_s * 5 / 6) if x_i + width < org_img.shape[1] else x_s + y_min = int(y_s * 1 / 6) if y_i > 0 else 0 + y_max = int(y_s * 5 / 6) if y_i + height < org_img.shape[0] else y_s + mask[y_min:y_max, x_min:x_max] = True + + keymap_p = maps[0,:,:,0] + line_p = maps[0,:,:,6] + seps_p = maps[0,:,:,7] + code_p = [] + for k in range(4): + code_p.append(maps[0,:,:,8+k]) + + keymap_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(keymap_p * mask, keymap_all[y_is:y_is+y_s,x_is:x_is+x_s]) + lines_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(line_p * mask, lines_all[y_is:y_is+y_s,x_is:x_is+x_s]) + seps_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(seps_p * mask, seps_all[y_is:y_is+y_s,x_is:x_is+x_s]) + for k in range(4): + code_all[k][y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(code_p[k] * mask, code_all[k][y_is:y_is+y_s,x_is:x_is+x_s]) + + peak = maps[0,:,:,1] + idxy, idxx = np.unravel_index(np.argsort(-peak.ravel()), peak.shape) + + for y, x in zip(idxy, idxx): + if peak[y,x] < cut_off: + break + w = maps[0,y,x,2] + h = maps[0,y,x,3] + dx = maps[0,y,x,4] + dy = maps[0,y,x,5] + if w * h <= 0: + continue + ix = x * scale + dx + x_i + iy = y * scale + dy + y_i + + codes = [] + for k in range(4): + codes.append(code_p[k][y,x]) + + locations.append(np.array([peak[y,x], ix, iy, w, h, *codes])) + glyphfeatures.append(feature[0, y, x, :]) + + locations = np.array(locations) + glyphfeatures = np.array(glyphfeatures) + + idx = np.argsort(-locations[:,0]) + done_area = np.zeros([0,4]) + selected_idx = [] + for i in idx: + p = locations[i,0] + if p < cut_off: + break + cx = locations[i,1] + cy = locations[i,2] + w = locations[i,3] + h = locations[i,4] + area0_vol = w * h + if done_area.size > 0: + area1_vol = done_area[:,2] * done_area[:,3] + inter_xmin = np.maximum(cx - w / 2, done_area[:,0] - done_area[:,2] / 2) + inter_ymin = np.maximum(cy - h / 2, done_area[:,1] - done_area[:,3] / 2) + inter_xmax = np.minimum(cx + w / 2, done_area[:,0] + done_area[:,2] / 2) + inter_ymax = np.minimum(cy + h / 2, done_area[:,1] + done_area[:,3] / 2) + inter_w = np.maximum(inter_xmax - inter_xmin, 0.) + inter_h = np.maximum(inter_ymax - inter_ymin, 0.) + inter_vol = inter_w * inter_h + union_vol = area0_vol + area1_vol - inter_vol + iou = np.where(union_vol > 0., inter_vol / union_vol, 0.) + if iou.max() > 0.75: + continue + if inter_vol.max() > area0_vol * 0.75: + continue + done_area = np.vstack([done_area, np.array([cx, cy, w, h])]) + selected_idx.append(i) + + if len(selected_idx) > 0: + selected_idx = np.array(selected_idx) + + locations = locations[selected_idx,:] + glyphfeatures = glyphfeatures[selected_idx,:] + else: + locations = np.zeros([0,5+4]) + glyphfeatures = np.zeros([0,feature_dim], dtype=np.float32) + + for i in range(locations.shape[0]): + cx = locations[i,1] + cy = locations[i,2] + x = int(cx / scale) + y = int(cy / scale) + if x >= 0 and x < org_img.shape[1] // scale and y >= 0 and y < org_img.shape[0] // scale: + for k in range(4): + locations[i,5+k] = max(code_all[k][y,x], locations[i,5+k]) + + return locations, glyphfeatures + +def filter_boxes(im0, locations, glyphfeatures): + valid_locations = [] + for i, (p, x, y, w, h, c1, c2, c4, c8) in enumerate(locations): + x1 = np.clip(int(x - w/2), 0, im0.shape[1]) + y1 = np.clip(int(y - h/2), 0, im0.shape[0]) + x2 = np.clip(int(x + w/2) + 1, 0, im0.shape[1]) + y2 = np.clip(int(y + h/2) + 1, 0, im0.shape[0]) + if calcHist(im0[y1:y2,x1:x2,:]) < 50: + continue + valid_locations.append(i) + locations = locations[valid_locations,:] + glyphfeatures = glyphfeatures[valid_locations,:] + print(locations.shape[0],'boxes') + return locations, glyphfeatures + +def decode(glyphfeatures): + print("decode") + glyphids = [] + glyphprobs = [] + for data in glyphfeatures: + decode_output = mlmodel_decoder.predict({'Input': np.expand_dims(data,0)}) + p = decode_output['Output_p'][0] + ids = list(decode_output['Output_id'][0].astype(int)) + i = calc_predid(*ids) + glyphids.append(i) + glyphprobs.append(p) + + glyphids = np.stack(glyphids) + glyphprobs = np.stack(glyphprobs) + + return glyphids, glyphprobs + +def process(filename): + im0 = Image.open(filename).convert('RGB') + #im0 = im0.filter(ImageFilter.SHARPEN) + im0 = np.asarray(im0) + + stepx = width * 1 // 2 + stepy = height * 1 // 2 + + padx = max(0, stepx - (im0.shape[1] - width) % stepx, width - im0.shape[1]) + pady = max(0, stepy - (im0.shape[0] - height) % stepy, height - im0.shape[0]) + im0 = np.pad(im0, [[0,pady],[0,padx],[0,0]], 'constant', constant_values=((255,255),(255,255),(255,255))) + + ds0 = [] + for y in range(0, im0.shape[0] - height + 1, stepy): + for x in range(0, im0.shape[1] - width + 1, stepx): + ds0.append({ + 'input': im0[y:y+height,x:x+width,:], + 'offsetx': x, + 'offsety': y, + }) + locations, glyphfeatures = eval(ds0, im0, cut_off=0.35) + locations, glyphfeatures = filter_boxes(im0, locations, glyphfeatures) + glyphids, glyphprobs = decode(glyphfeatures) + + if locations.shape[0] < 1: + print('no box found.') + return + + box_points = [] + pred_chars = [] + for i, loc in enumerate(locations): + cx = loc[1] + cy = loc[2] + w = loc[3] + h = loc[4] + cid = glyphids[i] + + points = [ + [cx - w / 2, cy - h / 2], + [cx + w / 2, cy - h / 2], + [cx + w / 2, cy + h / 2], + [cx - w / 2, cy + h / 2], + [cx - w / 2, cy - h / 2], + ] + points = np.array(points) + + if cid < 0x10FFFF: + pred_char = chr(cid) + else: + pred_char = None + + pred_chars.append(pred_char) + box_points.append(points) + + box_points = np.array(box_points) + + fig = plt.figure() + fig.gca().imshow(im0) + + global targetIdx, waiting + targetIdx = -1 + waiting = False + + def onclick(event): + global targetIdx, waiting + if waiting: + fig.canvas.draw_idle() + return + ix, iy = event.xdata, event.ydata + if ix is None or iy is None: + fig.canvas.draw_idle() + return + b1 = np.logical_and(box_points[:,0,0] < ix, box_points[:,0,1] < iy) + b2 = np.logical_and(box_points[:,1,0] > ix, box_points[:,1,1] < iy) + b3 = np.logical_and(box_points[:,2,0] > ix, box_points[:,2,1] > iy) + b4 = np.logical_and(box_points[:,3,0] < ix, box_points[:,3,1] > iy) + idx = np.where(np.logical_and(np.logical_and(b1,b2),np.logical_and(b3,b4)))[0] + if idx.size == 0: + fig.canvas.draw_idle() + return + else: + idx = idx[0] + targetIdx = idx + if pred_chars[idx]: + pred_char = pred_chars[idx] + else: + pred_char = '' + waiting = True + tcflush(sys.stdin, TCIOFLUSH) + ans = input(f'current:{pred_char}>') + waiting = False + for txt in plt.gca().texts: + p = txt.get_position() + if p[0] == locations[targetIdx,1] and p[1] == locations[targetIdx,2]: + txt.remove() + fig.canvas.draw_idle() + break + + if ans == '' or ans[0].isspace(): + newchar = None + else: + newchar = ans[0] + pred_chars[targetIdx] = newchar + + cx = locations[targetIdx,1] + cy = locations[targetIdx,2] + pred_char = pred_chars[targetIdx] + if pred_char: + plt.text(cx, cy, pred_char, fontsize=28, color='red', fontproperties=fprop) + fig.canvas.draw_idle() + + for loc, points, pred_char in zip(locations, box_points, pred_chars): + cx = loc[1] + cy = loc[2] + plt.plot(points[:,0], points[:,1],color='cyan') + if pred_char: + plt.text(cx, cy, pred_char, fontsize=28, color='blue', fontproperties=fprop) + + fig.canvas.mpl_connect('button_press_event', onclick) + + plt.show() + + for i, pred_char in enumerate(pred_chars): + if pred_char: + feature = glyphfeatures[i,:] + save_codefeature(ord(pred_char), feature) + +def save_codefeature(code, feature): + os.makedirs(output_dir, exist_ok=True) + filename = os.path.join(output_dir,'%d.npy'%code) + if os.path.exists(filename): + prev = np.load(filename) + feature = np.vstack([prev, feature]) + count = feature.shape[0] + else: + count = 0 + print(code, count) + np.save(filename, feature) + +if __name__=="__main__": + if len(sys.argv) < 2: + print(sys.argv[0], 'image.png') + exit() + + target_files = [] + for a in sys.argv[1:]: + target_files += glob.glob(a) + + if len(target_files) < 1: + print('no image found') + exit() + + for i, filename in enumerate(target_files): + print(i,'/',len(target_files), filename) + process(filename) diff --git a/make_chardata_onnx.py b/make_chardata_onnx.py new file mode 100755 index 0000000..ece47ff --- /dev/null +++ b/make_chardata_onnx.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python3 + +import onnxruntime +from scipy.ndimage import gaussian_filter +import numpy as np +from numpy.lib.stride_tricks import as_strided +from PIL import Image, ImageEnhance, ImageFilter +import os, glob + +from render_font.generate_random_txt import get_random_char + +output_dir = 'chardata_font' + +min_delta = 0.5 +width = 512 +height = 512 +scale = 2 +feature_dim = 64 + +data_path = '.' +random_background = glob.glob(os.path.join(data_path,'data','background','*')) +print(len(random_background),'background files loaded.') + +quantized_filter = False +if os.path.exists("TextDetector.quant.onnx"): + print('quantized') + onnx_detector = onnxruntime.InferenceSession("TextDetector.quant.onnx") + quantized_filter = True +elif os.path.exists("TextDetector.infer.onnx"): + print('infer') + onnx_detector = onnxruntime.InferenceSession("TextDetector.infer.onnx") +else: + onnx_detector = onnxruntime.InferenceSession("TextDetector.onnx") + +def maxpool2d(input_matrix, kernel_size): + # Padding + pad_size = kernel_size // 2 + pad = (pad_size, pad_size) + input_matrix = np.pad(input_matrix, [pad]*len(input_matrix.shape), constant_values=-np.inf) + + # Window view of input_matrix + output_shape = (input_matrix.shape[0] - kernel_size + 1, + input_matrix.shape[1] - kernel_size + 1) + kernel_size = (kernel_size, kernel_size) + input_matrix_w = as_strided(input_matrix, shape = output_shape + kernel_size, + strides = input_matrix.strides + input_matrix.strides) + input_matrix_w = input_matrix_w.reshape(-1, *kernel_size) + return input_matrix_w.max(axis=(1,2)).reshape(output_shape) + +def eval(ds, org_img, cut_off = 0.5): + print(org_img.shape) + print("test") + + locations = [np.zeros(5+4)] + glyphfeatures = [np.zeros(feature_dim, dtype=np.float32)] + keymap_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale]) + lines_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale]) + seps_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale]) + code_all = [] + for _ in range(4): + code_all.append(np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale])) + + for n, inputs in enumerate(ds): + print(n) + x_i = inputs['offsetx'] + y_i = inputs['offsety'] + x_is = x_i // scale + y_is = y_i // scale + x_s = width // scale + y_s = height // scale + + images = inputs['input'] + maps, feature = onnx_detector.run(['maps','feature'], {'image_input': images}) + + mask = np.zeros([y_s, x_s], dtype=bool) + x_min = int(x_s * 1 / 6) if x_i > 0 else 0 + x_max = int(x_s * 5 / 6) if x_i + width < org_img.shape[1] else x_s + y_min = int(y_s * 1 / 6) if y_i > 0 else 0 + y_max = int(y_s * 5 / 6) if y_i + height < org_img.shape[0] else y_s + mask[y_min:y_max, x_min:x_max] = True + + keymap_p = 1/(1 + np.exp(-maps[0,:,:,0])) + line_p = 1/(1 + np.exp(-maps[0,:,:,5])) + seps_p = 1/(1 + np.exp(-maps[0,:,:,6])) + code_p = [] + for k in range(4): + code_p.append(1/(1 + np.exp(-maps[0,:,:,7+k]))) + + keymap_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(keymap_p * mask, keymap_all[y_is:y_is+y_s,x_is:x_is+x_s]) + lines_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(line_p * mask, lines_all[y_is:y_is+y_s,x_is:x_is+x_s]) + seps_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(seps_p * mask, seps_all[y_is:y_is+y_s,x_is:x_is+x_s]) + for k in range(4): + code_all[k][y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(code_p[k] * mask, code_all[k][y_is:y_is+y_s,x_is:x_is+x_s]) + + keypeak = maps[0,:,:,0] + if quantized_filter: + keypeak = gaussian_filter(keypeak, sigma=1) + peak = np.where(maxpool2d(keypeak, 5) == keypeak, keymap_p * mask, 0.) + idxy, idxx = np.unravel_index(np.argsort(-peak.ravel()), peak.shape) + + for y, x in zip(idxy, idxx): + if peak[y,x] < cut_off: + break + w = np.exp(maps[0,y,x,1] - 3) * 1024 + h = np.exp(maps[0,y,x,2] - 3) * 1024 + dx = maps[0,y,x,3] * scale + dy = maps[0,y,x,4] * scale + if w * h <= 0: + continue + ix = x * scale + dx + x_i + iy = y * scale + dy + y_i + + codes = [] + for k in range(4): + codes.append(code_p[k][y,x]) + + locations.append(np.array([peak[y,x], ix, iy, w, h, *codes])) + glyphfeatures.append(feature[0, y, x, :]) + + locations = np.array(locations) + glyphfeatures = np.array(glyphfeatures) + + idx = np.argsort(-locations[:,0]) + done_area = np.zeros([0,4]) + selected_idx = [] + for i in idx: + p = locations[i,0] + if p < cut_off: + break + cx = locations[i,1] + cy = locations[i,2] + w = locations[i,3] + h = locations[i,4] + area0_vol = w * h + if done_area.size > 0: + area1_vol = done_area[:,2] * done_area[:,3] + inter_xmin = np.maximum(cx - w / 2, done_area[:,0] - done_area[:,2] / 2) + inter_ymin = np.maximum(cy - h / 2, done_area[:,1] - done_area[:,3] / 2) + inter_xmax = np.minimum(cx + w / 2, done_area[:,0] + done_area[:,2] / 2) + inter_ymax = np.minimum(cy + h / 2, done_area[:,1] + done_area[:,3] / 2) + inter_w = np.maximum(inter_xmax - inter_xmin, 0.) + inter_h = np.maximum(inter_ymax - inter_ymin, 0.) + inter_vol = inter_w * inter_h + union_vol = area0_vol + area1_vol - inter_vol + iou = np.where(union_vol > 0., inter_vol / union_vol, 0.) + if iou.max() > 0.75: + continue + if inter_vol.max() > area0_vol * 0.75: + continue + done_area = np.vstack([done_area, np.array([cx, cy, w, h])]) + selected_idx.append(i) + + if len(selected_idx) > 0: + selected_idx = np.array(selected_idx) + + locations = locations[selected_idx,:] + glyphfeatures = glyphfeatures[selected_idx,:] + else: + locations = np.zeros([0,5+4]) + glyphfeatures = np.zeros([0,feature_dim], dtype=np.float32) + + return locations, glyphfeatures + +def load_background_images(im_width, im_height): + import random + ind = random.choice(range(len(random_background))) + img0 = Image.open(random_background[ind]).convert('RGB') + scale_min = max(float(im_width) / float(img0.width), float(im_height) / float(img0.height)) + scale_max = max(scale_min + 0.5, 1.5) + s = np.random.uniform(scale_min, scale_max) + img = img0.resize((int(float(img0.width) * s)+1, int(float(img0.height) * s)+1),Image.BILINEAR) + x1 = max(0, int(np.random.uniform(0, img.width - im_width))) + y1 = max(0, int(np.random.uniform(0, img.height - im_height))) + img = np.asarray(img)[y1:y1+im_height, x1:x1+im_width,:] + + if np.random.uniform() < 0.5: + img = img[::-1,:,:] + if np.random.uniform() < 0.5: + img = img[:,::-1,:] + enhancer = ImageEnhance.Brightness(Image.fromarray(img)) + img = enhancer.enhance(np.random.uniform()) + enhancer = ImageEnhance.Contrast(img) + img = enhancer.enhance(np.random.uniform(0.2,1.8)) + + img = np.asarray(img).astype(np.float32) / 255. + img = np.clip(img, 0., 1.) + return img + +def background_image(im_width, im_height): + bkimg = load_background_images(im_width, im_height) + bk_c = np.min(bkimg, axis=(0,1)) + bk_std = np.std(bkimg, axis=(0,1)) + fg_c = np.where( + bk_c > 0.5, + np.random.uniform(np.clip(bk_c - bk_std * 2 - min_delta, None, -1), bk_c - bk_std * 2 - min_delta,[3]), + np.random.uniform(bk_c + bk_std * 2 + min_delta, np.clip(bk_c + bk_std * 2 + min_delta, 1, None), [3])) + bk_alpha = np.maximum(np.max(np.abs(fg_c)), 1) + bkimg /= bk_alpha + fg_c /= bk_alpha + fg_c = np.clip(fg_c, 0., 1.) + fgimg = fg_c[None,None,:] + return fgimg, bkimg +def preprocess_image(image, pos): + aspect = rng.uniform(0.75,1.3) + w = int(image.shape[1]*aspect) + h = int(image.shape[0]/aspect) + im = Image.fromarray(image).resize((w,h), Image.Resampling.BILINEAR) + image = np.asarray(im) + pos *= np.array([aspect,1/aspect,aspect,1/aspect]) + + angle = rng.normal() * 2.0 + py1 = max(0,int(image.shape[1]*np.sin(angle/180*np.pi))) + py2 = max(0,int(image.shape[1]*np.sin(-angle/180*np.pi))) + px1 = max(0,int(image.shape[0]*np.sin(-angle/180*np.pi))) + px2 = max(0,int(image.shape[0]*np.sin(angle/180*np.pi))) + image = np.pad(image, ((py1,py2),(px1,px2))) + im = Image.fromarray(image).rotate(angle, Image.Resampling.BILINEAR, center=(px1,py1)) + + M = np.array([[np.cos(angle/180*np.pi),-np.sin(angle/180*np.pi)], + [np.sin(angle/180*np.pi), np.cos(angle/180*np.pi)],]) + pos[:,:2] = (pos[:,:2] @ M) + pos[:,2:4] += np.array([pos[:,3] * np.abs(np.sin(angle/180*np.pi)), pos[:,2] * np.abs(np.sin(angle/180*np.pi))]).T + pos += np.array([px1 - 1,py1 - 1,0,0]) + return np.asarray(im), pos + +def random_filter(image): + img = Image.fromarray(image) + r = rng.uniform() + if r > 0: + img = img.filter(ImageFilter.GaussianBlur(radius=r)) + + r = rng.uniform() + if r > 0: + img = img.filter(ImageFilter.UnsharpMask(radius=r, percent=150, threshold=3)) + + return np.array(img) + +def process(rng): + turn = rng.uniform() < 0.01 + d = get_random_char(rng, turn=turn) + pos = d['position'] + if pos.size == 0: + return + codes = d['code_list'] + image = d['image'] + image, pos = preprocess_image(image, pos) + image = random_filter(image) + fgimg, bkimg = background_image(image.shape[1], image.shape[0]) + + img = image[...,None] + img = img / 255. + image = fgimg * img + bkimg * (1 - img) + image = np.clip(image, 0., 1.) + image = image * 255 + + stepx = width * 1 // 2 + stepy = height * 1 // 2 + + im0 = np.asarray(image).astype(np.float32) + + padx = max(0, stepx - (im0.shape[1] - width) % stepx, width - im0.shape[1]) + pady = max(0, stepy - (im0.shape[0] - height) % stepy, height - im0.shape[0]) + im0 = np.pad(im0, [[0,pady],[0,padx],[0,0]], 'constant', constant_values=((255,255),(255,255),(255,255))) + + ds0 = [] + for y in range(0, im0.shape[0] - height + 1, stepy): + for x in range(0, im0.shape[1] - width + 1, stepx): + ds0.append({ + 'input': np.expand_dims(im0[y:y+height,x:x+width,:], 0), + 'offsetx': x, + 'offsety': y, + }) + locations, glyphfeatures = eval(ds0, im0) + + for i in range(locations.shape[0]): + cx = locations[i,1] + cy = locations[i,2] + w = locations[i,3] + h = locations[i,4] + area0_vol = w * h + + area1_vol = pos[:,2] * pos[:,3] + inter_xmin = np.maximum(cx - w / 2, pos[:,0] - pos[:,2] / 2) + inter_ymin = np.maximum(cy - h / 2, pos[:,1] - pos[:,3] / 2) + inter_xmax = np.minimum(cx + w / 2, pos[:,0] + pos[:,2] / 2) + inter_ymax = np.minimum(cy + h / 2, pos[:,1] + pos[:,3] / 2) + inter_w = np.maximum(inter_xmax - inter_xmin, 0.) + inter_h = np.maximum(inter_ymax - inter_ymin, 0.) + inter_vol = inter_w * inter_h + union_vol = area0_vol + area1_vol - inter_vol + iou = np.where(union_vol > 0., inter_vol / union_vol, 0.) + j = np.argmax(iou) + if iou[j] < 0.3: + continue + + code = codes[j,0] + feature = glyphfeatures[i,:] + + save_codefeature(code, feature) + +def save_codefeature(code, feature, turn=False): + os.makedirs(output_dir, exist_ok=True) + if turn: + filename = os.path.join(output_dir,'%dt.npy'%code) + else: + filename = os.path.join(output_dir,'%dn.npy'%code) + if os.path.exists(filename): + prev = np.load(filename) + feature = np.vstack([prev, feature]) + count = feature.shape[0] + else: + count = 0 + print(code, turn, count) + np.save(filename, feature) + +if __name__=="__main__": + rng = np.random.default_rng() + count = 5000 + for i in range(count): + print(i,'/',count) + process(rng) diff --git a/make_traindata2.py b/make_traindata2.py new file mode 100755 index 0000000..c4d4648 --- /dev/null +++ b/make_traindata2.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 + +import tensorflow as tf +my_devices = tf.config.list_physical_devices(device_type='CPU') +tf.config.set_visible_devices(devices= my_devices, device_type='CPU') + +import numpy as np +import os +import re +import glob +import time +from multiprocessing import Pool + +from render_font.get_aozora import get_aozora_urls, get_contents, decode_ruby +from render_font.get_wikipedia import get_random_wordid, get_word_content +from render_font.renderer import UNICODE_WHITESPACE_CHARACTERS +from const import lines_per_file + +tfdata_path = 'train_data2' + +with open(os.path.join('data','wordlist.txt'),'r') as f: + wordlist = f.read().splitlines() +wordlist = np.array(wordlist) + +with open(os.path.join('data','en_wordlist.txt'),'r') as f: + en_wordlist = f.read().splitlines() +en_wordlist = np.array(en_wordlist) + +aozora_urls = get_aozora_urls() + +npz_file = np.load('charparam.npz') +codes = [] +for varname in npz_file.files: + if 'mu_' in varname: + codes.append(int(varname[3:-1])) +all_codes = sorted(set(codes)) +glyphs_list = [chr(c) for c in all_codes] + +rng = np.random.default_rng() + +def get_random_string(): + result = [] + for _ in range(1000): + jpstr = ''.join(rng.choice(wordlist, rng.integers(low=0,high=30))) + enstr = ' ' + ' '.join(rng.choice(en_wordlist, rng.integers(low=0,high=3))) + ' ' + if rng.uniform() < 0.1: + result.append('\n') + result.append(jpstr) + if rng.uniform() < 0.1: + result.append('\n') + result.append(enstr) + return ''.join(result) + +def get_random_special(): + result = '' + for _ in range(1000): + p = rng.uniform() + result += ''.join(rng.choice(wordlist, rng.integers(low=0,high=30))) + if p < 0.2: + result += ''.join(['ー'] * rng.integers(1,10)) + elif p < 0.4: + result += ''.join(['〰'] * rng.integers(1,10)) + elif p < 0.6: + result += '〜' + + p = rng.uniform() + if p < 0.1: + result += '、' + elif p < 0.2: + result += '。' + elif p < 0.3: + result += '?' + elif p < 0.4: + result += '!' + elif p < 0.5: + result += '‼' + elif p < 0.6: + result += '⁉' + elif p < 0.7: + result += '⁈' + + if rng.uniform() < 0.25: + result += '\n' + else: + if p < 0.2: + pass + else: + result += ' ' + return result + +def count_prevfile(train=True): + if train: + prev_files = sorted(glob.glob(os.path.join(tfdata_path,'train*.tfrecords'))) + if len(prev_files) > 0: + k = int(os.path.splitext(os.path.basename(prev_files[-1]))[0][-8:]) + 1 + else: + k = 0 + else: + prev_files = sorted(glob.glob(os.path.join(tfdata_path,'test*.tfrecords'))) + if len(prev_files) > 0: + k = int(os.path.splitext(os.path.basename(prev_files[-1]))[0][-8:]) + 1 + else: + k = 0 + return k + +def get_filepath(k=0, train=True): + os.makedirs(tfdata_path, exist_ok=True) + + if train: + filename = os.path.join(tfdata_path,'train%08d.tfrecords'%k) + else: + filename = os.path.join(tfdata_path,'test%08d.tfrecords'%k) + return filename + +def process_trainfunc(k): + return process_func(k, train=True) + +def process_testfunc(k): + return process_func(k, train=False) + +def process_func(k, train): + with tf.io.TFRecordWriter(get_filepath(k=k, train=train)) as file_writer: + linecount = 0 + while linecount < lines_per_file: + print(k,linecount) + + p = rng.random() + try: + if p < 0.01: + content = get_random_special() + en = False + elif p < 0.1: + content = get_random_string() + en = True + elif p < 0.4: + # aozora + url = rng.choice(aozora_urls) + content = get_contents(url) + en = False + elif p < 0.7: + pageid = get_random_wordid(en=False) + content = get_word_content(pageid, en=False) + en = False + elif p < 0.9: + pageid = get_random_wordid(en=True) + content = get_word_content(pageid, en=True) + en = True + else: + max_text = 64*1024 + content = ''.join(rng.choice(glyphs_list, size=max_text)) + en = False + except OSError: + time.sleep(1) + continue + + str_lines = content.splitlines() + str_lines = [s for s in str_lines if s.strip()] + str_lines = [s.rstrip() for s in str_lines] + str_lines = [re.sub(' +',' ',s) for s in str_lines] + str_lines = [re.sub('\u3000+','\u3000',s) for s in str_lines] + + lines = [] + for content in str_lines: + lines.append(''.join([c for c in content if ord(c) in all_codes or c in UNICODE_WHITESPACE_CHARACTERS or c in ['\uFFF9','\uFFFA','\uFFFB']])) + str_lines = lines + + if len(str_lines) == 0: + continue + + lines = [] + for content in str_lines: + if en: + while len(content) > 0: + max_count = rng.integers(2,80) + if len(content) < max_count: + lines.append(content) + content = [] + else: + i = content.find(' ', max_count) + if i < 0: + lines.append(content) + content = [] + else: + lines.append(content[:i]) + content = content[i+1:] + else: + while len(content) > 0: + max_count = rng.integers(2,80) + if len(content) < max_count: + lines.append(content) + content = [] + else: + i = max_count + st = [i for i, c in enumerate(content) if c == '\uFFF9'] + ed = [i for i, c in enumerate(content) if c == '\uFFFB'] + for s,e in zip(st,ed): + if i < s: + break + if s <= i <= e: + i = e+1 + break + lines.append(content[:i]) + content = content[i:] + + for content in lines: + codes = [] + sp = 0 + ruby = 0 + rubybase = 0 + for c in list(content): + t = ord(c) + if c in UNICODE_WHITESPACE_CHARACTERS: + sp = 1 + continue + elif c == '\uFFF9': + ruby = 0 + rubybase = 1 + continue + elif c == '\uFFFA': + ruby = 1 + rubybase = 0 + continue + elif c == '\uFFFB': + ruby = 0 + rubybase = 0 + continue + codes.append([t,sp,ruby,rubybase,0]) + sp = 0 + content += '\n' + codes.append([0,0,0,0,1]) + example_proto = tf.train.Example(features=tf.train.Features(feature={ + 'str': tf.train.Feature(bytes_list=tf.train.BytesList(value=[content.encode()])), + 'code': tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.serialize_tensor(tf.constant(codes, tf.int32)).numpy()])), + 'codelen': tf.train.Feature(int64_list=tf.train.Int64List(value=[len(codes)])), + 'strlen': tf.train.Feature(int64_list=tf.train.Int64List(value=[len(content)])), + })) + #print(decode_ruby(content), end='') + record_bytes = example_proto.SerializeToString() + file_writer.write(record_bytes) + linecount+=1 + if linecount >= lines_per_file: + break + return k + +def create_data(train=True, count=1): + with Pool() as p: + k = count_prevfile(train=train) + if k >= count: + return + if train: + for i in p.imap_unordered(process_trainfunc, range(k,count)): + print(i,'done') + else: + for i in p.imap_unordered(process_testfunc, range(k,count)): + print(i,'done') + +if __name__=="__main__": + import sys + + if len(sys.argv) < 3: + test_count = 5 + train_count = 100 + else: + test_count = int(sys.argv[1]) + train_count = int(sys.argv[2]) + + create_data(train=False, count=test_count) + create_data(train=True, count=train_count) diff --git a/quantize_onnx.py b/quantize_onnx.py index 41f6309..ab045c9 100755 --- a/quantize_onnx.py +++ b/quantize_onnx.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -from onnxruntime.quantization import quantize, CalibrationDataReader, StaticQuantConfig, CalibrationMethod, QuantType, QuantFormat +from onnxruntime.quantization import quantize, CalibrationDataReader, StaticQuantConfig, CalibrationMethod, QuantType, QuantFormat, DynamicQuantConfig from PIL import Image import numpy as np @@ -9,7 +9,7 @@ class ImageDataReader(CalibrationDataReader): def __init__(self): - self.imfile = sorted(glob.glob(os.path.join('img','img*.png'))) + self.imfile = sorted(glob.glob(os.path.join('images','img*.png'))) self.datasize = len(self.imfile) self.enum_imfile = iter(self.imfile) @@ -32,14 +32,32 @@ def optimize1(): quant_format=QuantFormat.QOperator, activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8, + nodes_to_exclude=[ + 'TextDetector/CenterNetBlock/keyheatmap/keyheatmap_out_conv/BiasAdd', + 'TextDetector/CenterNetBlock/sizes/sizes_out_conv/BiasAdd', + 'TextDetector/CenterNetBlock/offsets/offsets_out_conv/BiasAdd', + 'TextDetector/CenterNetBlock/textline/textline_out_conv/BiasAdd', + 'TextDetector/CenterNetBlock/sepatator/sepatator_out_conv/BiasAdd', + 'TextDetector/CenterNetBlock/codes/codes_out_conv/BiasAdd', + 'TextDetector/CenterNetBlock/concatenate/concat', + 'TextDetector/CenterNetBlock/feature/feature_out_conv/BiasAdd', + ], extra_options={ 'CalibMovingAverage': True, - 'CalibMovingAverageConstant': 0.1, }) quantize('TextDetector.infer.onnx', 'TextDetector.quant.onnx', config) +def optimize2(): + config = DynamicQuantConfig( + weight_type=QuantType.QUInt8, + ) + + quantize('TextDetector.infer.onnx', + 'TextDetector.quant.onnx', + config) + if __name__ == "__main__": from onnxruntime.quantization.shape_inference import quant_pre_process @@ -47,6 +65,9 @@ def optimize1(): 'TextDetector.onnx', 'TextDetector.infer.onnx' ) - optimize1() + optimize1() # >30GB memory needed + + #optimize2() + diff --git a/test_image2.py b/test_image2.py new file mode 100755 index 0000000..4a224b5 --- /dev/null +++ b/test_image2.py @@ -0,0 +1,394 @@ +#!/usr/bin/env python3 + +import tensorflow as tf +physical_devices = tf.config.list_physical_devices('GPU') +if len(physical_devices) > 0 and tf.config.experimental.get_device_details(physical_devices[0]).get('device_name') != 'METAL': + tf.keras.mixed_precision.set_global_policy('mixed_float16') + + physical_devices = tf.config.list_physical_devices('GPU') + try: + for gpu in physical_devices: + tf.config.experimental.set_memory_growth(gpu, True) + except: + # Invalid device or cannot modify virtual devices once initialized. + pass + +import numpy as np +from PIL import Image +from PIL.Image import Resampling +import sys +import os +import subprocess + +import net + +from dataset.data_transformer import max_encoderlen, max_decoderlen, decoder_SOT, decoder_EOT +from util_funcs import calcHist, calc_predid, decode_ruby + +if len(sys.argv) < 2: + print(sys.argv[0],'target.png','(twopass)') + exit(1) + +target_file = sys.argv[1] +twopass = False +if len(sys.argv) > 2: + if 'twopass' in sys.argv[2:]: + twopass = True + +im0 = Image.open(target_file).convert('RGB') +#im0 = im0.filter(ImageFilter.SHARPEN) +im0 = np.asarray(im0) + +class TextDetectorModel(tf.keras.models.Model): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + self.detector = net.CenterNetDetectionBlock(pre_weight=False) + self.decoder = net.SimpleDecoderBlock() + + def eval(self, ds, org_img, cut_off = 0.5, locations0 = None, glyphfeatures0 = None): + org_img = org_img.numpy() + print(org_img.shape) + print("test") + + locations = [np.zeros(5+4, np.float32)] + glyphfeatures = [np.zeros(net.feature_dim, np.float32)] + #allfeatures = np.zeros([0,net.feature_dim]) + keymap_all = np.zeros([org_img.shape[0] // net.scale, org_img.shape[1] // net.scale], np.float32) + lines_all = np.zeros([org_img.shape[0] // net.scale, org_img.shape[1] // net.scale], np.float32) + seps_all = np.zeros([org_img.shape[0] // net.scale, org_img.shape[1] // net.scale], np.float32) + code_all = [] + for _ in range(4): + code_all.append(np.zeros([org_img.shape[0] // net.scale, org_img.shape[1] // net.scale], np.float32)) + + for n, inputs in ds.enumerate(): + print(n.numpy()) + offsetx = inputs['offsetx'].numpy() + offsety = inputs['offsety'].numpy() + + images = inputs['input'].numpy() + maps, feature = self.detector(inputs['input']) + + keymap = maps[...,0] + local_peak = tf.nn.max_pool2d(keymap[...,tf.newaxis],5,1,'SAME') + keep = local_peak[...,0] == keymap + keymap = tf.math.sigmoid(keymap) + detectedkey = keymap * tf.cast(keep, tf.float32) + + textlines = tf.math.sigmoid(maps[...,5]) + separator = tf.math.sigmoid(maps[...,6]) + xsize = maps[...,1] + ysize = maps[...,2] + xoffset = maps[...,3] * net.scale + yoffset = maps[...,4] * net.scale + code_map = [] + for k in range(4): + code_map.append(tf.math.sigmoid(maps[...,7+k])) + + #allfeatures = np.concatenate([allfeatures, np.reshape(feature, [-1, net.feature_dim])]) + + for img_idx in range(images.shape[0]): + x_i = offsetx[img_idx] + y_i = offsety[img_idx] + x_is = x_i // net.scale + y_is = y_i // net.scale + x_s = net.width // net.scale + y_s = net.height // net.scale + + mask = np.zeros([y_s, x_s], dtype=bool) + x_min = int(x_s * 1 / 6) if x_i > 0 else 0 + x_max = int(x_s * 5 / 6) if x_i + net.width < org_img.shape[1] else x_s + y_min = int(y_s * 1 / 6) if y_i > 0 else 0 + y_max = int(y_s * 5 / 6) if y_i + net.height < org_img.shape[0] else y_s + mask[y_min:y_max, x_min:x_max] = True + + keymap_p = keymap[img_idx,...] + line_p = textlines[img_idx,...] + seps_p = separator[img_idx,...] + code_p = [m[img_idx,...] for m in code_map] + + keymap_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(keymap_p * mask, keymap_all[y_is:y_is+y_s,x_is:x_is+x_s]) + lines_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(line_p * mask, lines_all[y_is:y_is+y_s,x_is:x_is+x_s]) + seps_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(seps_p * mask, seps_all[y_is:y_is+y_s,x_is:x_is+x_s]) + for k in range(4): + code_all[k][y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(code_p[k] * mask, code_all[k][y_is:y_is+y_s,x_is:x_is+x_s]) + + peak = (detectedkey[img_idx, ...] * mask).numpy() + idxy, idxx = np.unravel_index(np.argsort(-peak.ravel()), peak.shape) + + for y, x in zip(idxy, idxx): + if peak[y,x] < cut_off: + break + w = tf.math.exp(xsize[img_idx,y,x] - 3) * 1024 + h = tf.math.exp(ysize[img_idx,y,x] - 3) * 1024 + if w * h <= 0: + continue + + dx = xoffset[img_idx,y,x] + dy = yoffset[img_idx,y,x] + + ix = x * net.scale + dx + x_i + iy = y * net.scale + dy + y_i + + codes = [] + for k in range(4): + codes.append(code_p[k][y,x]) + + locations.append(np.array([peak[y,x], ix, iy, w, h, *codes])) + glyphfeatures.append(feature[img_idx, y, x, :].numpy()) + + locations = np.array(locations) + if locations0 is not None: + locations = np.concatenate([locations, locations0]) + glyphfeatures = np.array(glyphfeatures) + if glyphfeatures0 is not None: + glyphfeatures = np.concatenate([glyphfeatures, glyphfeatures0]) + + idx = np.argsort(-locations[:,0]) + done_area = np.zeros([0,4], np.float32) + selected_idx = [] + for i in idx: + p = locations[i,0] + if p < cut_off: + break + cx = locations[i,1] + cy = locations[i,2] + w = locations[i,3] + h = locations[i,4] + area0_vol = w * h + if done_area.size > 0: + area1_vol = done_area[:,2] * done_area[:,3] + inter_xmin = np.maximum(cx - w / 2, done_area[:,0] - done_area[:,2] / 2) + inter_ymin = np.maximum(cy - h / 2, done_area[:,1] - done_area[:,3] / 2) + inter_xmax = np.minimum(cx + w / 2, done_area[:,0] + done_area[:,2] / 2) + inter_ymax = np.minimum(cy + h / 2, done_area[:,1] + done_area[:,3] / 2) + inter_w = np.maximum(inter_xmax - inter_xmin, 0.) + inter_h = np.maximum(inter_ymax - inter_ymin, 0.) + inter_vol = inter_w * inter_h + union_vol = area0_vol + area1_vol - inter_vol + iou = np.where(union_vol > 0., inter_vol / union_vol, 0.) + if iou.max() > 0.75: + continue + if inter_vol.max() > area0_vol * 0.8: + continue + done_area = np.vstack([done_area, np.array([cx, cy, w, h])]) + selected_idx.append(i) + + if len(selected_idx) > 0: + selected_idx = np.array(selected_idx) + + locations = locations[selected_idx,:] + glyphfeatures = glyphfeatures[selected_idx,:] + else: + locations = np.zeros([0,5+4], np.float32) + glyphfeatures = np.zeros([0,net.feature_dim], np.float32) + + for i in range(locations.shape[0]): + cx = locations[i,1] + cy = locations[i,2] + x = int(cx / net.scale) + y = int(cy / net.scale) + if x >= 0 and x < org_img.shape[1] // net.scale and y >= 0 and y < org_img.shape[0] // net.scale: + for k in range(4): + locations[i,5+k] = max(code_all[k][y,x], locations[i,5+k]) + + return locations, glyphfeatures, lines_all, seps_all + +class TransformerDecoderModel(tf.keras.models.Model): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + self.transformer = net.TextTransformer() + embedded = tf.keras.Input(shape=(max_encoderlen,net.encoder_dim)) + decoderinput = tf.keras.Input(shape=(max_decoderlen,)) + self.transformer((embedded, decoderinput)) + + self.transformer.summary() + +model1 = TextDetectorModel() +last = tf.train.latest_checkpoint('ckpt1') +print(last) +model1.load_weights(last).expect_partial() + +model2 = TransformerDecoderModel() +last = tf.train.latest_checkpoint('ckpt2') +print(last) +model2.load_weights(last).expect_partial() + +stepx = net.width * 1 // 2 +stepy = net.height * 1 // 2 + +padx = max(0, stepx - (im0.shape[1] - net.width) % stepx, net.width - im0.shape[1]) +pady = max(0, stepy - (im0.shape[0] - net.height) % stepy, net.height - im0.shape[0]) +im0 = np.pad(im0, [[0,pady],[0,padx],[0,0]], 'constant', constant_values=((255,255),(255,255),(255,255))) + +if twopass and (im0.shape[1] / stepx > 2 or im0.shape[0] / stepy > 2): + print('two-pass') + s = max(im0.shape[1], im0.shape[0]) / max(net.width, net.height) + im1 = Image.fromarray(im0).resize((int(im0.shape[1] / s), int(im0.shape[0] / s)), resample=Resampling.BILINEAR) + im1 = np.asarray(im1) + padx = max(0, net.width - im1.shape[1]) + pady = max(0, net.height - im1.shape[0]) + im1 = np.pad(im1, [[0,pady],[0,padx],[0,0]], 'constant', constant_values=((255,255),(255,255),(255,255))) + + im = tf.image.convert_image_dtype(im1, dtype=tf.float32) + im = im * 255. + + ds1 = tf.data.Dataset.range(1) + ds1 = ds1.map(lambda x: { + 'input': im, + 'offsetx': 0, + 'offsety': 0, + }) + ds1 = ds1.batch(1) + ds1 = ds1.prefetch(tf.data.AUTOTUNE) + + locations0, glyphfeatures0, lines0, seps0 = model1.eval(ds1, im, cut_off=0.5) + locations0[:,1:] = locations0[:,1:] * s +else: + locations0, glyphfeatures0 = None, None + +im = tf.image.convert_image_dtype(im0, dtype=tf.float32) +im = im * 255. + +yi = tf.data.Dataset.range(0, im0.shape[0] - net.height + 1, stepy) +xi = tf.data.Dataset.range(0, im0.shape[1] - net.width + 1, stepx) +ds0 = yi.flat_map(lambda y: xi.map(lambda x : (x, y))) +ds0 = ds0.map(lambda x,y: { + 'input': im[y:y+net.height,x:x+net.width,:], + 'offsetx': x, + 'offsety': y, + }) +ds0 = ds0.batch(8) +ds0 = ds0.prefetch(tf.data.AUTOTUNE) + +locations, glyphfeatures, lines, seps = model1.eval(ds0, im, cut_off=0.5, + locations0=locations0, glyphfeatures0=glyphfeatures0) + +valid_locations = [] +for i, (p, x, y, w, h, c1, c2, c4, c8) in enumerate(locations): + x1 = np.clip(int(x - w/2), 0, im0.shape[1]) + y1 = np.clip(int(y - h/2), 0, im0.shape[0]) + x2 = np.clip(int(x + w/2) + 1, 0, im0.shape[1]) + y2 = np.clip(int(y + h/2) + 1, 0, im0.shape[0]) + if calcHist(im0[y1:y2,x1:x2,:]) < 50: + continue + valid_locations.append(i) +locations = locations[valid_locations,:] +glyphfeatures = glyphfeatures[valid_locations,:] +print(locations.shape[0],'boxes') + +print('construct data') +h, w = lines.shape +input_binary = int(0).to_bytes(4, 'little') +input_binary += int(w).to_bytes(4, 'little') +input_binary += int(h).to_bytes(4, 'little') +input_binary += lines.tobytes() +input_binary += seps.tobytes() +input_binary += int(locations.shape[0]).to_bytes(4, 'little') +input_binary += locations[:,1:].tobytes() +input_binary += int(im0.shape[1] // 2).to_bytes(4, 'little') +input_binary += int(im0.shape[0] // 2).to_bytes(4, 'little') + +print('run') +result = subprocess.run('./linedetect', input=input_binary, stdout=subprocess.PIPE).stdout +detected_boxes = [] +p = 0 +max_block = 0 +count = int.from_bytes(result[p:p+4], byteorder='little') +print(count) +p += 4 +for i in range(count): + id = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + p += 4 + block = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + max_block = max(max_block, block) + p += 4 + idx = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + p += 4 + subidx = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + p += 4 + subtype = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + p += 4 + detected_boxes.append((id,block,idx,subidx,subtype)) + + +features = [] +prev_block = 0 +prev_idx = 0 +for id, block, idx, subidx, subtype in detected_boxes: + if id < 0: + continue + + ruby = 0 + rubybase = 0 + space = 0 + + g = np.concatenate([np.zeros([net.feature_dim], np.float32), np.array([space,ruby,rubybase,0], np.float32)]) + if prev_block != block: + prev_block = block + features.append(g) + if prev_idx != idx: + prev_idx = idx + features.append(g) + + if subtype & 2+4 == 2+4: + ruby = 1 + elif subtype & 2+4 == 2: + rubybase = 1 + + if subtype & 8 == 8: + space = 0 + + g = np.concatenate([glyphfeatures[id,:], np.array([space,ruby,rubybase,0], np.float32)]) + features.append(g) +features = np.array(features, np.float32) + + +@tf.function +def call_loop(decoder_input, i, encoder_output, encoder_input): + decoder_output = model2.transformer.decoder([decoder_input, encoder_output, encoder_input]) + + out1091, out1093, out1097 = decoder_output + p1091 = tf.math.softmax(out1091[0,i]) + p1093 = tf.math.softmax(out1093[0,i]) + p1097 = tf.math.softmax(out1097[0,i]) + i1091 = tf.argmax(p1091, axis=-1) + i1093 = tf.argmax(p1093, axis=-1) + i1097 = tf.argmax(p1097, axis=-1) + code = calc_predid(i1091,i1093,i1097) + return tf.where(tf.range(max_decoderlen) == i+1, code, decoder_input), i+1, encoder_output, encoder_input + +i = 0 +result_txt = '' +while i < features.shape[0]: + j = min(features.shape[0] - 1, i + (max_decoderlen - 10)) + while features[j,-1] == 0: + j -= 1 + if j <= i: + j = min(features.shape[0], i + (max_decoderlen - 10)) + break + print(i,j) + encoder_input = tf.constant(features[i:j+1,:], tf.int32) + encoder_len = tf.shape(encoder_input)[0] + encoder_input = tf.pad(encoder_input, [[0, max_encoderlen - encoder_len], [0, 0]]) + encoder_input = tf.expand_dims(encoder_input, 0) + encoder_output = model2.transformer.encoder(encoder_input) + + decoder_input = tf.constant([decoder_SOT], dtype=tf.int64) + decoder_input = tf.pad(decoder_input, [[0, max_decoderlen - 1]]) + decoder_input = tf.expand_dims(decoder_input, 0) + i0 = tf.constant(0) + c = lambda n, i, eo, ei: tf.logical_and(i < max_decoderlen-1, n[0,i] != decoder_EOT) + output,count,_,_ = tf.while_loop( + c, call_loop, loop_vars=[decoder_input, i0, encoder_output, encoder_input]) + + count = count.numpy() + code = output[0].numpy().astype(np.int32) + print(code) + str_code = code[1:count] + str_text = ''.join([chr(c) if c < 0x110000 else '\uFFFD' for c in str_code]) + result_txt += str_text + i = j+1 + +print(decode_ruby(result_txt)) diff --git a/test_image2_coreml.py b/test_image2_coreml.py new file mode 100755 index 0000000..2e60d66 --- /dev/null +++ b/test_image2_coreml.py @@ -0,0 +1,322 @@ +#!/usr/bin/env python3 + +import coremltools as ct + +import numpy as np +from PIL import Image +from PIL.Image import Resampling +import sys +import os +import subprocess + +from const import max_encoderlen, max_decoderlen, decoder_SOT, decoder_EOT +from util_funcs import calcHist, calc_predid, decode_ruby, feature_dim, height, width, scale + +if len(sys.argv) < 2: + print(sys.argv[0],'target.png','(twopass)') + exit(1) + +target_file = sys.argv[1] +twopass = False +if len(sys.argv) > 2: + if 'twopass' in sys.argv[2:]: + twopass = True + +im0 = Image.open(target_file).convert('RGB') +#im0 = im0.filter(ImageFilter.SHARPEN) +im0 = np.asarray(im0) + +mlmodel_detector = ct.models.MLModel('TextDetector.mlpackage') +mlmodel_encoder = ct.models.MLModel('TransformerEncoder.mlpackage') +mlmodel_decoder = ct.models.MLModel('TransformerDecoder.mlpackage') + +def eval(ds, org_img, cut_off = 0.5, locations0 = None, glyphfeatures0 = None): + print(org_img.shape) + print("test") + + locations = [np.zeros(5+4, dtype=np.float32)] + glyphfeatures = [np.zeros(feature_dim, dtype=np.float32)] + keymap_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale], dtype=np.float32) + lines_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale], dtype=np.float32) + seps_all = np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale], dtype=np.float32) + code_all = [] + for _ in range(4): + code_all.append(np.zeros([org_img.shape[0] // scale, org_img.shape[1] // scale], dtype=np.float32)) + + for n, inputs in enumerate(ds): + print(n) + x_i = inputs['offsetx'] + y_i = inputs['offsety'] + x_is = x_i // scale + y_is = y_i // scale + x_s = width // scale + y_s = height // scale + + input_image = Image.fromarray(inputs['input'], mode="RGB") + output = mlmodel_detector.predict({'Image': input_image}) + maps = output['Output_heatmap'] + feature = output['Output_feature'] + + mask = np.zeros([y_s, x_s], dtype=bool) + x_min = int(x_s * 1 / 6) if x_i > 0 else 0 + x_max = int(x_s * 5 / 6) if x_i + width < org_img.shape[1] else x_s + y_min = int(y_s * 1 / 6) if y_i > 0 else 0 + y_max = int(y_s * 5 / 6) if y_i + height < org_img.shape[0] else y_s + mask[y_min:y_max, x_min:x_max] = True + + keymap_p = maps[0,:,:,0] + line_p = maps[0,:,:,6] + seps_p = maps[0,:,:,7] + code_p = [] + for k in range(4): + code_p.append(maps[0,:,:,8+k]) + + keymap_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(keymap_p * mask, keymap_all[y_is:y_is+y_s,x_is:x_is+x_s]) + lines_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(line_p * mask, lines_all[y_is:y_is+y_s,x_is:x_is+x_s]) + seps_all[y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(seps_p * mask, seps_all[y_is:y_is+y_s,x_is:x_is+x_s]) + for k in range(4): + code_all[k][y_is:y_is+y_s,x_is:x_is+x_s] = np.maximum(code_p[k] * mask, code_all[k][y_is:y_is+y_s,x_is:x_is+x_s]) + + peak = maps[0,:,:,1] + idxy, idxx = np.unravel_index(np.argsort(-peak.ravel()), peak.shape) + + for y, x in zip(idxy, idxx): + if peak[y,x] < cut_off: + break + w = maps[0,y,x,2] + h = maps[0,y,x,3] + dx = maps[0,y,x,4] + dy = maps[0,y,x,5] + if w * h <= 0: + continue + ix = x * scale + dx + x_i + iy = y * scale + dy + y_i + + codes = [] + for k in range(4): + codes.append(code_p[k][y,x]) + + locations.append(np.array([peak[y,x], ix, iy, w, h, *codes])) + glyphfeatures.append(feature[0, y, x, :]) + + locations = np.array(locations, np.float32) + if locations0 is not None: + locations = np.concatenate([locations, locations0]) + glyphfeatures = np.array(glyphfeatures, np.float32) + if glyphfeatures0 is not None: + glyphfeatures = np.concatenate([glyphfeatures, glyphfeatures0]) + + idx = np.argsort(-locations[:,0]) + done_area = np.zeros([0,4]) + selected_idx = [] + for i in idx: + p = locations[i,0] + if p < cut_off: + break + cx = locations[i,1] + cy = locations[i,2] + w = locations[i,3] + h = locations[i,4] + area0_vol = w * h + if done_area.size > 0: + area1_vol = done_area[:,2] * done_area[:,3] + inter_xmin = np.maximum(cx - w / 2, done_area[:,0] - done_area[:,2] / 2) + inter_ymin = np.maximum(cy - h / 2, done_area[:,1] - done_area[:,3] / 2) + inter_xmax = np.minimum(cx + w / 2, done_area[:,0] + done_area[:,2] / 2) + inter_ymax = np.minimum(cy + h / 2, done_area[:,1] + done_area[:,3] / 2) + inter_w = np.maximum(inter_xmax - inter_xmin, 0.) + inter_h = np.maximum(inter_ymax - inter_ymin, 0.) + inter_vol = inter_w * inter_h + union_vol = area0_vol + area1_vol - inter_vol + iou = np.where(union_vol > 0., inter_vol / union_vol, 0.) + if iou.max() > 0.75: + continue + if inter_vol.max() > area0_vol * 0.75: + continue + done_area = np.vstack([done_area, np.array([cx, cy, w, h])]) + selected_idx.append(i) + + if len(selected_idx) > 0: + selected_idx = np.array(selected_idx) + + locations = locations[selected_idx,:] + glyphfeatures = glyphfeatures[selected_idx,:] + else: + locations = np.zeros([0,5+4]) + glyphfeatures = np.zeros([0,feature_dim], dtype=np.float32) + + for i in range(locations.shape[0]): + cx = locations[i,1] + cy = locations[i,2] + x = int(cx / scale) + y = int(cy / scale) + if x >= 0 and x < org_img.shape[1] // scale and y >= 0 and y < org_img.shape[0] // scale: + for k in range(4): + locations[i,5+k] = max(code_all[k][y,x], locations[i,5+k]) + + return locations, glyphfeatures, lines_all, seps_all + +stepx = width * 1 // 2 +stepy = height * 1 // 2 + +padx = max(0, stepx - (im0.shape[1] - width) % stepx, width - im0.shape[1]) +pady = max(0, stepy - (im0.shape[0] - height) % stepy, height - im0.shape[0]) +im0 = np.pad(im0, [[0,pady],[0,padx],[0,0]], 'constant', constant_values=((255,255),(255,255),(255,255))) + +if twopass and (im0.shape[1] / stepx > 2 or im0.shape[0] / stepy > 2): + print('two-pass') + s = max(im0.shape[1], im0.shape[0]) / max(width, height) + im1 = Image.fromarray(im0).resize((int(im0.shape[1] / s), int(im0.shape[0] / s)), resample=Image.BILINEAR) + im1 = np.asarray(im1) + padx = max(0, width - im1.shape[1]) + pady = max(0, height - im1.shape[0]) + im1 = np.pad(im1, [[0,pady],[0,padx],[0,0]], 'constant', constant_values=((255,255),(255,255),(255,255))) + + ds1 = [] + ds1.append({ + 'input': im1, + 'offsetx': 0, + 'offsety': 0, + }) + + locations0, glyphfeatures0, lines0, seps0 = eval(ds1, im1, cut_off=0.5) + locations0[:,1:] = locations0[:,1:] * s +else: + locations0, glyphfeatures0 = None, None + +ds0 = [] +for y in range(0, im0.shape[0] - height + 1, stepy): + for x in range(0, im0.shape[1] - width + 1, stepx): + ds0.append({ + 'input': im0[y:y+height,x:x+width,:], + 'offsetx': x, + 'offsety': y, + }) +locations, glyphfeatures, lines, seps = eval(ds0, im0, cut_off=0.5, + locations0=locations0, glyphfeatures0=glyphfeatures0) + +valid_locations = [] +for i, (p, x, y, w, h, c1, c2, c4, c8) in enumerate(locations): + x1 = np.clip(int(x - w/2), 0, im0.shape[1]) + y1 = np.clip(int(y - h/2), 0, im0.shape[0]) + x2 = np.clip(int(x + w/2) + 1, 0, im0.shape[1]) + y2 = np.clip(int(y + h/2) + 1, 0, im0.shape[0]) + if calcHist(im0[y1:y2,x1:x2,:]) < 50: + continue + valid_locations.append(i) +locations = locations[valid_locations,:] +glyphfeatures = glyphfeatures[valid_locations,:] +print(locations.shape[0],'boxes') + +print('construct data') +h, w = lines.shape +input_binary = int(0).to_bytes(4, 'little') +input_binary += int(w).to_bytes(4, 'little') +input_binary += int(h).to_bytes(4, 'little') +input_binary += lines.tobytes() +input_binary += seps.tobytes() +input_binary += int(locations.shape[0]).to_bytes(4, 'little') +input_binary += locations[:,1:].tobytes() +input_binary += int(im0.shape[1] // 2).to_bytes(4, 'little') +input_binary += int(im0.shape[0] // 2).to_bytes(4, 'little') + +print('run') +result = subprocess.run('./linedetect', input=input_binary, stdout=subprocess.PIPE).stdout +detected_boxes = [] +p = 0 +max_block = 0 +count = int.from_bytes(result[p:p+4], byteorder='little') +print(count) +p += 4 +for i in range(count): + id = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + p += 4 + block = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + max_block = max(max_block, block) + p += 4 + idx = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + p += 4 + subidx = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + p += 4 + subtype = int.from_bytes(result[p:p+4], byteorder='little', signed=True) + p += 4 + detected_boxes.append((id,block,idx,subidx,subtype)) + +features = [] +prev_block = 0 +prev_idx = 0 +for id, block, idx, subidx, subtype in detected_boxes: + if id < 0: + continue + + ruby = 0 + rubybase = 0 + space = 0 + + g = np.concatenate([np.zeros([feature_dim], np.float32), np.array([space,ruby,rubybase,1], np.float32)]) + if prev_block != block: + prev_block = block + features.append(g) + features.append(g) + if prev_idx != idx: + prev_idx = idx + features.append(g) + + if subtype & 2+4 == 2+4: + ruby = 1 + elif subtype & 2+4 == 2: + rubybase = 1 + + if subtype & 8 == 8: + space = 1 + + g = np.concatenate([glyphfeatures[id,:], np.array([space,ruby,rubybase,0], np.float32)]) + features.append(g) +features = np.array(features, np.float32) + + +i = 0 +result_txt = '' +while i < features.shape[0]: + j = i + (max_encoderlen - 10) + if j < features.shape[0]-1: + while features[j,-1] == 0: + j -= 1 + if j <= i: + j = min(features.shape[0]-1, i + (max_encoderlen - 10)) + break + else: + j = features.shape[0]-1 + print(i,j) + encoder_input = features[i:j+1,:] + print(list(encoder_input)) + encoder_input = np.pad(encoder_input, [[0, max_encoderlen - encoder_input.shape[0]],[0,0]]) + encoder_input = np.expand_dims(encoder_input, 0) + + print('encoder') + out1 = mlmodel_encoder.predict({ 'encoder_input': encoder_input }) + + print('decoder') + decoder_input = np.zeros([1,max_decoderlen], dtype=np.float32) + decoder_input[0,0] = decoder_SOT + count = 0 + while count < max_decoderlen - 1 and decoder_input[0,count] != decoder_EOT: + out2 = mlmodel_decoder.predict({ 'decoder_input': decoder_input, **out1, 'encoder_input': encoder_input }) + mod1091 = out2['mod1091'] + mod1093 = out2['mod1093'] + mod1097 = out2['mod1097'] + i1091 = np.argmax(mod1091[count,:]) + i1093 = np.argmax(mod1093[count,:]) + i1097 = np.argmax(mod1097[count,:]) + code = calc_predid(i1091,i1093,i1097) + count += 1 + decoder_input[0,count] = code + + code = decoder_input[0].astype(np.int32) + print(code) + str_code = code[1:count] + str_text = ''.join([chr(c) if c < 0x110000 else '\uFFFD' for c in str_code]) + result_txt += str_text + i = j+1 + +print(decode_ruby(result_txt)) diff --git a/train2.py b/train2.py new file mode 100755 index 0000000..76cc2eb --- /dev/null +++ b/train2.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +import os + +import tensorflow as tf +physical_devices = tf.config.list_physical_devices('GPU') +if len(physical_devices) > 0 and tf.config.experimental.get_device_details(physical_devices[0]).get('device_name') != 'METAL': + tf.keras.mixed_precision.set_global_policy('mixed_float16') + + physical_devices = tf.config.list_physical_devices('GPU') + try: + for gpu in physical_devices: + tf.config.experimental.set_memory_growth(gpu, True) + except: + # Invalid device or cannot modify virtual devices once initialized. + pass + +from net.transformer_trainer import TransformerDecoderModel +from dataset.data_transformer import generate_data, train_data, test_data + +save_target = 'result2' +batchsize = 256 + +class GenerateCallback(tf.keras.callbacks.Callback): + def __init__(self, log_dir) -> None: + super().__init__() + + self.summary_writer_test = tf.summary.create_file_writer( + os.path.join(log_dir, "predict")) + self.ds = generate_data() + + def on_epoch_end(self, epoch, logs=None): + result_text = self.model.generate(self.ds) + + with self.summary_writer_test.as_default(): + tf.summary.text("predict", result_text, step=epoch) + +def train2(): + model = TransformerDecoderModel() + #opt1 = tf.keras.optimizers.Adam(learning_rate=1e-4) + boundaries = [100, 1000*50, 1000*200] + values = [1e-5, 2e-4, 1e-4, 1e-5] + lr = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries, values) + opt1 = tf.keras.optimizers.AdamW(learning_rate=lr, weight_decay=1e-2) + opt1.exclude_from_weight_decay(var_names=['layer_normalization','/bias']) + model.compile(optimizer=opt1) + + callbacks = [ + tf.keras.callbacks.TerminateOnNaN(), + GenerateCallback(log_dir=os.path.join(save_target,'log')), + tf.keras.callbacks.ModelCheckpoint( + os.path.join(save_target,'ckpt2','ckpt'), + save_weights_only=True), + tf.keras.callbacks.BackupAndRestore(os.path.join(save_target,'backup')), + tf.keras.callbacks.TensorBoard( + log_dir=os.path.join(save_target,'log'), + write_graph=False), + tf.keras.callbacks.CSVLogger(os.path.join(save_target,'training.csv'), append=True), + ] + + model.fit( + train_data(batchsize), + epochs=2000, + steps_per_epoch=1000, + validation_data=test_data(batchsize), + validation_steps=200, + callbacks=callbacks, + ) + + +if __name__ == '__main__': + train2() diff --git a/util_funcs.py b/util_funcs.py new file mode 100644 index 0000000..25fd9f1 --- /dev/null +++ b/util_funcs.py @@ -0,0 +1,105 @@ +import numpy as np +import re + +modulo_list = [1091,1093,1097] +width = 512 +height = 512 +scale = 2 +feature_dim = 64 + +def gaussian(x,a,x0,sigma): + return a*np.exp(-(x-x0)**2/(2*sigma**2)) + +def calcHist(im): + agg = 1 + rHist, bins = np.histogram(im[...,0], 256 // agg, (0.,255.)) + gHist, bins = np.histogram(im[...,1], 256 // agg, (0.,255.)) + bHist, bins = np.histogram(im[...,2], 256 // agg, (0.,255.)) + + maxPeakDiff = -1 + for hist in [rHist, gHist, bHist]: + y = np.array(hist) + x = np.linspace(0.,255.,len(y)) + + if np.sum(y) == 0: + continue + + idx = np.argsort(-y) + mu_y = x[idx[0]] + mean_y = np.sum(x * y) / np.sum(y) + + if mu_y > mean_y: + peak1 = y[idx[0]:] + x1 = x[idx[0]:] + peak1 = np.concatenate([peak1[::-1],peak1[1:]], axis=0) + x1 = np.concatenate([(2 * x1[0] - x1[::-1]),x1[1:]], axis=0) + else: + peak1 = y[:idx[0]+1] + x1 = x[:idx[0]+1] + peak1 = np.concatenate([peak1[:-1],peak1[::-1]], axis=0) + x1 = np.concatenate([x1[:-1],(x1 + x1[-1])], axis=0) + + mu = np.sum(x1 * peak1) / np.sum(peak1) + sigma = np.sqrt(np.sum((x1 - mu)**2 * peak1) / np.sum(peak1)) + fixmax = np.max(y[np.bitwise_and(mu + 10 > x, x > mu - 10)]) + + neg_peak = gaussian(x, fixmax, mu, sigma + 10) + fixy = y - neg_peak + fixy[fixy < 0] = 0 + + if np.sum(fixy) == 0: + continue + + fix_diff = np.sum(np.abs(x - mu) * fixy) / np.sum(fixy) + idx = np.argsort(-fixy) + fix_maxx = np.abs(x[idx[0]] - mu) + + maxPeakDiff = max(maxPeakDiff, fix_diff, fix_maxx) + + if False: + import matplotlib.pyplot as plt + plt.subplot(2,1,1) + plt.plot(x,y) + plt.plot(x,gaussian(x, fixmax, mu, sigma + 10)) + plt.subplot(2,1,2) + plt.plot(x,fixy) + plt.vlines(mu, *plt.ylim(), 'r') + plt.vlines(np.sum(x * fixy) / np.sum(fixy), *plt.ylim(), 'g') + plt.show() + + return maxPeakDiff + +def calc_predid(*args): + m = modulo_list + b = args + assert(len(m) == len(b)) + t = [] + + for k in range(len(m)): + u = 0 + for j in range(k): + w = t[j] + for i in range(j): + w *= m[i] + u += w + tk = b[k] - u + for j in range(k): + tk *= pow(m[j], m[k]-2, m[k]) + #tk *= pow(m[j], -1, m[k]) + tk = tk % m[k] + t.append(tk) + x = 0 + for k in range(len(t)): + w = t[k] + for i in range(k): + w *= m[i] + x += w + mk = 1 + for k in range(len(m)): + mk *= m[k] + x = x % mk + return x + +def decode_ruby(text): + text = re.sub('\uFFF9(.*?)\uFFFA(.*?)\uFFFB',r'\1(\2)', text) + return text