-
Notifications
You must be signed in to change notification settings - Fork 10
/
furigana.cpp
1682 lines (1387 loc) · 64.4 KB
/
furigana.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
** Copyright 2007-2013, 2017-2018 Sólyom Zoltán
** This file is part of zkanji, a free software released under the terms of the
** GNU General Public License version 3. See the file LICENSE for details.
**/
#include <QFile>
#include <QTextStream>
#include <set>
#include "furigana.h"
#include "zkanjimain.h"
#include "romajizer.h"
#include "kanji.h"
#include "words.h"
#include "checked_cast.h"
const uchar consonantcolumn[84] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 101, 1, 101, 1, //14
101, 1, 101, 1, 101, 2, 102, 2, 102, 2, 102, 2, 102, 2, 102, 3, //30
103, 3, 103, 0, 3, 103, 3, 103, 3, 103, 4, 4, 4, 4, 4, 5, //46
105, 205, 5, 105, 205, 5, 105, 205, 5, 105, 205, 5, 105, 205, 6, 6, //62
6, 6, 6, 0, 7, 0, 7, 0, 7, 8, 8, 8, 8, 8, 9, 9, //78
0, 0, 9, 4, 0
};
// Furigana search:
// Given a kanji and a kana form (which should match,) a list of FuriganaData items are
// produced. Each item marks a substring in kanji and kana (by their position and length).
// The kana substring can appear above the kanji as furigana. This data is also used for
// finding kanji readings in words (if the kanji substring contains only a single kanji.)
//
// Furigana is found by recursion. There's a current position saved in both kanji and kana.
// First furiganaStep() is called to skip any unwanted characters (kana, middle dot, etc.)
// that appear the same way both in kanji and kana at the current position. It in turn calls
// furiganaStep2() to fill the FuriganaData structures.
// There are several ways to do that. 1) the kanji readings are matched with the kana string
// after the current position. If a possible result is found, it calls furiganaStep() after
// advancing the positions in kanji and kana. If that recursive call returns with no result,
// the next reading is tested.
// 2) No reading was matched, but the kanji stands alone without other (unprocessed) kanji,
// surrounding it, the kana at the current position is assumed to be its furigana. First one
// kana by advancing the current position in kana by one. Then furiganaStep() is called to
// find results again. If it doesn't produce a result, we advance in kana again.
// 3) Checking all dictionaries for words matching the current kanji/kana position. Multiple
// substrings are checked like previously, by slowly advancing the current position in both
// kanji and kana. The kanji string must only contain kanji. If a match is found, the whole
// substring in kana is marked as the furigana for the kanji, and the following substring is
// matched with furiganaStep().
// 4) If the current characters in kanji are not kanji nor kana, but valid unicode characters
// that can appear in the dictionary, they might have a reading too. The longest substring in
// kanji that only contains such characters, is counted as one unit. Skipping kana position
// one by one, the rest of the strings are checked with furiganaStep().
// 5) When everything failed and the current character is a kanji with other kanji following,
// it's skipped by advancing the current kanji and kana position, calling findFurigana().
// While no result is found, kana is advanced until it's not possible. At the end, kanji is
// advanced by one again (now two kanji skipped) and the process starts over.
//
// Methods apart from the first one are basically hacks, because they don't matched kanji with
// real kanji readings. To avoid finding too many false positives, these steps increase a
// value called 'hacks' every time they are used. When one recursion returns, and the hacks
// value is not zero, the current method goes to its next step, and then the other methods are
// used as well. The current possible results are saved, and only overwritten if the next call
// to findFurigana() returns with less hacks.
bool furiganaStep2(const QCharString &kanji, const QCharString &kana, int &kanjistart, int &kanjiend, int &kanastart, int &kanaend, std::vector<FuriganaData> &dat, int &datpos, int &hacks);
// Determines the furigana positions inside a word defined by kanji and kana. Kanji/kana start/end are
// extents in the word searched in this step. Dat is updated with furigana positions, datpos is the
// number of usable values in dat.
bool furiganaStep(const QCharString &kanji, const QCharString &kana, int &kanjistart, int &kanjiend, int &kanastart, int &kanaend, std::vector<FuriganaData> &dat, int &datpos, int &hacks)
{
//std::vector<int> l;
//int ix;
//int iy;
//bool res = true;
//wchar_t och, kch, tmp;
if (kanjistart != 0 && kanjistart <= kanjiend && kanastart <= kanaend && KANJI(kanji[kanjistart].unicode()))
{
if (DASH(kana[kanastart].unicode()))
return false;
QChar ch = hiraganaCh(kana.data(), kanastart);
if (ch == 0x3083 /* small ya */ || ch == 0x3085 /* small yu */ || ch == 0x3087 /* small yo */ ||
ch == 0x3041 /* small a */ || ch == 0x3043 /* small i */ || ch == 0x3045 /* small u */ || ch == 0x3047 /* small e */ || ch == 0x3049 /* small o */)
return false;
}
bool res = true;
int ix = 0;
bool changed = true;
while (changed)
{
changed = false;
// Exclude kana matching at the start of the checked parts.
while (kanjistart <= kanjiend && kanastart <= kanaend && !KANJI(kanji[kanjistart].unicode()) &&
!VALIDCODE(kanji[kanjistart].unicode()) && kanaMatch(kanji.data(), kanjistart, kana.data(), kanastart))
kanjistart++, kanastart++, ix++, changed = true;
// Word has a full width 0 character in front but no zero kana found. Skip the number.
while (kanjistart <= kanjiend && kanastart <= kanaend && kanji[kanjistart] == 0xFF10 /* fullwidth zero */ &&
(kanjistart == 0 || kanji[kanjistart - 1].unicode() < 0xFF10 || kanji[kanjistart - 1].unicode() > 0xFF19) &&
(kanastart == kanaend || hiraganaCh(kana.data(), kanastart) != 0x305c /*hiragana ze*/ ||
(kanastart < kanaend && hiraganaCh(kana.data(), kanastart + 1) != 0x308D /* hiragana ro */)))
kanjistart++, ix++, changed = true;
// Remove any dot punctuation in the middle of the word
while (kanjistart <= kanjiend && kanastart <= kanaend &&
(kanji[kanjistart] == 0x30fb /* kana mid-dot*/ ||
kanji[kanjistart] == 0xff1d /* fullwidth equal sign*/ ||
kanji[kanjistart] == 0xff0e /* fullwidth stop */ ||
kanji[kanjistart] == 0xff0f /* fullwidth / */))
kanjistart++, ix++, changed = true;
}
// The whole checked part is made up of a single kanji. The valid kana
// string part is its furigana.
if (kanjistart == 0 && kanjiend == 0 && kanastart <= kanaend && datpos == 0)
{
dat[datpos].kanji.pos = kanjistart;
dat[datpos].kanji.len = 1;
dat[datpos].kana.pos = kanastart;
dat[datpos].kana.len = kanaend - kanastart + 1;
kanjistart++;
kanastart = kanaend + 1;
datpos++;
return true;
}
if (kanjistart <= kanjiend && kanastart <= kanaend)
{
res = furiganaStep2(kanji, kana, kanjistart, kanjiend, kanastart, kanaend, dat, datpos, hacks);
if (!res && !datpos && ix == kanjistart + (tosigned(kana.size()) - 1 - kanaend) && kanjiend - kanjistart + 1 <= 3)
{
res = true;
for (int i = kanjistart; i <= kanjiend && res; i++)
if (!KANJI(kanji[i].unicode()) && !VALIDCODE(kanji[i].unicode()))
res = false;
if (res)
{
dat[datpos].kanji.pos = kanjistart;
dat[datpos].kanji.len = kanjiend - kanjistart + 1;
dat[datpos].kana.pos = kanastart;
dat[datpos].kana.len = kanaend - kanastart + 1;
hacks += std::max(2, std::abs(dat[datpos].kana.len - dat[datpos].kanji.len)) + 1;
kanastart = kanaend + 1;
kanjistart = kanjiend + 1;
datpos++;
}
}
}
else if (kanjistart <= kanjiend || kanastart <= kanaend)
res = false;
return res;
}
// Returns a list of readings for the passed character, to be used for
// furigana lookup.
//QCharStringList& validReadings(QChar vchar)
//{
// switch (vchar.unicode())
// {
// case 0xFF10:
// return zero0;
// case 0xFF11:
// return ichi1;
// case 0xFF12:
// return ni2;
// case 0xFF13:
// return san3;
// case 0xFF14:
// return yon4;
// case 0xFF15:
// return go5;
// case 0xFF16:
// return roku6;
// case 0xFF17:
// return nana7;
// case 0xFF18:
// return hachi8;
// case 0xFF19:
// return kyuu9;
// case 0x3006:
// return shimekun;
// case 0xFF21:
// case 0xFF41:
// return letterA;
// case 0xFF22:
// case 0xFF42:
// return letterB;
// case 0xFF23:
// case 0xFF43:
// return letterC;
// case 0xFF24:
// case 0xFF44:
// return letterD;
// case 0xFF25:
// case 0xFF45:
// return letterE;
// case 0xFF26:
// case 0xFF46:
// return letterF;
// case 0xFF27:
// case 0xFF47:
// return letterG;
// case 0xFF28:
// case 0xFF48:
// return letterH;
// case 0xFF29:
// case 0xFF49:
// return letterI;
// case 0xFF2A:
// case 0xFF4A:
// return letterJ;
// case 0xFF2B:
// case 0xFF4B:
// return letterK;
// case 0xFF2C:
// case 0xFF4C:
// return letterL;
// case 0xFF2D:
// case 0xFF4D:
// return letterM;
// case 0xFF2E:
// case 0xFF4E:
// return letterN;
// case 0xFF2F:
// case 0xFF4F:
// return letterO;
// case 0xFF30:
// case 0xFF50:
// return letterP;
// case 0xFF31:
// case 0xFF51:
// return letterQ;
// case 0xFF32:
// case 0xFF52:
// return letterR;
// case 0xFF33:
// case 0xFF53:
// return letterS;
// case 0xFF34:
// case 0xFF54:
// return letterT;
// case 0xFF35:
// case 0xFF55:
// return letterU;
// case 0xFF36:
// case 0xFF56:
// return letterV;
// case 0xFF37:
// case 0xFF57:
// return letterW;
// case 0xFF38:
// case 0xFF58:
// return letterX;
// case 0xFF39:
// case 0xFF59:
// return letterY;
// case 0xFF3A:
// case 0xFF5A:
// return letterZ;
// case 0x3003:
// return onajikun;
// case 0x3007:
// return marukun;
// case 0xff06:
// return andokun;
// case 0xff20:
// return attokun;
// case 0xFF0B:
// return purasukun;
// default:
// return noreadings;
// }
//}
// Length of kanji kun reading without okurigana.
int kunLen(const QChar *kun)
{
const QChar *c = kun;
while (*c != 0)
{
if (*c == '.')
break;
++c;
}
return c - kun;
}
// Length of kanji on reading without the leading dash.
int onLen(const QChar *on)
{
return tosigned(qcharlen(on)) - (on[0] == '-' ? 1 : 0);
}
// Checks whether the passed reading might be found at kanastart in kana, ending at or before
// kanaend.
// Returns 0 if the kana does not match the passed reading, and a positive number if the
// reading might match. The larger the returned value, the bigger the difference between the
// reading and the actual kana string. (Differences are possible between them because the
// sounds can change in context.)
// Pass an integer to matchlen to receive the number of characters found in kana at kanastart,
// that match the characters at the front of reading.
int fuReading(const QChar *reading, int readinglen, const QChar *kana, int kanastart, int kanaend, int *mismatch = nullptr, bool recurse = false)
{
//int readinglen = kunLen(reading);
int val = 0;
//if (readinglen > kanaend - kanastart + 1)
// return 0;
bool changedlen = false;
int siz = readinglen;
if (readinglen > kanaend - kanastart + 1)
{
if (mismatch == nullptr)
return 0;
changedlen = true;
siz = kanaend - kanastart + 1;
}
int ix;
QChar kch2;
for (ix = 0; ix != siz && kanastart + ix <= kanaend; ++ix)
{
if (kanaMatch(kana, kanastart + ix, reading, ix))
continue;
QChar och = hiraganaCh(reading, ix);
QChar kch = hiraganaCh(kana, kanastart + ix);
if (!changedlen && ix == readinglen - 1 && readinglen != 1)
{
if (kch == 0x3063 /* minitsu */)
{
if (och == 0x3061 /* chi */)
{
val += 2;
continue;
}
if (och == 0x3064 /* tsu */)
{
val += 1;
continue;
}
}
if (och == 0x3064 /* tsu */ && kch == 0x3061 /* chi */)
{
val += 2;
continue;
}
if (och == 0x3061 /* chi */ && kch == 0x3058 /* ji */)
{
val += 2;
continue;
}
if (och == 0x305F /* ta */ && kch == 0x3060 /* da */)
{
val += 1;
continue;
}
}
if (ix == 0 && !recurse)
{
if ((och == 0x3061 /* chi */ && kch == 0x3058 /* ji */) || (och == 0x3064 /* tsu */ && kch == 0x305A /* zu */))
{
val += 2;
continue;
}
if (kch.unicode() == och.unicode() + 1 && ((och.unicode() >= 0x304b && och.unicode() < 0x3062 && (och.unicode() % 2)) || (och.unicode() >= 0x3064 && och.unicode() < 0x3069 && !(och.unicode() % 2))))
{
++val;
continue;
}
if (och.unicode() >= 0x306f && och.unicode() < 0x307d && (kch.unicode() == och.unicode() + 1 || kch.unicode() == och.unicode() + 2) && !(och.unicode() % 3))
{
++val;
continue;
}
}
if ((och == 0x30F6 /* miniKE */ && kch == 0x304B /* ka */) || (och == 0x30F5 /* miniKA */ && kch == 0x304B /* ka */) ||
(och == 0x3088 /* yo */ && kch == 0x3087 /* miniyo */) || (och == 0x3086 /* yu */ && kch == 0x3085 /* miniyu */) || (och == 0x3084 /* ya */ && kch == 0x3083 /* miniya */) || //h row matches
(och == 0x3042 /* a */ && kch == 0x308F /* wa */))
{
++val;
continue;
}
if (kanastart + ix + 1 <= kanaend && (!changedlen && ix == readinglen - 1 && readinglen != 1) && kch == 0x3063 /* minitsu */ && och.unicode() - 0x3041 >= 0 && och.unicode() - 0x3041 < 84)
{
kch2 = hiraganaCh(kana, kanastart + ix + 1);
if (kch2.unicode() - 0x3041 >= 0 && kch2.unicode() - 0x3041 < 84 && consonantcolumn[och.unicode() - 0x3041] == consonantcolumn[kch2.unicode() - 0x3041] && consonantcolumn[och.unicode() - 0x3041] > 0)
val += 2;
continue;
}
break;
}
if (mismatch != nullptr)
{
if (ix == 0)
{
*mismatch = 999999;
if (!recurse)
{
// In case there was no match at all, we try again by skipping the first characters in
// both the reading and the current kana position.
int retrymatch;
if (kanastart != kanaend && fuReading(reading, readinglen, kana, kanastart + 1, kanaend, &retrymatch, true))
*mismatch = std::min(*mismatch, 2 + retrymatch + 1);
if (readinglen > 1 && fuReading(reading + 1, readinglen - 1, kana, kanastart, kanaend, &retrymatch, true))
*mismatch = std::min(*mismatch, 2 + retrymatch + 1);
if (readinglen > 1 && kanastart != kanaend && fuReading(reading + 1, readinglen - 1, kana, kanastart + 1, kanaend, &retrymatch, true))
*mismatch = std::min(*mismatch, 4 + retrymatch + 2);
}
*mismatch = std::min(*mismatch, std::max((kanaend - kanastart + 1), readinglen) + 1);
}
else
*mismatch = readinglen - ix + (kanaend - kanastart + 1) - ix + val / 2 + std::max(0, (kanaend - kanastart + 1) - readinglen);
if (readinglen != kanaend - kanastart + 1)
return 0;
}
return (!changedlen && ix == readinglen) ? std::max(val, 1) : 0;
}
bool kanjiOrKurikaeshi(const QCharString &kanji, int pos)
{
return (pos != 0 && kanji[pos].unicode() == KURIKAESHI && KANJI(kanji[pos - 1].unicode())) || KANJI(kanji[pos].unicode());
}
int furiReadingDiff(KanjiEntry *k, const QCharString &kana, int kanastart, int kanaend)
{
int mindiff = 999999;
int mismatch;
for (int ix = 0, siz = tosigned(k->on.size()); ix != siz; ++ix)
{
const QChar *on = k->on.items(ix).data();
if (on[0] == '-')
++on;
int onlen = tosigned(qcharlen(on));
if (!fuReading(on, onlen, kana.data(), kanastart, kanaend, &mismatch))
mindiff = std::min(mindiff, mismatch);
else
return 0;
}
// Try to match kanji KUN-readings or irregular readings.
for (int ix = 0, siz = tosigned(k->kun.size()); ix != siz; ++ix)
{
int klen = kunLen(k->kun[ix].data());
if (!fuReading(k->kun[ix].data(), klen, kana.data(), kanastart, kanaend, &mismatch))
mindiff = std::min(mindiff, mismatch);
else
return 0;
}
return mindiff;
}
// Called during furigana lookup. Fills dat and datpos with furigana data corresponding to
// kanji/kana start/end. Checks kanji readings and tries to match them with the kana. In case
// it doesn't work, try other methods, like looking up shorter words that have the exact same
// form.
bool furiganaStep2(const QCharString &kanji, const QCharString &kana, int &kanjistart, int &kanjiend, int &kanastart, int &kanaend, std::vector<FuriganaData> &dat, int &datpos, int &hacks)
{
// We rely on the fact that set holds its values in increasing order. Don't change it
// unless sorting is added later.
std::set<int> l;
KanjiEntry *k = nullptr;
int kindex = -1;
bool res = false;
if ((kanjistart == 0 && kanji[0].unicode() == KURIKAESHI) || ((!VALIDCODE(kanji[kanjistart].unicode()) || kanji[kanjistart].unicode() == KURIKAESHI) && (kindex = ZKanji::kanjiIndex(kanji[(kanji[kanjistart].unicode() == KURIKAESHI ? kanjistart - 1 : kanjistart)])) < 0))
return false;
if (kindex >= 0)
k = ZKanji::kanjis[kindex];
// Values saved after furiganaStep() returns with less hacks than before.
int lastkanjistart = -1;
int lastkanastart = -1;
int lastdatpos = -1;
// First set to the hacks value returned by furiganaStep(). Only updated (with the rest
// of the 'last...' values) if the next furiganaStep() call returns with less hacks.
int lasthacks = -1;
// Hacks that will be added in this step. Only updated with the other 'last...' values.
// It'll increase the value of 'hacks', but it is not used when comparing the hacks set by
// subsequent calls to furiganaStep().
int extrahacks = 0;
// If validReadings are used again, remove the if below with curly braces, but leave
// contents. Only the readings value and its use should be removed.
//QCharStringList *readings = (k != nullptr ? k->on : validReadings(kanji[kanjistart]));
if (k != nullptr)
{
QCharStringList *readings = nullptr;
readings = &k->on;
// Try to match kanji ON-readings **** REMOVED: or special character pronunciations.
for (int ix = 0, siz = tosigned(readings->size()); ix != siz; ++ix)
{
const QChar *on = (readings->items(ix).data() + (readings->items(ix)[0] == '-' ? 1 : 0));
int onlen = tosigned(qcharlen(on));
if (l.count(onlen) == 0 && fuReading(on, onlen, kana.data(), kanastart, kanaend))
l.insert(onlen);
}
// Try to match kanji KUN-readings or irregular readings.
for (int ix = 0, siz = tosigned(k->kun.size()); ix != siz; ++ix)
{
int kunlen = kunLen(k->kun[ix].data());
if (l.count(kunlen) == 0 && fuReading(k->kun[ix].data(), kunlen, kana.data(), kanastart, kanaend))
l.insert(kunlen);
}
// TO-NOT-DO: remove irreg unless using irregulars.txt again.
//for (int ix = 0; ix < k->irreg.size(); ++ix)
// if (std::find(l.begin(), l.end(), k->irreg[ix].size()) == l.end() &&
// fuReading(k->irreg[ix].data(), kana.data(), kanastart, kanaend))
// l.push_back(k->irreg[ix].size());
if (l.size() == 1)
{
// Only one reading is found which matches the current kanji/kana position.
int len = abs(*l.begin());
int furidatpos = datpos + 1;
int furikanjistart = kanjistart + 1;
int furikanastart = kanastart + len;
int furihacks = 0;
if ((furikanjistart == kanjiend + 1 && furikanastart == kanaend + 1) ||
furiganaStep(kanji, kana, furikanjistart, kanjiend, furikanastart, kanaend, dat, furidatpos, furihacks))
{
res = true;
dat[datpos].kanji.pos = kanjistart;
dat[datpos].kanji.len = 1;
dat[datpos].kana.pos = kanastart;
dat[datpos].kana.len = len;
lastkanjistart = furikanjistart;
lastkanastart = furikanastart;
lastdatpos = furidatpos;
lasthacks = furihacks;
extrahacks = 0;
}
}
else if (!l.empty())
{
// Multiple readings can match the current kanji/kana position. Try them from longest
// to shortest, until one is found which allows the whole word part to be matched with
// furigana.
//std::sort(l.begin(), l.end(), [](int a, int b) { return a > b; });
//l->Sort(KunLenSort);
std::vector<FuriganaData> furi(kanjiend - kanjistart + 1);
//for (int ix = 0; ix < l.size() && lasthacks != 0; ++ix)
for (auto it = l.rbegin(); it != l.rend(); ++it)
{
int len = std::abs(*it);
int furikanjistart = kanjistart + 1;
int furikanastart = kanastart + len;
int furidatpos = 0;
int furihacks = 0;
if ((furikanjistart == kanjiend + 1 && furikanastart == kanaend + 1) ||
furiganaStep(kanji, kana, furikanjistart, kanjiend, furikanastart, kanaend, furi, furidatpos, furihacks))
{
int hackinc = 0;
if (!res || furihacks + hackinc < lasthacks + extrahacks)
{
res = true;
dat[datpos].kanji.pos = kanjistart;
dat[datpos].kanji.len = 1;
dat[datpos].kana.pos = kanastart;
lastkanjistart = furikanjistart;
lastkanastart = furikanastart;
lastdatpos = furidatpos + datpos + 1;
lasthacks = furihacks;
extrahacks = hackinc;
dat[datpos].kana.len = len;
for (int iy = 0; iy != furidatpos; ++iy)
dat[datpos + 1 + iy] = furi[iy];
}
}
}
}
// Create readings formed by the -masu base of verbs which only have this one kanji
// at the front. Doesn't actually take verbs, but tries to construct new ones from the
// kana readings that have okurigana information.
std::set<int> l2;
for (int ix = 0, siz = tosigned(k->kun.size()); ix != siz; ++ix)
{
const QChar *c = qcharchr(k->kun[ix].data(), QChar('.'));
if (c == nullptr || c[1].unicode() == 0)
continue;
int dotpos = c - k->kun[ix].data();
if (c[2].unicode() == 0)
{
// The kana reading has one extra furigana after the . character. It can be
// converted into a fake masu base form of a verb.
QChar ch = hiraganaCh(c + 1, 0);
QChar newch = 0;
QCharString masureading;
if (ch == 0x3046 /* kana u */ || ch == 0x304f /* kana ku */ || ch == 0x3059 /* kana su */ || ch == 0x305a /* kana zu */)
newch = QChar(ch.unicode() - 2);
if (ch == 0x3064 /* kana tsu */ || ch == 0x3065 /* kana dzu */ || ch == 0x3075 /* kana fu */ ||
ch == 0x3076 /* kana bu */ || ch == 0x3077 /* kana pu */)
newch = QChar(ch.unicode() - 3);
if (ch == 0x306c /* kana nu */ || ch == 0x3080 /* kana mu */ || ch == 0x308b /* kana ru */)
newch = QChar(ch.unicode() - 1);
if (newch.unicode() != 0)
{
masureading.copy(k->kun[ix].data(), dotpos + 1);
masureading[dotpos] = newch;
}
if (!masureading.empty() && l.count(dotpos + 1) == 0 && l2.count(dotpos + 1) == 0 && fuReading(masureading.data(), dotpos + 1, kana.data(), kanastart, kanaend))
l2.insert(dotpos + 1);
}
else
{
// The reading has more characters after the dot. Try to convert the last kana
// character into the masu base form.
// Number of characters after the . character.
int clen = tosigned(qcharlen(c + 1));
QChar ch = hiraganaCh(c + clen, 0);
QChar newch = 0;
QCharString masureading;
if (ch == 0x3046 /* kana u */ || ch == 0x304f /* kana ku */ || ch == 0x3059 /* kana su */ || ch == 0x305a /* kana zu */)
newch = QChar(ch.unicode() - 2);
if (ch == 0x3064 /* kana tsu */ || ch == 0x3065 /* kana dzu */ || ch == 0x3075 /* kana fu */ ||
ch == 0x3076 /* kana bu */ || ch == 0x3077 /* kana pu */)
newch = QChar(ch.unicode() - 3);
if (ch == 0x306c /* kana nu */ || ch == 0x3080 /* kana mu */ || ch == 0x308b /* kana ru */)
newch = QChar(ch.unicode() - 1);
if (newch.unicode() != 0)
{
masureading.copy(k->kun[ix].data(), dotpos);
masureading.resize(dotpos + clen);
for (int iy = dotpos; iy != dotpos + clen - 1; ++iy)
masureading[iy] = k->kun[ix][iy + 1];
masureading[dotpos + clen - 1] = newch;
if (!masureading.empty() && l.count(dotpos + clen) == 0 && l2.count(dotpos + clen) == 0 && fuReading(masureading.data(), dotpos + clen, kana.data(), kanastart, kanaend))
l2.insert(dotpos + clen);
// Verbs ending in -ru can be converted to masu base by simply cutting off
// that trailing character.
if (ch == 0x308b /* kana ru */)
{
if (!masureading.empty() && l.count(dotpos + clen - 1) == 0 && l2.count(dotpos + clen - 1) == 0 && fuReading(masureading.data(), dotpos + clen - 1, kana.data(), kanastart, kanaend))
l2.insert(dotpos + clen - 1);
}
}
}
}
if (l2.size() == 1)
{
// Only one reading is found which matches the current kanji/kana position.
int len = abs(*l2.begin());
int furidatpos = 0;
int furihacks = 0;
int furikanjistart = kanjistart + 1;
int furikanastart = kanastart + len;
std::vector<FuriganaData> furi(kanjiend - kanjistart + 1);
if ((furikanjistart == kanjiend + 1 && furikanastart == kanaend + 1) ||
furiganaStep(kanji, kana, furikanjistart, kanjiend, furikanastart, kanaend, furi, furidatpos, furihacks))
{
int hackinc = 1;
if (!res || furihacks + hackinc < lasthacks + extrahacks)
{
res = true;
dat[datpos].kanji.pos = kanjistart;
dat[datpos].kanji.len = 1;
dat[datpos].kana.pos = kanastart;
dat[datpos].kana.len = len;
lastkanjistart = furikanjistart;
lastkanastart = furikanastart;
lastdatpos = furidatpos + datpos + 1;
lasthacks = furihacks;
extrahacks = hackinc;
for (int iy = 0; iy != furidatpos; ++iy)
dat[datpos + 1 + iy] = furi[iy];
}
}
}
else if (!l2.empty())
{
// Multiple readings can match the current kanji/kana position. Try them from longest
// to shortest, until one is found which allows the whole word part to be matched with
// furigana.
std::vector<FuriganaData> furi(kanjiend - kanjistart + 1);
for (auto it = l2.rbegin(); it != l2.rend(); ++it)
{
int len = std::abs(*it);
int furikanjistart = kanjistart + 1;
int furikanastart = kanastart + len;
int furidatpos = 0;
int furihacks = 0;
if (((furikanjistart == kanjiend + 1 && furikanastart == kanaend + 1) ||
furiganaStep(kanji, kana, furikanjistart, kanjiend, furikanastart, kanaend, furi, furidatpos, furihacks)) && (!res || furihacks + 1 < lasthacks + extrahacks))
{
res = true;
dat[datpos].kanji.pos = kanjistart;
dat[datpos].kanji.len = 1;
dat[datpos].kana.pos = kanastart;
lastkanjistart = furikanjistart;
lastkanastart = furikanastart;
lastdatpos = furidatpos + datpos + 1;
lasthacks = furihacks;
extrahacks = 1;
dat[datpos].kana.len = len;
for (int iy = 0; iy != furidatpos; ++iy)
dat[datpos + 1 + iy] = furi[iy];
}
}
}
}
if ((!res || lasthacks != 0) && kanjistart != kanjiend && kanaend - kanastart > 0 && KANJI(kanji[kanjistart].unicode()) && (kanji[kanjistart + 1] == kanji[kanjistart] || kanji[kanjistart + 1].unicode() == KURIKAESHI))
{
// Kanji is repeated. It's possible the reading is repeated too. Check at most 3
// characters. If match is found, nothing else is checked.
for (int kanalen = 1; kanalen != 4 && kanastart + kanalen * 2 - 1 <= kanaend; ++kanalen)
{
if (std::find(l.begin(), l.end(), kanalen) != l.end())
continue;
int kanacateg = 0;
QChar ch = hiraganaCh(kana.data(), kanastart + kanalen);
// The leading kana of the current part might have changed sound due to its
// position. Depending on the kana, a neighboring unicode position contains
// the original kana.
if ((ch > 0x304b && ch <= 0x3062 && (ch.unicode() % 2) == 0) || (ch > 0x3064 && ch <= 0x3069 && (ch.unicode() % 2) != 0))
kanacateg = 1;
else if (ch > 0x306f && ch <= 0x307d && (ch.unicode() % 3) != 0)
kanacateg = 2;
QChar kch = hiraganaCh(kana.data(), kanastart);
// Check the first characters for match separately. The latter ones don't
// change sound.
if (ch.unicode() != kch.unicode() && (kanacateg != 1 || ch.unicode() - 1 != kch.unicode()) &&
(kanacateg != 2 && ch.unicode() - (ch.unicode() % 3) != kch.unicode()))
continue;
bool match = true;
for (int ix = 1; ix != kanalen && match; ++ix)
match = hiraganaCh(kana.data(), kanastart + ix) == hiraganaCh(kana.data(), kanastart + kanalen + ix);
if (match)
{
std::vector<FuriganaData> furi(kanjiend - kanjistart + 1);
int furikanjistart = kanjistart + 2;
int furikanastart = kanastart + kanalen * 2;
int furidatpos = 0;
int furihacks = 0;
if (((furikanjistart == kanjiend + 1 && furikanastart == kanaend + 1) ||
furiganaStep(kanji, kana, furikanjistart, kanjiend, furikanastart, kanaend, furi, furidatpos, furihacks)) && (!res || furihacks < lasthacks))
{
dat[datpos].kanji.pos = kanjistart;
dat[datpos].kanji.len = 2;
dat[datpos].kana.pos = kanastart;
dat[datpos].kana.len = kanalen * 2;
lastkanjistart = furikanjistart;
lastkanastart = furikanastart;
lasthacks = furihacks;
lastdatpos = furidatpos + datpos + 1;
extrahacks = 0;
for (int iy = 0; iy != furidatpos; ++iy)
dat[datpos + 1 + iy] = furi[iy];
res = true;
}
break;
}
}
}
// As a last step when all else failed, kanji are skipped one by one (see below.) When
// that happens, some other hacks shouldn't be done. Check this value in those cases.
bool kanjiskipping = datpos == 1 && (short)dat[0].kanji.len < 0 && dat[0].kanji.pos - (short)dat[0].kanji.len == kanjistart;
if ((!res || lasthacks != 0) && k && !kanjiskipping)
{
// Kanji readings might not match. If the kanji is standalone, and a non-kanji
// follows, try to match that to the following kana string.
if (kanjistart == kanjiend && kanastart <= kanaend)
{
// Standalone kanji at the back of the word. Let's predict that the remaining kana
// matches it. Because this can produce invalid results, increase hacks
// accordingly.
int hackinc = furiReadingDiff(k, kana, kanastart, kanaend);
if (!res || extrahacks > hackinc /*|| lasthacks > std::abs(kanaend - kanastart) + 3*/)
{
dat[datpos].kanji.pos = kanjistart;
dat[datpos].kanji.len = 1;
dat[datpos].kana.pos = kanastart;
dat[datpos].kana.len = kanaend - kanastart + 1;
lasthacks = std::max(0, lasthacks);
extrahacks = hackinc; //std::max(1, std::abs((kanaend - kanastart) - (kanjiend - kanjistart)));
lastdatpos = datpos + 1;
lastkanjistart = kanjiend + 1;
lastkanastart = kanaend + 1;
}
res = true;
}
else if (kanjistart != kanjiend && !kanjiOrKurikaeshi(kanji, kanjistart + 1))
{
int furikanjistart = kanjistart + 1;
int furikanastart = kanastart;
std::vector<FuriganaData> furi(kanjiend - kanjistart + 1);
QChar ch = hiraganaCh(kanji.data(), furikanjistart);
while (furikanastart != kanaend && lasthacks != 0)
{
++furikanastart;
if (hiraganaCh(kana.data(), furikanastart).unicode() != ch.unicode())
continue;
// Kana start position is saved because furiganaStep() can update it.
int kanapos = furikanastart;
int furidatpos = 0;
int furihacks = 0;
if (furiganaStep(kanji, kana, furikanjistart, kanjiend, furikanastart, kanaend, furi, furidatpos, furihacks))
{
int hackinc = furiReadingDiff(k, kana, kanastart, kanapos - 1);
if (!res || furihacks + hackinc < lasthacks + extrahacks)
{
dat[datpos].kanji.pos = kanjistart;
dat[datpos].kanji.len = 1;
dat[datpos].kana.pos = kanastart;
lastkanjistart = furikanjistart;
lastkanastart = furikanastart;
lastdatpos = furidatpos + datpos + 1;
lasthacks = furihacks;
extrahacks = hackinc; //std::max(1, (kanapos - kanastart) - 1);
dat[datpos].kana.len = kanapos - dat[datpos].kana.pos;
for (int iy = 0; iy != furidatpos; ++iy)
dat[datpos + 1 + iy] = furi[iy];
res = true;
}
}
furikanastart = kanapos;
furikanjistart = kanjistart + 1;
}
}
}
if ((!res /*|| lasthacks + extrahacks != 0*/) && kanji[kanjistart].unicode() != KURIKAESHI && VALIDCODE(kanji[kanjistart].unicode()))
{
// Nothing is found because the current character is not a kanji or kana. The
// consecutive validcode characters are skipped until something is found.
int lastvalidpos = kanjistart;
while (lastvalidpos != kanjiend && kanji[lastvalidpos + 1].unicode() != 0x30fb /* kana mid-dot*/ &&
kanji[lastvalidpos + 1].unicode() != 0xff1d /* fullwidth equal sign*/ && kanji[lastvalidpos + 1].unicode() != 0xff0e /* fullwidth stop */ &&
kanji[lastvalidpos + 1].unicode() != 0xff0f /* fullwidth / */ && VALIDCODE(kanji[lastvalidpos + 1].unicode()))
++lastvalidpos;
// Number of skipped valid characters.
int skipvalid = lastvalidpos - kanjistart + 1;
if (lastvalidpos == kanjiend)
{
int hackinc = std::max(1, std::abs(kanaend - kanastart - lastvalidpos + kanjistart) / 2);
if (!res || extrahacks > hackinc)
{
// Nothing comes after the valid code characters. Pretend that they match the
// rest of the kana, but increase the value of hacks accordingly.
dat[datpos].kanji.pos = kanjistart;
dat[datpos].kana.pos = kanastart;
dat[datpos].kanji.len = lastvalidpos - kanjistart + 1;
dat[datpos].kana.len = kanaend - kanastart + 1;
lasthacks = std::max(0, lasthacks);
extrahacks = hackinc;
lastdatpos = datpos + 1;
lastkanjistart = kanjiend + 1;
lastkanastart = kanaend + 1;
res = true;
}
}
else
{
// Skip as many kana characters as it takes to find something.
std::vector<FuriganaData> furi(kanjiend - kanjistart + 1);
for (int furikanastart = kanastart + std::max(1, skipvalid / 3); furikanastart < kanaend && lasthacks != 0; ++furikanastart)
{
int furidatpos = 0;
int furihacks = 0;
int furikanjistart = lastvalidpos + 1;
int kanapos = furikanastart;
if (furiganaStep(kanji, kana, furikanjistart, kanjiend, furikanastart, kanaend, furi, furidatpos, furihacks))