-
Notifications
You must be signed in to change notification settings - Fork 1
/
patgen.web
1978 lines (1759 loc) · 74.8 KB
/
patgen.web
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% This is PATGEN.WEB in text format, as of August 23, 2004.
% Version 1.0 was finished in 1983.
% Version 2.0 major revision for `8-bit TeX' (November 8, 1991).
% Version 2.1 allows left/right_hypen_min from terminal (April, 1992).
% Version 2.2 added `close_in(dictionary)' (August, 1996).
% Version 2.3 avoided division by zero - Karl Berry (October, 1996).
% Here is TeX material that gets inserted after \input webmac
\def\hang{\hangindent 3em\indent\ignorespaces}
\def\PASCAL{Pascal}
\def\title{PATGEN}
\def\contentspagenumber{45} % should be odd
\def\topofcontents{
\line{\tenit Appendix\hfil \mainfont\contentspagenumber}
\vfill
\null\vskip 40pt
\centerline{\titlefont {\ttitlefont PAT}tern {\ttitlefont GEN}eration
program}
\vskip 8pt
\centerline{\titlefont for the \TeX 82 hyphenator}
\vskip 15pt
\centerline{(Version 2.3, August 2004)}
\vfill}
\pageno=\contentspagenumber \advance\pageno by 1
@* Introduction.
This program takes a list of hyphenated words and generates a set of
patterns that can be used by the \TeX 82 hyphenation algorithm.
The patterns consist of strings of letters and digits, where a digit
indicates a `hyphenation value' for some intercharacter position. For
example, the pattern \.{3t2ion} specifies that if the string \.{tion}
occurs in a word, we should assign a hyphenation value of 3 to the
position immediately before the \.{t}, and a value of 2 to the position
between the \.{t} and the \.{i}.
To hyphenate a word, we find all patterns that match within the word and
determine the hyphenation values for each intercharacter position. If
more than one pattern applies to a given position, we take the maximum of
the values specified (i.e., the higher value takes priority). If the
resulting hyphenation value is odd, this position is a feasible
breakpoint; if the value is even or if no value has been specified, we are
not allowed to break at this position.
In order to find quickly the patterns that match in a given word and to
compute the associated hyphenation values, the patterns generated by this
program are compiled by \.{INITEX} into a compact version of a finite
state machine. For further details, see the \TeX 82 source.
The |banner| string defined here should be changed whenever \.{PATGEN}
gets modified.
@d banner=='This is PATGEN, Version 2.3' {printed when the program starts}
@ The original version~1 of \.{PATGEN} was written by Frank~M. Liang
@^Liang, Franklin Mark@>
in 1982; a major revision (version~2) by Peter Breitenlohner in 1991
@^Breitenlohner, Peter@>
is mostly related to the new features of `8-bit \TeX' (version~3 of
\TeX 82). The differences between versions~1 and~2 fall into several
categories (all of Liang's algorithms have been left essentially
unchanged): (1)~enhancements related to 8-bit \TeX, e.g., the
introduction of 8-bit |ASCII_code| values and of \.{\\lefthyphenmin} and
\.{\\righthyphenmin}; (2)~a modification of the input and output
procedures which should make language specific modifications of this
program unnecessary (information about the external representation of
all `letters' used by a particular language is obtained from the
|translate| file); (3)~removal of ANSI standard \PASCAL\ and range check
violations; (4)~removal of uninitialized variables; (5)~minor
modifications in order to simplify system-dependent modifications.
@^range check violations@>
@ This program is written in standard \PASCAL, except where it is
necessary to use extensions. All places where nonstandard constructions
are used have been listed in the index under ``system dependencies.''
@!@^system dependencies@>
The program uses \PASCAL's standard |input| and |output| files to read
from and write to the user's terminal.
@d print(#)==write(output,#)
@d print_ln(#)==write_ln(output,#)
@d get_input(#)==read(input,#)
@d get_input_ln(#)==
begin if eoln(input) then read_ln(input);
read(input,#);
end
@#
@d end_of_PATGEN=9999
@p @<Compiler directives@>@/
program PATGEN(@!dictionary,@!patterns,@!translate,@!patout);
label end_of_PATGEN;
const @<Constants in the outer block@>@/
type @<Types in the outer block@>@/
var @<Globals in the outer block@>@/
procedure initialize; {this procedure gets things started properly}
var @<Local variables for initialization@>@/
begin print_ln(banner);@/
@<Set initial values@>@/
end;
@ The patterns are generated in a series of sequential passes through the
dictionary. In each pass, we collect count statistics for a particular
type of pattern, taking into account the effect of patterns chosen in
previous passes. At the end of a pass, the counts are examined and new
patterns are selected.
Patterns are chosen one level at a time, in order of increasing
hyphenation value. In the sample run shown below, the parameters
|hyph_start| and |hyph_finish| specify the first and last levels,
respectively, to be generated.
Patterns at each level are chosen in order of increasing pattern length
(usually starting with length~2). This is controlled by the parameters
|pat_start| and |pat_finish| specified at the beginning of each level.
Furthermore patterns of the same length applying to different
intercharacter positions are chosen in separate passes through the
dictionary. Since patterns of length $n$ may apply to $n+1$ different
positions, choosing a set of patterns of lengths $2$ through $n$ for a
given level requires $(n+1)(n+2)/2-3$ passes through the word list.
At each level, the selection of patterns is controlled by the three
parameters |good_wt|, |bad_wt|, and |thresh|. A hyphenating pattern will
be selected if |good*good_wt-bad*bad_wt>=thresh|, where |good| and
|bad| are the number of times the pattern could and could not be
hyphenated, respectively, at a particular point. For inhibiting patterns,
|good| is the number of errors inhibited, and |bad| is the number of
previously found hyphens inhibited.
@<Globals...@>=
@!pat_start, @!pat_finish: dot_type;
@!hyph_start, @!hyph_finish: val_type;
@!good_wt, @!bad_wt, @!thresh: integer;
@ The proper choice of the parameters to achieve a desired degree of
hyphenation is discussed in Chapter~4. Below we show part of a sample run
of \.{PATGEN}, with the user's inputs underlined.
$$\vbox{\halign{\.{#\hfil}\cr
$\underline{\smash{\.{ex patgen}}}$\cr
DICTIONARY : $\underline{\smash{\.{murray.hyf}}}$\cr
PATTERNS : $\underline{\smash{\.{nul:}}}$\cr
TRANSLATE : $\underline{\smash{\.{nul:}}}$\cr
PATOUT : $\underline{\smash{\.{murray.pat}}}$\cr
This is PATGEN, Version 2.0\cr
left\_hyphen\_min = 2, right\_hyphen\_min = 3, 26 letters\cr
0 patterns read in\cr
pattern trie has 256 nodes, trie\_max = 256, 0 outputs\cr
hyph\_start, hyph\_finish: $\underline{\.{1 1}}$\cr
pat\_start, pat\_finish: $\underline{\.{2 3}}$\cr
good weight, bad weight, threshold: $\underline{\.{1 3 3}}$\cr
processing dictionary with pat\_len = 2, pat\_dot = 1\cr
\cr
0 good, 0 bad, 3265 missed\cr
0.00 \%, 0.00 \%, 100.00 \%\cr
338 patterns, 466 nodes in count trie, triec\_max = 983\cr
46 good and 152 bad patterns added (more to come)\cr
finding 715 good and 62 bad hyphens, efficiency = 10.72\cr
pattern trie has 326 nodes, trie\_max = 509, 2 outputs\cr
processing dictionary with pat\_len = 2, pat\_dot = 0\cr
\cr
\hskip 1.5em ...\cr
\cr
1592 nodes and 39 outputs deleted\cr
total of 220 patterns at hyph\_level 1\cr
hyphenate word list? $\underline{\smash{\.{y}}}$\cr
writing pattmp.1\cr
\cr
2529 good, 243 bad, 736 missed\cr
77.46 \%, 7.44 \%, 22.54 \%\cr}}$$
@ Note that before beginning a pattern selection run, a file of existing
patterns may be read in. In order for pattern selection to work properly,
this file should only contain patterns with hyphenation values less than
|hyph_start|. Each word in the dictionary is hyphenated according to the
existing set of patterns (including those chosen on previous passes of the
current run) before pattern statistics are collected.
Also, a hyphenated word list may be written out at the end of a run. This
list can be read back in as the `dictionary' to continue pattern selection
from this point. In addition to ordinary hyphens (|'-'|) the new list
will contain two additional kinds of ``hyphens'' between letters, namely
hyphens that have been found by previously generated patterns, as well
as erroneous hyphens that have been inserted by those patterns. These
are represented by the symbols |'*'| and |'.'|, respectively. The three
characters |'-'|, |'*'|, and |'.'| are, in fact, just the default values
used to represent the three kinds of hyphens, the |translate| file may
specify different characters to be used instead of them.
In addition, a word list can include hyphen weights, both for entire words
and for individual hyphen positions. (The syntax for this is explained in
the dictionary processing routines.) Thus common words can be weighted
more heavily, or, more generally, words can be weighted according to their
frequency of occurrence, if such information is available. The use of
hyphen weights combined with an appropriate setting of the pattern
selection threshold can be used to guarantee hyphenation of certain words
or certain hyphen positions within a word.
@ Below we show the first few lines of a typical word list,
before and after generating a level of patterns.
$$\vbox{\halign{\tabskip 1in\.{#\hfil}&\.{#\hfil}\cr
abil-i-ty& abil*i*ty\cr
ab-sence& ab*sence\cr
ab-stract& ab*stract\cr
ac-a-dem-ic& ac-a-d.em-ic\cr
ac-cept& ac*cept\cr
ac-cept-able& ac*cept-able\cr
ac-cept-ed& ac*cept*ed\cr
\hskip 1.5em ...&\hskip 1.5em ...\cr
}}$$
@ We augment \PASCAL 's control structures a bit using |goto|\unskip's
and the following symbolic labels.
@d exit=10 {go here to leave a procedure}
@d continue=22 {go here to resume a loop}
@d done=30 {go here to exit a loop}
@d found=40 {go here when you've found it}
@d not_found=41 {go here when you've found something else}
@ Here are some macros for common programming idioms.
@d incr(#)==#:=#+1 {increase a variable by unity}
@d decr(#)==#:=#-1 {decrease a variable by unity}
@#
@d Incr_Decr_end(#)==#
@d Incr(#)==#:=#+Incr_Decr_end {we use |Incr(a)(b)| to increase \dots}
@d Decr(#)==#:=#-Incr_Decr_end {\dots\ and |Decr(a)(b)| to decrease
variable |a| by |b|; this can be optimized for some compilers}
@#
@d loop == @+ while true do@+ {repeat over and over until a |goto| happens}
@d do_nothing == {empty statement}
@d return==goto exit {terminate a procedure call}
@f return==nil
@f loop == xclause
@ In case of serious problems \.{PATGEN} will give up, after issuing an
error message about what caused the error. Such errors might be
discovered inside of subroutines inside of subroutines, so a \.{WEB}
macro called |jump_out| has been introduced. This macro, which transfers
control to the label |end_of_PATGEN| at the end of the program, contains
the only non-local |@!goto| statement in \.{PATGEN}. Some \PASCAL\
compilers do not implement non-local |goto| statements. In such cases
the |goto end_of_PATGEN| in the definition of |jump_out| should simply
be replaced by a call on some system procedure that quietly terminates
the program.
@^system dependencies@>
An overflow stop occurs if \.{PATGEN}'s tables aren't large enough.
@d jump_out==goto end_of_PATGEN {terminates \.{PATGEN}}
@#
@d error(#)==begin print_ln(#); jump_out; end
@d overflow(#)==error('PATGEN capacity exceeded, sorry [',#,'].')
@.PATGEN capacity exceeded ...@>
@ @<Compiler directives@>=
@{@&$C-,A+,D-@} {no range check, catch arithmetic overflow, no debug overhead}
@^system dependencies@>
@* The character set.
Since different \PASCAL\ systems may use different character sets, we use
the name |text_char| to stand for the data type of characters appearing in
external text files. We also assume that |text_char| consists of the
elements |chr(first_text_char)| through |chr(last_text_char)|, inclusive.
The definitions below should be adjusted if necessary.
@^system dependencies@>
@^character set dependencies@>
Internally, characters will be represented using the type |ASCII_code|.
Note, however, that only some of the standard ASCII characters are
assigned a fixed |ASCII_code|; all other characters are assigned an
|ASCII_code| dynamically when they are first read from the |translate|
file specifying the external representation of the `letters' used by a
particular language. For the sake of generality the standard version of
this program allows for 256 different |ASCII_code| values, but 128 of
them would probably suffice for all practical purposes.
@d first_text_char=0 {ordinal number of the smallest element of |text_char|}
@d last_text_char=255 {ordinal number of the largest element of |text_char|}
@#
@d last_ASCII_code=255 {the highest allowed |ASCII_code| value}
@<Types...@>=
@!text_char=char; {the data type of characters in text files}
@!ASCII_code=0..last_ASCII_code; {internal representation of input characters}
@!text_file=text;
@ Some \PASCAL s can store only signed eight-bit quantities (|-128..127|)
but not unsigned ones (|0..255|) in one byte. If storage is tight we
must, for such \PASCAL s, either restrict |ASCII_code| to the range
|0..127| (with some loss of generality) or convert between |ASCII_code|
and |packed_ASCII_code| and vice versa by subtracting or adding an
offset. (Or we might define |packed_ASCII_code| as |char| and use
suitable typecasts for the conversion.) Only the type |packed_ASCII_code|
will be used for large arrays and the \.{WEB} macros |si| and |so| will
always be used to convert an |ASCII_code| into a |packed_ASCII_code| and
vice versa.
@^system dependencies@>
@d min_packed=0 {change this to `$\\{min\_packed}=-128$' when necessary;
and don't forget to change the definitions of |si| and |so| below
accordingly}
@#
@d si(#)==# {converts |ASCII_code| to |packed_ASCII_code|}
@d so(#)==# {converts |packed_ASCII_code| to |ASCII_code|}
@<Types...@>=
@!packed_ASCII_code=min_packed..last_ASCII_code+min_packed;
@ We want to make sure that the ``constants'' defined in this program
satisfy all the required relations. Some of them are needed to avoid
time-consuming checks while processing the dictionary and\slash or to
prevent range check and array bound violations.
@^range check violations@>
Here we check that the definitions of |ASCII_code| and
|packed_ASCII_code| are consistent with those of |si| and |so|.
@<Set init...@>=
bad:=0;@/
if last_ASCII_code<127 then bad:=1;
if (si(0)<>min_packed)or(so(min_packed)<>0) then bad:=2;@/
@<Check the ``constant'' values for consistency@>@;
if bad>0 then error('Bad constants---case ',bad:1);
@.Bad constants@>
@ @<Local variables for init...@>=
@!bad:integer;
@!i:text_char;
@!j:ASCII_code;
@ We convert between |ASCII_code| and the user's external character set by
means of arrays |xord| and |xchr| that are analogous to \PASCAL's |ord|
and |chr| functions.
@<Globals...@>=
@!xord: array [text_char] of ASCII_code;
{specifies conversion of input characters}
@!xchr: array [ASCII_code] of text_char;
{specifies conversion of output characters}
@ The following code initializes the |xchr| array with some of the
standard ASCII characters.
@<Set init...@>=
for j:=0 to last_ASCII_code do xchr[j]:=' ';
xchr["."]:='.';@/
xchr["0"]:='0'; xchr["1"]:='1'; xchr["2"]:='2'; xchr["3"]:='3';
xchr["4"]:='4'; xchr["5"]:='5'; xchr["6"]:='6'; xchr["7"]:='7';
xchr["8"]:='8'; xchr["9"]:='9';@/
xchr["A"]:='A'; xchr["B"]:='B'; xchr["C"]:='C'; xchr["D"]:='D';
xchr["E"]:='E'; xchr["F"]:='F'; xchr["G"]:='G'; xchr["H"]:='H';
xchr["I"]:='I'; xchr["J"]:='J'; xchr["K"]:='K'; xchr["L"]:='L';
xchr["M"]:='M'; xchr["N"]:='N'; xchr["O"]:='O'; xchr["P"]:='P';
xchr["Q"]:='Q'; xchr["R"]:='R'; xchr["S"]:='S'; xchr["T"]:='T';
xchr["U"]:='U'; xchr["V"]:='V'; xchr["W"]:='W'; xchr["X"]:='X';
xchr["Y"]:='Y'; xchr["Z"]:='Z';@/
xchr["a"]:='a'; xchr["b"]:='b'; xchr["c"]:='c'; xchr["d"]:='d';
xchr["e"]:='e'; xchr["f"]:='f'; xchr["g"]:='g'; xchr["h"]:='h';
xchr["i"]:='i'; xchr["j"]:='j'; xchr["k"]:='k'; xchr["l"]:='l';
xchr["m"]:='m'; xchr["n"]:='n'; xchr["o"]:='o'; xchr["p"]:='p';
xchr["q"]:='q'; xchr["r"]:='r'; xchr["s"]:='s'; xchr["t"]:='t';
xchr["u"]:='u'; xchr["v"]:='v'; xchr["w"]:='w'; xchr["x"]:='x';
xchr["y"]:='y'; xchr["z"]:='z';
@ The following system-independent code makes the |xord| array contain a
suitable inverse to the information in |xchr|.
@d invalid_code=0 {|ASCII_code| that should not appear}
@d tab_char=@'11 {|ord| of tab character; tab characters seem to be
unavoidable with files from UNIX systems}
@^system dependencies@>
@^character set dependencies@>
@<Set init...@>=
for i:=chr(first_text_char) to chr(last_text_char) do
xord[i]:=invalid_code;
for j:=0 to last_ASCII_code do xord[xchr[j]]:=j;
xord[' ']:=" "; xord[chr(tab_char)]:=" ";
@ So far each invalid |ASCII_code| has been assigned the character |' '|
and all invalid characters have been assigned |ASCII_code=invalid_code|.
The |get_ASCII| function, used only while reading the |translate| file,
returns the |ASCII_code| corresponding to a character, assigning a new
|ASCII_code| first if necessary.
@d num_ASCII_codes=last_ASCII_code+1 {number of different |ASCII_code| values}
@p function get_ASCII(@!c:text_char):ASCII_code;
label found;
var i: ASCII_code;
begin i:=xord[c];
if i=invalid_code then
begin while i<last_ASCII_code do
begin incr(i);
if (xchr[i]=' ')and(i<>" ") then goto found;
end;
overflow(num_ASCII_codes:1,' characters');
found: xord[c]:=i; xchr[i]:=c;
end;
get_ASCII:=i;
end;
@ The \TeX 82 hyphenation algorithm operates on `hyphenable words'
converted temporarily to lower case, i.e., they may consist of up to
255 different `letters' corresponding to \.{\\lccode}s |1..255|. These
\.{\\lccode}s could, in principle, be language dependent but this might
lead to undesirable results when hyphenating multilingual paragraphs.
No more than 245 different letters can occur in hyphenation patterns
since the characters |'0'..'9'| and |'.'| play a special r\^^Dole when
reading patterns. For the purpose of this program each letter is
represented internally by a unique |internal_code>=2| (|internal_code=1|
is the |edge_of_word| indicator); |internal_code| values |2..127| will
probably suffice for all practical purposes, but we allow the range
|2..last_ASCII_code| for the sake of generality. Syntactically
|internal_code| and |ASCII_code| are the same, we will use one or the
other name according to the semantic context.
@d edge_of_word=1 {|internal_code| for start and end of a word}
@<Types...@>=
@!internal_code=ASCII_code;
@!packed_internal_code=packed_ASCII_code;
@ Note that an |internal_code| used by this program is in general quite
different from the |ASCII_code| (or rather \.{\\lccode}) used by \TeX
82. This program allows the input of characters (from the |dictionary|
and |patterns| file) corresponding to an |internal_code| in either lower
or upper case form; the output (to the |patout| and |pattmp| file) will
always be in lower case form.
Unfortunately there does not (yet?) exist a standardized and widely
accepted 8-bit character set (or a unique one-to-one translation between
such sets). On the other hand macro expansion takes place in \TeX 82
when reading hyphenable words and when reading patterns. Thus the lower
and upper case versions of all `letters' used by a particular language
can (and for the sake of portability should) be represented entirely in
terms of the standard ASCII character set; either directly as characters
or via macros (or active characters) with or without arguments. The
macro definitions for such a representation will in general be language
dependent.
For the purpose of this program the external representation of the lower
and upper case version of a letter (i.e., |internal_code|) consists of a
unique sequence of characters (or \\{ASCII\_codes}), the only restriction
being that no such sequence must be a subsequence of an other one.
Moreover such sequences must not start with |' '|, |'.'|, |'0'..'9'| or
with one of the three characters (|'-'|, |'*'|, and |'.'|) representing
hyphens in the |dictionary| file; a sequence may, however, end with a
mandatory |' '| as, e.g., the sequence |'\ss '|.
The language dependent values of \.{\\lefthyphenmin} and
\.{\\righthyphenmin} as well as the external representation of the lower
and upper case letters and their collating sequence are specified in the
|translate| file, thus making any language dependent modifications of
this program unnecessary. If the |translate| file is empty (or does not
exist) the values \.{\\lefthyphenmin=2} and \.{\\righthyphenmin=3} and
|internal_code| values |2..27| with the one character external
representations |'a'..'z'| and |'A'..'Z'| will be used as defaults.
Incidentally this program can be used to convert a |dictionary| and
|patterns| file from one (``upper case'') to another (``lower case'')
external representation of letters.
@ When reading the |dictionary| (and |patterns|) file sequences of
characters must be recognized and converted to their corresponding
|internal_code|. This conversion is part of \.{PATGEN}s inner loop and
@^inner loop@>
must therefore be done as efficient as possible. Thus we will
mostly bypass the conversion from character to |ASCII_code| and convert
directly to the corresponding |internal_code| using the |xclass|
and |xint| arrays. Six types of characters are distinguished by their
|xclass|:
\yskip\hang |space_class| character |' '| terminates a pattern or word.
\yskip\hang |digit_class| characters |'0'..'9'| are hyphen values for a
pattern or hyphen weights for a word; their |xint| is the corresponding
numeric value |0..9|.
\yskip\hang |hyf_class| characters (|'.'|, |'-'|, and |'*'|) are `dots'
and indicate hyphens in a word; their |xint| is the corresponding
numeric value |err_hyf..found_hyf|.
\yskip\hang |letter_class| characters represent a letter; their |xint|
is the corresponding |internal_code|.
\yskip\hang |escape_class| characters indicate the start of a
multi-character sequence representing a letter.
\yskip\hang |invalid_class| characters should not occur except as part
of multi-character sequences.
@d space_class=0 {the character |' '|}
@d digit_class=1 {the characters |'0'..'9'|}
@d hyf_class=2 {the `hyphen' characters (|'.'|, |'-'|, and |'*'|)}
@d letter_class=3 {characters representing a letter}
@d escape_class=4 {characters that start a multi-character sequence
representing a letter}
@d invalid_class=5 {characters that normally should not occur}
@#
@d no_hyf=0 {no hyphen}
@d err_hyf=1 {erroneous hyphen}
@d is_hyf=2 {hyphen}
@d found_hyf=3 {found hyphen}
@<Types...@>=
@!class_type=space_class..invalid_class; {class of a character}
@!digit=0..9; {a hyphen weight (or word weight)}
@!hyf_type=no_hyf..found_hyf; {type of a hyphen}
@ In addition we will use the |xext|, |xdig|, and |xdot| arrays to
convert from the internal representation to the corresponding
characters.
@<Globals...@>=
@!xclass: array [text_char] of class_type;
{specifies the class of a character}
@!xint: array [text_char] of internal_code;
{specifies the |internal_code| for a character}
@!xdig: array [0..9] of text_char;
{specifies conversion of output characters}
@!xext: array [internal_code] of text_char;
{specifies conversion of output characters}
@!xhyf: array [err_hyf..found_hyf] of text_char;
{specifies conversion of output characters}
@ @<Set init...@>=
for i:=chr(first_text_char) to chr(last_text_char) do
begin xclass[i]:=invalid_class; xint[i]:=0;
end;
xclass[' ']:=space_class;
for j:=0 to last_ASCII_code do xext[j]:=' ';
xext[edge_of_word]:='.';
for j:=0 to 9 do
begin xdig[j]:=xchr[j+"0"];
xclass[xdig[j]]:=digit_class; xint[xdig[j]]:=j;
end;
xhyf[err_hyf]:='.'; xhyf[is_hyf]:='-'; xhyf[found_hyf]:='*';
{default representation for hyphens}
@ We assume that words use only the letters |cmin+1| through |cmax|.
This allows us to save some time on trie operations that involve
searching for packed transitions belonging to a particular state.
@d cmin=edge_of_word
@<Globals...@>=
@!cmax: internal_code; {largest |internal_code| or |ASCII_code|}
@* Data structures.
The main data structure used in this program is a dynamic packed trie.
In fact we use two of them, one for the set of patterns selected so far,
and one for the patterns being considered in the current pass.
For a pattern $p_1\ldots p_k$, the information associated with that
pattern is accessed by setting |@t$t_1$@>:=trie_root+@t$p_1$@>| and
then, for |1<i<=k|, setting |@t$t_i$@>:=trie_link(@t$t_{i-1}$@>)+
@t$p_i$@>|; the pattern information is then stored in a location addressed
by |@t$t_k$@>|. Since all trie nodes are packed into a single array, in
order to distinguish nodes belonging to different trie families, a special
field is provided such that |trie_char@t$(t_i)=si(p_i)$@>| for all |i|.
In addition the trie must support dynamic insertions and deletions. This
is done by maintaining a doubly linked list of unoccupied cells and
repacking trie families as necessary when insertions are made.
Each trie node consists of three fields: the character |trie_char|, and
the two link fields |trie_link| and |trie_back|. In addition there is a
separate boolean array |trie_base_used|. When a node is unoccupied,
|trie_char=min_packed| and the link fields point to the next and previous
unoccupied nodes, respectively, in the doubly linked list. When a node is
occupied, |trie_link| points to the next trie family, and |trie_back|
(renamed |trie_outp|) contains the output associated with this transition.
The |trie_base_used| bit indicates that some family has been packed at
this base location, and is used to prevent two families from being packed
at the same location.
@ The sizes of the pattern tries may have to be adjusted depending
on the particular application (i.e., the parameter settings and the
size of the dictionary). The sizes below were sufficient to generate
the original set of english \TeX 82 hyphenation patterns (file
\.{hyphen.tex}).
@<Constants...@>=
@!trie_size=55000; {space for pattern trie}
@!triec_size=26000; {space for pattern count trie, must be less than
|trie_size| and greater than the number of occurrences of any pattern in
the dictionary}
@!max_ops=4080; {size of output hash table, should be a multiple of 510}
@!max_val=10; {maximum number of levels$+1$, also used to denote bad patterns}
@!max_dot=15; {maximum pattern length, also maximum length of external
representation of a `letter'}
@!max_len=50; {maximum word length}
@!max_buf_len=80; {maximum length of input lines, must be at least |max_len|}
@ @<Check the ``constant'' values for consistency@>=
if (triec_size<4096)or(trie_size<triec_size) then bad:=3;
if max_ops>trie_size then bad:=4;
if max_val>10 then bad:=5;
if max_buf_len<max_len then bad:=6;
@ @<Types...@>=
@!q_index=1..last_ASCII_code; {number of transitions in a state}
@!val_type=0..max_val; {hyphenation values}
@!dot_type=0..max_dot; {dot positions}
@!op_type=0..max_ops; {index into output hash table}
@!word_index=0..max_len; {index into |word|}
@!trie_pointer=0..trie_size;
@!triec_pointer=0..triec_size;@/
@!op_word=packed record dot: dot_type; val: val_type; op: op_type end;
@ Trie is actually stored with its components in separate packed arrays,
in order to save space and time (although this depends on the computer's
word size and the size of the trie pointers).
@<Globals...@>=
@!trie_c: packed array[trie_pointer] of packed_internal_code;
@!trie_l, @!trie_r: packed array[trie_pointer] of trie_pointer;
@!trie_taken: packed array[trie_pointer] of boolean;
@!triec_c: packed array[triec_pointer] of packed_internal_code;
@!triec_l, @!triec_r: packed array[triec_pointer] of triec_pointer;
@!triec_taken: packed array[triec_pointer] of boolean;
@!ops: array[op_type] of op_word; {output hash table}
@ When some trie state is being worked on, an unpacked version of the
state is kept in positions |1..qmax| of the global arrays |trieq_c|,
|trieq_l|, and |trieq_r|. The character fields need not be in any
particular order.
@<Globals...@>=
@!trieq_c: array[q_index] of internal_code; {character fields of a
single trie state}
@!trieq_l, @!trieq_r: array[q_index] of trie_pointer; {link fields}
@!qmax: q_index; {number of transitions in an unpacked state}
@!qmax_thresh: q_index; {controls density of first-fit packing}
@ Trie fields are accessed using the following macros.
@d trie_char(#)==trie_c[#]
@d trie_link(#)==trie_l[#]
@d trie_back(#)==trie_r[#]
@d trie_outp(#)==trie_r[#]
@d trie_base_used(#)==trie_taken[#]
@#
@d triec_char(#)==triec_c[#]
@d triec_link(#)==triec_l[#]
@d triec_back(#)==triec_r[#]
@d triec_good(#)==triec_l[#]
@d triec_bad(#)==triec_r[#]
@d triec_base_used(#)==triec_taken[#]
@#
@d q_char(#)==trieq_c[#]
@d q_link(#)==trieq_l[#]
@d q_back(#)==trieq_r[#]
@d q_outp(#)==trieq_r[#]
@#
@d hyf_val(#)==ops[#].val
@d hyf_dot(#)==ops[#].dot
@d hyf_nxt(#)==ops[#].op
@* Routines for pattern trie.
The pattern trie holds the set of patterns chosen prior to the current
pass, including bad or ``hopeless'' patterns at the current level that
occur too few times in the dictionary to be of use. Each transition of
the trie includes an output field pointing to the hyphenation information
associated with this transition.
@<Globals...@>=
@!trie_max: trie_pointer; {maximum occupied trie node}
@!trie_bmax: trie_pointer; {maximum base of trie family}
@!trie_count: trie_pointer; {number of occupied trie nodes, for space usage
statistics}
@!op_count: op_type; {number of outputs in hash table}
@ Initially, the dynamic packed trie has just one state, namely the root,
with all transitions present (but with null links). This is convenient
because the root will never need to be repacked and also we won't have to
check that the base is nonnegative when packing other states.
Moreover in many cases we need not check for a vanishing link field:
if |trie_link(t)=0| then a subsequent test for
|trie_char(trie_link(t)+c)=si(c)| will always fail due to |trie_root=1|.
@d trie_root=1
@p procedure init_pattern_trie;
var c: internal_code; @!h: op_type;
begin for c:=0 to last_ASCII_code do
begin trie_char(trie_root+c):=si(c); {indicates node occupied;
fake for |c=0|}
trie_link(trie_root+c):=0;
trie_outp(trie_root+c):=0;
trie_base_used(trie_root+c):=false;
end;
trie_base_used(trie_root):=true;
trie_bmax:=trie_root;
trie_max:=trie_root+last_ASCII_code;
trie_count:=num_ASCII_codes;@/
qmax_thresh:=5;@/
trie_link(0):=trie_max+1;
trie_back(trie_max+1):=0;@/
{|trie_link(0)| is used as the head of the doubly linked list of
unoccupied cells}
for h:=1 to max_ops do hyf_val(h):=0; {clear output hash table}
op_count:=0;
end;
@ The |first_fit| procedure finds a hole in the packed trie into which the
state in |trieq_c|, |trieq_l|, and |trieq_r| will fit. This is normally
done by going through the linked list of unoccupied cells and testing if
the state will fit at each position. However if a state has too many
transitions (and is therefore unlikely to fit among existing
transitions) we don't bother and instead just pack it immediately to the
right of the occupied region (starting at |trie_max+1|).
@p function first_fit: trie_pointer;
label found, not_found;
var s, @!t: trie_pointer; @!q: q_index;
begin @<Set |s| to the trie base location at which this state should be
packed@>;
for q:=1 to qmax do {pack it}
begin t:=s+q_char(q);@/
trie_link(trie_back(t)):=trie_link(t);
trie_back(trie_link(t)):=trie_back(t); {link around
filled cell}
trie_char(t):=si(q_char(q));
trie_link(t):=q_link(q);
trie_outp(t):=q_outp(q);
if t>trie_max then trie_max:=t;
end;
trie_base_used(s):=true;
first_fit:=s
end;
@ The threshold for large states is initially 5 transitions. If more than
one level of patterns is being generated, the threshold is set to 7 on
subsequent levels because the pattern trie will be sparser after bad
patterns are deleted (see |delete_bad_patterns|).
@<Set |s| to the trie base location at which this state should be packed@>=
if qmax>qmax_thresh then t:=trie_back(trie_max+1) @+else t:=0;
loop begin t:=trie_link(t); s:=t-q_char(1); {get next unoccupied cell}
@<Ensure |trie| linked up to |s+num_ASCII_codes|@>;
if trie_base_used(s) then goto not_found;
for q:=qmax downto 2 do {check if state fits here}
if trie_char(s+q_char(q))<>min_packed then goto not_found;
goto found;
not_found: end;
found:
@ The trie is only initialized (as a doubly linked list of empty cells) as
far as necessary. Here we extend the initialization if necessary, and
check for overflow.
@<Ensure |trie| linked up to |s+num_ASCII_codes|@>=
if s>trie_size-num_ASCII_codes then
overflow(trie_size:1,' pattern trie nodes');
while trie_bmax<s do
begin incr(trie_bmax);
trie_base_used(trie_bmax):=false;
trie_char(trie_bmax+last_ASCII_code):=min_packed;
trie_link(trie_bmax+last_ASCII_code):=trie_bmax+num_ASCII_codes;
trie_back(trie_bmax+num_ASCII_codes):=trie_bmax+last_ASCII_code;
end
@ The |unpack| procedure finds all transitions associated with the state
with base |s|, puts them into the arrays |trieq_c|, |trieq_l|, and
|trieq_r|, and sets |qmax| to one more than the number of transitions
found. Freed cells are put at the beginning of the free list.
@p procedure unpack(@!s: trie_pointer);
var c: internal_code; @!t: trie_pointer;
begin qmax:=1;
for c:=cmin to cmax do {search for transitions belonging to this state}
begin t:=s+c;
if so(trie_char(t))=c then {found one}
begin q_char(qmax):=c;
q_link(qmax):=trie_link(t);
q_outp(qmax):=trie_outp(t);
incr(qmax);@/
{now free trie node}
trie_back(trie_link(0)):=t;
trie_link(t):=trie_link(0);
trie_link(0):=t;
trie_back(t):=0;
trie_char(t):=min_packed;
end;
end;
trie_base_used(s):=false;
end;
@ The function |new_trie_op| returns the `opcode' for the output
consisting of hyphenation value~|v|, hyphen position |d|, and next output
|n|. The hash function used by |new_trie_op| is based on the idea that
313/510 is an approximation to the golden ratio [cf.\ {\sl The Art of
Computer Programming \bf3} (1973), 510--512]; but the choice is
comparatively unimportant in this particular application.
@p function new_trie_op(@!v: val_type; @!d: dot_type; @!n: op_type): op_type;
label exit;
var h: op_type;
begin h:=((n+313*d+361*v) mod max_ops)+1; {trial hash location}
loop begin if hyf_val(h)=0 then {empty position found}
begin incr(op_count);
if op_count=max_ops then overflow(max_ops:1,' outputs');
hyf_val(h):=v; hyf_dot(h):=d; hyf_nxt(h):=n; new_trie_op:=h; return;
end;
if (hyf_val(h)=v) and (hyf_dot(h)=d) and
(hyf_nxt(h)=n) then {already in hash table}
begin new_trie_op:=h; return;
end;
if h>1 then decr(h) @+else h:=max_ops; {try again}
end;
exit: end;
@ @<Globals...@>=
@!pat: array[dot_type] of internal_code; {current pattern}
@!pat_len: dot_type; {pattern length}
@ Now that we have provided the necessary routines for manipulating the
dynamic packed trie, here is a procedure that inserts a pattern of length
|pat_len|, stored in the |pat| array, into the pattern trie. It also adds
a new output.
@p procedure insert_pattern(@!val: val_type; @!dot: dot_type);
var i: dot_type; @!s, @!t: trie_pointer;
begin i:=1;
s:=trie_root+pat[i]; t:=trie_link(s);
while (t>0) and (i<pat_len) do {follow existing trie}
begin incr(i); Incr(t)(pat[i]);
if so(trie_char(t))<>pat[i] then
@<Insert critical transition, possibly repacking@>;
s:=t; t:=trie_link(s);
end;
q_link(1):=0; q_outp(1):=0; qmax:=1;
while i<pat_len do {insert rest of pattern}
begin incr(i); q_char(1):=pat[i];
t:=first_fit;
trie_link(s):=t;
s:=t+pat[i];
incr(trie_count);
end;
trie_outp(s):=new_trie_op(val,dot,trie_outp(s));
end;
@ We have accessed a transition not in the trie. We insert it, repacking
the state if necessary.
@<Insert critical transition, possibly repacking@>=
begin if trie_char(t)=min_packed then
begin {we're lucky, no repacking needed}
trie_link(trie_back(t)):=trie_link(t);
trie_back(trie_link(t)):=trie_back(t);@/
trie_char(t):=si(pat[i]);
trie_link(t):=0;
trie_outp(t):=0;
if t>trie_max then trie_max:=t;
end
else begin {whoops, have to repack}
unpack(t-pat[i]);@/
q_char(qmax):=pat[i];
q_link(qmax):=0;
q_outp(qmax):=0;@/
t:=first_fit;
trie_link(s):=t;
Incr(t)(pat[i]);
end;
incr(trie_count);
end
@* Routines for pattern count trie.
The pattern count trie is used to store the set of patterns considered in
the current pass, along with the counts of good and bad instances. The
fields of this trie are the same as the pattern trie, except that there is
no output field, and leaf nodes are also used to store counts
(|triec_good| and |triec_bad|). Except where noted, the following
routines are analogous to the pattern trie routines.
@<Globals...@>=
@!triec_max, @!triec_bmax, @!triec_count: triec_pointer; {same as for pattern
trie}
@!triec_kmax: triec_pointer; {shows growth of trie during pass}
@!pat_count: integer; {number of patterns in count trie}
@ [See |init_pattern_trie|.] The variable |triec_kmax| always contains
the size of the count trie rounded up to the next multiple of 4096, and is
used to show the growth of the trie during each pass.
@d triec_root=1
@p procedure init_count_trie;
var c: internal_code;
begin for c:=0 to last_ASCII_code do
begin triec_char(triec_root+c):=si(c);@/
triec_link(triec_root+c):=0;
triec_back(triec_root+c):=0;
triec_base_used(triec_root+c):=false;
end;
triec_base_used(triec_root):=true;
triec_bmax:=triec_root; triec_max:=triec_root+last_ASCII_code;
triec_count:=num_ASCII_codes; triec_kmax:=4096;@/
triec_link(0):=triec_max+1; triec_back(triec_max+1):=0;@/
pat_count:=0;
end;
@ [See |first_fit|.]
@p function firstc_fit: triec_pointer;
label found, not_found;
var a, @!b: triec_pointer; @!q: q_index;
begin @<Set |b| to the count trie base location at which this state should
be packed@>;
for q:=1 to qmax do {pack it}
begin a:=b+q_char(q);@/
triec_link(triec_back(a)):=triec_link(a);
triec_back(triec_link(a)):=triec_back(a);@/
triec_char(a):=si(q_char(q));
triec_link(a):=q_link(q);
triec_back(a):=q_back(q);
if a>triec_max then triec_max:=a;
end;
triec_base_used(b):=true;
firstc_fit:=b
end;
@ The threshold for attempting a first-fit packing is 3 transitions, which
is lower than for the pattern trie because speed is more important here.
@<Set |b| to the count trie base location...@>=
if qmax>3 then a:=triec_back(triec_max+1) @+else a:=0;
loop begin a:=triec_link(a); b:=a-q_char(1);@/
@<Ensure |triec| linked up to |b+num_ASCII_codes|@>;
if triec_base_used(b) then goto not_found;
for q:=qmax downto 2 do
if triec_char(b+q_char(q))<>min_packed then goto not_found;
goto found;
not_found: end;
found:
@ @<Ensure |triec| linked up to |b+num_ASCII_codes|@>=
if b>triec_kmax-num_ASCII_codes then
begin if triec_kmax=triec_size then
overflow(triec_size:1,' count trie nodes');
print(triec_kmax div 1024:1, 'K ');
if triec_kmax>triec_size-4096 then triec_kmax:=triec_size
else Incr(triec_kmax)(4096);
end;
while triec_bmax<b do
begin incr(triec_bmax);
triec_base_used(triec_bmax):=false;
triec_char(triec_bmax+last_ASCII_code):=min_packed;
triec_link(triec_bmax+last_ASCII_code):=triec_bmax+num_ASCII_codes;
triec_back(triec_bmax+num_ASCII_codes):=triec_bmax+last_ASCII_code;
end
@ [See |unpack|.]
@p procedure unpackc(@!b: triec_pointer);
var c: internal_code; @!a: triec_pointer;
begin qmax:=1;
for c:=cmin to cmax do {search for transitions belonging to this state}
begin a:=b+c;
if so(triec_char(a))=c then {found one}
begin q_char(qmax):=c;
q_link(qmax):=triec_link(a);
q_back(qmax):=triec_back(a);
incr(qmax);@/
triec_back(triec_link(0)):=a;
triec_link(a):=triec_link(0);
triec_link(0):=a; triec_back(a):=0;
triec_char(a):=min_packed;
end;
end;
triec_base_used(b):=false;
end;
@ [See |insert_pattern|.] Patterns being inserted into the count trie are
always substrings of the current word, so they are contained in the array
|word| with length |pat_len| and finishing position |fpos|.
@p function insertc_pat(@!fpos: word_index): triec_pointer;
var spos: word_index; @!a, @!b: triec_pointer;
begin spos:=fpos-pat_len; {starting position of pattern}
incr(spos); b:=triec_root+word[spos]; a:=triec_link(b);
while (a>0) and (spos<fpos) do {follow existing trie}
begin incr(spos); Incr(a)(word[spos]);
if so(triec_char(a))<>word[spos] then
@<Insert critical count transition, possibly repacking@>;
b:=a; a:=triec_link(a);
end;
q_link(1):=0; q_back(1):=0; qmax:=1;
while spos<fpos do {insert rest of pattern}
begin incr(spos); q_char(1):=word[spos];
a:=firstc_fit;
triec_link(b):=a;
b:=a+word[spos];
incr(triec_count);