-
Notifications
You must be signed in to change notification settings - Fork 125
/
formatting.py
1863 lines (1452 loc) · 71 KB
/
formatting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
Defines functions useful for formatting code or text according to SE standards, for calculating
several text-level statistics like reading ease, and for adding semantics.
"""
from copy import deepcopy
import html.entities
import math
import string
import unicodedata
from pathlib import Path
from typing import Dict, Union, List, Tuple, Optional
import regex
import roman
import tinycss2
from lxml import etree
from titlecase import titlecase as pip_titlecase
from unidecode import unidecode
import se
from se.easy_xml import EasyXmlTree, EasyXmlElement
# This list of phrasing tags is not intended to be exhaustive. The list is only used
# to resolve the uncommon situation where there is no plain text in a paragraph. The
# span and br tags are explicitly omitted because of how they are used in poetry formatting,
# which differs from the normal formatting.
PHRASING_TAGS = [
"{http://www.w3.org/1999/xhtml}a",
"{http://www.w3.org/1999/xhtml}abbr",
"{http://www.w3.org/1999/xhtml}b",
"{http://www.w3.org/1999/xhtml}cite",
"{http://www.w3.org/1999/xhtml}em",
"{http://www.w3.org/1999/xhtml}i",
"{http://www.w3.org/1999/xhtml}strong",
]
def semanticate(xhtml: str) -> str:
"""
Add semantics to well-formed XHTML
INPUTS
xhtml: A string of well-formed XHTML
OUTPUTS
A string of XHTML with semantics added.
"""
dom = EasyXmlTree(xhtml)
for el in dom.xpath("/html/body/*"):
_semanticate_element(el, ancestors=['html', 'body'])
return dom.to_string()
def _semanticate_element(el: EasyXmlElement, ancestors: List[str]):
"""
Recursive helper function to semanticate an XML element and its descendants.
"""
# Keep track of ancestors (to exclude rules inside <abbr>, etc.)
ancestors.append(el.tag)
def sub_fn(pattern: str, repl: str, **kwargs):
# Finds pattern in the text of this element and the tail of its children
# Replaces with repl, which may introduce new XML elements
# Additional arguments are passed to regex.sub
if el.text:
# Apply the regex to the text before the first child
new_text, new_els = _sub_elements(pattern, repl, el.text, **kwargs)
el.text = new_text
# Insert new elements before the first child
for new_el in reversed(new_els):
el.lxml_element.insert(0, new_el.lxml_element)
for child in el.children:
if child.tail:
# Apply the regex to the text after this child
new_tail, new_els = _sub_elements(pattern, repl, child.tail, **kwargs)
if child.tail != new_tail or new_els:
child.tail = new_tail
# Insert new elements after this child
for new_el in new_els:
el.lxml_element.insert(el.lxml_element.index(child.lxml_element) + 1, new_el.lxml_element)
# Run the semanticate rules, using sub_fn as a callback to make the replacements
_semanticate_rules(el, ancestors, sub_fn)
# Recursively process all children
for child in el.children:
_semanticate_element(child, ancestors)
ancestors.pop()
table = str.maketrans({
"<": "<",
">": ">",
"&": "&",
"'": "'",
'"': """,
})
def xmlescape(txt):
return txt.translate(table)
def _sub_elements(pattern: str, repl: str, text: str, **kwargs) -> List[Union[str, EasyXmlElement]]:
"""
Helper function to apply a regex to a string and return a list of new XML elements.
"""
escaped_text = xmlescape(text)
new_text = regex.sub(pattern, repl, escaped_text, **kwargs)
if new_text != escaped_text:
root = EasyXmlElement(f"""<root xmlns:epub="http://www.idpf.org/2007/ops">{new_text}</root>""")
return root.text, root.children
else:
return text, []
def _semanticate_rules(el, ancestors, sub):
"""
Helper function to apply the semanticate rules to an XML element.
"""
if "abbr" not in ancestors:
# Some common abbreviations
sub(r"(?<!(?:\.|\B))(\L<titles>\.)", r"""<abbr epub:type="z3998:name-title">\1</abbr>""", titles=[
"Capt",
"Col",
"Dr",
"Drs",
"Esq",
"Fr",
"Hon",
"Lieut",
"Lt",
"MM",
"Mdlle",
"Messers",
"Messrs",
"Mlle",
"Mlles",
"Mme",
"Mmes",
"Mon",
"Mr",
"Mrs",
"Ms",
"Prof",
"Pvt",
"Rev",
])
sub(r"(?<!(?:\.|\B))(\L<abbreviations>\.)", r"""<abbr>\1</abbr>""", abbreviations=[
"Bros",
"Mt",
"[Vv]ols?",
"Co",
"Inc",
"Ltd",
"St",
"[Gg]ov",
"MSS?",
"[Vv]iz",
"etc",
"[Cc])f",
"ed",
"(?:Jan\.|Feb\.|Mar\.|Apr\.|Jun\.|Jul\.|Aug\.|Sep\.|Sept\.|Oct\.|Nov\.|Dec\.)",
"[Vv]s",
"[Ff]f", # ff. typically used in footnotes, means "and following"
"[Ll]ib", # Lib. = Liber = Book
])
sub(r"(?<!(?:\.|\B))(No\.)(\s+[0-9]+)", r"<abbr>\1</abbr>\2")
sub(r"(?<!(?:\.|\B))([Cc])hap\. ([0-9])", r"<abbr>\1hap.</abbr> \2") # The number allows us to avoid phrases like `Hello, old chap.`
sub(r"(?<!(?:\.|\B))(P\.(?:P\.)?S\.(?:S\.)?\B)", r"""<abbr epub:type="z3998:initialism">\1</abbr>""")
sub(r"(?<!(?:\.|\B))inst\.", r"""<abbr xml:lang="la">inst.</abbr>""") # `inst.` is short for `instante mense` but it is not italicized
sub(r"(?<!(?:\.|\B))([Ii])\.e\.", r"""<abbr epub:type="z3998:initialism">\1.e.</abbr>""")
sub(r"(?<!(?:\.|\B))([Ee])\.g\.", r"""<abbr epub:type="z3998:initialism">\1.g.</abbr>""")
sub(r"(?<!(?:\.|\B))\bN\.?B\.\B", r"""<abbr epub:type="z3998:initialism">N.B.</abbr>""")
sub(r"(?<!(?:\.|\B))Ph\.?\s*D\.?", r"""<abbr epub:type="z3998:name-title">Ph. D.</abbr>""")
sub(r"(?<!(?:\.|\B))(?:IOU(?:\.|\b)|I\.O\.U\.)", r"""<abbr epub:type="z3998:initialism">I.O.U.</abbr>""")
sub(r"(?<!(?:\.|\B))\b([1-4]D)\b", r"""<abbr epub:type="z3998:initialism">\1</abbr>""")
sub(r"(?<!(?:\.|\B))(Thos\.|Jas\.|Chas\.|Wm\.)", r"""<abbr epub:type="z3998:given-name">\1</abbr>""")
sub(r"(?<!(?:\.|\B))([ap])\.\s?m\.", r"<abbr>\1.m.</abbr>")
sub(r"(?<!(?:\.|\B))(4to|8vo|12mo|16mo|18mo|32mo|48mo|64mo)(?:\.(\s+\p{Lowercase_Letter}))?", r"<abbr>\1</abbr>\2") # Book sizes
sub(r"(?<!(?:\.|\B))([0-9]{1,2})\s?[Aa]\.?\s?[Mm](?:\.|\b)", r"\1 <abbr>a.m.</abbr>")
sub(r"(?<!(?:\.|\B))([0-9]{1,2})\s?[Pp]\.?\s?[Mm](?:\.|\b)", r"\1 <abbr>p.m.</abbr>")
# this should be placed after the am/pm test, to prevent tagging just the p. in "p. m."
sub(r"(?<!(?:\.|\B))p(p?)\.([\s0-9])", r"<abbr>p\1.</abbr>\2")
# keep a period after TV that terminates a clause
if el.tag == "p":
sub(r"(?<!(?:\.|\B))T\.?V\.([”’]?)$", r"""<abbr epub:type="z3998:initialism">TV</abbr>.\1""")
sub(r"(?<!(?:\.|\B))T\.?V\.(\s+[“‘]?[\p{Uppercase_Letter}])", r"""<abbr epub:type="z3998:initialism">TV</abbr>.\1""")
# otherwise, get rid of any periods in TV
sub(r"(?<!(?:\.|\B))(?:TV\b|T\.V\.\B)", r"""<abbr epub:type="z3998:initialism">TV</abbr>""")
# keep a period after AD/BC that terminates a clause
if el.tag == "p":
sub(r"(?<!(?:\.|\B))A\.?D\.([”’]?)$", r"""<abbr epub:type="se:era">AD</abbr>.\1""")
sub(r"(?<!(?:\.|\B))B\.?C\.([”’]?)$", r"""<abbr epub:type="se:era">BC</abbr>.\1""")
sub(r"(?<!(?:\.|\B))A\.?D\.([”’]?</p>|\s+[“‘]?[\p{Uppercase_Letter}])", r"""<abbr epub:type="se:era">AD</abbr>.\1""")
sub(r"(?<!(?:\.|\B))B\.?C\.([”’]?</p>|\s+[“‘]?[\p{Uppercase_Letter}])", r"""<abbr epub:type="se:era">BC</abbr>.\1""")
# otherwise, get rid of any periods in AD/BC
sub(r"(?<!(?:\.|\B))(?:AD\b|A\.D\.\B)", r"""<abbr epub:type="se:era">AD</abbr>""")
sub(r"(?<!(?:\.|\B))(?:BC\b|B\.C\.\B)", r"""<abbr epub:type="se:era">BC</abbr>""")
# Wrap £sd shorthand
sub(r"([0-9½¼¾⅙⅚⅛⅜⅝⅞]+)([sd]\.)", r"\1<abbr>\2</abbr>")
# Add abbrevations around some SI measurements
sub(r"([0-9]+)\s*([cmk][mgl])\b", fr"\1{se.NO_BREAK_SPACE}<abbr>\2</abbr>")
# Add abbrevations around Imperial measurements
sub(r"(?<![\$£0-9,])([0-9½¼⅙⅚⅛⅜⅝⅞]+)\s*(ft|yd|mi|pt|qt|gal|oz|lbs)\.?\b", fr"\1{se.NO_BREAK_SPACE}<abbr>\2.</abbr>")
# Handle `in.` separately to require a period, because with an optional period there are too many false positives
sub(r"(?<![\$£0-9,])([0-9½¼⅙⅚⅛⅜⅝⅞]+)\s*in\.(\b|\s)", fr"\1{se.NO_BREAK_SPACE}<abbr>in.</abbr>")
# Tweak some other Imperial measurements
sub(r"([0-9]+)\s*m\.?p\.?h\.?", fr"\1{se.NO_BREAK_SPACE}<abbr>mph</abbr>", flags=regex.IGNORECASE)
sub(r"([0-9]+)\s*h\.?p\.?", fr"\1{se.NO_BREAK_SPACE}<abbr>hp</abbr>", flags=regex.IGNORECASE)
if el.tag == "abbr":
# add eoc (End Of Clause) class
eoc = False
# sub(r"<abbr>etc\.</abbr>([”’]?(?:</p>|\s+[“‘]?[\p{Uppercase_Letter}]))", r"""<abbr class="eoc">etc.</abbr>\1""")
# sub(r"""<abbr( epub:type="[^"]+")?>([^<]+\.)</abbr>([”’]?</p>)""", r"""<abbr class="eoc"\1>\2</abbr>\3""")
if el.text == "etc." and not el.children:
if el.tail and regex.match(r"[”’]?\s+[“‘]?[\p{Uppercase_Letter}]", el.tail):
eoc = True
if el.text.endswith(".") and not el.children:
if el.parent.tag == "p" and el.next is None and (not el.tail or el.tail in "”’"):
eoc = True
if eoc and not "eoc" in (el.get_attr("class") or ""):
el.add_attr_value("class", "eoc")
# sort attributes
el.attrs = dict(el.attrs)
if "abbr" not in ancestors and "span" not in ancestors:
# Get Roman numerals >= 2 characters
# Ignore "numerals" followed by a dash, as they are more likely something like `x-ray` or `v-shaped`
# Note that `j` may occur only at the end of a numeral as an old-fashioned terminal `i`, like int `ij` (2), `vij` (7)
sub(r"([^\p{Letter}])([ixvIXV]{2,}j?)(\b[^\-]|st\b|nd\b|rd\b|th\b)", r"""\1<span epub:type="z3998:roman">\2</span>\3""")
# Get Roman numerals that are X or V and single characters. We can't do I for obvious reasons.
sub(r"""([^\p{Letter}\"])([vxVX])(\b[^\-]|st\b|nd\b|rd\b|th\b)""", r"""\1<span epub:type="z3998:roman">\2</span>\3""")
# We can assume a lowercase i is always a Roman numeral unless followed by ’
sub(r"""([^\p{Letter}<>/\"])i\b(?!’)""", r"""\1<span epub:type="z3998:roman">i</span>""")
def _semanticate_todo():
"""
Temporary holding place for unconverted functions from the old semanticate.
"""
# Fix obscured names starting with I, V, or X
xhtml = regex.sub(fr"""<span epub:type="z3998:roman">([IVX])</span>{se.WORD_JOINER}⸺""", fr"""\1{se.WORD_JOINER}⸺""", xhtml)
# Fix some possible errors introduced by the above
xhtml = regex.sub(fr"((?:[Nn]o\.|[Nn]umber)\s[0-9]+){se.NO_BREAK_SPACE}<abbr>in\.</abbr>", r"\1 in", xhtml)
# We may have added HTML tags within title tags. Remove those here
matches = regex.findall(r"<title>.+?</title>", xhtml)
if matches:
xhtml = regex.sub(r"<title>.+?</title>", f"<title>{se.formatting.remove_tags(matches[0])}</title>", xhtml)
return xhtml
def semanticate_old(xhtml: str) -> str:
"""
Add semantics to well-formed XHTML
INPUTS
xhtml: A string of well-formed XHTML
OUTPUTS
A string of XHTML with semantics added.
"""
# Some common abbreviations
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))(\L<titles>\.)", r"""<abbr epub:type="z3998:name-title">\1</abbr>""", xhtml, titles=[
"Capt",
"Col",
"Dr",
"Drs",
"Esq",
"Fr",
"Hon",
"Lieut",
"Lt",
"MM",
"Mdlle",
"Messers",
"Messrs",
"Mlle",
"Mlles",
"Mme",
"Mmes",
"Mon",
"Mr",
"Mrs",
"Ms",
"Prof",
"Pvt",
"Rev",
])
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))Bros\.", r"<abbr>Bros.</abbr>", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))Mt\.", r"<abbr>Mt.</abbr>", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))([Vv])ol(s?)\.", r"<abbr>\1ol\2.</abbr>", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))([Cc])hap\. ([0-9])", r"<abbr>\1hap.</abbr> \2", xhtml) # The number allows us to avoid phrases like `Hello, old chap.`
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>)|\.)(P\.(?:P\.)?S\.(?:S\.)?\B)", r"""<abbr epub:type="z3998:initialism">\1</abbr>""", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))Co\.", r"<abbr>Co.</abbr>", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))Inc\.", r"<abbr>Inc.</abbr>", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))Ltd\.", r"<abbr>Ltd.</abbr>", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))St\.", r"<abbr>St.</abbr>", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))([Gg])ov\.", r"<abbr>\1ov.</abbr>", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))MS(S?)\.", r"""<abbr>MS\1.</abbr>""", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))([Vv])iz\.", r"<abbr>\1iz.</abbr>", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))etc\.", r"<abbr>etc.</abbr>", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))inst\.", r"""<abbr xml:lang="la">inst.</abbr>""", xhtml) # `inst.` is short for `instante mense` but it is not italicized
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))([Cc])f\.", r"<abbr>\1f.</abbr>", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))ed\.", r"<abbr>ed.</abbr>", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))(Jan\.|Feb\.|Mar\.|Apr\.|Jun\.|Jul\.|Aug\.|Sep\.|Sept\.|Oct\.|Nov\.|Dec\.)", r"<abbr>\1</abbr>", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))No\.(\s+[0-9]+)", r"<abbr>No.</abbr>\1", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))([Vv])s\.", r"<abbr>\1s.</abbr>", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))([Ff])f\.", r"<abbr>\1f.</abbr>", xhtml) # ff. typically used in footnotes, means "and following"
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))([Ll])ib\.", r"<abbr>\1ib.</abbr>", xhtml) # Lib. = Liber = Book
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))([Ii])\.e\.", r"""<abbr epub:type="z3998:initialism">\1.e.</abbr>""", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))([Ee])\.g\.", r"""<abbr epub:type="z3998:initialism">\1.g.</abbr>""", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))\bN\.?B\.\B", r"""<abbr epub:type="z3998:initialism">N.B.</abbr>""", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))Ph\.?\s*D\.?", r"""<abbr epub:type="z3998:name-title">Ph. D.</abbr>""", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))(?:IOU(?:\.|\b)|I\.O\.U\.)", r"""<abbr epub:type="z3998:initialism">I.O.U.</abbr>""", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))\b([1-4]D)\b", r"""<abbr epub:type="z3998:initialism">\1</abbr>""", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))(Thos\.|Jas\.|Chas\.|Wm\.)", r"""<abbr epub:type="z3998:given-name">\1</abbr>""", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))([ap])\.\s?m\.", r"<abbr>\1.m.</abbr>", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))(4to|8vo|12mo|16mo|18mo|32mo|48mo|64mo)(?:\.(\s+\p{Lowercase_Letter}))?", r"<abbr>\1</abbr>\2", xhtml) # Book sizes
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))([0-9]{1,2})\s?[Aa]\.?\s?[Mm](?:\.|\b)", r"\1 <abbr>a.m.</abbr>", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))([0-9]{1,2})\s?[Pp]\.?\s?[Mm](?:\.|\b)", r"\1 <abbr>p.m.</abbr>", xhtml)
# this should be placed after the am/pm test, to prevent tagging just the p. in "p. m."
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))p(p?)\.([\s0-9])", r"<abbr>p\1.</abbr>\2", xhtml)
# keep a period after TV that terminates a clause
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))T\.?V\.([”’]?</p>|\s+[“‘]?[\p{Uppercase_Letter}])", r"""<abbr epub:type="z3998:initialism">TV</abbr>.\1""", xhtml)
# otherwise, get rid of any periods in TV
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))(?:TV\b|T\.V\.\B)", r"""<abbr epub:type="z3998:initialism">TV</abbr>""", xhtml)
# keep a period after AD/BC that terminates a clause
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))A\.?D\.([”’]?</p>|\s+[“‘]?[\p{Uppercase_Letter}])", r"""<abbr epub:type="se:era">AD</abbr>.\1""", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))B\.?C\.([”’]?</p>|\s+[“‘]?[\p{Uppercase_Letter}])", r"""<abbr epub:type="se:era">BC</abbr>.\1""", xhtml)
# otherwise, get rid of any periods in AD/BC
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))(?:AD\b|A\.D\.\B)", r"""<abbr epub:type="se:era">AD</abbr>""", xhtml)
xhtml = regex.sub(r"(?<!(?:\.|\B|\<abbr[^>]*?\>))(?:BC\b|B\.C\.\B)", r"""<abbr epub:type="se:era">BC</abbr>""", xhtml)
# Wrap £sd shorthand
xhtml = regex.sub(r"([0-9½¼¾⅙⅚⅛⅜⅝⅞]+)([sd]\.)", r"\1<abbr>\2</abbr>", xhtml)
# add eoc (End Of Clause) class
xhtml = regex.sub(r"<abbr>etc\.</abbr>([”’]?(?:</p>|\s+[“‘]?[\p{Uppercase_Letter}]))", r"""<abbr class="eoc">etc.</abbr>\1""", xhtml)
xhtml = regex.sub(r"""<abbr( epub:type="[^"]+")?>([^<]+\.)</abbr>([”’]?</p>)""", r"""<abbr class="eoc"\1>\2</abbr>\3""", xhtml)
# We may have added eoc classes twice, so remove duplicates here
xhtml = regex.sub(r"""<abbr class="(.*) eoc(\s+eoc)+">""", r"""<abbr class="\1 eoc">""", xhtml)
# Clean up nesting errors
xhtml = regex.sub(r"""<abbr class="eoc"><abbr>([^<]+)</abbr></abbr>""", r"""<abbr class="eoc">\1</abbr>""", xhtml)
xhtml = regex.sub(r"""class="eoc eoc""", r"""class="eoc""", xhtml)
# Get Roman numerals >= 2 characters
# We only wrap these if they're standalone (i.e. not already wrapped in a tag) to prevent recursion in multiple runs
# Ignore "numerals" followed by a dash, as they are more likely something like `x-ray` or `v-shaped`
# Note that `j` may occur only at the end of a numeral as an old-fashioned terminal `i`, like int `ij` (2), `vij` (7)
xhtml = regex.sub(r"([^\p{Letter}>])([ixvIXV]{2,}j?)(\b[^\-]|st\b|nd\b|rd\b|th\b)", r"""\1<span epub:type="z3998:roman">\2</span>\3""", xhtml)
# Get Roman numerals that are X or V and single characters. We can't do I for obvious reasons.
xhtml = regex.sub(r"""([^\p{Letter}>\"])([vxVX])(\b[^\-]|st\b|nd\b|rd\b|th\b)""", r"""\1<span epub:type="z3998:roman">\2</span>\3""", xhtml)
# We can assume a lowercase i is always a Roman numeral unless followed by ’
xhtml = regex.sub(r"""([^\p{Letter}<>/\"])i\b(?!’)""", r"""\1<span epub:type="z3998:roman">i</span>""", xhtml)
# Fix obscured names starting with I, V, or X
xhtml = regex.sub(fr"""<span epub:type="z3998:roman">([IVX])</span>{se.WORD_JOINER}⸺""", fr"""\1{se.WORD_JOINER}⸺""", xhtml)
# Add abbrevations around some SI measurements
xhtml = regex.sub(r"([0-9]+)\s*([cmk][mgl])\b", fr"\1{se.NO_BREAK_SPACE}<abbr>\2</abbr>", xhtml)
# Add abbrevations around Imperial measurements
xhtml = regex.sub(r"(?<![\$£0-9,])([0-9½¼⅙⅚⅛⅜⅝⅞]+)\s*(ft|yd|mi|pt|qt|gal|oz|lbs)\.?\b", fr"\1{se.NO_BREAK_SPACE}<abbr>\2.</abbr>", xhtml)
# Handle `in.` separately to require a period, because with an optional period there are too many false positives
xhtml = regex.sub(r"(?<![\$£0-9,])([0-9½¼⅙⅚⅛⅜⅝⅞]+)\s*in\.(\b|\s)", fr"\1{se.NO_BREAK_SPACE}<abbr>in.</abbr>", xhtml)
# Fix some possible errors introduced by the above
xhtml = regex.sub(fr"((?:[Nn]o\.|[Nn]umber)\s[0-9]+){se.NO_BREAK_SPACE}<abbr>in\.</abbr>", r"\1 in", xhtml)
# Tweak some other Imperial measurements
xhtml = regex.sub(r"([0-9]+)\s*m\.?p\.?h\.?", fr"\1{se.NO_BREAK_SPACE}<abbr>mph</abbr>", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"([0-9]+)\s*h\.?p\.?", fr"\1{se.NO_BREAK_SPACE}<abbr>hp</abbr>", xhtml, flags=regex.IGNORECASE)
# We may have added HTML tags within title tags. Remove those here
matches = regex.findall(r"<title>.+?</title>", xhtml)
if matches:
xhtml = regex.sub(r"<title>.+?</title>", f"<title>{se.formatting.remove_tags(matches[0])}</title>", xhtml)
return xhtml
def get_flesch_reading_ease(xhtml: str) -> float:
"""
Get the Flesch reading ease of some XHTML.
INPUTS
text: A string of XHTML to calculate the reading ease of.
OUTPUTS
A float representing the Flesch reading ease of the text.
"""
# Add a full stop to sentences that don’t end in punctuation
# This is primarily for free-form poetry like Mina Loy’s, where the
# reading score can end up being extremely low without this.
xhtml = regex.sub(r"([A-Za-z])(<\/span>\n)*\s*</p>", r"\1.\2</p>", xhtml)
# Remove HTML tags
text = regex.sub(r"<title>.+?</title>", " ", xhtml)
text = regex.sub(r"<.+?>", " ", text, flags=regex.DOTALL)
# Remove non-sentence-ending punctuation from source text
included_characters = list(string.whitespace) + list(string.digits) + [":", ";", ".", "?", "!"]
processed_text = regex.sub(r"[—–\n]", " ", text.lower())
processed_text = "".join(c for c in processed_text if c.isalpha() or c in included_characters).strip()
# Remove accents
processed_text = "".join(c for c in unicodedata.normalize("NFD", processed_text) if unicodedata.category(c) != "Mn")
# Get word count
word_count = se.formatting.get_word_count(processed_text)
if word_count <= 0:
word_count = 1
# Get average sentence length
ignore_count = 0
sentences = regex.split(r" *[\.\?!]['\"\)\]]* *", processed_text)
for sentence in sentences:
if se.formatting.get_word_count(sentence) <= 2:
ignore_count = ignore_count + 1
sentence_count = len(sentences) - ignore_count
if sentence_count <= 0:
sentence_count = 1
average_sentence_length = round(float(word_count) / float(sentence_count), 1)
# Get average syllables per word
syllable_count = 0
for word in processed_text.split():
syllable_count += _get_syllable_count(word)
average_syllables_per_word = round(float(syllable_count) / float(word_count), 1)
return round(206.835 - float(1.015 * average_sentence_length) - float(84.6 * average_syllables_per_word), 2)
def _get_syllable_count(word: str) -> int:
"""
Helper function to get the syllable count of a word.
"""
# See http://eayd.in/?p=232
exception_add = ["serious", "crucial"]
exception_del = ["fortunately", "unfortunately"]
co_one = ["cool", "coach", "coat", "coal", "count", "coin", "coarse", "coup", "coif", "cook", "coign", "coiffe", "coof", "court"]
co_two = ["coapt", "coed", "coinci"]
pre_one = ["preach"]
syls = 0 # Added syllable number
disc = 0 # Discarded syllable number
# 1) if letters < 3: return 1
if len(word) <= 3:
syls = 1
return syls
# 2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end.
# if it has only 1 vowel or 1 set of consecutive vowels, discard. (like "speed", "fled" etc.)
if word[-2:] == "es" or word[-2:] == "ed":
double_and_triple_1 = len(regex.findall(r"[eaoui][eaoui]", word))
if double_and_triple_1 > 1 or len(regex.findall(r"[eaoui][^eaoui]", word)) > 1:
if word[-3:] == "ted" or word[-3:] == "tes" or word[-3:] == "ses" or word[-3:] == "ied" or word[-3:] == "ies":
pass
else:
disc += 1
# 3) discard trailing "e", except where ending is "le"
le_except = ["whole", "mobile", "pole", "male", "female", "hale", "pale", "tale", "sale", "aisle", "whale", "while"]
if word[-1:] == "e":
if word[-2:] == "le" and word not in le_except:
pass
else:
disc += 1
# 4) check if consecutive vowels exists, triplets or pairs, count them as one.
double_and_triple = len(regex.findall(r"[eaoui][eaoui]", word))
tripple = len(regex.findall(r"[eaoui][eaoui][eaoui]", word))
disc += double_and_triple + tripple
# 5) count remaining vowels in word.
num_vowels = len(regex.findall(r"[eaoui]", word))
# 6) add one if starts with "mc"
if word[:2] == "mc":
syls += 1
# 7) add one if ends with "y" but is not surrouned by vowel
if word[-1:] == "y" and word[-2] not in "aeoui":
syls += 1
# 8) add one if "y" is surrounded by non-vowels and is not in the last word.
for i, j in enumerate(word):
if j == "y":
if (i != 0) and (i != len(word) - 1): # pylint: disable=consider-using-in
if word[i - 1] not in "aeoui" and word[i + 1] not in "aeoui":
syls += 1
# 9) if starts with "tri-" or "bi-" and is followed by a vowel, add one.
if word[:3] == "tri" and word[3] in "aeoui":
syls += 1
if word[:2] == "bi" and word[2] in "aeoui":
syls += 1
# 10) if ends with "-ian", should be counted as two syllables, except for "-tian" and "-cian"
if word[-3:] == "ian":
# and (word[-4:] != "cian" or word[-4:] != "tian"):
if word[-4:] == "cian" or word[-4:] == "tian":
pass
else:
syls += 1
# 11) if starts with "co-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.
if word[:2] == "co" and word[2] in "eaoui":
if word[:4] in co_two or word[:5] in co_two or word[:6] in co_two:
syls += 1
elif word[:4] in co_one or word[:5] in co_one or word[:6] in co_one:
pass
else:
syls += 1
# 12) if starts with "pre-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.
if word[:3] == "pre" and word[3] in "eaoui":
if word[:6] in pre_one:
pass
else:
syls += 1
# 13) check for "-n't" and cross match with dictionary to add syllable.
negative = ["doesn't", "isn't", "shouldn't", "couldn't", "wouldn't", "doesn’t", "isn’t", "shouldn’t", "couldn’t", "wouldn’t"]
if word[-3:] == "n't" or word[-3:] == "n’t":
if word in negative:
syls += 1
else:
pass
# 14) Handling the exceptional words.
if word in exception_del:
disc += 1
if word in exception_add:
syls += 1
# Calculate the output
return num_vowels - disc + syls
def get_word_count(xhtml: str) -> int:
"""
Get the word count from an XHTML string.
INPUTS
xhtml: A string of XHTML
OUTPUTS
The number of words in the XHTML string.
"""
# Remove MathML
xhtml = regex.sub(r"<(m:)?math.+?</(m:)?math>", " ", xhtml)
# Remove HTML tags
xhtml = regex.sub(r"<title[^>]*?>.+?</title>", " ", xhtml)
xhtml = regex.sub(r"<.+?>", " ", xhtml, flags=regex.DOTALL)
# Replace some formatting characters
xhtml = regex.sub(r"[…–—― ‘’“”\{\}\(\)]", " ", xhtml)
# Remove word-connecting dashes, apostrophes, commas, and slashes (and/or), they count as a word boundry but they shouldn't
xhtml = regex.sub(fr"[\p{{Letter}}0-9][\-\'\,\.\/{se.NO_BREAK_HYPHEN}{se.SHY_HYPHEN}][\p{{Letter}}0-9]", "aa", xhtml)
# Replace sequential spaces with one space
xhtml = regex.sub(r"\s+", " ", xhtml)
# Get the word count
return len(regex.findall(r"\b\w+\b", xhtml))
def _replace_character_references(match_object) -> str:
"""Replace most XML character references with literal characters.
This function excludes ", ', &, >, and < (&, <, and >), since
un-escaping them would create an invalid document.
"""
entity = match_object.group(0).lower()
retval = entity
# Explicitly whitelist the six (nine) essential character references
try:
if entity in [">", "<", "&", """, "'", ">", "<", "&", ">", "<", "&"]:
retval = entity
# Convert base 16 references
elif entity.startswith("&#x"):
retval = chr(int(entity[3:-1], 16))
# Convert base 10 references
elif entity.startswith("&#"):
retval = chr(int(entity[2:-1]))
# Convert named references
else:
retval = html.entities.html5[entity[1:]]
except (ValueError, KeyError):
pass
return retval
def _indent(tree, space="\t"):
"""
Indent an lxml tree using the given space characters.
"""
if len(tree) > 0:
level = 0
indentation = "\n" + level * space
_indent_children(tree, 1, space, [indentation, indentation + space])
else:
tree.text = "\n"
def _indent_children(elem, level, one_space, indentations, has_child_tails=False):
"""
Recursive helper function implementing indent levels for lxml tree.
"""
# Reuse indentation strings for speed.
if len(indentations) <= level:
indentations.append(indentations[-1] + one_space)
# Start a new indentation level for the first child.
child_indentation = indentations[level]
# Check if any children have tail content
if not has_child_tails:
if len(elem) > 0 and elem.text and not regex.match(r"^[\n\t ]+$", elem.text):
has_child_tails = True
else:
for child in elem:
if (child.tail and not regex.match(r"^[\n\t ]+$", child.tail)):
has_child_tails = True
break
# If elem text is empty, start a new indentation level
if not elem.text or regex.match(r"^[\n\t ]+$", elem.text):
if has_child_tails:
elem.text = ""
else:
elem.text = child_indentation
else:
_unwrap_text(elem, remove_trailing_space=False)
# Recursively indent all children.
for child in elem:
if len(child) > 0:
if has_child_tails:
next_level = level
else:
next_level = level + 1
_indent_children(child, next_level, one_space, indentations, has_child_tails)
next_child = child.getnext()
# Remove line wraps and extra whitespace from child text (except meta tags)
if child.text and not regex.match(r"^[\n\t ]+$", child.text):
if child.tag is etree.Comment:
child.text = regex.sub(r" *\n[\n\t ]*", child_indentation, child.text)
elif child.tag != "{http://www.idpf.org/2007/opf}meta":
_unwrap_text(child, remove_trailing_space=True)
child.text = regex.sub(r"[\t ]+", " ", child.text)
# Handle different cases for indentation in child tail content
if not child.tail or regex.match(r"^[\n\t ]+$", child.tail):
if next_child is None:
if has_child_tails:
child.tail = ""
else:
child_indentation = indentations[level - 1]
child.tail = child_indentation
elif child.tag == "{http://www.w3.org/1999/xhtml}br":
if has_child_tails:
child_indentation = indentations[level - 1]
child.tail = child_indentation
elif not has_child_tails and next_child.tag == "{http://www.w3.org/1999/xhtml}br":
child.tail = child_indentation
elif not has_child_tails and not child.tail and next_child.tag in PHRASING_TAGS:
child.tail = ""
elif has_child_tails:
if not child.tail or next_child.tag == "{http://www.w3.org/1999/xhtml}br":
child.tail = ""
else:
child.tail = " "
else:
child.tail = child_indentation
else:
# Remove line wraps and extra whitespace in child tail
_unwrap_tail(child, remove_trailing_space=next_child is None)
child.tail = regex.sub(r"[\t ]+", " ", child.tail)
# Add special indentation for br tag with non-empty tail
if child.tag == "{http://www.w3.org/1999/xhtml}br":
child_indentation = indentations[level - 1]
child.tail = child_indentation + child.tail
def _unwrap_text(elem: etree.Element, remove_trailing_space: bool):
"""
Remove line wraps from text content of element.
"""
elem.text = regex.sub(r"^\n[\n\t ]*", "", elem.text)
if remove_trailing_space:
elem.text = regex.sub(r"\n[\n\t ]*$", "", elem.text)
elem.text = regex.sub(r" *\n[\n\t ]*", " ", elem.text)
def _unwrap_tail(elem: etree.Element, remove_trailing_space: bool):
"""
Remove line wraps from tail content of element.
"""
if elem.tag == "{http://www.w3.org/1999/xhtml}br":
elem.tail = regex.sub(r"^\n[\n\t ]*", "", elem.tail)
else:
elem.tail = regex.sub(r"^\n[\n\t ]*", " ", elem.tail)
if remove_trailing_space:
elem.tail = regex.sub(r"\n[\n\t ]*$", "", elem.tail)
elem.tail = regex.sub(r" *\n[\n\t ]*", " ", elem.tail)
def format_xml_file(filename: Path) -> None:
"""
Pretty-print well-formed XML and save to file.
Detects if the filename is XHTML, SVG, OPF, or plain XML and adjusts formatting accordingly.
INPUTS
filename: A file containing well-formed XML
OUTPUTS
None.
"""
with open(filename, "r+", encoding="utf-8") as file:
xml = file.read()
if filename.suffix == ".xhtml":
processed_xml = se.formatting.format_xhtml(xml)
elif filename.suffix == ".svg":
processed_xml = se.formatting.format_svg(xml)
elif filename.suffix == ".opf":
processed_xml = se.formatting.format_opf(xml)
else:
processed_xml = se.formatting.format_xml(xml)
if processed_xml != xml:
file.seek(0)
file.write(processed_xml)
file.truncate()
def _format_style_elements(tree: etree.ElementTree):
"""
Find <style> elements in an XML etree, and pretty-print the CSS inside of them.
The passed tree is modified in-place.
INPUTS
tree: An XML etree.
OUTPUTS
None.
"""
try:
for node in tree.xpath("//svg:style", namespaces={"xhtml": "http://www.w3.org/1999/xhtml", "svg": "http://www.w3.org/2000/svg"}):
css = format_css(node.text)
# Get the <style> element's indentation
indent = node.xpath("./preceding-sibling::text()[1]")[0].replace("\n", "")
# Indent the CSS one level deeper than the <style> element
css = "".join(indent + "\t" + line + "\n" for line in css.splitlines())
css = css.strip("\n")
css = regex.sub(r"^\s+$", "", css, flags=regex.MULTILINE) # Remove indents from lines that are just white space
node.text = "\n" + css + "\n" + indent
except se.InvalidCssException as ex:
raise ex
except Exception as ex:
raise se.InvalidCssException(f"Couldn’t parse CSS. Exception: {ex}")
def _format_xml_str(xml: str) -> etree.ElementTree:
"""
Given a string of well-formed XML, return a pretty-printed etree.
INPUTS
xml: A string of well-formed XML.
OUTPUTS
An etree representing the pretty-printed XML.
"""
# huge_tree allows XML files of arbitrary size, like Ulysses S. Grant
custom_parser = etree.XMLParser(huge_tree=True)
tree = etree.fromstring(str.encode(xml), parser=custom_parser)
canonical_bytes = etree.tostring(tree, method="c14n")
tree = etree.fromstring(canonical_bytes, parser=custom_parser)
_indent(tree, space="\t")
# Remove white space around attribute values
for node in tree.xpath("//*[attribute::*[re:test(., '^\\s+') or re:test(., '\\s+$')]]", namespaces={"re": "http://exslt.org/regular-expressions"}):
for attribute in node.keys():
value = node.get(attribute)
value = regex.sub(r"^\s+", "", value)
value = regex.sub(r"\s+$", "", value)
node.set(attribute, value)
return tree
def _xml_tree_to_string(tree: etree.ElementTree, doctype: Optional[str] = None) -> str:
"""
Given an XML etree, return a string representing the etree's XML.
INPUTS
tree: An XML etree.
OUTPUTS
A string representing the etree's XML.
"""
xml = """<?xml version="1.0" encoding="utf-8"?>\n""" + etree.tostring(tree, encoding="unicode", doctype=doctype) + "\n"
# Normalize unicode characters
xml = unicodedata.normalize("NFC", xml)
return xml
def format_xml(xml: str) -> str:
"""
Pretty-print well-formed XML.
INPUTS
xml: A string of well-formed XML.
OUTPUTS
A string of pretty-printed XML.
"""
try:
tree = _format_xml_str(xml)
except Exception as ex:
raise se.InvalidXmlException(f"Couldn’t parse XML file. Exception: {ex}")
# Pull out the doctype if there is one, as etree seems to eat it
doctypes = regex.search(r"<!doctype[^>]+?>", xml, flags=regex.IGNORECASE)
return _xml_tree_to_string(tree, doctypes.group(0) if doctypes else None)
def format_xhtml(xhtml: str) -> str:
"""
Pretty-print well-formed XHTML.
INPUTS
xhtml: A string of well-formed XHTML
OUTPUTS
A string of pretty-printed XHTML.
"""
namespaces = {"xhtml": "http://www.w3.org/1999/xhtml", "epub": "http://www.idpf.org/2007/ops", "re": "http://exslt.org/regular-expressions"} # re enables regular expressions in xpath
# Epub3 doesn't allow named entities, so convert them to their unicode equivalents
# But, don't unescape the metadata file long-description accidentally
xhtml = regex.sub(r"&#?\w+;", _replace_character_references, xhtml)
# Remove unnecessary doctypes which can cause xmllint to hang
xhtml = regex.sub(r"<!DOCTYPE[^>]+?>", "", xhtml)
# Remove white space between opening/closing tag and text nodes
# We do this first so that we can still format line breaks after <br/>
# Exclude comments
xhtml = regex.sub(r"(<(?:[^!/][^>]*?[^/]|[a-z])>)\s+([^\s<])", r"\1\2", xhtml, flags=regex.IGNORECASE)
xhtml = regex.sub(r"([^\s>])\s+(</[^>]+?>)", r"\1\2", xhtml)
try:
tree = _format_xml_str(xhtml)
except Exception as ex:
raise se.InvalidXhtmlException(f"Couldn’t parse XHTML file. Exception: {ex}")
# Lowercase attribute names
for node in tree.xpath("//*[attribute::*[re:test(local-name(), '[A-Z]')]]", namespaces=namespaces):
for key, value in node.items(): # Iterate over attributes
node.attrib.pop(key) # Remove the attribute
node.attrib[key.lower()] = value # Re-add the attribute, lowercased
# Sort classes alphabetically, except the "eoc" class always comes last
for node in tree.xpath("//*[re:test(@class, '\\s')]", namespaces=namespaces):
# Sort class elements
classes = regex.split(r"\s+", node.get("class"))
classes = sorted(classes, key=str.lower)
# Move eoc to the end, if it exists
if "eoc" in classes:
classes += [classes.pop(classes.index("eoc"))]
# Set the new class value
node.set("class", " ".join(classes))
# Lowercase tag names
for node in tree.xpath("//*[re:test(local-name(), '[A-Z]')]", namespaces=namespaces):
node.tag = node.tag.lower()
# Format <style> elements
_format_style_elements(tree)
# Remove white space between non-tags and <br/>
xhtml = regex.sub(r"([^>\s])\s+<br/>", r"\1<br/>", _xml_tree_to_string(tree))
return xhtml
def format_opf(xml: str) -> str:
"""
Pretty-print well-formed OPF XML.
INPUTS
xml: A string of well-formed OPF XML
OUTPUTS
A string of pretty-printed XML.
"""
# Replace html entities in the long description so we can clean it too.
# We re-establish them later. Don't use html.unescape because that will unescape
# things like & which would make an invalid XML document. (& may appear in translator info,
# or other parts of the metadata that are not the long description.
xml = xml.replace("<", "<")
xml = xml.replace(">", ">")
xml = xml.replace("&amp;", "&") # Unescape escaped ampersands, which appear in the long description only
# Canonicalize and format XML
try:
tree = _format_xml_str(xml)
except Exception as ex:
raise se.InvalidXmlException(f"Couldn’t parse OPF file. Exception: {ex}")
# Format the long description, then escape it
for node in tree.xpath("/opf:package/opf:metadata/opf:meta[@property='se:long-description']", namespaces={"opf": "http://www.idpf.org/2007/opf"}):
# Convert the node contents to escaped text.
xhtml = node.text # This preserves the initial newline and indentation
if xhtml is None:
xhtml = ""
for child in node:
xhtml += etree.tostring(child, encoding="unicode")
# After composing the string, lxml adds namespaces to every tag. The only way to remove them is with regex.
xhtml = regex.sub(r"\sxmlns(:.+?)?=\"[^\"]+?\"", "", xhtml)
# Make some easy fixes
xhtml = regex.sub(r"<p>\s+", "<p>", xhtml)
xhtml = regex.sub(r"\s+</p>", "</p>", xhtml)
# Remove the children so that we can replace them with the escaped xhtml
for child in node:
node.remove(child)
node.text = xhtml
return _xml_tree_to_string(tree)
def format_svg(svg: str) -> str:
"""
Pretty-print well-formed SVG XML.