forked from standardebooks/tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
simplify-tags
executable file
·184 lines (142 loc) · 8.32 KB
/
simplify-tags
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env python3
import argparse
import sys
import fnmatch
import os
import regex
import lxml.cssselect
import lxml.etree as etree
XHTML_NAMESPACES = {"xhtml": "http://www.w3.org/1999/xhtml", "epub": "http://www.idpf.org/2007/ops", "z3998": "http://www.daisy.org/z3998/2012/vocab/structure/", "se": "http://standardebooks.org/vocab/1.0"}
def namespace_to_class(selector):
return selector.replace(":", "-").replace("|", "-").replace("~=", "-").replace("[", ".").replace("]", "").replace("\"", "")
def simplify_css(css):
#First we replace :first-child selectors with a first-child class, since ADE doesn't handle them
#Currently this replacement isn't perfect, because occasionally lxml generates an xpath expression
#from the css selector that lxml itself can't evaluate, even though the `xpath` binary can!
#We don't *replace* the selector, we *add* it, because lxml has problems selecting first-child sometimes
css = regex.sub(r"((.+)\:first\-child(.*))", "\\2.first-child\\3,\n\\1", css)
css = css.replace("{,", ",")
css = css.replace(",,", ",")
#Now replace abbr styles with spans, because ADE screws up with unrecognized elements
css = css.replace("abbr", "span")
#Replace shorthand CSS with longhand properties, another ADE screwup
css = regex.sub(r"margin:\s*([^\s]+?)\s*;", "margin-top: \\1;\n\tmargin-right: \\1;\n\tmargin-bottom: \\1;\n\tmargin-left: \\1;", css)
css = regex.sub(r"margin:\s*([^\s]+?)\s+([^\s]+?)\s*;", "margin-top: \\1;\n\tmargin-right: \\2;\n\tmargin-bottom: \\1;\n\tmargin-left: \\2;", css)
css = regex.sub(r"margin:\s*([^\s]+?)\s+([^\s]+?)\s+([^\s]+?)\s*;", "margin-top: \\1;\n\tmargin-right: \\2;\n\tmargin-bottom: \\3;\n\tmargin-left: \\2;", css)
css = regex.sub(r"margin:\s*([^\s]+?)\s+([^\s]+?)\s+([^\s]+?)\s+([^\s]+?)\s*;", "margin-top: \\1;\n\tmargin-right: \\2;\n\tmargin-bottom: \\3;\n\tmargin-left: \\4;", css)
#Replace some more poorly-supported CSS attributes
css = css.replace("all-small-caps;", "small-caps;\n\ttext-transform: lowercase;")
#Replace CSS namespace selectors with classes
#For example, p[epub|type~="z3998:salutation"] becomes p.epub-type-z3998-salutation
result = regex.findall(r"\[epub\|type\~\=\"[^\"]*?\"\]", css)
for line in result:
fixed_line = namespace_to_class(line)
css = css.replace(line, fixed_line)
return css
def main():
parser = argparse.ArgumentParser(description="Simplify some HTML and CSS to be more compatible with crappier reading systems (ADE I'm looking at you...).")
parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")
parser.add_argument("directories", metavar="DIRECTORY", nargs="+", help="a Standard Ebooks source directory")
args = parser.parse_args()
for directory in args.directories:
directory = os.path.abspath(directory)
if not os.path.isdir(directory):
print("Error: Not a directory: {}".format(directory), file=sys.stderr)
exit(1)
if args.verbose:
print("Processing {} ...".format(directory), end="", flush=True)
total_css = ""
#Simplify the CSS first. Later we'll update the document to match our simplified selectors.
#While we're doing this, we store the original css into a single variable so we can extract the original selectors later.
for root, _, filenames in os.walk(directory):
for filename in fnmatch.filter(filenames, "*.css"):
with open(os.path.join(root, filename), "r+", encoding="utf-8") as file:
css = file.read()
#Before we do anything, we process a special case in core.css
if "core.css" in filename:
css = regex.sub(r"abbr{.+?}", "", css, flags=regex.MULTILINE | regex.DOTALL)
total_css = total_css + css + "\n"
file.seek(0)
file.write(simplify_css(css))
file.truncate()
#Now get a list of original selectors
#Remove CSS rules
total_css = regex.sub(r"{[^}]+}", "", total_css, flags=regex.MULTILINE)
#Remove trailing commas
total_css = regex.sub(r",", "", total_css)
#Remove comments
total_css = regex.sub(r"/\*.+?\*/", "", total_css, flags=regex.DOTALL)
#Remove @ defines
total_css = regex.sub(r"^@.+", "", total_css, flags=regex.MULTILINE)
#Construct a dictionary of the original selectors
selectors = set([line for line in total_css.splitlines() if line != ""])
#Get a list of .xhtml files to search
for root, _, filenames in os.walk(directory):
for filename in fnmatch.filter(filenames, "*.xhtml"):
#Don't mess with the ToC, since if we have ol/li > first-child selectors we could screw it up
if filename == "toc.xhtml":
continue
filename = os.path.join(root, filename)
with open(filename, "r+", encoding="utf-8") as file:
#We have to remove the default namespace declaration from our document, otherwise
#xpath won't find anything at all. See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python
xhtml = file.read().replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "")
processed_xhtml = xhtml
tree = etree.fromstring(str.encode(xhtml))
#Now iterate over each CSS selector and see if it's used in any of the files we found
for selector in selectors:
try:
sel = lxml.cssselect.CSSSelector(selector, translator="xhtml", namespaces=XHTML_NAMESPACES)
except lxml.cssselect.ExpressionError:
#This gets thrown if we use pseudo-elements, which lxml doesn't support
continue
#Convert <abbr> to <span>
if "abbr" in selector:
for element in tree.xpath(sel.path, namespaces=XHTML_NAMESPACES):
#Why would you want the tail to output by default?!?
raw_string = etree.tostring(element, encoding=str, with_tail=False)
#lxml--crap as usual--includes a bunch of namespace information in every element we print.
#Remove it heregex.
raw_string = raw_string.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "")
raw_string = raw_string.replace(" xmlns:epub=\"http://www.idpf.org/2007/ops\"", "")
#Now lxml doesn't let us modify the tree, so we just do a straight up regex replace to turn this into a span
processed_string = raw_string.replace("<abbr", "<span")
processed_string = processed_string.replace("</abbr", "</span")
#Now we have a nice, fixed string. But, since lxml can't replace elements, we write it ourselves.
processed_xhtml = processed_xhtml.replace(raw_string, processed_string)
tree = etree.fromstring(str.encode(processed_xhtml))
#Add a "first-child" class to elements that match any :first-child selectors
if ":first-child" in selector:
for element in tree.xpath(sel.path, namespaces=XHTML_NAMESPACES):
current_class = element.get("class")
if current_class is not None and "first-child" not in current_class:
current_class = current_class + " first-child"
else:
current_class = "first-child"
element.set("class", current_class)
for selector in selectors:
#We've already replaced attribute/namespace selectors with classes in the CSS, now add those classes to the matching elements
if "[epub|type" in selector:
result = regex.findall(r"\[epub\|type\~\=\"[^\"]*?\"\]", selector)
for namespace_selector in result:
sel = lxml.cssselect.CSSSelector(namespace_selector, translator="xhtml", namespaces=XHTML_NAMESPACES)
for element in tree.xpath(sel.path, namespaces=XHTML_NAMESPACES):
new_class = regex.sub(r"^\.", "", namespace_to_class(namespace_selector))
current_class = element.get("class")
if current_class is not None and new_class not in current_class:
current_class = current_class + " " + new_class
else:
current_class = new_class
element.set("class", current_class)
processed_xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + etree.tostring(tree, encoding=str, pretty_print=True)
#Now we just remove all stray abbr tags that were not styled by CSS
processed_xhtml = regex.sub(r"</?abbr[^>]*?>", "", processed_xhtml)
tree = etree.fromstring(str.encode(processed_xhtml))
if processed_xhtml != xhtml:
file.seek(0)
file.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + etree.tostring(tree, encoding=str, pretty_print=True).replace("<html", "<html xmlns=\"http://www.w3.org/1999/xhtml\""))
file.truncate()
if args.verbose:
print(" OK")
if __name__ == "__main__":
main()