-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdb2rst.py
executable file
·579 lines (462 loc) · 16.1 KB
/
db2rst.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
DocBook to ReST converter
=========================
This script may not work out of the box, but is easy to extend.
If you extend it, please send me a patch: wojdyr at gmail.
Docbook has >400 elements, most of them are not supported (yet).
``pydoc db2rst`` shows the list of supported elements.
In reST, inline markup can not be nested (major deficiency of reST).
Since it is not clear what to do with, say,
<subscript><emphasis>x</emphasis></subscript>
the script outputs incorrect (nested) reST (:sub:`*x*`)
and it is up to user to decide how to change it.
Usage: db2rst.py file.xml > file.rst
:copyright: 2009 by Marcin Wojdyr.
:license: BSD.
"""
# If this option is True, XML comment are discarded. Otherwise, they are
# converted to ReST comments.
# Note that ReST doesn't support inline comments. XML comments
# are converted to ReST comment blocks, what may break paragraphs.
REMOVE_COMMENTS = False
# id attributes of DocBook elements are translated to ReST labels.
# If this option is False, only labels that are used in links are generated.
WRITE_UNUSED_LABELS = True
import sys
import re
import lxml.etree as ET
# to avoid dupliate error reports
_not_handled_tags = set()
# to remember which id/labels are really needed
_linked_ids = set()
# to avoid duplicate substitutions
_substitutions = set()
# buffer that is flushed after the end of paragraph,
# used for ReST substitutions
_buffer = ""
def _main():
if len(sys.argv) != 2 or sys.argv[1] == '-h' or sys.argv[1] == '--help':
sys.stderr.write(__doc__)
sys.exit()
input_file = sys.argv[1]
sys.stderr.write("Parsing XML file `%s'...\n" % input_file)
parser = ET.XMLParser(remove_comments=REMOVE_COMMENTS)
tree = ET.parse(input_file, parser=parser)
for elem in tree.getiterator():
if elem.tag in ("xref", "link"):
_linked_ids.add(elem.get("linkend"))
print TreeRoot(tree.getroot()).encode('utf-8')
def _warn(s):
sys.stderr.write("WARNING: %s\n" % s)
def _supports_only(el, tags):
"print warning if there are unexpected children"
for i in el.getchildren():
if i.tag not in tags:
_warn("%s/%s skipped." % (el.tag, i.tag))
def _what(el):
"returns string describing the element, such as <para> or Comment"
if isinstance(el.tag, basestring):
return "<%s>" % el.tag
elif isinstance(el, ET._Comment):
return "Comment"
else:
return str(el)
def _has_only_text(el):
"print warning if there are any children"
if el.getchildren():
_warn("children of %s are skipped: %s" % (_get_path(el),
", ".join(_what(i) for i in el.getchildren())))
def _has_no_text(el):
"print warning if there is any non-blank text"
if el.text is not None and not el.text.isspace():
_warn("skipping text of <%s>: %s" % (_get_path(el), el.text))
for i in el.getchildren():
if i.tail is not None and not i.tail.isspace():
_warn("skipping tail of <%s>: %s" % (_get_path(i), i.tail))
def _conv(el):
"element to string conversion; usually calls element_name() to do the job"
if el.tag in globals():
s = globals()[el.tag](el)
assert s, "Error: %s -> None\n" % _get_path(el)
return s
elif isinstance(el, ET._Comment):
return Comment(el) if (el.text and not el.text.isspace()) else ""
else:
if el.tag not in _not_handled_tags:
_warn("Don't know how to handle <%s>" % el.tag)
#_warn(" ... from path: %s" % _get_path(el))
_not_handled_tags.add(el.tag)
return _concat(el)
def _no_special_markup(el):
return _concat(el)
def _remove_indent_and_escape(s):
"remove indentation from the string s, escape some of the special chars"
s = "\n".join(i.lstrip().replace("\\", "\\\\") for i in s.splitlines())
# escape inline mark-up start-string characters (even if there is no
# end-string, docutils show warning if the start-string is not escaped)
# TODO: handle also Unicode: ‘ “ ’ « ¡ ¿ as preceding chars
s = re.sub(r"([\s'\"([{</:-])" # start-string is preceded by one of these
r"([|*`[])" # the start-string
r"(\S)", # start-string is followed by non-whitespace
r"\1\\\2\3", # insert backslash
s)
return s
def _concat(el):
"concatate .text with children (_conv'ed to text) and their tails"
s = ""
id = el.get("id")
if id is not None and (WRITE_UNUSED_LABELS or id in _linked_ids):
s += "\n\n.. _%s:\n\n" % id
if el.text is not None:
s += _remove_indent_and_escape(el.text)
for i in el.getchildren():
s += _conv(i)
if i.tail is not None:
if len(s) > 0 and not s[-1].isspace() and i.tail[0] in " \t":
s += i.tail[0]
s += _remove_indent_and_escape(i.tail)
return s
def _original_xml(el):
return ET.tostring(el, with_tail=False)
def _no_markup(el):
s = ET.tostring(el, with_tail=False)
s = re.sub(r"<.+?>", " ", s) # remove tags
s = re.sub(r"\s+", " ", s) # replace all blanks with single space
return s
def _get_level(el):
"return number of ancestors"
return sum(1 for i in el.iterancestors())
def _get_path(el):
t = [el] + list(el.iterancestors())
return "/".join(str(i.tag) for i in reversed(t))
def _make_title(t, level):
if level == 1:
return "\n\n" + "=" * len(t) + "\n" + t + "\n" + "=" * len(t)
char = ["#", "=", "-", "~", "^", "." ]
return "\n\n" + t + "\n" + char[level-2] * len(t)
def _join_children(el, sep):
_has_no_text(el)
return sep.join(_conv(i) for i in el.getchildren())
def _block_separated_with_blank_line(el):
s = "\n\n" + _concat(el)
global _buffer
if _buffer:
s += "\n\n" + _buffer
_buffer = ""
return s
def _indent(el, indent, first_line=None, suppress_blank_line=False):
"returns indented block with exactly one blank line at the beginning"
start = "\n\n"
if suppress_blank_line:
start = ""
lines = [" "*indent + i for i in _concat(el).splitlines()
if i and not i.isspace()]
if first_line is not None:
# replace indentation of the first line with prefix `first_line'
lines[0] = first_line + lines[0][indent:]
return start + "\n".join(lines)
def _normalize_whitespace(s):
return " ".join(s.split())
################### DocBook elements #####################
# special "elements"
def TreeRoot(el):
output = _conv(el)
# remove trailing whitespace
output = re.sub(r"[ \t]+\n", "\n", output)
# leave only one blank line
output = re.sub(r"\n{3,}", "\n\n", output)
return output
def Comment(el):
return _indent(el, 12, ".. COMMENT: ")
# general inline elements
def emphasis(el):
return "*%s*" % _concat(el).strip()
phrase = emphasis
citetitle = emphasis
def firstterm(el):
_has_only_text(el)
return ":dfn:`%s`" % el.text
acronym = _no_special_markup
# links
def ulink(el):
url = el.get("url")
text = _concat(el).strip()
if text.startswith(".. image::"):
return "%s\n :target: %s\n\n" % (text, url)
elif url == text:
return text
elif not text:
return "`<%s>`_" % (url)
else:
return "`%s <%s>`_" % (text, url)
# TODO:
# put labels where referenced ids are
# e.g. <appendix id="license"> -> .. _license:\n<appendix>
# if the label is not before title, we need to give explicit title:
# :ref:`Link title <label-name>`
# (in DocBook was: the section called “Variables”)
def xref(el):
return ":ref:`%s`" % el.get("linkend")
def link(el):
return ":ref:`%s <%s>`" % (_concat(el).strip(), el.get("linkend"))
# math and media
# the DocBook syntax to embed equations is sick. Usually, (inline)equation is
# a (inline)mediaobject, which is imageobject + textobject
def inlineequation(el):
_supports_only(el, ("inlinemediaobject",))
return _concat(el).strip()
def informalequation(el):
_supports_only(el, ("mediaobject",))
return _concat(el)
def equation(el):
_supports_only(el, ("title", "mediaobject"))
title = el.find("title")
if title is not None:
s = "\n\n**%s:**" % _concat(title).strip()
else:
s = ""
for mo in el.findall("mediaobject"):
s += "\n" + _conv(mo)
return s
def mediaobject(el, substitute=False):
global _substitutions
_supports_only(el, ("imageobject", "textobject"))
# i guess the most common case is one imageobject and one (or none)
alt = ""
for txto in el.findall("textobject"):
_supports_only(txto, ("phrase",))
if alt:
alt += "; "
alt += _normalize_whitespace(_concat(txto.find("phrase")))
symbols = []
img = ""
for imgo in el.findall("imageobject"):
_supports_only(imgo, ("imagedata",))
fileref = imgo.find("imagedata").get("fileref")
s = "\n\n.. image:: %s" % fileref
if (alt):
s += "\n :alt: %s" % alt
if substitute:
if fileref not in _substitutions:
img += s[:4] + " |%s|" % fileref + s[4:] # insert |symbol|
_substitutions.add(fileref)
symbols.append(fileref)
else:
img += s
img += "\n\n"
if substitute:
return img, symbols
else:
return img
def inlinemediaobject(el):
global _buffer
subst, symbols = mediaobject(el, substitute=True)
_buffer += subst
return "".join("|%s|" % i for i in symbols)
def subscript(el):
return "\ :sub:`%s`" % _concat(el).strip()
def superscript(el):
return "\ :sup:`%s`" % _concat(el).strip()
# GUI elements
def menuchoice(el):
if all(i.tag in ("guimenu", "guimenuitem") for i in el.getchildren()):
_has_no_text(el)
return ":menuselection:`%s`" % \
" --> ".join(i.text for i in el.getchildren())
else:
return _concat(el)
def guilabel(el):
_has_only_text(el)
return ":guilabel:`%s`" % el.text.strip()
guiicon = guilabel
guimenu = guilabel
guimenuitem = guilabel
mousebutton = _no_special_markup
# system elements
def keycap(el):
_has_only_text(el)
return ":kbd:`%s`" % el.text
def application(el):
_has_only_text(el)
return ":program:`%s`" % el.text.strip()
def userinput(el):
return "``%s``" % _concat(el).strip()
systemitem = userinput
prompt = userinput
def filename(el):
_has_only_text(el)
return ":file:`%s`" % el.text
def command(el):
return ":command:`%s`" % _concat(el).strip()
def parameter(el):
if el.get("class"): # this hack is specific for fityk manual
return ":option:`%s`" % _concat(el).strip()
return emphasis(el)
replaceable = emphasis
def cmdsynopsis(el):
# just remove all markup and remember to change it manually later
return "\n\nCMDSYN: %s\n" % _no_markup(el)
# programming elements
def function(el):
#_has_only_text(el)
#return ":func:`%s`" % _concat(el)
return "``%s``" % _concat(el).strip()
def constant(el):
_has_only_text(el)
#return ":constant:`%s`" % el.text
return "``%s``" % el.text.strip()
varname = constant
# popular block elements
def title(el):
# Titles in some elements may be handled from the title's parent.
t = _concat(el).strip()
level = _get_level(el)
parent = el.getparent().tag
## title in elements other than the following will trigger assertion
#if parent in ("book", "chapter", "section", "variablelist", "appendix"):
return _make_title(t, level)
def screen(el):
return "\n::\n" + _indent(el, 4) + "\n"
literallayout = screen
def blockquote(el, first_line=None, suppress_blank_line=False):
return _indent(el, 4, first_line, suppress_blank_line)
book = _no_special_markup
article = _no_special_markup
para = _block_separated_with_blank_line
section = _block_separated_with_blank_line
appendix = _block_separated_with_blank_line
chapter = _block_separated_with_blank_line
# lists
def itemizedlist(el, bullet="-"):
# ItemizedList ::= (ListItem+)
s = ""
for i in el.getchildren():
s += _indent(i, len(bullet) + 1, bullet + " ")
return s + "\n\n"
def orderedlist(el):
# OrderedList ::= (ListItem+)
return itemizedlist(el, bullet="#.")
def simplelist(el):
# SimpleList ::= (Member+)
# The simplelist is the most complicated one. There are 3 kinds of
# SimpleList: Inline, Horiz and Vert.
if el.get("type") == "inline":
return _join_children(el, ", ")
else:
# members should be rendered in tabular fashion, with number
# of columns equal el[columns]
# but we simply transform it to bullet list
return itemizedlist(el, bullet="+")
def variablelist(el):
#VariableList ::= ((Title,TitleAbbrev?)?, VarListEntry+)
#VarListEntry ::= (Term+,ListItem)
_supports_only(el, ("title", "varlistentry"))
s = ""
title = el.find("title")
if title is not None:
s += _conv(title)
for entry in el.findall("varlistentry"):
s += "\n\n"
s += ", ".join(_concat(i).strip() for i in entry.findall("term"))
s += _indent(entry.find("listitem"), 4)[1:]
return s
# admonition directives
def note(el):
return _indent(el, 3, ".. note:: ")
def caution(el):
return _indent(el, 3, ".. caution:: ")
def important(el):
return _indent(el, 3, ".. important:: ")
def tip(el):
return _indent(el, 3, ".. tip:: ")
def warning(el):
return _indent(el, 3, ".. warning:: ")
# bibliography
def author(el):
_supports_only(el, ("firstname", "surname"))
return el.findtext("firstname") + " " + el.findtext("surname")
editor = author
def authorgroup(el):
return _join_children(el, ", ")
def biblioentry(el):
_supports_only(el, ("abbrev", "authorgroup", "author", "editor", "title",
"publishername", "pubdate", "address"))
s = "\n"
abbrev = el.find("abbrev")
if abbrev is not None:
_has_only_text(abbrev)
s += "[%s] " % abbrev.text
auth = el.find("authorgroup")
if auth is None:
auth = el.find("author")
if auth is not None:
s += "%s. " % _conv(auth)
editor = el.find("editor")
if editor is not None:
s += "%s. " % _conv(editor)
title = el.find("title")
if title is not None:
_has_only_text(title)
s += "*%s*. " % title.text.strip()
address = el.find("address")
if address is not None:
_supports_only(address, ("otheraddr",))
s += "%s " % address.findtext("otheraddr")
publishername = el.find("publishername")
if publishername is not None:
_has_only_text(publishername)
s += "%s. " % publishername.text
pubdate = el.find("pubdate")
if pubdate is not None:
_has_only_text(pubdate)
s += "%s. " % pubdate.text
return s
def bibliography(el):
_supports_only(el, ("biblioentry",))
return _make_title("Bibliography", 2) + "\n" + _join_children(el, "\n")
def attribution(el):
return emphasis(el) + ": "
def email(el):
# Leave for now to see if they get autolinked
return _concat(el)
def glossdiv(el):
# Ignore - <title> gets handled anyway
return _concat(el)
def glossentry(el):
_supports_only(el, ("glossterm", "glossdef", "acronym"))
s = "\n"
acronym_text = ""
acronym = el.find("acronym")
if acronym is not None and acronym.text is not None:
_has_only_text(acronym)
acronym_text = " (" + acronym.text + ")"
glossterm = el.find("glossterm")
if glossterm is not None and glossterm.text is not None:
_has_only_text(glossterm)
s += glossterm.text + acronym_text + "\n"
glossdef = el.find("glossdef")
if glossdef is not None:
s += blockquote(glossdef, None, True)
return s
# Tags to simply ignore
keyword = _concat
keywordset = _concat
abstract = _concat
bookinfo = _concat
corpauthor = _concat
example = _concat
glossary = _concat
figure = _concat
computeroutput = userinput
literal = userinput
option = userinput
procedure = orderedlist
productname = emphasis
programlisting = screen
pubdate = emphasis
quote = userinput
simpara = _block_separated_with_blank_line
if __name__ == '__main__':
_main()