diff --git a/html2docx/html2docx.py b/html2docx/html2docx.py index 2470bb0..29e76bf 100644 --- a/html2docx/html2docx.py +++ b/html2docx/html2docx.py @@ -2,9 +2,13 @@ from html.parser import HTMLParser from typing import Any, Dict, Iterator, List, Optional, Tuple +import webcolors from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH -from docx.shared import Pt +from docx.oxml import parse_xml +from docx.oxml.ns import nsdecls +from docx.oxml.shared import OxmlElement, qn +from docx.shared import Pt, RGBColor from docx.text.paragraph import Paragraph from docx.text.run import Run from tinycss2 import parse_declaration_list @@ -70,6 +74,9 @@ def __init__(self, title: str): self.doc.core_properties.title = title self.list_style: List[str] = [] self.href = "" + self.anchor = "" + self.style = "" + self.tag: Optional[str] = None self._reset() def _reset(self) -> None: @@ -96,11 +103,53 @@ def init_p(self, attrs: List[Tuple[str, Optional[str]]]) -> None: elif style_decl["name"] == "padding-left" and style_decl["unit"] == "px": self.padding_left = Pt(style_decl["value"]) + def init_table(self, attrs: List[Tuple[str, Optional[str]]]) -> None: + self.table_data: List[List[Tuple[str, str]]] = [] + def finish_p(self) -> None: if self.r is not None: self.r.text = self.r.text.rstrip() self._reset() + def finish_table(self) -> None: + if self.table_data: + # remove empty header + header = True + if not self.table_data[0]: + del self.table_data[0] + header = False + + # create table + rows = len(self.table_data) + cols = len(self.table_data[-1]) + table = self.doc.add_table(rows=rows, cols=cols) + + # copy data + for row in range(rows): + for col in range(cols): + cell = table.cell(row, col) + text, style = self.table_data[row][col] + cell.text = text + if style: + for style_decl in style_to_css(style): + if style_decl["name"] == "background": + rgb = webcolors.name_to_hex(style_decl["value"])[1:] + shading = parse_xml( + r''.format(nsdecls("w"), rgb) + ) + cell._tc.get_or_add_tcPr().append(shading) + + elif style_decl["name"] == "color": + rgb = webcolors.name_to_rgb(style_decl["value"]) + for p in cell.paragraphs: + for r in p.runs: + r.font.color.rgb = RGBColor(*rgb) + + if header and row == 0: + for p in cell.paragraphs: + for r in p.runs: + r.font.bold = True + def init_run(self, attrs: List[Tuple[str, Any]]) -> None: self.attrs.append(attrs) if attrs: @@ -124,7 +173,67 @@ def add_text(self, data: str) -> None: for attrs in self.attrs: for font_attr, value in attrs: setattr(self.r.font, font_attr, value) - self.r.add_text(data) + if self.href: + self.add_hyperlink(self.href, data) + elif self.anchor: + self.add_bookmark(self.anchor, data) + else: + self.r.add_text(data) + + def add_hyperlink(self, href: str, text: str) -> None: + if not href.startswith("#"): # TODO external links + if text.endswith(" "): + text += href + " " + else: + text += " " + href + if self.r: + self.r.add_text(text) + return + + hyperlink = OxmlElement("w:hyperlink") + hyperlink.set(qn("w:anchor"), href[1:]) + + new_run = OxmlElement("w:r") + + rPr = OxmlElement("w:rPr") + + rColor = OxmlElement("w:color") + rColor.set(qn("w:val"), "000080") + rPr.append(rColor) + + rU = OxmlElement("w:u") + rU.set(qn("w:val"), "single") + rPr.append(rU) + + new_run.append(rPr) + new_run.text = text + + hyperlink.append(new_run) + + if self.p: + self.p._p.append(hyperlink) + self.r = None + + def add_bookmark(self, anchor: str, text: str) -> None: + if self.r: + tag = self.r._r + start = OxmlElement("w:bookmarkStart") + start.set(qn("w:id"), "0") + start.set(qn("w:name"), anchor) + tag.addprevious(start) + end = OxmlElement("w:bookmarkEnd") + end.set(qn("w:id"), "0") + tag.addnext(end) + + self.r.add_text(self.anchor + " " + text) + + def add_code(self, data: str) -> None: + lines = data.splitlines() + for linenr, line in enumerate(lines): + self.add_text(line.strip()) + if linenr < len(lines) - 1: + if self.r: + self.r.add_break() def add_list_style(self, name: str) -> None: self.finish_p() @@ -149,8 +258,10 @@ def add_picture(self, attrs: List[Tuple[str, Optional[str]]]) -> None: run.add_picture(image_buffer, **size) def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: + self.tag = tag if tag == "a": self.href = get_attr(attrs, "href") + self.anchor = get_attr(attrs, "id") self.init_run([]) elif tag in ["b", "strong"]: self.init_run([("bold", True)]) @@ -183,28 +294,48 @@ def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> N self.init_run([("underline", True)]) elif tag == "ul": self.add_list_style("List Bullet") + elif tag == "table": + self.init_table(attrs) + elif tag == "tr": + self.table_data.append([]) + elif tag == "td": + styles = [b for a, b in attrs if a == "style" and b] + if styles: + self.style = styles[0] def handle_data(self, data: str) -> None: + if self.tag == "style": + return + elif self.tag in ("td", "th"): + if self.table_data: + self.table_data[-1].append((data, self.style)) + return if not self.pre: data = re.sub(WHITESPACE_RE, " ", data) if self.collapse_space: data = data.lstrip() if data: - if self.href: - if data.endswith(" "): - data += self.href + " " - else: - data += " " + self.href - self.href = "" self.collapse_space = data.endswith(" ") - self.add_text(data) + + if self.tag == "code": + self.add_code(data) + else: + self.add_text(data) def handle_endtag(self, tag: str) -> None: if tag in ["a", "b", "code", "em", "i", "span", "strong", "sub", "sup", "u"]: self.finish_run() + if tag == "a": + self.href = "" + self.anchor = "" + elif tag in ["td", "tr"]: + self.style = "" elif tag in ["h1", "h2", "h3", "h4", "h5", "h6", "li", "ol", "p", "pre", "ul"]: self.finish_p() if tag in ["ol", "ul"]: del self.list_style[-1] elif tag == "pre": self.pre = False + elif tag == "table": + self.finish_table() + self.tag = None diff --git a/tests/data/code-multiline.html b/tests/data/code-multiline.html new file mode 100644 index 0000000..8e84cff --- /dev/null +++ b/tests/data/code-multiline.html @@ -0,0 +1,5 @@ +

+value = get_value(arg)
+do_something(value)
+return
+
diff --git a/tests/data/code-multiline.json b/tests/data/code-multiline.json new file mode 100644 index 0000000..496be70 --- /dev/null +++ b/tests/data/code-multiline.json @@ -0,0 +1,11 @@ +[ + { + "text": "value = get_value(arg)\ndo_something(value)\nreturn", + "runs": [ + { + "text": "value = get_value(arg)\ndo_something(value)\nreturn", + "name": "Mono" + } + ] + } +] diff --git a/tests/data/style.html b/tests/data/style.html new file mode 100644 index 0000000..2d5dc57 --- /dev/null +++ b/tests/data/style.html @@ -0,0 +1 @@ + diff --git a/tests/data/style.json b/tests/data/style.json new file mode 100644 index 0000000..fe51488 --- /dev/null +++ b/tests/data/style.json @@ -0,0 +1 @@ +[] diff --git a/tests/links.html b/tests/links.html new file mode 100644 index 0000000..6e61c73 --- /dev/null +++ b/tests/links.html @@ -0,0 +1,15 @@ +

+link to bookmark +

+ +

some text

+

some text

+

some text

+

some text

+

some text

+ +

+

Bookmark

+

+ +

more text

diff --git a/tests/table.html b/tests/table.html new file mode 100644 index 0000000..9d6b5b5 --- /dev/null +++ b/tests/table.html @@ -0,0 +1,5 @@ + + + + +
column1column2
12
34
diff --git a/tests/test_links.py b/tests/test_links.py new file mode 100644 index 0000000..20a03d0 --- /dev/null +++ b/tests/test_links.py @@ -0,0 +1,51 @@ +import os + +import docx + +from html2docx import html2docx + +from .utils import TEST_DIR + + +def strip_ns(x): + pos = x.find("}") + 1 + return x[pos:] + + +def attrib(d, key): + for k, v in d.items(): + if k.endswith("}" + key): + return v + + +def test_links(): + html_path = os.path.join(TEST_DIR, "links.html") + html = open(html_path).read() + buf = html2docx(html, title="links") + + doc = docx.Document(buf) + + assert len(doc.paragraphs) == 9 + + # check hyperlink + run = doc.paragraphs[0]._p[1] + assert strip_ns(run.tag) == "hyperlink" + assert attrib(run.attrib, "anchor") == "1.1" + + children = run.getchildren() + assert len(children) == 1 + + wR = children[0] + assert strip_ns(wR.tag) == "r" + assert wR.text == "link to bookmark" + + # check bookmark + run = doc.paragraphs[6]._p[1] + assert strip_ns(run.tag) == "bookmarkStart" + + run = doc.paragraphs[6]._p[2] + assert strip_ns(run.tag) == "r" + assert run.text == "1.1 Bookmark" + + run = doc.paragraphs[6]._p[3] + assert strip_ns(run.tag) == "bookmarkEnd" diff --git a/tests/test_table.py b/tests/test_table.py new file mode 100644 index 0000000..b6662ab --- /dev/null +++ b/tests/test_table.py @@ -0,0 +1,31 @@ +import os + +import docx + +from html2docx import html2docx + +from .utils import TEST_DIR + + +def test_table(): + html_path = os.path.join(TEST_DIR, "table.html") + html = open(html_path).read() + buf = html2docx(html, title="table") + + doc = docx.Document(buf) + + assert len(doc.tables) == 1 + table = doc.tables[0] + + assert len(table.rows) == 3 + assert len(table.columns) == 2 + + contents = [ + ("column1", "column2"), + ("1", "2"), + ("3", "4"), + ] + + for r, row in enumerate(contents): + for c, text in enumerate(row): + assert table.cell(r, c).text == text diff --git a/tox.ini b/tox.ini index 3d964cd..a790096 100644 --- a/tox.ini +++ b/tox.ini @@ -12,6 +12,7 @@ commands = pytest {posargs} deps = Pillow pytest + webcolors [testenv:black] commands = black --target-version=py36 --check --diff . @@ -24,7 +25,7 @@ deps = flake8 skip_install = true [testenv:isort] -commands = isort --recursive --check-only --diff +commands = isort --check-only --diff --verbose . deps = isort skip_install = true