diff --git a/html2docx/html2docx.py b/html2docx/html2docx.py
index 2470bb0..29e76bf 100644
--- a/html2docx/html2docx.py
+++ b/html2docx/html2docx.py
@@ -2,9 +2,13 @@
from html.parser import HTMLParser
from typing import Any, Dict, Iterator, List, Optional, Tuple
+import webcolors
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
-from docx.shared import Pt
+from docx.oxml import parse_xml
+from docx.oxml.ns import nsdecls
+from docx.oxml.shared import OxmlElement, qn
+from docx.shared import Pt, RGBColor
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from tinycss2 import parse_declaration_list
@@ -70,6 +74,9 @@ def __init__(self, title: str):
self.doc.core_properties.title = title
self.list_style: List[str] = []
self.href = ""
+ self.anchor = ""
+ self.style = ""
+ self.tag: Optional[str] = None
self._reset()
def _reset(self) -> None:
@@ -96,11 +103,53 @@ def init_p(self, attrs: List[Tuple[str, Optional[str]]]) -> None:
elif style_decl["name"] == "padding-left" and style_decl["unit"] == "px":
self.padding_left = Pt(style_decl["value"])
+ def init_table(self, attrs: List[Tuple[str, Optional[str]]]) -> None:
+ self.table_data: List[List[Tuple[str, str]]] = []
+
def finish_p(self) -> None:
if self.r is not None:
self.r.text = self.r.text.rstrip()
self._reset()
+ def finish_table(self) -> None:
+ if self.table_data:
+ # remove empty header
+ header = True
+ if not self.table_data[0]:
+ del self.table_data[0]
+ header = False
+
+ # create table
+ rows = len(self.table_data)
+ cols = len(self.table_data[-1])
+ table = self.doc.add_table(rows=rows, cols=cols)
+
+ # copy data
+ for row in range(rows):
+ for col in range(cols):
+ cell = table.cell(row, col)
+ text, style = self.table_data[row][col]
+ cell.text = text
+ if style:
+ for style_decl in style_to_css(style):
+ if style_decl["name"] == "background":
+ rgb = webcolors.name_to_hex(style_decl["value"])[1:]
+ shading = parse_xml(
+ r''.format(nsdecls("w"), rgb)
+ )
+ cell._tc.get_or_add_tcPr().append(shading)
+
+ elif style_decl["name"] == "color":
+ rgb = webcolors.name_to_rgb(style_decl["value"])
+ for p in cell.paragraphs:
+ for r in p.runs:
+ r.font.color.rgb = RGBColor(*rgb)
+
+ if header and row == 0:
+ for p in cell.paragraphs:
+ for r in p.runs:
+ r.font.bold = True
+
def init_run(self, attrs: List[Tuple[str, Any]]) -> None:
self.attrs.append(attrs)
if attrs:
@@ -124,7 +173,67 @@ def add_text(self, data: str) -> None:
for attrs in self.attrs:
for font_attr, value in attrs:
setattr(self.r.font, font_attr, value)
- self.r.add_text(data)
+ if self.href:
+ self.add_hyperlink(self.href, data)
+ elif self.anchor:
+ self.add_bookmark(self.anchor, data)
+ else:
+ self.r.add_text(data)
+
+ def add_hyperlink(self, href: str, text: str) -> None:
+ if not href.startswith("#"): # TODO external links
+ if text.endswith(" "):
+ text += href + " "
+ else:
+ text += " " + href
+ if self.r:
+ self.r.add_text(text)
+ return
+
+ hyperlink = OxmlElement("w:hyperlink")
+ hyperlink.set(qn("w:anchor"), href[1:])
+
+ new_run = OxmlElement("w:r")
+
+ rPr = OxmlElement("w:rPr")
+
+ rColor = OxmlElement("w:color")
+ rColor.set(qn("w:val"), "000080")
+ rPr.append(rColor)
+
+ rU = OxmlElement("w:u")
+ rU.set(qn("w:val"), "single")
+ rPr.append(rU)
+
+ new_run.append(rPr)
+ new_run.text = text
+
+ hyperlink.append(new_run)
+
+ if self.p:
+ self.p._p.append(hyperlink)
+ self.r = None
+
+ def add_bookmark(self, anchor: str, text: str) -> None:
+ if self.r:
+ tag = self.r._r
+ start = OxmlElement("w:bookmarkStart")
+ start.set(qn("w:id"), "0")
+ start.set(qn("w:name"), anchor)
+ tag.addprevious(start)
+ end = OxmlElement("w:bookmarkEnd")
+ end.set(qn("w:id"), "0")
+ tag.addnext(end)
+
+ self.r.add_text(self.anchor + " " + text)
+
+ def add_code(self, data: str) -> None:
+ lines = data.splitlines()
+ for linenr, line in enumerate(lines):
+ self.add_text(line.strip())
+ if linenr < len(lines) - 1:
+ if self.r:
+ self.r.add_break()
def add_list_style(self, name: str) -> None:
self.finish_p()
@@ -149,8 +258,10 @@ def add_picture(self, attrs: List[Tuple[str, Optional[str]]]) -> None:
run.add_picture(image_buffer, **size)
def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
+ self.tag = tag
if tag == "a":
self.href = get_attr(attrs, "href")
+ self.anchor = get_attr(attrs, "id")
self.init_run([])
elif tag in ["b", "strong"]:
self.init_run([("bold", True)])
@@ -183,28 +294,48 @@ def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> N
self.init_run([("underline", True)])
elif tag == "ul":
self.add_list_style("List Bullet")
+ elif tag == "table":
+ self.init_table(attrs)
+ elif tag == "tr":
+ self.table_data.append([])
+ elif tag == "td":
+ styles = [b for a, b in attrs if a == "style" and b]
+ if styles:
+ self.style = styles[0]
def handle_data(self, data: str) -> None:
+ if self.tag == "style":
+ return
+ elif self.tag in ("td", "th"):
+ if self.table_data:
+ self.table_data[-1].append((data, self.style))
+ return
if not self.pre:
data = re.sub(WHITESPACE_RE, " ", data)
if self.collapse_space:
data = data.lstrip()
if data:
- if self.href:
- if data.endswith(" "):
- data += self.href + " "
- else:
- data += " " + self.href
- self.href = ""
self.collapse_space = data.endswith(" ")
- self.add_text(data)
+
+ if self.tag == "code":
+ self.add_code(data)
+ else:
+ self.add_text(data)
def handle_endtag(self, tag: str) -> None:
if tag in ["a", "b", "code", "em", "i", "span", "strong", "sub", "sup", "u"]:
self.finish_run()
+ if tag == "a":
+ self.href = ""
+ self.anchor = ""
+ elif tag in ["td", "tr"]:
+ self.style = ""
elif tag in ["h1", "h2", "h3", "h4", "h5", "h6", "li", "ol", "p", "pre", "ul"]:
self.finish_p()
if tag in ["ol", "ul"]:
del self.list_style[-1]
elif tag == "pre":
self.pre = False
+ elif tag == "table":
+ self.finish_table()
+ self.tag = None
diff --git a/tests/data/code-multiline.html b/tests/data/code-multiline.html
new file mode 100644
index 0000000..8e84cff
--- /dev/null
+++ b/tests/data/code-multiline.html
@@ -0,0 +1,5 @@
+
+value = get_value(arg)
+do_something(value)
+return
+
diff --git a/tests/data/code-multiline.json b/tests/data/code-multiline.json
new file mode 100644
index 0000000..496be70
--- /dev/null
+++ b/tests/data/code-multiline.json
@@ -0,0 +1,11 @@
+[
+ {
+ "text": "value = get_value(arg)\ndo_something(value)\nreturn",
+ "runs": [
+ {
+ "text": "value = get_value(arg)\ndo_something(value)\nreturn",
+ "name": "Mono"
+ }
+ ]
+ }
+]
diff --git a/tests/data/style.html b/tests/data/style.html
new file mode 100644
index 0000000..2d5dc57
--- /dev/null
+++ b/tests/data/style.html
@@ -0,0 +1 @@
+
diff --git a/tests/data/style.json b/tests/data/style.json
new file mode 100644
index 0000000..fe51488
--- /dev/null
+++ b/tests/data/style.json
@@ -0,0 +1 @@
+[]
diff --git a/tests/links.html b/tests/links.html
new file mode 100644
index 0000000..6e61c73
--- /dev/null
+++ b/tests/links.html
@@ -0,0 +1,15 @@
+
+link to bookmark
+
+
+some text
+some text
+some text
+some text
+some text
+
+
+Bookmark
+
+
+more text
diff --git a/tests/table.html b/tests/table.html
new file mode 100644
index 0000000..9d6b5b5
--- /dev/null
+++ b/tests/table.html
@@ -0,0 +1,5 @@
+
+ column1 | column2 |
+ 1 | 2 |
+ 3 | 4 |
+
diff --git a/tests/test_links.py b/tests/test_links.py
new file mode 100644
index 0000000..20a03d0
--- /dev/null
+++ b/tests/test_links.py
@@ -0,0 +1,51 @@
+import os
+
+import docx
+
+from html2docx import html2docx
+
+from .utils import TEST_DIR
+
+
+def strip_ns(x):
+ pos = x.find("}") + 1
+ return x[pos:]
+
+
+def attrib(d, key):
+ for k, v in d.items():
+ if k.endswith("}" + key):
+ return v
+
+
+def test_links():
+ html_path = os.path.join(TEST_DIR, "links.html")
+ html = open(html_path).read()
+ buf = html2docx(html, title="links")
+
+ doc = docx.Document(buf)
+
+ assert len(doc.paragraphs) == 9
+
+ # check hyperlink
+ run = doc.paragraphs[0]._p[1]
+ assert strip_ns(run.tag) == "hyperlink"
+ assert attrib(run.attrib, "anchor") == "1.1"
+
+ children = run.getchildren()
+ assert len(children) == 1
+
+ wR = children[0]
+ assert strip_ns(wR.tag) == "r"
+ assert wR.text == "link to bookmark"
+
+ # check bookmark
+ run = doc.paragraphs[6]._p[1]
+ assert strip_ns(run.tag) == "bookmarkStart"
+
+ run = doc.paragraphs[6]._p[2]
+ assert strip_ns(run.tag) == "r"
+ assert run.text == "1.1 Bookmark"
+
+ run = doc.paragraphs[6]._p[3]
+ assert strip_ns(run.tag) == "bookmarkEnd"
diff --git a/tests/test_table.py b/tests/test_table.py
new file mode 100644
index 0000000..b6662ab
--- /dev/null
+++ b/tests/test_table.py
@@ -0,0 +1,31 @@
+import os
+
+import docx
+
+from html2docx import html2docx
+
+from .utils import TEST_DIR
+
+
+def test_table():
+ html_path = os.path.join(TEST_DIR, "table.html")
+ html = open(html_path).read()
+ buf = html2docx(html, title="table")
+
+ doc = docx.Document(buf)
+
+ assert len(doc.tables) == 1
+ table = doc.tables[0]
+
+ assert len(table.rows) == 3
+ assert len(table.columns) == 2
+
+ contents = [
+ ("column1", "column2"),
+ ("1", "2"),
+ ("3", "4"),
+ ]
+
+ for r, row in enumerate(contents):
+ for c, text in enumerate(row):
+ assert table.cell(r, c).text == text
diff --git a/tox.ini b/tox.ini
index 3d964cd..a790096 100644
--- a/tox.ini
+++ b/tox.ini
@@ -12,6 +12,7 @@ commands = pytest {posargs}
deps =
Pillow
pytest
+ webcolors
[testenv:black]
commands = black --target-version=py36 --check --diff .
@@ -24,7 +25,7 @@ deps = flake8
skip_install = true
[testenv:isort]
-commands = isort --recursive --check-only --diff
+commands = isort --check-only --diff --verbose .
deps = isort
skip_install = true