Skip to content

Commit

Permalink
Add support for paragraph offset
Browse files Browse the repository at this point in the history
  • Loading branch information
roman-danilov committed Dec 2, 2019
1 parent 1972d06 commit a9ef888
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 13 deletions.
39 changes: 27 additions & 12 deletions html2docx/html2docx.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import re
from html.parser import HTMLParser
from typing import Iterator, List, Optional, Tuple
from typing import Any, Dict, Iterator, List, Optional, Tuple

from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from tinycss2 import parse_declaration_list
from tinycss2.ast import IdentToken
from tinycss2.ast import DimensionToken, IdentToken

WHITESPACE_RE = re.compile(r"\s+")

Expand All @@ -26,11 +27,17 @@ def get_attr(attrs: List[Tuple[str, Optional[str]]], attr_name: str) -> str:
return value


def style_to_css(style: str) -> Iterator[Tuple[str, str]]:
def style_to_css(style: str) -> Iterator[Dict[str, Any]]:
for declaration in parse_declaration_list(style):
for value in declaration.value:
if isinstance(value, IdentToken):
yield declaration.lower_name, value.lower_value
if isinstance(value, DimensionToken):
yield {
"name": declaration.lower_name,
"value": value.value,
"unit": value.lower_unit,
}
elif isinstance(value, IdentToken):
yield {"name": declaration.lower_name, "value": value.lower_value}


def html_attrs_to_font_style(attrs: List[Tuple[str, Optional[str]]]) -> List[str]:
Expand All @@ -44,10 +51,11 @@ def html_attrs_to_font_style(attrs: List[Tuple[str, Optional[str]]]) -> List[str
styles = []
style = get_attr(attrs, "style")
for style_decl in style_to_css(style):
if style_decl == ("text-decoration", "underline"):
styles.append("underline")
elif style_decl == ("text-decoration", "line-through"):
styles.append("strike")
if style_decl["name"] == "text-decoration":
if style_decl["value"] == "underline":
styles.append("underline")
elif style_decl["value"] == "line-through":
styles.append("strike")
return styles


Expand All @@ -67,14 +75,19 @@ def _reset(self) -> None:
# Formatting options
self.pre = False
self.alignment: Optional[int] = None
self.padding_left: Optional[Pt] = None
self.attrs: List[List[str]] = []
self.collapse_space = True

def init_p(self, attrs: List[Tuple[str, Optional[str]]]) -> None:
style = get_attr(attrs, "style")
for name, value in style_to_css(style):
if name == "text-align":
self.alignment = ALIGNMENTS.get(value, WD_ALIGN_PARAGRAPH.LEFT)
for style_decl in style_to_css(style):
if style_decl["name"] == "text-align":
self.alignment = ALIGNMENTS.get(
style_decl["value"], WD_ALIGN_PARAGRAPH.LEFT
)
elif style_decl["name"] == "padding-left" and style_decl["unit"] == "px":
self.padding_left = Pt(style_decl["value"])

def finish_p(self) -> None:
if self.r is not None:
Expand All @@ -97,6 +110,8 @@ def add_text(self, data: str) -> None:
self.p = self.doc.add_paragraph(style=style)
if self.alignment is not None:
self.p.alignment = self.alignment
if self.padding_left:
self.p.paragraph_format.left_indent = self.padding_left
if self.r is None:
self.r = self.p.add_run()
for attrs in self.attrs:
Expand Down
1 change: 1 addition & 0 deletions tests/data/offset.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<p style="padding-left: 40px;">Simple text with offset.</p>
11 changes: 11 additions & 0 deletions tests/data/offset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[
{
"text": "Simple text with offset.",
"left_indent": 40,
"runs": [
{
"text": "Simple text with offset."
}
]
}
]
9 changes: 8 additions & 1 deletion tests/test_html2docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import docx
import pytest
from docx.shared import Pt

from html2docx import html2docx

Expand Down Expand Up @@ -37,8 +38,14 @@ def test_html2docx(html_path, spec_path):
for p, p_spec in zip(doc.paragraphs, spec):
assert p.text == p_spec["text"]
assert p.style.name == p_spec.get("style", "Normal")
if p.alignment:
if p_spec.get("alignment"):
assert int(p.alignment) == p_spec["alignment"]
else:
assert p.alignment is None
if p_spec.get("left_indent"):
assert p.paragraph_format.left_indent == Pt(p_spec["left_indent"])
else:
assert p.paragraph_format.left_indent is None

runs_spec = p_spec["runs"]
assert len(p.runs) == len(runs_spec)
Expand Down

0 comments on commit a9ef888

Please sign in to comment.