Skip to content

Commit

Permalink
Basic support for inline images in documents
Browse files Browse the repository at this point in the history
Broken image from https://freesvg.org/broken-icon, licensed under CC0
(Creative commons, “No Rights Reserved”).

Reviewed-by: Jon Dufresne
Reviewed-by: Roman Danilov
  • Loading branch information
francoisfreitag committed Dec 3, 2019
1 parent a9ef888 commit 0f26c0d
Show file tree
Hide file tree
Showing 13 changed files with 256 additions and 4 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
include LICENSE
include README.md
include tox.ini
include html2docx/image-broken.png
include html2docx/py.typed
recursive-include tests *.html *.json *.py
11 changes: 10 additions & 1 deletion html2docx/html2docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from tinycss2 import parse_declaration_list
from tinycss2.ast import DimensionToken, IdentToken

from .image import load_image

WHITESPACE_RE = re.compile(r"\s+")

ALIGNMENTS = {
Expand Down Expand Up @@ -126,9 +128,14 @@ def add_list_style(self, name: str) -> None:
suffix = f" {level}" if level > 1 else ""
self.list_style.append(f"{name}{suffix}")

def add_picture(self, attrs: List[Tuple[str, Optional[str]]]) -> None:
src = get_attr(attrs, "src")
image_buffer = load_image(src)
self.doc.add_picture(image_buffer)

def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
if tag == "a":
self.href = str(next((val for name, val in attrs if name == "href"), ""))
self.href = get_attr(attrs, "href")
self.init_run([])
elif tag in ["b", "strong"]:
self.init_run(["bold"])
Expand All @@ -140,6 +147,8 @@ def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> N
elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
level = int(tag[-1])
self.p = self.doc.add_heading(level=level)
elif tag == "img":
self.add_picture(attrs)
elif tag == "ol":
self.add_list_style("List Number")
elif tag == "p":
Expand Down
Binary file added html2docx/image-broken.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
51 changes: 51 additions & 0 deletions html2docx/image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import http
import io
import pathlib
import time
import urllib.error
import urllib.request

from docx.image.exceptions import UnrecognizedImageError
from docx.image.image import Image

MAX_IMAGE_SIZE = 10 * 1024 * 1024 # 10 MiB


def load_image(src: str) -> io.BytesIO:
image_buffer = None
retry = 3
while retry and not image_buffer:
try:
with urllib.request.urlopen(src) as response:
size = response.getheader("Content-Length")
if size and int(size) > MAX_IMAGE_SIZE:
break
# Read up to MAX_IMAGE_SIZE when response does not contain
# the Content-Length header. The extra byte avoids an extra read to
# check whether the EOF was reached.
data = response.read(MAX_IMAGE_SIZE + 1)
except (ValueError, http.client.HTTPException, urllib.error.HTTPError):
# ValueError: Invalid URL or non-integer Content-Length.
# HTTPException: Server does not speak HTTP properly.
# HTTPError: Server could not perform request.
retry = 0
except urllib.error.URLError:
# URLError: Transient network error, e.g. DNS request failed.
retry -= 1
if retry:
time.sleep(1)
else:
if len(data) <= MAX_IMAGE_SIZE:
image_buffer = io.BytesIO(data)

if image_buffer:
try:
Image.from_blob(image_buffer.getbuffer())
except UnrecognizedImageError:
image_buffer = None

if not image_buffer:
broken_img_path = pathlib.Path(__file__).parent / "image-broken.png"
image_buffer = io.BytesIO(broken_img_path.read_bytes())

return image_buffer
Empty file added tests/__init__.py
Empty file.
103 changes: 103 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import http.server
import os
import sys
import threading

import pytest

from .utils import TEST_DIR


class CountingHTTPServer(http.server.HTTPServer):
request_count = 0

def finish_request(self, *args, **kwargs):
self.request_count += 1
return super().finish_request(*args, **kwargs)


class HttpServerThread(threading.Thread):
def __init__(self, handler):
super().__init__()
self.is_ready = threading.Event()
self.handler = handler
self.error = None

def run(self):
try:
self.httpd = CountingHTTPServer(("localhost", 0), self.handler)
port = self.httpd.server_address[1]
self.base_url = f"http://localhost:{port}/"
self.is_ready.set()
self.httpd.serve_forever(poll_interval=0.01)
except Exception as e:
self.error = e
self.is_ready.set()

def terminate(self):
if hasattr(self, "httpd"):
self.httpd.shutdown()
self.httpd.server_close()
self.join()


class ImageHandler(http.server.SimpleHTTPRequestHandler):
def __init__(self, *args, directory=None, **kwargs):
if sys.version_info >= (3, 9):
kwargs["directory"] = TEST_DIR / "images"
elif sys.version_info >= (3, 7):
kwargs["directory"] = os.fspath(TEST_DIR / "images")
super().__init__(*args, **kwargs)

def translate_path(self, path):
if sys.version_info < (3, 7):
cwd = os.getcwd()
try:
os.chdir(TEST_DIR / "images")
return super().translate_path(path)
finally:
os.chdir(cwd)
return super().translate_path(path)


def http_server_thread(handler):
server_thread = HttpServerThread(handler)
server_thread.daemon = True
server_thread.start()
server_thread.is_ready.wait()
yield server_thread
try:
if server_thread.error:
raise server_thread.error
finally:
server_thread.terminate()


@pytest.fixture(scope="function")
def image_server():
"""
Start a HTTP server serving test images.
"""
yield from http_server_thread(ImageHandler)


class BadContentHandler(http.server.SimpleHTTPRequestHandler):
def do_GET(self):
self.close_connection = True


@pytest.fixture(scope="function")
def bad_server():
yield from http_server_thread(BadContentHandler)


class BadContentLengthHandler(http.server.SimpleHTTPRequestHandler):
def do_GET(self):
self.send_response(http.HTTPStatus.OK)
self.send_header("Content-Length", "invalid")
self.end_headers()


@pytest.fixture(scope="function")
def bad_content_length_server():
yield from http_server_thread(BadContentLengthHandler)
Binary file added tests/data/1x1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions tests/data/img.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<img src="https://via.placeholder.com/1x1.png">
17 changes: 17 additions & 0 deletions tests/data/img.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[
{
"text": "",
"runs": [
{
"text": "",
"shapes": [
{
"type": 3,
"width": 9525,
"height": 9525
}
]
}
]
}
]
Binary file added tests/images/1x1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
12 changes: 9 additions & 3 deletions tests/test_html2docx.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import json
import pathlib

import docx
import pytest
from docx.shared import Pt

from html2docx import html2docx

TEST_DIR = pathlib.Path(__file__).parent.resolve(strict=True)
PROJECT_DIR = TEST_DIR.parent
from .utils import PROJECT_DIR, TEST_DIR

FONT_ATTRS = ["bold", "italic", "strike", "subscript", "superscript", "underline"]

Expand Down Expand Up @@ -51,10 +49,18 @@ def test_html2docx(html_path, spec_path):
assert len(p.runs) == len(runs_spec)
for run, run_spec in zip(p.runs, runs_spec):
assert run.text == run_spec.pop("text")
shapes_spec = run_spec.pop("shapes", None)
unknown = set(run_spec).difference(FONT_ATTRS)
assert not unknown, "Unknown attributes in {}: {}".format(
spec_rel_path, ", ".join(unknown)
)
for attr in FONT_ATTRS:
msg = f"Wrong {attr} for text '{run.text}' in {html_rel_path}"
assert getattr(run.font, attr) is run_spec.get(attr), msg
if shapes_spec:
shapes = run.part.inline_shapes
assert len(shapes) == len(shapes_spec)
for shape, shape_spec in zip(shapes, shapes_spec):
assert shape.type == shape_spec["type"]
assert shape.width == shape_spec["width"]
assert shape.height == shape_spec["height"]
60 changes: 60 additions & 0 deletions tests/test_load_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import urllib.error
import urllib.request
from unittest import mock

from html2docx.image import load_image

from .utils import PROJECT_DIR, TEST_DIR

broken_image = PROJECT_DIR / "html2docx" / "image-broken.png"
broken_image_bytes = broken_image.read_bytes()


def test_basic(image_server):
image_data = load_image(image_server.base_url + "1x1.png")
expected = TEST_DIR / "data" / "1x1.png"
assert image_data.getbuffer() == expected.read_bytes()


def test_non_image(image_server):
image_data = load_image(image_server.base_url)
assert image_data.getbuffer() == broken_image_bytes


def test_bad_url():
image_data = load_image("bad")
assert image_data.getbuffer() == broken_image_bytes


def test_transient_network_error_retries():
url = "https://transient.network.issue.com/image.png"
with mock.patch(
"html2docx.image.urllib.request.urlopen",
autospec=True,
side_effect=urllib.error.URLError(
reason="[Errno -2] Name or service not known"
),
) as url_mock:
with mock.patch("html2docx.image.time.sleep", autospec=True) as time_mock:
image_data = load_image(url)
assert time_mock.mock_calls == [mock.call(1)] * 2
assert url_mock.call_args_list == [mock.call(url)] * 3
assert image_data.getbuffer() == broken_image_bytes


def test_404(image_server):
image_data = load_image(image_server.base_url + "nonexistent")
assert image_data.getbuffer() == broken_image_bytes
assert image_server.httpd.request_count == 1


def test_bad_server(bad_server):
image_data = load_image(bad_server.base_url)
assert image_data.getbuffer() == broken_image_bytes
assert bad_server.httpd.request_count == 1


def test_bad_content_length(bad_content_length_server):
image_data = load_image(bad_content_length_server.base_url)
assert image_data.getbuffer() == broken_image_bytes
assert bad_content_length_server.httpd.request_count == 1
4 changes: 4 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import pathlib

TEST_DIR = pathlib.Path(__file__).parent.resolve(strict=True)
PROJECT_DIR = TEST_DIR.parent

0 comments on commit 0f26c0d

Please sign in to comment.