diff --git a/MANIFEST.in b/MANIFEST.in index 786e967..9d4319a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,6 @@ include LICENSE include README.md include tox.ini +include html2docx/image-broken.png include html2docx/py.typed recursive-include tests *.html *.json *.py diff --git a/html2docx/html2docx.py b/html2docx/html2docx.py index 96ee4b7..cc0c47d 100644 --- a/html2docx/html2docx.py +++ b/html2docx/html2docx.py @@ -10,6 +10,8 @@ from tinycss2 import parse_declaration_list from tinycss2.ast import DimensionToken, IdentToken +from .image import load_image + WHITESPACE_RE = re.compile(r"\s+") ALIGNMENTS = { @@ -126,9 +128,14 @@ def add_list_style(self, name: str) -> None: suffix = f" {level}" if level > 1 else "" self.list_style.append(f"{name}{suffix}") + def add_picture(self, attrs: List[Tuple[str, Optional[str]]]) -> None: + src = get_attr(attrs, "src") + image_buffer = load_image(src) + self.doc.add_picture(image_buffer) + def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: if tag == "a": - self.href = str(next((val for name, val in attrs if name == "href"), "")) + self.href = get_attr(attrs, "href") self.init_run([]) elif tag in ["b", "strong"]: self.init_run(["bold"]) @@ -140,6 +147,8 @@ def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> N elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: level = int(tag[-1]) self.p = self.doc.add_heading(level=level) + elif tag == "img": + self.add_picture(attrs) elif tag == "ol": self.add_list_style("List Number") elif tag == "p": diff --git a/html2docx/image-broken.png b/html2docx/image-broken.png new file mode 100644 index 0000000..b0c7e12 Binary files /dev/null and b/html2docx/image-broken.png differ diff --git a/html2docx/image.py b/html2docx/image.py new file mode 100644 index 0000000..f760686 --- /dev/null +++ b/html2docx/image.py @@ -0,0 +1,51 @@ +import http +import io +import pathlib +import time +import urllib.error +import urllib.request + +from docx.image.exceptions import UnrecognizedImageError +from docx.image.image import Image + +MAX_IMAGE_SIZE = 10 * 1024 * 1024 # 10 MiB + + +def load_image(src: str) -> io.BytesIO: + image_buffer = None + retry = 3 + while retry and not image_buffer: + try: + with urllib.request.urlopen(src) as response: + size = response.getheader("Content-Length") + if size and int(size) > MAX_IMAGE_SIZE: + break + # Read up to MAX_IMAGE_SIZE when response does not contain + # the Content-Length header. The extra byte avoids an extra read to + # check whether the EOF was reached. + data = response.read(MAX_IMAGE_SIZE + 1) + except (ValueError, http.client.HTTPException, urllib.error.HTTPError): + # ValueError: Invalid URL or non-integer Content-Length. + # HTTPException: Server does not speak HTTP properly. + # HTTPError: Server could not perform request. + retry = 0 + except urllib.error.URLError: + # URLError: Transient network error, e.g. DNS request failed. + retry -= 1 + if retry: + time.sleep(1) + else: + if len(data) <= MAX_IMAGE_SIZE: + image_buffer = io.BytesIO(data) + + if image_buffer: + try: + Image.from_blob(image_buffer.getbuffer()) + except UnrecognizedImageError: + image_buffer = None + + if not image_buffer: + broken_img_path = pathlib.Path(__file__).parent / "image-broken.png" + image_buffer = io.BytesIO(broken_img_path.read_bytes()) + + return image_buffer diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..8b414aa --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,103 @@ +import http.server +import os +import sys +import threading + +import pytest + +from .utils import TEST_DIR + + +class CountingHTTPServer(http.server.HTTPServer): + request_count = 0 + + def finish_request(self, *args, **kwargs): + self.request_count += 1 + return super().finish_request(*args, **kwargs) + + +class HttpServerThread(threading.Thread): + def __init__(self, handler): + super().__init__() + self.is_ready = threading.Event() + self.handler = handler + self.error = None + + def run(self): + try: + self.httpd = CountingHTTPServer(("localhost", 0), self.handler) + port = self.httpd.server_address[1] + self.base_url = f"http://localhost:{port}/" + self.is_ready.set() + self.httpd.serve_forever(poll_interval=0.01) + except Exception as e: + self.error = e + self.is_ready.set() + + def terminate(self): + if hasattr(self, "httpd"): + self.httpd.shutdown() + self.httpd.server_close() + self.join() + + +class ImageHandler(http.server.SimpleHTTPRequestHandler): + def __init__(self, *args, directory=None, **kwargs): + if sys.version_info >= (3, 9): + kwargs["directory"] = TEST_DIR / "images" + elif sys.version_info >= (3, 7): + kwargs["directory"] = os.fspath(TEST_DIR / "images") + super().__init__(*args, **kwargs) + + def translate_path(self, path): + if sys.version_info < (3, 7): + cwd = os.getcwd() + try: + os.chdir(TEST_DIR / "images") + return super().translate_path(path) + finally: + os.chdir(cwd) + return super().translate_path(path) + + +def http_server_thread(handler): + server_thread = HttpServerThread(handler) + server_thread.daemon = True + server_thread.start() + server_thread.is_ready.wait() + yield server_thread + try: + if server_thread.error: + raise server_thread.error + finally: + server_thread.terminate() + + +@pytest.fixture(scope="function") +def image_server(): + """ + Start a HTTP server serving test images. + """ + yield from http_server_thread(ImageHandler) + + +class BadContentHandler(http.server.SimpleHTTPRequestHandler): + def do_GET(self): + self.close_connection = True + + +@pytest.fixture(scope="function") +def bad_server(): + yield from http_server_thread(BadContentHandler) + + +class BadContentLengthHandler(http.server.SimpleHTTPRequestHandler): + def do_GET(self): + self.send_response(http.HTTPStatus.OK) + self.send_header("Content-Length", "invalid") + self.end_headers() + + +@pytest.fixture(scope="function") +def bad_content_length_server(): + yield from http_server_thread(BadContentLengthHandler) diff --git a/tests/data/1x1.png b/tests/data/1x1.png new file mode 100644 index 0000000..90a7589 Binary files /dev/null and b/tests/data/1x1.png differ diff --git a/tests/data/img.html b/tests/data/img.html new file mode 100644 index 0000000..e8d1f45 --- /dev/null +++ b/tests/data/img.html @@ -0,0 +1 @@ + diff --git a/tests/data/img.json b/tests/data/img.json new file mode 100644 index 0000000..2f47314 --- /dev/null +++ b/tests/data/img.json @@ -0,0 +1,17 @@ +[ + { + "text": "", + "runs": [ + { + "text": "", + "shapes": [ + { + "type": 3, + "width": 9525, + "height": 9525 + } + ] + } + ] + } +] diff --git a/tests/images/1x1.png b/tests/images/1x1.png new file mode 100644 index 0000000..90a7589 Binary files /dev/null and b/tests/images/1x1.png differ diff --git a/tests/test_html2docx.py b/tests/test_html2docx.py index 27c475e..3cef917 100644 --- a/tests/test_html2docx.py +++ b/tests/test_html2docx.py @@ -1,5 +1,4 @@ import json -import pathlib import docx import pytest @@ -7,8 +6,7 @@ from html2docx import html2docx -TEST_DIR = pathlib.Path(__file__).parent.resolve(strict=True) -PROJECT_DIR = TEST_DIR.parent +from .utils import PROJECT_DIR, TEST_DIR FONT_ATTRS = ["bold", "italic", "strike", "subscript", "superscript", "underline"] @@ -51,6 +49,7 @@ def test_html2docx(html_path, spec_path): assert len(p.runs) == len(runs_spec) for run, run_spec in zip(p.runs, runs_spec): assert run.text == run_spec.pop("text") + shapes_spec = run_spec.pop("shapes", None) unknown = set(run_spec).difference(FONT_ATTRS) assert not unknown, "Unknown attributes in {}: {}".format( spec_rel_path, ", ".join(unknown) @@ -58,3 +57,10 @@ def test_html2docx(html_path, spec_path): for attr in FONT_ATTRS: msg = f"Wrong {attr} for text '{run.text}' in {html_rel_path}" assert getattr(run.font, attr) is run_spec.get(attr), msg + if shapes_spec: + shapes = run.part.inline_shapes + assert len(shapes) == len(shapes_spec) + for shape, shape_spec in zip(shapes, shapes_spec): + assert shape.type == shape_spec["type"] + assert shape.width == shape_spec["width"] + assert shape.height == shape_spec["height"] diff --git a/tests/test_load_image.py b/tests/test_load_image.py new file mode 100644 index 0000000..7212cab --- /dev/null +++ b/tests/test_load_image.py @@ -0,0 +1,60 @@ +import urllib.error +import urllib.request +from unittest import mock + +from html2docx.image import load_image + +from .utils import PROJECT_DIR, TEST_DIR + +broken_image = PROJECT_DIR / "html2docx" / "image-broken.png" +broken_image_bytes = broken_image.read_bytes() + + +def test_basic(image_server): + image_data = load_image(image_server.base_url + "1x1.png") + expected = TEST_DIR / "data" / "1x1.png" + assert image_data.getbuffer() == expected.read_bytes() + + +def test_non_image(image_server): + image_data = load_image(image_server.base_url) + assert image_data.getbuffer() == broken_image_bytes + + +def test_bad_url(): + image_data = load_image("bad") + assert image_data.getbuffer() == broken_image_bytes + + +def test_transient_network_error_retries(): + url = "https://transient.network.issue.com/image.png" + with mock.patch( + "html2docx.image.urllib.request.urlopen", + autospec=True, + side_effect=urllib.error.URLError( + reason="[Errno -2] Name or service not known" + ), + ) as url_mock: + with mock.patch("html2docx.image.time.sleep", autospec=True) as time_mock: + image_data = load_image(url) + assert time_mock.mock_calls == [mock.call(1)] * 2 + assert url_mock.call_args_list == [mock.call(url)] * 3 + assert image_data.getbuffer() == broken_image_bytes + + +def test_404(image_server): + image_data = load_image(image_server.base_url + "nonexistent") + assert image_data.getbuffer() == broken_image_bytes + assert image_server.httpd.request_count == 1 + + +def test_bad_server(bad_server): + image_data = load_image(bad_server.base_url) + assert image_data.getbuffer() == broken_image_bytes + assert bad_server.httpd.request_count == 1 + + +def test_bad_content_length(bad_content_length_server): + image_data = load_image(bad_content_length_server.base_url) + assert image_data.getbuffer() == broken_image_bytes + assert bad_content_length_server.httpd.request_count == 1 diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..916f68e --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,4 @@ +import pathlib + +TEST_DIR = pathlib.Path(__file__).parent.resolve(strict=True) +PROJECT_DIR = TEST_DIR.parent