Stream pages in containers (merge isolation providers)

Merge Qubes and Containers isolation providers into a superclass called "ProcessBasedIsolationProviders" by streaming pages in containers for exclusively in first conversion process. The commit is rather large due to the multiple interdependencies of the code, making it difficult to split into various commits. The main conversion method (_convert) now in the superclass simply calls two methods: - doc_to_pixels() - pixels_to_pdf() Critically, doc_to_pixels is implemented in the superclass, diverging only in a specialized method called "get_doc_to_pixels_proc()". This method obtains the process responsible that communicates with the isolation provider (container / disp VM) via `podman/docker` and qrexec on Containers and Qubes respectively. Known regressions: - progress reports stopped working on containers Fixes #443
freedomofpress · Nov 22, 2023 · 1370f26 · 1370f26
1 parent f547d05
commit 1370f26
Show file tree

Hide file tree

Showing 11 changed files with 293 additions and 429 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -54,10 +54,5 @@ COPY conversion /opt/dangerzone/dangerzone/conversion
 RUN adduser -s /bin/sh -D dangerzone
 USER dangerzone
 
-# /tmp/input_file is where the first convert expects the input file to be, and
-# /tmp where it will write the pixel files
-#
-# /dangerzone is where the second script expects files to be put by the first one
-#
-# /safezone is where the wrapper eventually moves the sanitized files.
-VOLUME /dangerzone /tmp/input_file /safezone
+# /safezone is a directory through which Pixels to PDF receives files
+VOLUME /safezone
diff --git a/dangerzone/conversion/common.py b/dangerzone/conversion/common.py
@@ -10,7 +10,7 @@
 import sys
 import time
 from abc import abstractmethod
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import Callable, Dict, List, Optional, TextIO, Tuple, Union
 
 TIMEOUT_PER_PAGE: float = 30  # (seconds)
 TIMEOUT_PER_MB: float = 30  # (seconds)
@@ -57,6 +57,49 @@ def __init__(self, progress_callback: Optional[Callable] = None) -> None:
         self.progress_callback = progress_callback
         self.captured_output: bytes = b""
 
+    @classmethod
+    def _read_bytes(cls) -> bytes:
+        """Read bytes from the stdin."""
+        data = sys.stdin.buffer.read()
+        if data is None:
+            raise EOFError
+        return data
+
+    @classmethod
+    def _write_bytes(cls, data: bytes, file: TextIO = sys.stdout) -> None:
+        file.buffer.write(data)
+
+    @classmethod
+    def _write_text(cls, text: str, file: TextIO = sys.stdout) -> None:
+        cls._write_bytes(text.encode(), file=file)
+
+    @classmethod
+    def _write_int(cls, num: int, file: TextIO = sys.stdout) -> None:
+        cls._write_bytes(num.to_bytes(2, signed=False), file=file)
+
+    # ==== ASYNC METHODS ====
+    # We run sync methods in async wrappers, because pure async methods are more difficult:
+    # https://stackoverflow.com/a/52702646
+    #
+    # In practice, because they are I/O bound and we don't have many running concurrently,
+    # they shouldn't cause a problem.
+
+    @classmethod
+    async def read_bytes(cls) -> bytes:
+        return await asyncio.to_thread(cls._read_bytes)
+
+    @classmethod
+    async def write_bytes(cls, data: bytes, file: TextIO = sys.stdout) -> None:
+        return await asyncio.to_thread(cls._write_bytes, data, file=file)
+
+    @classmethod
+    async def write_text(cls, text: str, file: TextIO = sys.stdout) -> None:
+        return await asyncio.to_thread(cls._write_text, text, file=file)
+
+    @classmethod
+    async def write_int(cls, num: int, file: TextIO = sys.stdout) -> None:
+        return await asyncio.to_thread(cls._write_int, num, file=file)
+
     async def read_stream(
         self, sr: asyncio.StreamReader, callback: Optional[Callable] = None
     ) -> bytes:
@@ -149,13 +192,4 @@ async def convert(self) -> None:
         pass
 
     def update_progress(self, text: str, *, error: bool = False) -> None:
-        if running_on_qubes():
-            if self.progress_callback:
-                self.progress_callback(error, text, int(self.percentage))
-        else:
-            print(
-                json.dumps(
-                    {"error": error, "text": text, "percentage": int(self.percentage)}
-                )
-            )
-            sys.stdout.flush()
+        pass
diff --git a/dangerzone/conversion/doc_to_pixels.py b/dangerzone/conversion/doc_to_pixels.py
@@ -14,7 +14,7 @@
 import re
 import shutil
 import sys
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, TextIO
 
 import fitz
 import magic
@@ -24,26 +24,17 @@
 
 
 class DocumentToPixels(DangerzoneConverter):
-    # XXX: These functions write page data and metadata to a separate file. For now,
-    # they act as an anchor point for Qubes to stream back page data/metadata in
-    # real time. In the future, they will be completely replaced by their streaming
-    # counterparts. See:
-    #
-    # https://github.com/freedomofpress/dangerzone/issues/443
     async def write_page_count(self, count: int) -> None:
-        pass
+        return await self.write_int(count)
 
     async def write_page_width(self, width: int, filename: str) -> None:
-        with open(filename, "w") as f:
-            f.write(str(width))
+        return await self.write_int(width)
 
     async def write_page_height(self, height: int, filename: str) -> None:
-        with open(filename, "w") as f:
-            f.write(str(height))
+        return await self.write_int(height)
 
     async def write_page_data(self, data: bytes, filename: str) -> None:
-        with open(filename, "wb") as f:
-            f.write(data)
+        return await self.write_bytes(data)
 
     async def convert(self) -> None:
         conversions: Dict[str, Dict[str, Optional[str]]] = {
@@ -259,20 +250,6 @@ async def convert(self) -> None:
             await self.write_page_height(pix.height, height_filename)
             await self.write_page_data(rgb_buf, rgb_filename)
 
-        final_files = (
-            glob.glob("/tmp/page-*.rgb")
-            + glob.glob("/tmp/page-*.width")
-            + glob.glob("/tmp/page-*.height")
-        )
-
-        # XXX: Sanity check to avoid situations like #560.
-        if not running_on_qubes() and len(final_files) != 3 * doc.page_count:
-            raise errors.PageCountMismatch()
-
-        # Move converted files into /tmp/dangerzone
-        for filename in final_files:
-            shutil.move(filename, "/tmp/dangerzone")
-
         self.update_progress("Converted document to pixels")
 
     async def install_libreoffice_ext(self, libreoffice_ext: str) -> None:
@@ -304,18 +281,28 @@ def detect_mime_type(self, path: str) -> str:
         return mime_type
 
 
-async def main() -> int:
-    converter = DocumentToPixels()
+async def main() -> None:
+    try:
+        data = await DocumentToPixels.read_bytes()
+    except EOFError:
+        sys.exit(1)
+
+    with open("/tmp/input_file", "wb") as f:
+        f.write(data)
 
     try:
+        converter = DocumentToPixels()
         await converter.convert()
-        error_code = 0  # Success!
-    except errors.ConversionException as e:  # Expected Errors
-        error_code = e.error_code
+    except errors.ConversionException as e:
+        await DocumentToPixels.write_bytes(str(e).encode(), file=sys.stderr)
+        sys.exit(e.error_code)
     except Exception as e:
-        converter.update_progress(str(e), error=True)
+        await DocumentToPixels.write_bytes(str(e).encode(), file=sys.stderr)
         error_code = errors.UnexpectedConversionError.error_code
-    return error_code
+        sys.exit(error_code)
+
+    # Write debug information
+    await DocumentToPixels.write_bytes(converter.captured_output, file=sys.stderr)
 
 
 if __name__ == "__main__":

diff --git a/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py b/dangerzone/conversion/doc_to_pixels_qubes_wrapper.py
diff --git a/dangerzone/conversion/pixels_to_pdf.py b/dangerzone/conversion/pixels_to_pdf.py
@@ -24,17 +24,17 @@ async def convert(
     ) -> None:
         self.percentage = 50.0
         if tempdir is None:
-            tempdir = "/tmp"
+            tempdir = "/safezone"
 
-        num_pages = len(glob.glob(f"{tempdir}/dangerzone/page-*.rgb"))
+        num_pages = len(glob.glob(f"{tempdir}/pixels/page-*.rgb"))
         total_size = 0.0
 
         safe_doc = fitz.Document()
 
         # Convert RGB files to PDF files
         percentage_per_page = 45.0 / num_pages
         for page_num in range(1, num_pages + 1):
-            filename_base = f"{tempdir}/dangerzone/page-{page_num}"
+            filename_base = f"{tempdir}/pixels/page-{page_num}"
             rgb_filename = f"{filename_base}.rgb"
             width_filename = f"{filename_base}.width"
             height_filename = f"{filename_base}.height"
@@ -96,6 +96,18 @@ async def convert(
 
         safe_doc.save(safe_pdf_path, deflate_images=True)
 
+    def update_progress(self, text: str, *, error: bool = False) -> None:
+        if running_on_qubes():
+            if self.progress_callback:
+                self.progress_callback(error, text, int(self.percentage))
+        else:
+            print(
+                json.dumps(
+                    {"error": error, "text": text, "percentage": int(self.percentage)}
+                )
+            )
+            sys.stdout.flush()
+
 
 async def main() -> int:
     ocr_lang = os.environ.get("OCR_LANGUAGE") if os.environ.get("OCR") == "1" else None