From 36a0c74d14ef891a72f1c39d9c83bb4e36ab35ba Mon Sep 17 00:00:00 2001 From: mara004 Date: Mon, 27 Nov 2023 07:59:57 +0100 Subject: [PATCH] Allow transferring binary data from JS to Python without serialization (CC #67) (#103) * Transfer blobs without serialization (CC #67) * nits * rework print * style improvements, add debug() calls * example/pdfjs: allow URL input It sometimes works, sometimes not. See below for examples: Working: https://cinelerra-gg.org/download/CinelerraGG_Manual.pdf https://pikepdf.readthedocs.io/_/downloads/en/latest/pdf/ Failing: https://www.gemeinde-grindelwald.ch/wp-content/uploads/2020/06/Broschuere_Landwirtschaft_und_Tourismus_Franzoesisch.pdf -> getHexString reports invalid characters. File renders when downloaded, though. Further, the document below produces an out of memory error, but also when rendering locally https://bro.jrtag.ch/Gri/so_guide/de_en/files/downloads/Infoguide_Sommer_23-WEB.pdf * Add test case for surrounding newlines specifically * further polish test cases * make blob debug output less confusing * test: fix wrong timing endpoint * prefer consistent spelling (com vs comm) * nit: use consistent test message * test nits - match test name letter case - add print and 2 more test values - use list comp --- README.md | 33 ++++++------- examples/python/pdfjs.py | 68 +++++++++++++++++++++++++++ src/javascript/connection.py | 82 ++++++++++++++++++++++----------- src/javascript/js/bridge.js | 12 ++++- src/javascript/proxy.py | 21 ++++++--- test/javascript/test_general.py | 57 +++++++++++++++++++++-- 6 files changed, 219 insertions(+), 54 deletions(-) create mode 100644 examples/python/pdfjs.py diff --git a/README.md b/README.md index a3cab1c..f9195d9 100644 --- a/README.md +++ b/README.md @@ -301,24 +301,25 @@ for await (const file of files) { ## Details -* When doing a function call, any foreign objects will be sent to you as a reference. For example, - if you're in JavaScript and do a function call to Python that returns an array, you won't get a - JS array back, but you will get a reference to the Python array. You can still access the array - normally with the [] notation, as long as you use await. If you would like the bridge to turn - the foreign refrence to something native, you can request a primitive value by calling `.valueOf()` - on the Python array. This would give you a JS array. It works the same the other way around. -* The above behavior makes it very fast to pipe data from one function onto another, avoiding costly - conversions. -* This above behavior is not present for callbacks and function parameters. The bridge will try to - serialize what it can, and will give you a foreign reference if it's unable to serialize something. - So if you pass a JS object, you'll get a Python dict, but if the dict contains something like a class, - you'll get a reference in its place. - -#### Notable details + +* When doing a function call, any returned foreign objects will be sent to you as a reference. For example, if you're in JavaScript and do a function call to Python that returns an array, you won't get a JS array back, but you will get a reference to the Python array. You can still access the array normally with the [] notation, as long as you use await. + +* This behavior makes it very fast to pass objects directly between same-language functions, avoiding costly cross-language data transfers. + +* However, this does not apply with callbacks or non-native function input parameters. The bridge will try to serialize what it can, and will give you a foreign reference if it's unable to serialize something. So if you pass a JS object, you'll get a Python dict, but if the dict contains something like a class, you'll get a reference in its place. + +* If you would like the bridge to turn a foreign reference to something native, you can use `.valueOf()` to transfer an object via JSON serialization, or `.blobValueOf()` to write an object into the communication pipe directly. + - `.valueOf()` can be used on any JSON-serializable object, but may be very slow for big data. + - `.blobValueOf()` can be used on any pipe-writeable object implementing the `length` property (e.g. `Buffer`). It can be massively faster by circumventing the JSON+UTF8 encode/decode layer, which is inept for large byte arrays. + +* You can use custom Node.js/Python binary paths by setting the `NODE_BIN` or `PYTHON_BIN` enviornment variables before importing the library. Otherwise, the `node` and `python3` or `python` binaries will be called relative to your PATH enviornment variable. + +* The inter-process communication can be inspected by setting the `DEBUG` env var to `jspybridge`. + +#### Limitations * The `ffid` keyword is reserved. You cannot use it in variable names, object keys or values as this is used to internlly track objects. -* On the bridge to call JavaScript from Python, due to the limiatations of Python and cross-platform IPC, we currently communicate over standard error which means that JSON output in JS standard error can interfere with the bridge. The same issue exists on Windows with python. You are however very unlikely to have issues with this. -* You can set the Node.js/Python binary paths by setting the `NODE_BIN` or `PYTHON_BIN` enviornment variables before importing the library. Otherwise, the `node` and `python3` or `python` binaries will be called relative to your PATH enviornment variable. +* On the bridge to call JavaScript from Python, due to the limiatations of Python and cross-platform IPC, we currently communicate over standard error which means that specific output in JS standard error can interfere with the bridge (as of this writing, the prefices `{"r"` and `blob!` are reserved). A similar issue exists on Windows with Python. You are however very unlikely to have issues with this. * Function calls will timeout after 100000 ms and throw a `BridgeException` error. That default value can be overridden by defining the new value of `REQ_TIMEOUT` in an environment variable. diff --git a/examples/python/pdfjs.py b/examples/python/pdfjs.py new file mode 100644 index 0000000..bdd5779 --- /dev/null +++ b/examples/python/pdfjs.py @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: 2023 mara004 aka geisserml +# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0 + +# See also https://gist.github.com/mara004/87276da4f8be31c80c38036c6ab667d7 + +# Py-Depends: Pillow, JsPyBridge itself +# Js-Depends: pdfjs-dist, canvas +# Use `python -m pip install` and `python -m javascript --install` + +import argparse +from pathlib import Path +import PIL.Image +import javascript + +# NOTE canvas must be the build pdfjs is linked against, otherwise it'll fail with type error +pdfjs = javascript.require("pdfjs-dist") +libcanvas = javascript.require("canvas") + + +def render_pdf(input, outdir, scale): + + pdf = pdfjs.getDocument(input).promise + n_pages = pdf.numPages + n_digits = len(str(n_pages)) + + for i in range(1, n_pages+1): + + page = pdf.getPage(i) + viewport = page.getViewport({"scale": scale}) + w, h = int(viewport.width), int(viewport.height) + + canvas = libcanvas.createCanvas(w, h) + context = canvas.getContext("2d") + page.render({"canvasContext": context, "viewport": viewport}).promise + + # note that blobValueOf() is much faster than valueOf()["data"] for large byte buffers + js_buffer = canvas.toBuffer("raw") + py_buffer = js_buffer.blobValueOf() + + pil_image = PIL.Image.frombuffer("RGBX", (w, h), py_buffer, "raw", "BGRX", 0, 1) + pil_image.save(outdir / f"out_{i:0{n_digits}d}.jpg") + + pdf.destroy() + + +def main(): + + parser = argparse.ArgumentParser( + description="Render a PDF file with Mozilla pdf.js via JsPyBridge.\n" + + "Known issues: - URL support is buggy; - certain PDFs may hit memory limits.", + ) + path_type = lambda p: Path(p).expanduser().resolve() + input_type = lambda p: p if p.startswith("http") else str(path_type(p)) + parser.add_argument( + "input", type=input_type, + help="Input file path or URL.", + ) + parser.add_argument("--outdir", "-o", type=path_type) + parser.add_argument("--scale", type=float, default=4) + + args = parser.parse_args() + if not args.outdir.exists(): + args.outdir.mkdir(parents=True, exist_ok=True) + + render_pdf(args.input, args.outdir, scale=args.scale) + + +main() diff --git a/src/javascript/connection.py b/src/javascript/connection.py index ff3554e..57bab91 100644 --- a/src/javascript/connection.py +++ b/src/javascript/connection.py @@ -59,23 +59,44 @@ def supports_color(): proc = com_thread = stdout_thread = None -def read_stderr(stderrs): - ret = [] - for stderr in stderrs: - inp = stderr.decode("utf-8") - for line in inp.split("\n"): - if not len(line): - continue - if not line.startswith('{"r"'): - print("[JSE]", line) - continue - try: - d = json.loads(line) - debug("[js -> py]", int(time.time() * 1000), line) - ret.append(d) - except ValueError as e: - print("[JSE]", line) - return ret +def readComItem(stream): + + line = stream.readline() + if not line: + return + + if line.startswith(b"blob!"): + + _, d, blob = line.split(b"!", maxsplit=2) + d = json.loads(d.decode("utf-8")) + + # blobs may contain any value, including b"\n", so we track length and fetch possible remaining data + # note that either initial_len or fetch_len will include space for a trailing \n + target_len = d.pop("len") + initial_len = len(blob) + fetch_len = (target_len - initial_len) + 1 + debug(f"[js -> py] blob r:{d['r']}: target_len {target_len}, initial_len {initial_len}, fetch_len {fetch_len}") + if fetch_len > 0: + blob += stream.read(fetch_len) + + # must end with \n (added by bridge) to separate the next IPC call, which will be received via .readline() + assert blob.endswith(b"\n") + d["blob"] = blob[:-1] + assert len(d["blob"]) == target_len + debug(f"[js -> py] blob r:{d['r']}: {d['blob'][:20]} ... (truncated)") + + return d + + line = line.decode("utf-8") + if not line.startswith('{"r"'): + print("[JSE]", line) + return + try: + d = json.loads(line) + debug("[js -> py]", int(time.time() * 1000), line) + return d + except ValueError as e: + print("[JSE]", line) sendQ = [] @@ -100,14 +121,15 @@ def writeAll(objs): break -stderr_lines = [] +com_items = [] # Reads from the socket, in this case it's standard error. Returns an array -# of responses from the server. +# of parsed responses from the server. def readAll(): - ret = read_stderr(stderr_lines) - stderr_lines.clear() - return ret + global com_items + capture = com_items + com_items = [] + return capture def com_io(): @@ -139,21 +161,25 @@ def com_io(): for send in sendQ: proc.stdin.write(send) proc.stdin.flush() - + + # FIXME untested if notebook: stdout_thread = threading.Thread(target=stdout_read, args=(), daemon=True) stdout_thread.start() - while proc.poll() == None: - stderr_lines.append(proc.stderr.readline()) - if config.event_loop != None: - config.event_loop.queue.put("stdin") + while proc.poll() is None: + item = readComItem(proc.stderr) + if item: + com_items.append(item) + if config.event_loop != None: + config.event_loop.queue.put("stdin") stop() +# FIXME untested def stdout_read(): while proc.poll() is None: - print(proc.stdout.readline().decode("utf-8")) + os.write(sys.stdout.fileno(), proc.stdout.readline()) def start(): diff --git a/src/javascript/js/bridge.js b/src/javascript/js/bridge.js index 998da39..f99371c 100644 --- a/src/javascript/js/bridge.js +++ b/src/javascript/js/bridge.js @@ -174,7 +174,12 @@ class Bridge { const v = await this.m[ffid] this.ipc.send({ r, val: v.valueOf() }) } - + + async blob (r, ffid) { + const v = await this.m[ffid] + this.ipc.sendBlob(v, r) + } + async keys (r, ffid) { const v = await this.m[ffid] const keys = Object.getOwnPropertyNames(v) @@ -252,6 +257,11 @@ const ipc = { debug('js -> py', data) process.stderr.write(JSON.stringify(data) + '\n') }, + sendBlob: (data, r) => { + process.stderr.write('blob!{"r":'+r+',"len":'+data.length+'}!') + process.stderr.write(data) + process.stderr.write('\n') + }, writeRaw: (data, r, cb) => { debug('js -> py', data) handlers[r] = cb diff --git a/src/javascript/proxy.py b/src/javascript/proxy.py index 21d8054..8eaa7c4 100644 --- a/src/javascript/proxy.py +++ b/src/javascript/proxy.py @@ -15,21 +15,26 @@ def __init__(self, loop): self.bridge = self.loop.pyi def ipc(self, action, ffid, attr, args=None): + # NOTE The actions here translate to function calls in bridge.js self.i += 1 r = self.i # unique request ts, acts as ID for response l = None # the lock if action == "get": # return obj[prop] l = self.queue(r, {"r": r, "action": "get", "ffid": ffid, "key": attr}) - if action == "init": # return new obj[prop] + elif action == "init": # return new obj[prop] l = self.queue(r, {"r": r, "action": "init", "ffid": ffid, "key": attr, "args": args}) - if action == "inspect": # return require('util').inspect(obj[prop]) + elif action == "inspect": # return require('util').inspect(obj[prop]) l = self.queue(r, {"r": r, "action": "inspect", "ffid": ffid, "key": attr}) - if action == "serialize": # return JSON.stringify(obj[prop]) + elif action == "serialize": # return JSON.stringify(obj[prop]) l = self.queue(r, {"r": r, "action": "serialize", "ffid": ffid}) - if action == "set": + elif action == "blob": + l = self.queue(r, {"r": r, "action": "blob", "ffid": ffid}) + elif action == "set": l = self.queue(r, {"r": r, "action": "set", "ffid": ffid, "key": attr, "args": args}) - if action == "keys": + elif action == "keys": l = self.queue(r, {"r": r, "action": "keys", "ffid": ffid}) + else: + assert False, f"Unhandled action '{action}'" if not l.wait(10): if not config.event_thread: @@ -265,7 +270,11 @@ def __contains__(self, key): def valueOf(self): ser = self._exe.ipc("serialize", self.ffid, "") return ser["val"] - + + def blobValueOf(self): + blob = self._exe.ipc("blob", self.ffid, "") + return blob["blob"] + def __str__(self): return self._exe.inspect(self.ffid, "str") diff --git a/test/javascript/test_general.py b/test/javascript/test_general.py index 720daf4..cf226c3 100644 --- a/test/javascript/test_general.py +++ b/test/javascript/test_general.py @@ -1,4 +1,6 @@ -from javascript import require, console, On, Once, off, once, eval_js +import time +from pathlib import Path +from javascript import require, console, On, Once, off, once, eval_js, globalThis def assertEquals(cond, val): assert cond == val @@ -53,7 +55,6 @@ def handler(this, fn, num, obj): def onceIncrement(this, *args): print("Hey, I'm only called once !") - demo.increment() def test_arrays(): @@ -79,6 +80,54 @@ def test_valueOf(): assert a[2] == 3 print("Array", demo.arr.valueOf()) + +def test_blobValueOf_generalValue(): + + # use this file itself as test data for simplicity + fs = require("fs") + FILE = Path(__file__).resolve() + js_buffer = fs.readFileSync(str(FILE), {"encoding": None}) + + t_start = time.time() + blob_value = js_buffer.blobValueOf() + t_blob = time.time() - t_start + assert isinstance(blob_value, bytes) + assert b"\n" in blob_value + + t_start = time.time() + json_value = js_buffer.valueOf() + t_json = time.time() - t_start + assert json_value["type"] == "Buffer" + assert isinstance(json_value["data"], list) + + # confirm both transfer strategies return the same data, and transferred data matches with natively reproduced data + native_value = FILE.read_bytes() + assert blob_value == bytes(json_value["data"]) == native_value + + # don't actually assert to avoid time dependent test case + # note, the performance difference is much more pronounced for bigger values (see examples/pdfjs.py) + print(f"blobValueOf() faster? {t_blob < t_json} (t_blob: {t_blob}, t_json {t_json})") + + +def test_blobValueOf_specificValues(): + test_values = [ + "Value without newline", + "Value with \nembedded\n newlines", + "\nValue with single enclosing newlines\n", + "\n\nValue with double enclosing newlines\n\n", + # test an empty string and various amounts of newlines only + "", *["\n"*c for c in (1, 2, 3, 10)] + ] + for val in test_values: + print(f"blobValueOf() {val!r}") + # 'from' is a reserved keyword in python, so use dict getitem as a workaround + js_buffer = globalThis.Buffer["from"](val, "utf-8") + blob_value = js_buffer.blobValueOf() + json_value = js_buffer.valueOf() + assert json_value["type"] == "Buffer" + assert blob_value == bytes(json_value["data"]) == bytes(val, "utf-8") + + def test_once(): demo.wait() once(demo, "done") @@ -122,8 +171,10 @@ def test_nullFromJsReturnsNone(): test_arrays() test_errors() test_valueOf() +test_blobValueOf_generalValue() +test_blobValueOf_specificValues() test_once() test_assignment() test_eval() test_bigint() -test_nullFromJsReturnsNone() \ No newline at end of file +test_nullFromJsReturnsNone()