Skip to content

Commit

Permalink
feat: use "diffoscope --diff-mask" to filter out stray entries
Browse files Browse the repository at this point in the history
Naïvely diffing the regenerated .mo files works unless a given catalog
has entries in the "fuzzy" and "obsolete" states (let's call them
"stale"), which will be lost in the round trip .mo → .po → .mo.  Rather
that insist that Weblate strip these entries upstream[1], here we build
a list of them in each catalog and use "diffoscope --diff-mask" to
ignore them, plus a little grepping to make sure only real diffs are
left.

[1]: WeblateOrg/weblate#3350
  • Loading branch information
cfm committed Sep 21, 2023
1 parent 911fd9b commit e61d670
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 62 deletions.
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,6 @@ $(POT): securedrop_client

.PHONY: verify-mo
verify-mo: ## Verify that all gettext machine objects (.mo) are reproducible from their catalogs (.po).
@scripts/reproduce-mo.py "${LOCALE_DIR}/*"
@git diff --quiet "${LOCALE_DIR}/**/*.mo"
@scripts/verify-mo.py "${LOCALE_DIR}/*"
@# All good; now clean up.
@git restore "${LOCALE_DIR}/**/*.po"
11 changes: 11 additions & 0 deletions requirements/dev-bookworm-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,9 @@ defusedxml==0.7.1 \
--hash=sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69 \
--hash=sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61
# via semgrep
diffoscope==249 \
--hash=sha256:bc4d8cb3198025013784ef7e3fa61b7a642de39e5b790c45d7c29d153306fbdd
# via -r requirements/dev-sdw-requirements.in
easyprocess==1.1 \
--hash=sha256:82eed523a0a5eb12a81fa4eacd9f342caeb3f900eb4b798740e6696ad07e63f9 \
--hash=sha256:885898302a57aab948973e8b5d32a4229392b9fb2d986ab1d4ffd590e5ba90ec
Expand Down Expand Up @@ -291,6 +294,10 @@ jsonschema-specifications==2023.7.1 \
--hash=sha256:05adf340b659828a004220a9613be00fa3f223f2b82002e273dee62fd50524b1 \
--hash=sha256:c91a50404e88a1f6ba40636778e2ee08f6e24c5613fe4c53ac24578a5a7f72bb
# via jsonschema
libarchive-c==5.0 \
--hash=sha256:3ed7ee9b7d7d6fc200aecce63cee2084754cb6c00e946f6d007b80236e662bff \
--hash=sha256:d673f56673d87ec740d1a328fa205cafad1d60f5daca4685594deb039d32b159
# via diffoscope
lxml==4.9.3 \
--hash=sha256:05186a0f1346ae12553d66df1cfce6f251589fea3ad3da4f3ef4e34b2d58c6a3 \
--hash=sha256:075b731ddd9e7f68ad24c635374211376aa05a281673ede86cbe1d1b3455279d \
Expand Down Expand Up @@ -812,6 +819,10 @@ python-lsp-jsonrpc==1.0.0 \
--hash=sha256:079b143be64b0a378bdb21dff5e28a8c1393fe7e8a654ef068322d754e545fc7 \
--hash=sha256:7bec170733db628d3506ea3a5288ff76aa33c70215ed223abdb0d95e957660bd
# via semgrep
python-magic==0.4.27 \
--hash=sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b \
--hash=sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3
# via diffoscope
python3-xlib==0.15 \
--hash=sha256:dc4245f3ae4aa5949c1d112ee4723901ade37a96721ba9645f2bfa56e5b383f8
# via
Expand Down
11 changes: 11 additions & 0 deletions requirements/dev-bullseye-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,9 @@ defusedxml==0.7.1 \
--hash=sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69 \
--hash=sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61
# via semgrep
diffoscope==249 \
--hash=sha256:bc4d8cb3198025013784ef7e3fa61b7a642de39e5b790c45d7c29d153306fbdd
# via -r requirements/dev-sdw-requirements.in
easyprocess==1.1 \
--hash=sha256:82eed523a0a5eb12a81fa4eacd9f342caeb3f900eb4b798740e6696ad07e63f9 \
--hash=sha256:885898302a57aab948973e8b5d32a4229392b9fb2d986ab1d4ffd590e5ba90ec
Expand Down Expand Up @@ -291,6 +294,10 @@ jsonschema-specifications==2023.7.1 \
--hash=sha256:05adf340b659828a004220a9613be00fa3f223f2b82002e273dee62fd50524b1 \
--hash=sha256:c91a50404e88a1f6ba40636778e2ee08f6e24c5613fe4c53ac24578a5a7f72bb
# via jsonschema
libarchive-c==5.0 \
--hash=sha256:3ed7ee9b7d7d6fc200aecce63cee2084754cb6c00e946f6d007b80236e662bff \
--hash=sha256:d673f56673d87ec740d1a328fa205cafad1d60f5daca4685594deb039d32b159
# via diffoscope
lxml==4.9.3 \
--hash=sha256:05186a0f1346ae12553d66df1cfce6f251589fea3ad3da4f3ef4e34b2d58c6a3 \
--hash=sha256:075b731ddd9e7f68ad24c635374211376aa05a281673ede86cbe1d1b3455279d \
Expand Down Expand Up @@ -806,6 +813,10 @@ python-lsp-jsonrpc==1.0.0 \
--hash=sha256:079b143be64b0a378bdb21dff5e28a8c1393fe7e8a654ef068322d754e545fc7 \
--hash=sha256:7bec170733db628d3506ea3a5288ff76aa33c70215ed223abdb0d95e957660bd
# via semgrep
python-magic==0.4.27 \
--hash=sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b \
--hash=sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3
# via diffoscope
python3-xlib==0.15 \
--hash=sha256:dc4245f3ae4aa5949c1d112ee4723901ade37a96721ba9645f2bfa56e5b383f8
# via
Expand Down
1 change: 1 addition & 0 deletions requirements/dev-sdw-requirements.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
-r requirements.in
babel
black
diffoscope
flake8
flaky
isort
Expand Down
11 changes: 11 additions & 0 deletions requirements/dev-sdw-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,9 @@ defusedxml==0.7.1 \
--hash=sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69 \
--hash=sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61
# via semgrep
diffoscope==249 \
--hash=sha256:bc4d8cb3198025013784ef7e3fa61b7a642de39e5b790c45d7c29d153306fbdd
# via -r requirements/dev-sdw-requirements.in
easyprocess==1.1 \
--hash=sha256:82eed523a0a5eb12a81fa4eacd9f342caeb3f900eb4b798740e6696ad07e63f9 \
--hash=sha256:885898302a57aab948973e8b5d32a4229392b9fb2d986ab1d4ffd590e5ba90ec
Expand Down Expand Up @@ -291,6 +294,10 @@ jsonschema-specifications==2023.7.1 \
--hash=sha256:05adf340b659828a004220a9613be00fa3f223f2b82002e273dee62fd50524b1 \
--hash=sha256:c91a50404e88a1f6ba40636778e2ee08f6e24c5613fe4c53ac24578a5a7f72bb
# via jsonschema
libarchive-c==5.0 \
--hash=sha256:3ed7ee9b7d7d6fc200aecce63cee2084754cb6c00e946f6d007b80236e662bff \
--hash=sha256:d673f56673d87ec740d1a328fa205cafad1d60f5daca4685594deb039d32b159
# via diffoscope
lxml==4.9.3 \
--hash=sha256:05186a0f1346ae12553d66df1cfce6f251589fea3ad3da4f3ef4e34b2d58c6a3 \
--hash=sha256:075b731ddd9e7f68ad24c635374211376aa05a281673ede86cbe1d1b3455279d \
Expand Down Expand Up @@ -774,6 +781,10 @@ python-lsp-jsonrpc==1.0.0 \
--hash=sha256:079b143be64b0a378bdb21dff5e28a8c1393fe7e8a654ef068322d754e545fc7 \
--hash=sha256:7bec170733db628d3506ea3a5288ff76aa33c70215ed223abdb0d95e957660bd
# via semgrep
python-magic==0.4.27 \
--hash=sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b \
--hash=sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3
# via diffoscope
python3-xlib==0.15 \
--hash=sha256:dc4245f3ae4aa5949c1d112ee4723901ade37a96721ba9645f2bfa56e5b383f8
# via
Expand Down
60 changes: 0 additions & 60 deletions scripts/reproduce-mo.py

This file was deleted.

129 changes: 129 additions & 0 deletions scripts/verify-mo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""
Verify the reproducibility of gettext machine objects (.mo) from catalogs
(.po).
Due to tool- and library-level idiosyncrasies, this happens in three stages:
1. Via polib: Overwrite metadata .mo → .po.
2. Via translate: Recompile the entire catalog .po → .mo.
3. Via diffoscope: Diff the new .mo against the old, heavily masked and
filtered to avoid false positives from stray entries in the "fuzzy"
and "obsolete" states.
In other words, the new .mo file should be identical (modulo stray entries) to
the original, meaning that the original .po/.mo pair differed only in their
metadata.
"""

import argparse
import os
import shlex
import subprocess
from collections.abc import Iterator
from pathlib import Path
from types import TracebackType
from typing import Optional, Set

import polib
from translate.tools.pocompile import convertmo

parser = argparse.ArgumentParser(
"""Verify the reproducibility of gettext machine objects (.mo) from catalogs (.po)."""
)
parser.add_argument(
"locale",
nargs="+",
help="""one or more locale directories, each of which must contain an "LC_MESSAGES" directory""",
)
parser.add_argument(
"--domain", default="messages", help="""the gettext domain to load (defaults to "messages")"""
)
args = parser.parse_args()


class CatalogVerifier:
"""Wrapper class for proving .mo → .po → .mo reproducibility."""

def __init__(self, path: Path, domain: str):
"""Set up the .po/.mo pair."""
self.path = path
self.po = polib.pofile(str(path / "LC_MESSAGES" / f"{domain}.po"))
self.mo = polib.mofile(str(path / "LC_MESSAGES" / f"{domain}.mo"))

def __enter__(self) -> "CatalogVerifier":
"""Prepare to generate the new .mo file to diff."""
self.mo_target = Path(f"{self.mo.fpath}.new")
return self

def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_value: Optional[BaseException],
traceback: Optional[TracebackType],
) -> None:
"""Clean up."""
self.mo_target.unlink(missing_ok=True)

@property
def diffoscope_args(self) -> Iterator[str]:
"""Build up a diffoscope invocation that removes false positives from the msgunfmt diff."""
yield f"diffoscope {self.mo.fpath} {self.mo_target}"
yield "--diff-mask '^$'" # tell diffoscope to mask empty lines
for stray in self.strays:
yield f"--diff-mask {shlex.quote(stray)}" # tell diffoscope to mask strays
yield "| grep -Fv '[masked]'" # ignore things we've masked
yield "| grep -E '│ (-|\+)msg(id|str)'" # ignore context; we only care about real diffs

@property
def diffoscope_cmd(self) -> str:
"""Return `diffoscope_args` as a string."""
return " ".join(self.diffoscope_args)

@property
def strays(self) -> Set[str]:
"""Return the set of stray (fuzzy or obsolete) entries to mask when diffing this catalog."""
fuzzy = {
f"^{line.replace('#| ', '')}" # strip fuzzy marker
for e in self.po.fuzzy_entries()
for line in str(e).splitlines()
}
obsolete = {
f"^{line.replace('#~ ', '')}" # strip obsolete marker
for e in self.po.obsolete_entries()
for line in str(e).splitlines()
}

return fuzzy | obsolete

def reproduce(self) -> None:
"""Overwrite metadata .mo → .po. Then rewrite the entire file .po → .mo."""
self.po.metadata = self.mo.metadata
self.po.save(self.po.fpath)

with open(self.mo_target, "wb") as mo_target:
convertmo(self.po.fpath, mo_target, "")

def verify(self) -> None:
"""Run diffoscope for this catalog and error if there's any unmasked diff."""
result = subprocess.run( # nosemgrep: python.lang.security.audit.subprocess-shell-true.subprocess-shell-true
self.diffoscope_cmd,
capture_output=True,
env=os.environ,
shell=True,
)
print(f"--> Verifying {self.path}: {result.args}")
if len(result.stdout) > 0:
raise Exception(result.stdout.decode("utf-8"))


print(f"--> Reproducing {len(args.locale)} path(s)")
for path in args.locale:
locale_dir = Path(path).resolve()
if not locale_dir.is_dir():
print(f'--> Skipping "{locale_dir}"')
continue

with CatalogVerifier(locale_dir, args.domain) as catalog:
catalog.reproduce()
catalog.verify()

0 comments on commit e61d670

Please sign in to comment.