diff --git a/build/python/backend/requirements.txt b/build/python/backend/requirements.txt index 71c69612..0013760e 100644 --- a/build/python/backend/requirements.txt +++ b/build/python/backend/requirements.txt @@ -33,6 +33,8 @@ opentelemetry-exporter-otlp-proto-grpc==1.15.0 opentelemetry-exporter-otlp-proto-http==1.15.0 opentelemetry-sdk==1.15.0 pefile==2022.5.30 +pillow-avif-plugin==1.3.1 +pillow-heif==0.9.3 pgpdump3==1.5.2 pre-commit==3.0.1 py-tlsh==4.7.2 diff --git a/configs/python/backend/backend.yaml b/configs/python/backend/backend.yaml index 46488306..ca7e05a6 100644 --- a/configs/python/backend/backend.yaml +++ b/configs/python/backend/backend.yaml @@ -57,6 +57,9 @@ scanners: flavors: - 'image/x-ms-bmp' - 'bmp_file' + negative: + source: + - 'ScanTranscode' priority: 5 'ScanBzip2': - positive: @@ -212,6 +215,9 @@ scanners: - 'application/x-shockwave-flash' - 'fws_file' - 'image/webp' + - 'image/avif' + - 'image/heic' + - 'image/heif' priority: 5 options: keys: @@ -329,6 +335,9 @@ scanners: flavors: - 'image/jpeg' - 'jpeg_file' + negative: + source: + - 'ScanTranscode' priority: 5 'ScanJson': - positive: @@ -366,6 +375,9 @@ scanners: - 'image/x-ms-bmp' - 'bmp_file' - 'image/webp' + negative: + source: + - 'ScanTranscode' priority: 5 'ScanLzma': - positive: @@ -431,6 +443,9 @@ scanners: - 'image/x-ms-bmp' - 'bmp_file' - 'image/webp' + negative: + source: + - 'ScanTranscode' priority: 5 'ScanOcr': - positive: @@ -535,6 +550,9 @@ scanners: flavors: - 'image/png' - 'png_file' + negative: + source: + - 'ScanTranscode' priority: 5 'ScanQr': - positive: @@ -621,6 +639,14 @@ scanners: - 'application/vnd.ms-tnef' - 'tnef_file' priority: 5 + 'ScanTranscode': + - positive: + flavors: + - 'image/avif' + - 'image/heic' + - 'image/heif' + options: + output_format: jpeg # "gif", "webp", "jpeg", "bmp", "png", "tiff" 'ScanUpx': - positive: flavors: diff --git a/docs/README.md b/docs/README.md index 673bae15..c021911b 100644 --- a/docs/README.md +++ b/docs/README.md @@ -652,6 +652,7 @@ The table below describes each scanner and its options. Each scanner has the hid | ScanTar | Extract files from tar archives | `limit` -- maximum number of files to extract (defaults to `1000`) | | ScanTlsh | Scans and compares a file's TLSH hash with a list of TLSH hashes | "location" -- location of the TLSH rules file or directory (defaults to "/etc/tlsh/")
"score" -- Score comparison threshold for matches (lower = closer match) | | ScanTnef | Collects metadata and extract files from TNEF files | N/A | +| ScanTranscode | Converts uncommon image formats to PNG to ease support in other scanners | `output_format` one of `gif` `webp` `jpeg` `bmp` `png` `tiff` (default `jpeg`) | [Ryan O'Horo](https://github.com/ryanohoro) | | ScanUpx | Decompresses UPX packed files | `tempfile_directory` -- location where `tempfile` will write temporary files (defaults to `/tmp/`) | | ScanUrl | Collects URLs from files | `regex` -- dictionary entry that establishes the regular expression pattern used for URL parsing (defaults to a widely scoped regex) | | ScanVb | Collects metadata from Visual Basic script files | N/A | diff --git a/src/python/requirements.txt b/src/python/requirements.txt index 71c69612..0013760e 100644 --- a/src/python/requirements.txt +++ b/src/python/requirements.txt @@ -33,6 +33,8 @@ opentelemetry-exporter-otlp-proto-grpc==1.15.0 opentelemetry-exporter-otlp-proto-http==1.15.0 opentelemetry-sdk==1.15.0 pefile==2022.5.30 +pillow-avif-plugin==1.3.1 +pillow-heif==0.9.3 pgpdump3==1.5.2 pre-commit==3.0.1 py-tlsh==4.7.2 diff --git a/src/python/strelka/scanners/scan_transcode.py b/src/python/strelka/scanners/scan_transcode.py new file mode 100644 index 00000000..4e518415 --- /dev/null +++ b/src/python/strelka/scanners/scan_transcode.py @@ -0,0 +1,43 @@ +import io +import logging + +import pillow_avif +from PIL import Image, UnidentifiedImageError +from pillow_heif import register_heif_opener + +from strelka import strelka + +logging.getLogger("PIL").setLevel(logging.WARNING) + +# Must be imported as a plugin, doesn't need to be used +_ = pillow_avif.AvifImagePlugin + +register_heif_opener() + + +class ScanTranscode(strelka.Scanner): + """ + Converts supported images for easier scanning + + Typical supported output options: + gif webp jpeg bmp png tiff + """ + + def scan(self, data, file, options, expire_at): + output_format = options.get("output_format", "jpeg") + + def convert(im): + with io.BytesIO() as f: + im.save(f, format=f"{output_format}", quality=90) + return f.getvalue() + + try: + converted_image = convert(Image.open(io.BytesIO(data))) + + # Send extracted file back to Strelka + self.emit_file(converted_image, name=file.name) + except UnidentifiedImageError: + self.flags.append("unidentified_image") + return + + self.flags.append("transcoded") diff --git a/src/python/strelka/strelka.py b/src/python/strelka/strelka.py index 08fab9fd..f27df388 100644 --- a/src/python/strelka/strelka.py +++ b/src/python/strelka/strelka.py @@ -542,10 +542,10 @@ def match_scanner( positives = mapping.get("positive", {}) neg_flavors = negatives.get("flavors", []) neg_filename = negatives.get("filename", None) - neg_source = negatives.get("source", None) + neg_source = negatives.get("source", []) pos_flavors = positives.get("flavors", []) pos_filename = positives.get("filename", None) - pos_source = positives.get("source", None) + pos_source = positives.get("source", []) assigned = { "name": scanner, "priority": mapping.get("priority", 5), @@ -555,22 +555,23 @@ def match_scanner( for neg_flavor in neg_flavors: if neg_flavor in itertools.chain(*file.flavors.values()): return {} - if neg_filename is not None: - if re.search(neg_filename, file.name) is not None: + if neg_filename: + if re.search(neg_filename, file.name): return {} - if neg_source is not None: - if re.search(neg_source, file.source) is not None: + if neg_source: + print(file.source, neg_source) + if file.source in neg_source: return {} for pos_flavor in pos_flavors: if ( pos_flavor == "*" and not ignore_wildcards ) or pos_flavor in itertools.chain(*file.flavors.values()): return assigned - if pos_filename is not None: - if re.search(pos_filename, file.name) is not None: + if pos_filename: + if re.search(pos_filename, file.name): return assigned - if pos_source is not None: - if re.search(pos_source, file.source) is not None: + if pos_source: + if file.source in pos_source: return assigned return {} @@ -753,28 +754,46 @@ def emit_file( self, data: bytes, name: str = "", flavors: Optional[list[str]] = None ) -> None: """Re-ingest extracted file""" + with self.tracer.start_as_current_span("emit_file") as current_span: - extract_file = File( - name=name, - source=self.name, - ) - if flavors: - extract_file.add_flavors({"external": flavors}) - - current_span.set_attribute(f"{__namespace__}.file.name", name) - current_span.set_attribute(f"{__namespace__}.file.size", len(data)) - current_span.set_attribute(f"{__namespace__}.file.source", self.name) - - if self.coordinator: - for c in chunk_string(data): - self.upload_to_coordinator( - extract_file.pointer, - c, - self.expire_at, - ) - else: - extract_file.data = data - self.files.append(extract_file) + try: + extract_file = File( + name=name, + source=self.name, + ) + if flavors: + extract_file.add_flavors({"external": flavors}) + + current_span.set_attribute(f"{__namespace__}.file.name", name) + current_span.set_attribute(f"{__namespace__}.file.size", len(data)) + current_span.set_attribute(f"{__namespace__}.file.source", self.name) + + if self.coordinator: + for c in chunk_string(data): + self.upload_to_coordinator( + extract_file.pointer, + c, + self.expire_at, + ) + else: + extract_file.data = data + + self.files.append(extract_file) + + if self.coordinator: + for c in chunk_string(data): + self.upload_to_coordinator( + extract_file.pointer, + c, + self.expire_at, + ) + else: + extract_file.data = data + self.files.append(extract_file) + + except Exception: + logging.exception("failed to emit file") + self.flags.append("failed_to_emit_file") def upload_to_coordinator(self, pointer, chunk, expire_at) -> None: """Uploads data to coordinator. diff --git a/src/python/strelka/tests/fixtures/test_broken.heic b/src/python/strelka/tests/fixtures/test_broken.heic new file mode 100644 index 00000000..44d3858b Binary files /dev/null and b/src/python/strelka/tests/fixtures/test_broken.heic differ diff --git a/src/python/strelka/tests/fixtures/test_qr.avif b/src/python/strelka/tests/fixtures/test_qr.avif new file mode 100755 index 00000000..d16147fd Binary files /dev/null and b/src/python/strelka/tests/fixtures/test_qr.avif differ diff --git a/src/python/strelka/tests/fixtures/test_qr.heic b/src/python/strelka/tests/fixtures/test_qr.heic new file mode 100755 index 00000000..2d875f0b Binary files /dev/null and b/src/python/strelka/tests/fixtures/test_qr.heic differ diff --git a/src/python/strelka/tests/fixtures/test_qr.heif b/src/python/strelka/tests/fixtures/test_qr.heif new file mode 100755 index 00000000..80ac9f3f Binary files /dev/null and b/src/python/strelka/tests/fixtures/test_qr.heif differ diff --git a/src/python/strelka/tests/test_scan_transcode.py b/src/python/strelka/tests/test_scan_transcode.py new file mode 100644 index 00000000..77d6466d --- /dev/null +++ b/src/python/strelka/tests/test_scan_transcode.py @@ -0,0 +1,86 @@ +from pathlib import Path +from unittest import TestCase, mock + +import pytest +from strelka.scanners.scan_transcode import ScanTranscode as ScanUnderTest +from strelka.tests import run_test_scan + +output_formats = ["gif", "webp", "jpeg", "bmp", "png", "tiff"] + + +@pytest.mark.parametrize("output_format", output_formats) +def test_scan_transcode_avif(mocker, output_format) -> None: + """ + Pass: Sample event matches output of scanner. + Failure: Unable to load file or sample event fails to match. + """ + + test_scan_event = {"elapsed": mock.ANY, "flags": ["transcoded"]} + + scanner_event = run_test_scan( + mocker=mocker, + scan_class=ScanUnderTest, + fixture_path=Path(__file__).parent / "fixtures/test_qr.avif", + options={"output_format": output_format}, + ) + + TestCase.maxDiff = None + TestCase().assertDictEqual(test_scan_event, scanner_event) + + +@pytest.mark.parametrize("output_format", output_formats) +def test_scan_transcode_heic(mocker, output_format) -> None: + """ + Pass: Sample event matches output of scanner. + Failure: Unable to load file or sample event fails to match. + """ + + test_scan_event = {"elapsed": mock.ANY, "flags": ["transcoded"]} + + scanner_event = run_test_scan( + mocker=mocker, + scan_class=ScanUnderTest, + fixture_path=Path(__file__).parent / "fixtures/test_qr.heic", + options={"output_format": output_format}, + ) + + TestCase.maxDiff = None + TestCase().assertDictEqual(test_scan_event, scanner_event) + + +@pytest.mark.parametrize("output_format", output_formats) +def test_scan_transcode_heif(mocker, output_format) -> None: + """ + Pass: Sample event matches output of scanner. + Failure: Unable to load file or sample event fails to match. + """ + + test_scan_event = {"elapsed": mock.ANY, "flags": ["transcoded"]} + + scanner_event = run_test_scan( + mocker=mocker, + scan_class=ScanUnderTest, + fixture_path=Path(__file__).parent / "fixtures/test_qr.heif", + options={"output_format": output_format}, + ) + + TestCase.maxDiff = None + TestCase().assertDictEqual(test_scan_event, scanner_event) + + +def test_scan_transcode_broken_heic(mocker) -> None: + """ + Pass: Sample event matches output of scanner. + Failure: Unable to load file or sample event fails to match. + """ + + test_scan_event = {"elapsed": mock.ANY, "flags": ["unidentified_image"]} + + scanner_event = run_test_scan( + mocker=mocker, + scan_class=ScanUnderTest, + fixture_path=Path(__file__).parent / "fixtures/test_broken.heic", + ) + + TestCase.maxDiff = None + TestCase().assertDictEqual(test_scan_event, scanner_event)