Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ScanTranscode - Convert New/Uncommon Image Formats #324

Merged
merged 11 commits into from
Feb 15, 2023
Merged
2 changes: 2 additions & 0 deletions build/python/backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ opencv-contrib-python==4.6.0.66
opencv-python==4.6.0.66
openpyxl==3.0.9
pefile==2022.5.30
pillow-avif-plugin==1.3.1
pillow-heif==0.9.3
pgpdump3==1.5.2
pre-commit==3.0.1
py-tlsh==4.7.2
Expand Down
26 changes: 26 additions & 0 deletions configs/python/backend/backend.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ scanners:
flavors:
- 'image/x-ms-bmp'
- 'bmp_file'
negative:
source:
- 'ScanTranscode'
priority: 5
'ScanBzip2':
- positive:
Expand Down Expand Up @@ -196,6 +199,9 @@ scanners:
- 'application/x-shockwave-flash'
- 'fws_file'
- 'image/webp'
- 'image/avif'
- 'image/heic'
- 'image/heif'
priority: 5
options:
keys:
Expand Down Expand Up @@ -313,6 +319,9 @@ scanners:
flavors:
- 'image/jpeg'
- 'jpeg_file'
negative:
source:
- 'ScanTranscode'
priority: 5
'ScanJson':
- positive:
Expand Down Expand Up @@ -350,6 +359,9 @@ scanners:
- 'image/x-ms-bmp'
- 'bmp_file'
- 'image/webp'
negative:
source:
- 'ScanTranscode'
priority: 5
'ScanLzma':
- positive:
Expand Down Expand Up @@ -415,6 +427,9 @@ scanners:
- 'image/x-ms-bmp'
- 'bmp_file'
- 'image/webp'
negative:
source:
- 'ScanTranscode'
priority: 5
'ScanOcr':
- positive:
Expand Down Expand Up @@ -519,6 +534,9 @@ scanners:
flavors:
- 'image/png'
- 'png_file'
negative:
source:
- 'ScanTranscode'
priority: 5
'ScanQr':
- positive:
Expand Down Expand Up @@ -605,6 +623,14 @@ scanners:
- 'application/vnd.ms-tnef'
- 'tnef_file'
priority: 5
'ScanTranscode':
- positive:
flavors:
- 'image/avif'
- 'image/heic'
- 'image/heif'
options:
output_format: jpeg # "gif", "webp", "jpeg", "bmp", "png", "tiff"
'ScanUpx':
- positive:
flavors:
Expand Down
1 change: 1 addition & 0 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -612,6 +612,7 @@ The table below describes each scanner and its options. Each scanner has the hid
| ScanTar | Extract files from tar archives | `limit` -- maximum number of files to extract (defaults to `1000`) |
| ScanTlsh | Scans and compares a file's TLSH hash with a list of TLSH hashes | "location" -- location of the TLSH rules file or directory (defaults to "/etc/tlsh/")<br>"score" -- Score comparison threshold for matches (lower = closer match) |
| ScanTnef | Collects metadata and extract files from TNEF files | N/A |
| ScanTranscode | Converts uncommon image formats to PNG to ease support in other scanners | `output_format` one of `gif` `webp` `jpeg` `bmp` `png` `tiff` (default `jpeg`) | [Ryan O'Horo](https://github.com/ryanohoro) |
| ScanUpx | Decompresses UPX packed files | `tempfile_directory` -- location where `tempfile` will write temporary files (defaults to `/tmp/`) |
| ScanUrl | Collects URLs from files | `regex` -- dictionary entry that establishes the regular expression pattern used for URL parsing (defaults to a widely scoped regex) |
| ScanVb | Collects metadata from Visual Basic script files | N/A |
Expand Down
2 changes: 2 additions & 0 deletions src/python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ opencv-contrib-python==4.6.0.66
opencv-python==4.6.0.66
openpyxl==3.0.9
pefile==2022.5.30
pillow-avif-plugin==1.3.1
pillow-heif==0.9.3
pgpdump3==1.5.2
pre-commit==3.0.1
py-tlsh==4.7.2
Expand Down
37 changes: 37 additions & 0 deletions src/python/strelka/scanners/scan_transcode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import io
import logging

import pillow_avif
from PIL import Image
from pillow_heif import register_heif_opener

from strelka import strelka

logging.getLogger("PIL").setLevel(logging.WARNING)

# Must be imported as a plugin, doesn't need to be used
_ = pillow_avif.AvifImagePlugin

register_heif_opener()


class ScanTranscode(strelka.Scanner):
"""
Converts supported images to PNG for easier scanning
ryanohoro marked this conversation as resolved.
Show resolved Hide resolved

Typical supported output options:
gif webp jpeg bmp png tiff
"""

def scan(self, data, file, options, expire_at):
output_format = options.get("output_format", "jpeg")

def convert(im):
with io.BytesIO() as f:
im.save(f, format=f"{output_format}", quality=90)
return f.getvalue()

# Send extracted file back to Strelka
self.emit_file(convert(Image.open(io.BytesIO(data))), name=file.name)

self.flags.append("transcoded")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know we discussed global exception handling - curious how you want to approach adding exceptions here. My first thought would be something like...

    def scan(self, data, file, options, expire_at):
        output_format = options.get("output_format", "jpeg")

        def convert(im):
            with io.BytesIO() as f:
                try:
                    im.save(f, format=f"{output_format}", quality=90)
                    return f.getvalue()
                except ValueError:
                  self.flags.append(f"{self.__class__.__name__} Exception:  Invalid format or quality.")
                except OSError:
                   self.flags.append(f"{self.__class__.__name__} Exception:  Unsupported format or invalid image file.")
                except AttributeError:
                   self.flags.append(f"{self.__class__.__name__} Exception:  Data is not a bytes-like object.")
                except Exception as e:
                   self.flags.append(f"{self.__class__.__name__} Exception: {str(e)[:50]}")

        # Send extracted file back to Strelka
        try:
            self.emit_file(convert(Image.open(io.BytesIO(data))), name=file.name)
        except Exception as e:
            self.flags.append(f"{self.__class__.__name__} Exception: Failed to emit file")
            return

        self.flags.append("transcoded")

Too much? Too specific to scanner?

Copy link
Collaborator Author

@ryanohoro ryanohoro Feb 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I catch UnidentifiedImageError now, which is what's thrown when a broken image is loaded in to Image.open. Fair to do if we expect a lot of exceptions from badly formatted or truncated image files. I'd like to add in other specific exceptions as they come up while running Strelka.

Added a test for a broken image.

I'm not keen on a broader catch. If emit_file() itself fails, I think that should be handled inside of emit_file(). I added code to add a flag when emit_file fails, and log the exception. If emit_file fails, it's likely due to a coordinator connectivity problem, and we shouldn't suppress those exceptions.

21 changes: 11 additions & 10 deletions src/python/strelka/strelka.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,10 +447,10 @@ def match_scanner(
positives = mapping.get("positive", {})
neg_flavors = negatives.get("flavors", [])
neg_filename = negatives.get("filename", None)
neg_source = negatives.get("source", None)
neg_source = negatives.get("source", [])
pos_flavors = positives.get("flavors", [])
pos_filename = positives.get("filename", None)
pos_source = positives.get("source", None)
pos_source = positives.get("source", [])
assigned = {
"name": scanner,
"priority": mapping.get("priority", 5),
Expand All @@ -460,22 +460,23 @@ def match_scanner(
for neg_flavor in neg_flavors:
if neg_flavor in itertools.chain(*file.flavors.values()):
return {}
if neg_filename is not None:
if re.search(neg_filename, file.name) is not None:
if neg_filename:
if re.search(neg_filename, file.name):
return {}
if neg_source is not None:
if re.search(neg_source, file.source) is not None:
if neg_source:
print(file.source, neg_source)
if file.source in neg_source:
return {}
for pos_flavor in pos_flavors:
if (
pos_flavor == "*" and not ignore_wildcards
) or pos_flavor in itertools.chain(*file.flavors.values()):
return assigned
if pos_filename is not None:
if re.search(pos_filename, file.name) is not None:
if pos_filename:
if re.search(pos_filename, file.name):
return assigned
if pos_source is not None:
if re.search(pos_source, file.source) is not None:
if pos_source:
if file.source in pos_source:
return assigned

return {}
Expand Down
Binary file added src/python/strelka/tests/fixtures/test_qr.avif
Binary file not shown.
Binary file added src/python/strelka/tests/fixtures/test_qr.heic
Binary file not shown.
Binary file added src/python/strelka/tests/fixtures/test_qr.heif
Binary file not shown.
68 changes: 68 additions & 0 deletions src/python/strelka/tests/test_scan_transcode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from pathlib import Path
from unittest import TestCase, mock

import pytest
from strelka.scanners.scan_transcode import ScanTranscode as ScanUnderTest
from strelka.tests import run_test_scan

output_formats = ["gif", "webp", "jpeg", "bmp", "png", "tiff"]


@pytest.mark.parametrize("output_format", output_formats)
def test_scan_transcode_avif(mocker, output_format) -> None:
"""
Pass: Sample event matches output of scanner.
Failure: Unable to load file or sample event fails to match.
"""

test_scan_event = {"elapsed": mock.ANY, "flags": ["transcoded"]}

scanner_event = run_test_scan(
mocker=mocker,
scan_class=ScanUnderTest,
fixture_path=Path(__file__).parent / "fixtures/test_qr.avif",
options={"output_format": output_format},
)

TestCase.maxDiff = None
TestCase().assertDictEqual(test_scan_event, scanner_event)


@pytest.mark.parametrize("output_format", output_formats)
def test_scan_transcode_heic(mocker, output_format) -> None:
"""
Pass: Sample event matches output of scanner.
Failure: Unable to load file or sample event fails to match.
"""

test_scan_event = {"elapsed": mock.ANY, "flags": ["transcoded"]}

scanner_event = run_test_scan(
mocker=mocker,
scan_class=ScanUnderTest,
fixture_path=Path(__file__).parent / "fixtures/test_qr.heic",
options={"output_format": output_format},
)

TestCase.maxDiff = None
TestCase().assertDictEqual(test_scan_event, scanner_event)


@pytest.mark.parametrize("output_format", output_formats)
def test_scan_transcode_heif(mocker, output_format) -> None:
"""
Pass: Sample event matches output of scanner.
Failure: Unable to load file or sample event fails to match.
"""

test_scan_event = {"elapsed": mock.ANY, "flags": ["transcoded"]}

scanner_event = run_test_scan(
mocker=mocker,
scan_class=ScanUnderTest,
fixture_path=Path(__file__).parent / "fixtures/test_qr.heif",
options={"output_format": output_format},
)

TestCase.maxDiff = None
TestCase().assertDictEqual(test_scan_event, scanner_event)