Skip to content

Commit

Permalink
Merge pull request #324 from ryanohoro/transcode
Browse files Browse the repository at this point in the history
ScanTranscode - Convert New/Uncommon Image Formats
  • Loading branch information
phutelmyer authored Feb 15, 2023
2 parents f334811 + 5297706 commit f5a85c4
Show file tree
Hide file tree
Showing 11 changed files with 210 additions and 31 deletions.
2 changes: 2 additions & 0 deletions build/python/backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ opentelemetry-exporter-otlp-proto-grpc==1.15.0
opentelemetry-exporter-otlp-proto-http==1.15.0
opentelemetry-sdk==1.15.0
pefile==2022.5.30
pillow-avif-plugin==1.3.1
pillow-heif==0.9.3
pgpdump3==1.5.2
pre-commit==3.0.1
py-tlsh==4.7.2
Expand Down
26 changes: 26 additions & 0 deletions configs/python/backend/backend.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ scanners:
flavors:
- 'image/x-ms-bmp'
- 'bmp_file'
negative:
source:
- 'ScanTranscode'
priority: 5
'ScanBzip2':
- positive:
Expand Down Expand Up @@ -212,6 +215,9 @@ scanners:
- 'application/x-shockwave-flash'
- 'fws_file'
- 'image/webp'
- 'image/avif'
- 'image/heic'
- 'image/heif'
priority: 5
options:
keys:
Expand Down Expand Up @@ -329,6 +335,9 @@ scanners:
flavors:
- 'image/jpeg'
- 'jpeg_file'
negative:
source:
- 'ScanTranscode'
priority: 5
'ScanJson':
- positive:
Expand Down Expand Up @@ -366,6 +375,9 @@ scanners:
- 'image/x-ms-bmp'
- 'bmp_file'
- 'image/webp'
negative:
source:
- 'ScanTranscode'
priority: 5
'ScanLzma':
- positive:
Expand Down Expand Up @@ -431,6 +443,9 @@ scanners:
- 'image/x-ms-bmp'
- 'bmp_file'
- 'image/webp'
negative:
source:
- 'ScanTranscode'
priority: 5
'ScanOcr':
- positive:
Expand Down Expand Up @@ -535,6 +550,9 @@ scanners:
flavors:
- 'image/png'
- 'png_file'
negative:
source:
- 'ScanTranscode'
priority: 5
'ScanQr':
- positive:
Expand Down Expand Up @@ -621,6 +639,14 @@ scanners:
- 'application/vnd.ms-tnef'
- 'tnef_file'
priority: 5
'ScanTranscode':
- positive:
flavors:
- 'image/avif'
- 'image/heic'
- 'image/heif'
options:
output_format: jpeg # "gif", "webp", "jpeg", "bmp", "png", "tiff"
'ScanUpx':
- positive:
flavors:
Expand Down
1 change: 1 addition & 0 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,7 @@ The table below describes each scanner and its options. Each scanner has the hid
| ScanTar | Extract files from tar archives | `limit` -- maximum number of files to extract (defaults to `1000`) |
| ScanTlsh | Scans and compares a file's TLSH hash with a list of TLSH hashes | "location" -- location of the TLSH rules file or directory (defaults to "/etc/tlsh/")<br>"score" -- Score comparison threshold for matches (lower = closer match) |
| ScanTnef | Collects metadata and extract files from TNEF files | N/A |
| ScanTranscode | Converts uncommon image formats to PNG to ease support in other scanners | `output_format` one of `gif` `webp` `jpeg` `bmp` `png` `tiff` (default `jpeg`) | [Ryan O'Horo](https://github.com/ryanohoro) |
| ScanUpx | Decompresses UPX packed files | `tempfile_directory` -- location where `tempfile` will write temporary files (defaults to `/tmp/`) |
| ScanUrl | Collects URLs from files | `regex` -- dictionary entry that establishes the regular expression pattern used for URL parsing (defaults to a widely scoped regex) |
| ScanVb | Collects metadata from Visual Basic script files | N/A |
Expand Down
2 changes: 2 additions & 0 deletions src/python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ opentelemetry-exporter-otlp-proto-grpc==1.15.0
opentelemetry-exporter-otlp-proto-http==1.15.0
opentelemetry-sdk==1.15.0
pefile==2022.5.30
pillow-avif-plugin==1.3.1
pillow-heif==0.9.3
pgpdump3==1.5.2
pre-commit==3.0.1
py-tlsh==4.7.2
Expand Down
43 changes: 43 additions & 0 deletions src/python/strelka/scanners/scan_transcode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import io
import logging

import pillow_avif
from PIL import Image, UnidentifiedImageError
from pillow_heif import register_heif_opener

from strelka import strelka

logging.getLogger("PIL").setLevel(logging.WARNING)

# Must be imported as a plugin, doesn't need to be used
_ = pillow_avif.AvifImagePlugin

register_heif_opener()


class ScanTranscode(strelka.Scanner):
"""
Converts supported images for easier scanning
Typical supported output options:
gif webp jpeg bmp png tiff
"""

def scan(self, data, file, options, expire_at):
output_format = options.get("output_format", "jpeg")

def convert(im):
with io.BytesIO() as f:
im.save(f, format=f"{output_format}", quality=90)
return f.getvalue()

try:
converted_image = convert(Image.open(io.BytesIO(data)))

# Send extracted file back to Strelka
self.emit_file(converted_image, name=file.name)
except UnidentifiedImageError:
self.flags.append("unidentified_image")
return

self.flags.append("transcoded")
81 changes: 50 additions & 31 deletions src/python/strelka/strelka.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,10 +542,10 @@ def match_scanner(
positives = mapping.get("positive", {})
neg_flavors = negatives.get("flavors", [])
neg_filename = negatives.get("filename", None)
neg_source = negatives.get("source", None)
neg_source = negatives.get("source", [])
pos_flavors = positives.get("flavors", [])
pos_filename = positives.get("filename", None)
pos_source = positives.get("source", None)
pos_source = positives.get("source", [])
assigned = {
"name": scanner,
"priority": mapping.get("priority", 5),
Expand All @@ -555,22 +555,23 @@ def match_scanner(
for neg_flavor in neg_flavors:
if neg_flavor in itertools.chain(*file.flavors.values()):
return {}
if neg_filename is not None:
if re.search(neg_filename, file.name) is not None:
if neg_filename:
if re.search(neg_filename, file.name):
return {}
if neg_source is not None:
if re.search(neg_source, file.source) is not None:
if neg_source:
print(file.source, neg_source)
if file.source in neg_source:
return {}
for pos_flavor in pos_flavors:
if (
pos_flavor == "*" and not ignore_wildcards
) or pos_flavor in itertools.chain(*file.flavors.values()):
return assigned
if pos_filename is not None:
if re.search(pos_filename, file.name) is not None:
if pos_filename:
if re.search(pos_filename, file.name):
return assigned
if pos_source is not None:
if re.search(pos_source, file.source) is not None:
if pos_source:
if file.source in pos_source:
return assigned

return {}
Expand Down Expand Up @@ -753,28 +754,46 @@ def emit_file(
self, data: bytes, name: str = "", flavors: Optional[list[str]] = None
) -> None:
"""Re-ingest extracted file"""

with self.tracer.start_as_current_span("emit_file") as current_span:
extract_file = File(
name=name,
source=self.name,
)
if flavors:
extract_file.add_flavors({"external": flavors})

current_span.set_attribute(f"{__namespace__}.file.name", name)
current_span.set_attribute(f"{__namespace__}.file.size", len(data))
current_span.set_attribute(f"{__namespace__}.file.source", self.name)

if self.coordinator:
for c in chunk_string(data):
self.upload_to_coordinator(
extract_file.pointer,
c,
self.expire_at,
)
else:
extract_file.data = data
self.files.append(extract_file)
try:
extract_file = File(
name=name,
source=self.name,
)
if flavors:
extract_file.add_flavors({"external": flavors})

current_span.set_attribute(f"{__namespace__}.file.name", name)
current_span.set_attribute(f"{__namespace__}.file.size", len(data))
current_span.set_attribute(f"{__namespace__}.file.source", self.name)

if self.coordinator:
for c in chunk_string(data):
self.upload_to_coordinator(
extract_file.pointer,
c,
self.expire_at,
)
else:
extract_file.data = data

self.files.append(extract_file)

if self.coordinator:
for c in chunk_string(data):
self.upload_to_coordinator(
extract_file.pointer,
c,
self.expire_at,
)
else:
extract_file.data = data
self.files.append(extract_file)

except Exception:
logging.exception("failed to emit file")
self.flags.append("failed_to_emit_file")

def upload_to_coordinator(self, pointer, chunk, expire_at) -> None:
"""Uploads data to coordinator.
Expand Down
Binary file not shown.
Binary file added src/python/strelka/tests/fixtures/test_qr.avif
Binary file not shown.
Binary file added src/python/strelka/tests/fixtures/test_qr.heic
Binary file not shown.
Binary file added src/python/strelka/tests/fixtures/test_qr.heif
Binary file not shown.
86 changes: 86 additions & 0 deletions src/python/strelka/tests/test_scan_transcode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from pathlib import Path
from unittest import TestCase, mock

import pytest
from strelka.scanners.scan_transcode import ScanTranscode as ScanUnderTest
from strelka.tests import run_test_scan

output_formats = ["gif", "webp", "jpeg", "bmp", "png", "tiff"]


@pytest.mark.parametrize("output_format", output_formats)
def test_scan_transcode_avif(mocker, output_format) -> None:
"""
Pass: Sample event matches output of scanner.
Failure: Unable to load file or sample event fails to match.
"""

test_scan_event = {"elapsed": mock.ANY, "flags": ["transcoded"]}

scanner_event = run_test_scan(
mocker=mocker,
scan_class=ScanUnderTest,
fixture_path=Path(__file__).parent / "fixtures/test_qr.avif",
options={"output_format": output_format},
)

TestCase.maxDiff = None
TestCase().assertDictEqual(test_scan_event, scanner_event)


@pytest.mark.parametrize("output_format", output_formats)
def test_scan_transcode_heic(mocker, output_format) -> None:
"""
Pass: Sample event matches output of scanner.
Failure: Unable to load file or sample event fails to match.
"""

test_scan_event = {"elapsed": mock.ANY, "flags": ["transcoded"]}

scanner_event = run_test_scan(
mocker=mocker,
scan_class=ScanUnderTest,
fixture_path=Path(__file__).parent / "fixtures/test_qr.heic",
options={"output_format": output_format},
)

TestCase.maxDiff = None
TestCase().assertDictEqual(test_scan_event, scanner_event)


@pytest.mark.parametrize("output_format", output_formats)
def test_scan_transcode_heif(mocker, output_format) -> None:
"""
Pass: Sample event matches output of scanner.
Failure: Unable to load file or sample event fails to match.
"""

test_scan_event = {"elapsed": mock.ANY, "flags": ["transcoded"]}

scanner_event = run_test_scan(
mocker=mocker,
scan_class=ScanUnderTest,
fixture_path=Path(__file__).parent / "fixtures/test_qr.heif",
options={"output_format": output_format},
)

TestCase.maxDiff = None
TestCase().assertDictEqual(test_scan_event, scanner_event)


def test_scan_transcode_broken_heic(mocker) -> None:
"""
Pass: Sample event matches output of scanner.
Failure: Unable to load file or sample event fails to match.
"""

test_scan_event = {"elapsed": mock.ANY, "flags": ["unidentified_image"]}

scanner_event = run_test_scan(
mocker=mocker,
scan_class=ScanUnderTest,
fixture_path=Path(__file__).parent / "fixtures/test_broken.heic",
)

TestCase.maxDiff = None
TestCase().assertDictEqual(test_scan_event, scanner_event)

0 comments on commit f5a85c4

Please sign in to comment.