Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancements to Strelka Email Scanner / Adding Broken Email Test #450

Merged
merged 3 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion configs/python/backend/backend.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version: 2024.02.01.01
version: 2024.04.02.01
logging_cfg: '/etc/strelka/logging.yaml'
limits:
max_files: 5000
Expand Down Expand Up @@ -107,6 +107,7 @@ scanners:
- 'application/vnd.ms-outlook'
- 'message/rfc822'
- 'email_file'
- 'email_file_broad'
priority: 5
options:
create_thumbnail: True
Expand Down
17 changes: 17 additions & 0 deletions configs/python/backend/taste/taste.yara
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,23 @@ rule email_file {
$e in (0..2048)
}

rule email_file_broad
{
meta:
type = "email"
strings:
$ = "Received: "
$ = "Origin-messageId: "
$ = "Return-Path: "
$ = "From: "
$ = "To: "
$ = "Subject: "
$ = "Date: "
condition:
magic.mime_type() == "message/rfc822" or
all of them
}

rule tnef_file {
meta:
description = "Transport Neutral Encapsulation Format"
Expand Down
270 changes: 103 additions & 167 deletions src/python/strelka/scanners/scan_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,7 @@ def scan(self, data, file, options, expire_at):
thumbnail_header = options.get("thumbnail_header", False)
thumbnail_size = options.get("thumbnail_size", (500, 500))

# ----------------
# Thumbnail
# ----------------
# Create a thumbnail from the image.
# Stores as a base64 value in the key: base64_thumbnail
# Attempt to create a thumbnail from the email
if create_thumbnail:
try:
image = self.create_email_thumbnail(data, thumbnail_header)
Expand All @@ -67,184 +63,124 @@ def scan(self, data, file, options, expire_at):
self.event["base64_thumbnail"] = base64_image
else:
self.flags.append(
f"{self.__class__.__name__}: image_thumbnail_error: Could not generate thumbnail."
f"{self.__class__.__name__}: image_thumbnail_error: Could not generate thumbnail. No HTML found."
)
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: image_thumbnail_error: {str(e)[:50]}"
)

# ----------------
# Parse Email Contents
# -------------------
# Parse email contents
try:
# Open and parse email byte string
# If fail to open, return.
try:
ep = eml_parser.EmlParser(
include_attachment_data=True, include_raw_body=True
)
parsed_eml = ep.decode_email_bytes(data)
except strelka.ScannerTimeout:
raise
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: email_parse_error: {str(e)[:50]}"
)
ep = eml_parser.EmlParser(
include_attachment_data=True, include_raw_body=True
)
parsed_eml = ep.decode_email_bytes(data)

# Check if email was parsed properly and attempt to deconflict and reload.
# If fail to reparse, return.
try:
if not (
parsed_eml["header"]["subject"] and parsed_eml["header"]["header"]
):
if b"\nReceived: from " in data:
data = (
data.rpartition(b"\nReceived: from ")[1]
+ data.rpartition(b"\nReceived: from ")[2]
)[1:]
elif b"Start mail input; end with <CRLF>.<CRLF>\n" in data:
data = data.rpartition(
b"Start mail input; end with <CRLF>.<CRLF>\n"
)[2]
parsed_eml = ep.decode_email_bytes(data)
if not (
parsed_eml["header"]["subject"]
and parsed_eml["header"]["header"]
):
self.flags.append(
f"{self.__class__.__name__}: email_parse_error"
)
return
except strelka.ScannerTimeout:
raise
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: email_parse_error: {str(e)[:50]}"
)
if not (parsed_eml["header"]["subject"] and parsed_eml["header"]["header"]):
if b"\nReceived: from " in data:
data = (
data.rpartition(b"\nReceived: from ")[1]
+ data.rpartition(b"\nReceived: from ")[2]
)[1:]
elif b"Start mail input; end with <CRLF>.<CRLF>\n" in data:
data = data.rpartition(
b"Start mail input; end with <CRLF>.<CRLF>\n"
)[2]
parsed_eml = ep.decode_email_bytes(data)

# Body
# If body exists in email, collect partial message contents and domains
try:
if "body" in parsed_eml:
for body in parsed_eml["body"]:
if "content_type" in body:
if body["content_type"] == "text/plain":
if len(body["content"]) <= 200:
self.event["body"] = body["content"]
else:
self.event["body"] = (
body["content"][:100]
+ "..."
+ body["content"][-100:]
)
else:
self.event["body"] = (
body["content"][:100] + "..." + body["content"][-100:]
)
if "domain" in body:
if "domain" in self.event:
self.event["domains"] += body["domain"]
# Extract body content and domains
if "body" in parsed_eml:
for body in parsed_eml["body"]:
if "content_type" in body:
if body["content_type"] == "text/plain":
if len(body["content"]) <= 200:
self.event["body"] = body["content"]
else:
self.event["domains"] = body["domain"]
except strelka.ScannerTimeout:
raise
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: email_parse_body_error: {str(e)[:50]}"
)

# Attachments
# If attachments exist in email, collect attachment details and raw data to be resubmitted to pipeline.
try:
if "attachment" in parsed_eml:
self.event["attachments"] = {}
self.event["attachments"]["filenames"] = []
self.event["attachments"]["hashes"] = []
self.event["attachments"]["totalsize"] = 0
for attachment in parsed_eml["attachment"]:
self.event["attachments"]["filenames"].append(
attachment["filename"]
)
self.event["attachments"]["hashes"].append(
attachment["hash"]["md5"]
)
self.event["attachments"]["totalsize"] += attachment["size"]
attachments.append(
{
"name": attachment["filename"],
"content-type": attachment["content_header"][
"content-type"
][0],
"raw": base64.b64decode(attachment["raw"]),
}
self.event["body"] = (
body["content"][:100]
+ "..."
+ body["content"][-100:]
)
else:
self.event["body"] = (
body["content"][:100] + "..." + body["content"][-100:]
)
except strelka.ScannerTimeout:
raise
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: email_parse_attachment_error: {str(e)[:50]}"
)
if "domain" in body:
if "domain" in self.event:
self.event["domains"] += body["domain"]
else:
self.event["domains"] = body["domain"]

# Extract attachment details and raw data
if "attachment" in parsed_eml:
self.event["attachments"] = {
"filenames": [],
"hashes": [],
"totalsize": 0,
}
for attachment in parsed_eml["attachment"]:
self.event["attachments"]["filenames"].append(
attachment["filename"]
)
self.event["attachments"]["hashes"].append(
attachment["hash"]["md5"]
)
self.event["attachments"]["totalsize"] += attachment["size"]
attachments.append(
{
"name": attachment["filename"],
"content-type": attachment["content_header"][
"content-type"
][0],
"raw": base64.b64decode(attachment["raw"]),
}
)

# Header
# Collect email header information
try:
self.event["subject"] = parsed_eml["header"]["subject"]
self.event["to"] = parsed_eml["header"]["to"]
self.event["from"] = parsed_eml["header"]["from"]
# Extract email header information
self.event["subject"] = parsed_eml["header"].get("subject", "")
self.event["to"] = parsed_eml["header"].get("to", "")
self.event["from"] = parsed_eml["header"].get("from", "")
date_header = parsed_eml["header"].get("date")
if date_header:
self.event["date_utc"] = (
parsed_eml["header"]["date"].astimezone(pytz.utc).isoformat()[:-6]
+ ".000Z"
date_header.astimezone(pytz.utc).isoformat()[:-6] + ".000Z"
)
self.event["message_id"] = str(
parsed_eml["header"]["header"]["message-id"][0]
.lstrip("<")
.rstrip(">")
)
if "received_domain" in parsed_eml["header"]:
self.event["received_domain"] = parsed_eml["header"][
"received_domain"
]
if "received_ip" in parsed_eml["header"]:
self.event["received_ip"] = parsed_eml["header"]["received_ip"]
except strelka.ScannerTimeout:
raise
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: email_parse_header_error: {str(e)[:50]}"
)

# If attachments were found, submit back into pipeline
try:
if attachments:
for attachment in attachments:
self.event["total"]["attachments"] += 1

name = attachment["name"]
try:
flavors = [
attachment["content-type"]
.encode("utf-8")
.partition(b";")[0]
]
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: email_extract_attachment_error: {str(e)[:50]}"
)

# Send extracted file back to Strelka
self.emit_file(attachment["raw"], name=name, flavors=flavors)

self.event["total"]["extracted"] += 1
except strelka.ScannerTimeout:
raise
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: email_extract_attachment_error: {str(e)[:50]}"
)

except AssertionError:
self.flags.append(f"{self.__class__.__name__}: email_assertion_error")
header = parsed_eml.get("header", {}).get("header", {})
message_id = header.get("message-id", [])[0] if header else None
self.event["message_id"] = (
str(message_id.lstrip("<").rstrip(">")) if message_id else ""
)
self.event["received_domain"] = parsed_eml["header"].get(
"received_domain", []
)
self.event["received_ip"] = parsed_eml["header"].get("received_ip", [])

# Process attachments
if attachments:
for attachment in attachments:
self.event["total"]["attachments"] += 1
name = attachment["name"]
try:
flavors = [
attachment["content-type"]
.encode("utf-8")
.partition(b";")[0]
]
except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: email_extract_attachment_error: {str(e)[:50]}"
)
# Send extracted file back to Strelka
self.emit_file(attachment["raw"], name=name, flavors=flavors)
self.event["total"]["extracted"] += 1

except Exception as e:
self.flags.append(
f"{self.__class__.__name__}: email_parse_error: {str(e)[:50]}"
)

def create_email_thumbnail(self, data, show_header):
"""
Expand Down
33 changes: 33 additions & 0 deletions src/python/strelka/tests/fixtures/test_broken.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Hi Placeholder,

Can I have access?

Thanks,
John


From: Placeholder Smith <[email protected]<mailto:[email protected]>>
Date: Thursday, March 28, 2024 at 1:45 PM
To: "Jane.Doe" <[email protected]>
Subject: Fwd: [EXTERNAL] Folder shared with you: "Strelka Details"

Begin forwarded message:
From: "Placeholder Smith (via Acme Share)" <[email protected]>
Date: March 27, 2024 at 6:47:31 PM EST
To: "Jane.Doe" <[email protected]>
Cc: "John.Doe" <[email protected]>
Subject: [EXTERNAL] Folder shared with you: "Strelka Details"
Reply-To: Placeholder Smith ([email protected]<mailto:[email protected]>

Placeholder shared a folder
Placeholder Smith ([email protected]<mailto:[email protected]>) added you as an editor. Verify your email to securely start contributing to this folder. You will need to verify your email every 7 days.
Hello, attached is the shared folder.
Best,
Placeholder

Open<share.acme.com>

Use is subject to the Google Privacy Policy<https://acme.com>.

ACME LLC, 123 Fake Street, USA
You have received this email because [email protected]<mailto:[email protected]> shared a file or folder located in Acme Share with you. Delete visitor session<https://acme.com>
Loading
Loading