target · phutelmyer · Apr 19, 2024 · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024
diff --git a/configs/python/backend/backend.yaml b/configs/python/backend/backend.yaml
@@ -1,4 +1,4 @@
-version: 2024.02.01.01
+version: 2024.04.02.01
 logging_cfg: '/etc/strelka/logging.yaml'
 limits:
   max_files: 5000
@@ -107,6 +107,7 @@ scanners:
           - 'application/vnd.ms-outlook'
           - 'message/rfc822'
           - 'email_file'
+          - 'email_file_broad'
       priority: 5
       options:
         create_thumbnail: True

diff --git a/configs/python/backend/taste/taste.yara b/configs/python/backend/taste/taste.yara
@@ -464,6 +464,23 @@ rule email_file {
         $e in (0..2048)
 }
 
+rule email_file_broad
+{
+    meta:
+        type = "email"
+    strings:
+        $ = "Received: "
+        $ = "Origin-messageId: "
+        $ = "Return-Path: "
+        $ = "From: "
+        $ = "To: "
+        $ = "Subject: "
+        $ = "Date: "
+    condition:
+        magic.mime_type() == "message/rfc822" or
+        all of them
+}
+
 rule tnef_file {
     meta:
         description = "Transport Neutral Encapsulation Format"

diff --git a/src/python/strelka/scanners/scan_email.py b/src/python/strelka/scanners/scan_email.py
@@ -51,11 +51,7 @@ def scan(self, data, file, options, expire_at):
         thumbnail_header = options.get("thumbnail_header", False)
         thumbnail_size = options.get("thumbnail_size", (500, 500))
 
-        # ----------------
-        # Thumbnail
-        # ----------------
-        # Create a thumbnail from the image.
-        # Stores as a base64 value in the key: base64_thumbnail
+        # Attempt to create a thumbnail from the email
         if create_thumbnail:
             try:
                 image = self.create_email_thumbnail(data, thumbnail_header)
@@ -67,184 +63,124 @@ def scan(self, data, file, options, expire_at):
                     self.event["base64_thumbnail"] = base64_image
                 else:
                     self.flags.append(
-                        f"{self.__class__.__name__}: image_thumbnail_error: Could not generate thumbnail."
+                        f"{self.__class__.__name__}: image_thumbnail_error: Could not generate thumbnail. No HTML found."
                     )
             except Exception as e:
                 self.flags.append(
                     f"{self.__class__.__name__}: image_thumbnail_error: {str(e)[:50]}"
                 )
 
-        # ----------------
-        # Parse Email Contents
-        # -------------------
+        # Parse email contents
         try:
             # Open and parse email byte string
-            # If fail to open, return.
-            try:
-                ep = eml_parser.EmlParser(
-                    include_attachment_data=True, include_raw_body=True
-                )
-                parsed_eml = ep.decode_email_bytes(data)
-            except strelka.ScannerTimeout:
-                raise
-            except Exception as e:
-                self.flags.append(
-                    f"{self.__class__.__name__}: email_parse_error: {str(e)[:50]}"
-                )
+            ep = eml_parser.EmlParser(
+                include_attachment_data=True, include_raw_body=True
+            )
+            parsed_eml = ep.decode_email_bytes(data)
 
             # Check if email was parsed properly and attempt to deconflict and reload.
-            # If fail to reparse, return.
-            try:
-                if not (
-                    parsed_eml["header"]["subject"] and parsed_eml["header"]["header"]
-                ):
-                    if b"\nReceived: from " in data:
-                        data = (
-                            data.rpartition(b"\nReceived: from ")[1]
-                            + data.rpartition(b"\nReceived: from ")[2]
-                        )[1:]
-                    elif b"Start mail input; end with <CRLF>.<CRLF>\n" in data:
-                        data = data.rpartition(
-                            b"Start mail input; end with <CRLF>.<CRLF>\n"
-                        )[2]
-                    parsed_eml = ep.decode_email_bytes(data)
-                    if not (
-                        parsed_eml["header"]["subject"]
-                        and parsed_eml["header"]["header"]
-                    ):
-                        self.flags.append(
-                            f"{self.__class__.__name__}: email_parse_error"
-                        )
-                        return
-            except strelka.ScannerTimeout:
-                raise
-            except Exception as e:
-                self.flags.append(
-                    f"{self.__class__.__name__}: email_parse_error: {str(e)[:50]}"
-                )
+            if not (parsed_eml["header"]["subject"] and parsed_eml["header"]["header"]):
+                if b"\nReceived: from " in data:
+                    data = (
+                        data.rpartition(b"\nReceived: from ")[1]
+                        + data.rpartition(b"\nReceived: from ")[2]
+                    )[1:]
+                elif b"Start mail input; end with <CRLF>.<CRLF>\n" in data:
+                    data = data.rpartition(
+                        b"Start mail input; end with <CRLF>.<CRLF>\n"
+                    )[2]
+                parsed_eml = ep.decode_email_bytes(data)
 
-            # Body
-            # If body exists in email, collect partial message contents and domains
-            try:
-                if "body" in parsed_eml:
-                    for body in parsed_eml["body"]:
-                        if "content_type" in body:
-                            if body["content_type"] == "text/plain":
-                                if len(body["content"]) <= 200:
-                                    self.event["body"] = body["content"]
-                                else:
-                                    self.event["body"] = (
-                                        body["content"][:100]
-                                        + "..."
-                                        + body["content"][-100:]
-                                    )
-                        else:
-                            self.event["body"] = (
-                                body["content"][:100] + "..." + body["content"][-100:]
-                            )
-                        if "domain" in body:
-                            if "domain" in self.event:
-                                self.event["domains"] += body["domain"]
+            # Extract body content and domains
+            if "body" in parsed_eml:
+                for body in parsed_eml["body"]:
+                    if "content_type" in body:
+                        if body["content_type"] == "text/plain":
+                            if len(body["content"]) <= 200:
+                                self.event["body"] = body["content"]
                             else:
-                                self.event["domains"] = body["domain"]
-            except strelka.ScannerTimeout:
-                raise
-            except Exception as e:
-                self.flags.append(
-                    f"{self.__class__.__name__}: email_parse_body_error: {str(e)[:50]}"
-                )
-
-            # Attachments
-            # If attachments exist in email, collect attachment details and raw data to be resubmitted to pipeline.
-            try:
-                if "attachment" in parsed_eml:
-                    self.event["attachments"] = {}
-                    self.event["attachments"]["filenames"] = []
-                    self.event["attachments"]["hashes"] = []
-                    self.event["attachments"]["totalsize"] = 0
-                    for attachment in parsed_eml["attachment"]:
-                        self.event["attachments"]["filenames"].append(
-                            attachment["filename"]
-                        )
-                        self.event["attachments"]["hashes"].append(
-                            attachment["hash"]["md5"]
-                        )
-                        self.event["attachments"]["totalsize"] += attachment["size"]
-                        attachments.append(
-                            {
-                                "name": attachment["filename"],
-                                "content-type": attachment["content_header"][
-                                    "content-type"
-                                ][0],
-                                "raw": base64.b64decode(attachment["raw"]),
-                            }
+                                self.event["body"] = (
+                                    body["content"][:100]
+                                    + "..."
+                                    + body["content"][-100:]
+                                )
+                    else:
+                        self.event["body"] = (
+                            body["content"][:100] + "..." + body["content"][-100:]
                         )
-            except strelka.ScannerTimeout:
-                raise
-            except Exception as e:
-                self.flags.append(
-                    f"{self.__class__.__name__}: email_parse_attachment_error: {str(e)[:50]}"
-                )
+                    if "domain" in body:
+                        if "domain" in self.event:
+                            self.event["domains"] += body["domain"]
+                        else:
+                            self.event["domains"] = body["domain"]
+
+            # Extract attachment details and raw data
+            if "attachment" in parsed_eml:
+                self.event["attachments"] = {
+                    "filenames": [],
+                    "hashes": [],
+                    "totalsize": 0,
+                }
+                for attachment in parsed_eml["attachment"]:
+                    self.event["attachments"]["filenames"].append(
+                        attachment["filename"]
+                    )
+                    self.event["attachments"]["hashes"].append(
+                        attachment["hash"]["md5"]
+                    )
+                    self.event["attachments"]["totalsize"] += attachment["size"]
+                    attachments.append(
+                        {
+                            "name": attachment["filename"],
+                            "content-type": attachment["content_header"][
+                                "content-type"
+                            ][0],
+                            "raw": base64.b64decode(attachment["raw"]),
+                        }
+                    )
 
-            # Header
-            # Collect email header information
-            try:
-                self.event["subject"] = parsed_eml["header"]["subject"]
-                self.event["to"] = parsed_eml["header"]["to"]
-                self.event["from"] = parsed_eml["header"]["from"]
+            # Extract email header information
+            self.event["subject"] = parsed_eml["header"].get("subject", "")
+            self.event["to"] = parsed_eml["header"].get("to", "")
+            self.event["from"] = parsed_eml["header"].get("from", "")
+            date_header = parsed_eml["header"].get("date")
+            if date_header:
                 self.event["date_utc"] = (
-                    parsed_eml["header"]["date"].astimezone(pytz.utc).isoformat()[:-6]
-                    + ".000Z"
+                    date_header.astimezone(pytz.utc).isoformat()[:-6] + ".000Z"
                 )
-                self.event["message_id"] = str(
-                    parsed_eml["header"]["header"]["message-id"][0]
-                    .lstrip("<")
-                    .rstrip(">")
-                )
-                if "received_domain" in parsed_eml["header"]:
-                    self.event["received_domain"] = parsed_eml["header"][
-                        "received_domain"
-                    ]
-                if "received_ip" in parsed_eml["header"]:
-                    self.event["received_ip"] = parsed_eml["header"]["received_ip"]
-            except strelka.ScannerTimeout:
-                raise
-            except Exception as e:
-                self.flags.append(
-                    f"{self.__class__.__name__}: email_parse_header_error: {str(e)[:50]}"
-                )
-
-            # If attachments were found, submit back into pipeline
-            try:
-                if attachments:
-                    for attachment in attachments:
-                        self.event["total"]["attachments"] += 1
-
-                        name = attachment["name"]
-                        try:
-                            flavors = [
-                                attachment["content-type"]
-                                .encode("utf-8")
-                                .partition(b";")[0]
-                            ]
-                        except Exception as e:
-                            self.flags.append(
-                                f"{self.__class__.__name__}: email_extract_attachment_error: {str(e)[:50]}"
-                            )
-
-                        # Send extracted file back to Strelka
-                        self.emit_file(attachment["raw"], name=name, flavors=flavors)
-
-                        self.event["total"]["extracted"] += 1
-            except strelka.ScannerTimeout:
-                raise
-            except Exception as e:
-                self.flags.append(
-                    f"{self.__class__.__name__}: email_extract_attachment_error: {str(e)[:50]}"
-                )
-
-        except AssertionError:
-            self.flags.append(f"{self.__class__.__name__}: email_assertion_error")
+            header = parsed_eml.get("header", {}).get("header", {})
+            message_id = header.get("message-id", [])[0] if header else None
+            self.event["message_id"] = (
+                str(message_id.lstrip("<").rstrip(">")) if message_id else ""
+            )
+            self.event["received_domain"] = parsed_eml["header"].get(
+                "received_domain", []
+            )
+            self.event["received_ip"] = parsed_eml["header"].get("received_ip", [])
+
+            # Process attachments
+            if attachments:
+                for attachment in attachments:
+                    self.event["total"]["attachments"] += 1
+                    name = attachment["name"]
+                    try:
+                        flavors = [
+                            attachment["content-type"]
+                            .encode("utf-8")
+                            .partition(b";")[0]
+                        ]
+                    except Exception as e:
+                        self.flags.append(
+                            f"{self.__class__.__name__}: email_extract_attachment_error: {str(e)[:50]}"
+                        )
+                    # Send extracted file back to Strelka
+                    self.emit_file(attachment["raw"], name=name, flavors=flavors)
+                    self.event["total"]["extracted"] += 1
+
+        except Exception as e:
+            self.flags.append(
+                f"{self.__class__.__name__}: email_parse_error: {str(e)[:50]}"
+            )
 
     def create_email_thumbnail(self, data, show_header):
         """

diff --git a/src/python/strelka/tests/fixtures/test_broken.eml b/src/python/strelka/tests/fixtures/test_broken.eml
@@ -0,0 +1,33 @@
+Hi Placeholder,
+
+Can I have access?
+
+Thanks,
+John
+
+
+From: Placeholder Smith  <[email protected]<mailto:[email protected]>>
+Date: Thursday, March 28, 2024 at 1:45 PM
+To: "Jane.Doe" <[email protected]>
+Subject: Fwd: [EXTERNAL] Folder shared with you: "Strelka Details"
+
+Begin forwarded message:
+From: "Placeholder Smith (via Acme Share)" <[email protected]>
+Date: March 27, 2024 at 6:47:31 PM EST
+To: "Jane.Doe" <[email protected]>
+Cc: "John.Doe" <[email protected]>
+Subject: [EXTERNAL] Folder shared with you: "Strelka Details"
+Reply-To: Placeholder Smith ([email protected]<mailto:[email protected]>
+
+Placeholder shared a folder
+Placeholder Smith ([email protected]<mailto:[email protected]>) added you as an editor. Verify your email to securely start contributing to this folder. You will need to verify your email every 7 days.
+Hello, attached is the shared folder.
+Best,
+Placeholder
+
+Open<share.acme.com>
+
+Use is subject to the Google Privacy Policy<https://acme.com>.
+
+ACME LLC, 123 Fake Street, USA
+You have received this email because [email protected]<mailto:[email protected]> shared a file or folder located in Acme Share with you. Delete visitor session<https://acme.com>