From 08bdd9d4ad7072e26fd64b9e57e03eb02bc1d490 Mon Sep 17 00:00:00 2001
From: Paul Hutelmyer <paul.hutelmyer@gmail.com>
Date: Tue, 2 Jan 2024 15:09:29 -0500
Subject: [PATCH 1/9] Adding Thumbnail Functionality

---
 configs/python/backend/backend.yaml       |  8 ++-
 src/python/strelka/scanners/scan_ocr.py   | 76 +++++++++++++++------
 src/python/strelka/tests/test_scan_ocr.py | 83 +++++++++++++++++++++++
 3 files changed, 142 insertions(+), 25 deletions(-)

diff --git a/configs/python/backend/backend.yaml b/configs/python/backend/backend.yaml
index 018f1f5b..7d2919ac 100644
--- a/configs/python/backend/backend.yaml
+++ b/configs/python/backend/backend.yaml
@@ -1,4 +1,4 @@
-version: 2023-07-07-update
+version: 2024.02.01.01
 logging_cfg: '/etc/strelka/logging.yaml'
 limits:
   max_files: 5000
@@ -380,9 +380,11 @@ scanners:
       priority: 5
       options:
         extract_text: False
-        split_words: True
-        tmp_directory: '/dev/shm/'
+        tmp_directory: "/dev/shm/"
         pdf_to_png: True
+        remove_formatting: True
+        create_thumbnail: True
+        thumbnail_size: [250, 250]
   'ScanOle':
     - positive:
         flavors:
diff --git a/src/python/strelka/scanners/scan_ocr.py b/src/python/strelka/scanners/scan_ocr.py
index e55ed9da..08ad8c61 100644
--- a/src/python/strelka/scanners/scan_ocr.py
+++ b/src/python/strelka/scanners/scan_ocr.py
@@ -1,33 +1,65 @@
 import os
 import subprocess
 import tempfile
-
+import io
 import fitz
+import base64
+from PIL import Image
 
 from strelka import strelka
 
 
 class ScanOcr(strelka.Scanner):
-    """Collects metadata and extracts optical text from image files.
+    """Extracts optical text from image files and creates a thumbnail.
+
+    This scanner extracts text from image files using OCR (Optical Character Recognition) and
+    generates a base64-encoded thumbnail. It supports direct image files and converting PDFs
+    to images for OCR.
 
     Options:
-        extract_text: Boolean that determines if optical text should be
-            extracted as a child file.
-            Defaults to False.
-        tmp_directory: Location where tempfile writes temporary files.
-            Defaults to '/tmp/'.
+        extract_text: If True, extracted text is emitted as a child file. (default: False)
+        split_words: If True, splits the OCR text into words and stores an array. (default: True)
+        remove_formatting: If True, removes formatting characters (e.g., \r). Overridden by split_words. (default: True)
+        tmp_directory: Directory for temporary files. (default: '/tmp/')
+        pdf_to_png: If True, converts PDFs to PNG for OCR. (default: False)
+        create_thumbnail: If True, creates a thumbnail for the image. (default: False)
+        thumbnail_size: Size of the thumbnail to create. (default: (250, 250))
     """
 
     def scan(self, data, file, options, expire_at):
         extract_text = options.get("extract_text", False)
         split_words = options.get("split_words", True)
+        remove_formatting = options.get("remove_formatting", True)
         tmp_directory = options.get("tmp_directory", "/tmp/")
         pdf_to_png = options.get("pdf_to_png", False)
+        create_thumbnail = options.get("create_thumbnail", False)
+        thumbnail_size = options.get("thumbnail_size", (250, 250))
 
+        # Convert PDF to PNG if required.
         if pdf_to_png and "application/pdf" in file.flavors.get("mime", []):
-            doc = fitz.open(stream=data, filetype="pdf")
-            data = doc.get_page_pixmap(0).tobytes("png")
-
+            try:
+                doc = fitz.open(stream=data, filetype="pdf")
+                data = doc.get_page_pixmap(0).tobytes("png")
+            except Exception as e:
+                self.flags.append(
+                    f"{self.__class__.__name__}: image_pdf_error: {str(e)[:50]}"
+                )
+
+        # Create a thumbnail from the image.
+        # Stores as a base64 value in the key: base64_thumbnail
+        if create_thumbnail:
+            try:
+                image = Image.open(io.BytesIO(data))
+                image.thumbnail(thumbnail_size, Image.Resampling.BILINEAR)
+                buffered = io.BytesIO()
+                image.save(buffered, format="WEBP", quality=70, optimize=True)
+                base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                self.event["base64_thumbnail"] = base64_image
+            except Exception as e:
+                self.flags.append(
+                    f"{self.__class__.__name__}: image_thumbnail_error: {str(e)[:50]}"
+                )
+        # Perform OCR on the image data.
         with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_data:
             tmp_data.write(data)
             tmp_data.flush()
@@ -35,28 +67,26 @@ def scan(self, data, file, options, expire_at):
             with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_tess:
                 try:
                     tess_txt_name = f"{tmp_tess.name}.txt"
-
-                    completed_process = subprocess.run(
+                    subprocess.run(
                         ["tesseract", tmp_data.name, tmp_tess.name],
                         capture_output=True,
                         check=True,
                     )
 
-                    _ = completed_process
-
                     with open(tess_txt_name, "rb") as tess_txt:
                         ocr_file = tess_txt.read()
-
                         if ocr_file:
                             if split_words:
                                 self.event["text"] = ocr_file.split()
                             else:
-                                self.event["text"] = (
-                                    ocr_file.replace(b"\r", b"")
-                                    .replace(b"\n", b"")
-                                    .replace(b"\f", b"")
-                                )
-
+                                if remove_formatting:
+                                    self.event["text"] = (
+                                        ocr_file.replace(b"\r", b"")
+                                        .replace(b"\n", b"")
+                                        .replace(b"\f", b"")
+                                    )
+                                else:
+                                    self.event["text"] = ocr_file
                             if extract_text:
                                 # Send extracted file back to Strelka
                                 self.emit_file(ocr_file, name="text")
@@ -64,5 +94,7 @@ def scan(self, data, file, options, expire_at):
                     os.remove(tess_txt_name)
 
                 except subprocess.CalledProcessError as e:
-                    self.flags.append("tesseract_process_error")
+                    self.flags.append(
+                        f"{self.__class__.__name__}: tesseract_process_error: {str(e)[:50]}"
+                    )
                     raise strelka.ScannerException(e.stderr)
diff --git a/src/python/strelka/tests/test_scan_ocr.py b/src/python/strelka/tests/test_scan_ocr.py
index 57b16fe6..b0bc3f2a 100644
--- a/src/python/strelka/tests/test_scan_ocr.py
+++ b/src/python/strelka/tests/test_scan_ocr.py
@@ -1,3 +1,4 @@
+import difflib
 from pathlib import Path
 from unittest import TestCase, mock
 
@@ -420,3 +421,85 @@ def test_scan_ocr_webp(mocker):
 
     TestCase.maxDiff = None
     TestCase().assertDictEqual(test_scan_event, scanner_event)
+
+
+def test_scan_ocr_keep_formatting(mocker):
+    """
+    Pass: Sample event matches output of scanner.
+    Failure: Unable to load file or sample event fails to match.
+    """
+
+    test_scan_event = {
+        "elapsed": mock.ANY,
+        "flags": [],
+        "text": b"Lorem Ipsum\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Cras lobortis sem dui. "
+        b"Morbi at magna quis ligula faucibus\nconsectetur feugiat at purus. Sed nec lorem nibh. Nam vel "
+        b"libero odio. Vivamus tempus non enim egestas pretium.\nVestibulum turpis arcu, maximus nec libero "
+        b"quis, imperdiet suscipit purus. Vestibulum blandit quis lacus non\nsollicitudin. Nullam non "
+        b"convallis dui, et aliquet risus. Sed accumsan ullamcorper vehicula. Proin non urna facilisis,"
+        b"\ncondimentum eros quis, suscipit purus. Morbi euismod imperdiet neque fermentum dictum. Integer "
+        b"aliquam, erat sit\namet fringilla tempus, mauris ligula blandit sapien, et varius sem mauris eu "
+        b"diam. Sed fringilla neque est, in laoreet\nfelis tristique in. Donec luctus velit a posuere "
+        b"posuere. Suspendisse sodales pellentesque quam.\n",
+    }
+
+    scanner_event = run_test_scan(
+        mocker=mocker,
+        scan_class=ScanUnderTest,
+        fixture_path=Path(__file__).parent / "fixtures/test_text.webp",
+        options=({"split_words": False, "remove_formatting": False}),
+    )
+
+    TestCase.maxDiff = None
+
+    # Output string formatting may result in slightly different results.
+    # Comparing similarity to the 99% percentile is good enough.
+    similarity = difflib.SequenceMatcher(
+        None, test_scan_event["text"], scanner_event["text"]
+    ).ratio()
+    assert similarity > 0.99
+
+
+def test_scan_ocr_thumbnail(mocker):
+    """
+    Pass: Sample event matches output of scanner.
+    Failure: Unable to load file or sample event fails to match.
+    """
+
+    test_scan_event = {
+        "elapsed": mock.ANY,
+        "flags": [],
+        "base64_thumbnail": "UklGRpoWAABXRUJQVlA4II4WAAAQUwCdASr6AJ8APp1EnEolo6KhqhUr6LATiWlu3WBpKP1U/6LwT/Ivon9X/f+IR2D5j/aH+h/iPObvj+IH+P6gvs3/Pel39D2n2wf5L0CPZ77T5kv1fmj9mfYE/Mz1m/8Pha/gv+h7An6e9I3QS+4eoT5dHsv9DlBZKO/3p+CYGIsxxoyLSfFJnk9qJs7xKOZiJEUM4voTPRt7q24KjWPzjP9LJSsOJXXHFklGiUlVhswwkwckKAJmkikmoUTlFySc2DYEAqA7oS65VGhxIchghhf/BGmBkrpssZjVSyMRC+nii9V5sve+Bg3cATPHNMkM/w6CnGg9glAqCdZjjqy0zamZLK642ZKCBc69CE4x1Cf0sCPXNLTS3mMTyUHftv4G9G1eDUlyTvICoabo0wKuqre97OFv755jJO70CvHC+HFkrKT9+V7VLhJab1LO1GaOub6hK1/oDTgqmRB2m6NL1kkSpOp43ZtJL7c0XJtRQr8E7m1N7zN2Gx1r1gDI/T9AyjPLGYj1eAiUzJZXW+HeQea7qMc9KneNWye5XTJmtxSRfzUUlqLLgRhoofNYvT/Kkt+VUVA5mV99OdE5R9JPWWzYIDOxV3fcS4D8faN3wisidC/76Cm4+9rP3jtmS+oz75VdxfzkRAE4vFcn56lQgWbtpI+lSKQY+LcRr9kAx1cgK5hvmF8k69iQaRIRUVKrczSS7kesuYQqO2cdcjlYdBpHhG5zzTioUyOCHCT2fOlXTZULVXUoDD9B0uTrYUMqVDJQP5II/eUTKByPmUWp/ZYL89ARn5nnFHJzPMCactNH/UsKcdzPArdZDEHVIjdngV1Rwk5Z0zTuGZEDdkMJAnD+CzqzRgw+hKRxiuvNOOXBZuItulW6ldy7uc0nVDkgAP77IeznaeNmVR+M7qCRlQfd4Czy8iTZqCCTFppPckMktMofwJIK4WofztLf4ehG5ucxbND1FiaHERAsW+P1E16YE3pEEnVyy78VQxYWA1inV6C3WmStm8YLlee80nnTNQ7tD2tqVS4awu/S0zeHTMW0Dv6PpxaD+cR88QpFPvhn4zgXFNfM+4IX5dctNjnwvhtvfi+frNNdGBmoJMTfZwuhXEBiDNlM93UFIhueTIJKO7noJ/1VcSd3vI+E+x5obCs1Y7K+2+3cOx+3/4Ju9nh6Q8urNc99XNm3aDWzOPufkp8QdGXjlcr/QApKXWpAtdC3w8VkNwZ92UeLaybH/y6g+8XfgfagTDqk59fDRbUIFj6tBelaEqV5l9zRe9hX940VJn/8PgVTjJIaeTdUQTdAH/tHf/bvvQVx18qvzyb3zTHCTa5X6FfTnwNhMkQ3O9XqOHgE8d/xOfitRHWsXccr/Dno2Ts+rN/X+aJ1HkXmlEVd/DMnmPfbkY79cx1vC37nk4btceed5M6AKcC96Otdr0M48tOpXRNJhpZrSrq8J2jhf5pJYNrD8+kfvl7mspxkUCOeFMMuCxJqEAgFu+vUvqVz97Nlr57p++IhmmBI32bdPpLP1Izg17IhWrqDf1EQp0kk3KFmLEs6BncDDZUWPwzcj0/aetD1tq4d5LFubRTi7b63w7J5vmAe5JSgwtn7fJ3TZ1/VgFoX3LMEgHmBpYjEjBsToibxoP7vDMCXf/a/RZzpA/jjT/GHomhs0DsPs63ba+Cb2blZ3qHrA6g3EKf7Lhapyk9LMj8YWb8z3P/L407laboZlj3BJdqaLQvX32KzVnFeYzcdutStRWHDYpeBO0iL2jju48wvxYGsP8Rv99bbSohwDXg/5Nf/zc50KuZPsIHpzEiwUeZqnq11XgH4nUpC3q+c/P05XSyf5Sy7GGxH5WI/s/PnLAA/hLwJsZnfCVMVB0nrN1CareL3oBX8MI5pacIVee1Nc3Fu6UaN/kqnjM7rsjOA7Yq/8RggDoUATJNtPioDeb0akRWNg/ZNQMtjrT6H/2+DjM+5w/aU+fT10annXH0pOlydUVODcN8dbUoJDTapuGPMYvoR1eMsetqJ933/Wqwfqvep8lFAeTKCJeK5gsum8WnXZ31Pw1lwPDGAhcvdkQleFV+WorMqp527ZXA8hxjBH8PyvmIwZZzu6iNzeEZULuk0YA4i+P1OYsztDn/Ait6d7VU3p/olPEMgq/kzV1K0eiyVKTUqHKL0D26nkS6uOLPgPeYbEGFYz4a3CEyDvj56zTLl3qtH49rZErA1rte9D5c18Bp09AC+vXOZVc2b28TIn4c8sI8kfHduHC7uhFlTqsJNMm54DOZctMWFD8EAFPJn+CXxU8OY72Q0Lo+LR6QBGqM+6eGdpM6OkCbG200mpIG1DSnCf7LMbB0LvfJyN6jzAQGZZq8klyEzmZIoCg8QlpCtoTJQ+AIqFWl9z4ywir7j1eaE69Ac68vNxMkj5pNtORvzKEFxIICioxjKBLO82BuwAy/4y3NkP/kvq7O0B6NrO/KtZXcqSwuhvgQoyIdKBpUcaZFbUWEMeHUelwLukHWTyZJGl4qHH1R7ngxSiyHI4u6Fjm+AC6yw6TlOrwbwbPtHElveg8aCXiTUVxNB1wRsvP4xpHeG4atg6flIeX4sgjtS1Ow4z96jJWi2HFjH6n/SVI/utmZZT+pDmQhSwd4ki7+L2q47FT/mB3ae13xPZbX/PRrg0oZG3Fx6RS8ZuohpFqX33DcSPE12Jtq9AA1CgCTh5cURyE0/yc2XI04tsdbsFxq4/jnn5u3NuXGgZWCU1JEiIQr446AAyRgNrBEVC1oiYPcxe1wnd17M1Z60V/RUd5/h0UKUOufwn3LSQLq0JvYr0Z4hY6fC1rZU+BmbhiW7P7X2AE0Rq6WHYc5baeeesKS6HJawuV9rBIZu7WLeNmvLnQhPF5UAKOaVq7W19l/gOq7nPxvhgOYNHzwq/53cTre0P7ScopX20PgD9tt/tL0oErJyGRVoBKoRuFLJ+c1Qk/6pEZY0GREKAFwUf+/MQvJjI4JQNh5qctnk0hBPwrMLxJ7E8wetG4kRrEApm6V58S4UJT9IbmrMe6FKBlzPZ+4g4zaf+ajS+/c3nQ3brRh+4e6cZif4Y5uIYZC8MYCBtJ2DD8FiC95HAHzQaAA+sxNeOA8u/0UqKqmCLo1AyxU9xIzZlMJYT3dH8OgFaZ+n4TOXKBzx7ddS2p8yT400Pg6Wdt81SbWf13/U6ra+4TxEl/6Ep5+q9kc/xHvGeqb0O8g5iEnLfU/08FF3nfkkgtOHKhhwmT1+RcWd3sMbPLoBxh8n0J/Q4NWMkgc//XzOtcbISz7a+dN328J2kLyfeK0662DDECirQdQMb8Sz+gsauD5e1cDcFgwcM86gOtURB9gb8v/JnNjsJaqFIizZOP4GTG+amm+vsUPYIsq4bBbqYPcgM6i5RT/0VEe9oQY1Cm0MTEwnaONIWens6HzNZF2CxKaWinwk3C7Gqqme9d+zkMCvqu1k+KFQbQwm9A8f8zWyClmwGmhT3n4b6s7MxKDagHzNqrsh7CyinVf/X2ULqhPHy2gs39BEGj00NsueWpDgYg7DuZM3rt08kRykqri4n5xbDYPCzsHpk+oxrvzKK3YPHEfs169xkuH9woGwI+o7HNl77LBogvxaUghXQvjsLL8hQTSFnCTsaT58BwWUAuPbthNiDeTxpjLabqY0QsyvNt6jTaEwyZD1bF9e34HIdlsI4pXO+Y8PqliWruxYKju47ey0iiD2C6xoe7dGpnNew7jcZ2XRDV/qhwTlXwjF4P1XRzTcxUIjyg+BgPmd2DRX8346onMOlWdNsM6G+/3zKMLGt5Waeptuv2qIfqDF/TT5mqMoKL+ZdepAnMoJL8cLNGb35K+ykMbhZVaWoQPVwCec/dR3tEcQkrc3xNCBVmyZuso9N1aySnjnMG/7b+tA9TQgdbhVwx/f00WDEcXCQ7Z9iB8XJ99V+UzaSjLGXk5HRHecwo343+nKg2ryjUZAhTn44dFq4c/vWjNyalTIgGiotFzHbGDsAlAzJic+zVGYHw7HKwnxlYZguMzeVciCU+ksBF2v0DN0OA/OgBzRWDaVvh/fAPDUMRy7dNoWx3N1LIdhNYf6pK7+o+lIWSjlhOO4oe+nFX4DL2dOgV7n1QaB5POTCahVp6SRDl5/76GMgSm1G8rlrKlzSsgqsemwSQMCLGayrYC+92Q5MEYbLzb/uePhP6DSBTKc0teQMOFJn4FcrZTA0zCjILhxspRsPLVDoexbIiws16BCbzG/ClSVOdX3aSE7ns8UgtdWppWBQOASdVRlJKYIAcktwldaNyOBL0rVt8el3TJGArDt8awRBmHR99jXi3uyVpkGAQD/eu8Fgvx8dFNPN1oDxv0xXrA262qnvWPC8ck1iPMehtDpjlODu8woy5RQqmwgslsObv7JKAm3T899tz7278s2NN7zD0//CHMfJxhh9bDfAsN/Q2Q47HG+HpP8YuzD3Pa1ckqjW0xImxSaxxgkpxygewAE22tcQAHvy9Oq3zwNZ97VRBw/eK/jS1ViAR6vHNCJn35IKLxLSfZpsT0ARYlQ/r7/Sc3t0R0IGR9LdamA3z6Csk1uH/B6/uIBIzsTQHTZvQuR4ygEzb9ug5ZCiYc5EqUKRgGPmXkLD+VSkI1W9TO0jCjl/tLM4zUs2psRXCC2bNMbq0w1zFOLEMWzJf+klbutHP+j991N1e2orlWLKpzI+Qi3fJTzdfsbLP5KkdB5xBblDcd24dopvX1ZKGbqfXffZ0J6bMhb0id8wwiy6ETph0NN9KOISzyR+hu2Tk6GCzcCZJo5P5zbBNDlGkueYE+9PP+B+Ul+QWAeUu9CTTdudfReT5GLbO+ShTm2AirzLYiEFnOk82j6N7kARTgOQsHT51JQSF2ymhqY3f7sS9FD0WMBYacIbxPa0g7PeeTwPTeZZHpUnP+sI4P0tLZvrGCCSdtEPOOKof8faPVstuUPWFbIL/N/FKpTP8/mJjBW80HjGVfWTy+h28fkw7fQ8BG5BJBGQBjtLlKiFlc/3OaVoEPscXpnSlkC8tr1uoRc/AOTp7E7N/EzA285d5ozPyNFHJDG/gio4DVZMjpbEwPLdr0ORIES7qz5gOFOlf7n+v7EtrjVN4nFpSsX9qlqeznybqxRb51H5EPjuzO+EDvsx85zF9khtI5K9Ny1IND71U5YkbDoi5ZdYf+vFWGfMiaVs4YJROuebwueh+If62NoRSa8X+opRsKmMZdrYyO9yu7/4dbWcA1nA5pqzgv9wQdyJ9BW0aGA4rd1aTOVvmWJcXgDiC0Rn3AUTqITVNWmP2Muvv503udJ4zjeNdzNVx6W2D1biis2GzwygkLsBemBSwuXEMEENIE/2pXmo9BsiCpCGRzQX0rJ7x6Cbqs3JHNDTIgTBd+I/TO9ElC2tnHIo+Rr4VjUPfpQr/lFrlBkNdO+MgRV+ad9XTJStOeMbDTOcJXoix76/4Fv04VSyM2fHxJAy1r1HWuw3fbZrE5B6BZoidIWvf8Hcj6iZz9Gv6YeX1wA6ADEHh93PUa9mupMOD6cTwCdmyGYxzdS9RgWE2q4QmjxT4b9Y8ZBA5PBaNFQWR/QCWU8YXRo3FEdz/pPUxuRPIMd+4WW+9jxlTt1QEusvFPxt6/hWKtXi8LVCa+ebrcXVWSL8nWCPDXEkcExr0bchCSyRgbqm0LcGkZr+13EpdfD1arvM2vwcfbNTH4BF0xWO2+rNEVDj0qUIJ1PhtdzVuS1N5O9rP5b3XREc00ixxUb4nJ78P8AuC1aK9YHsSMfj9Pl7J9qFbObM5KeMBRL10+5cUAMlTfxsaEefcT/hPPtFc+/zS+WDKGTAcHnRZHi5DTyCJJK7UEZ2t4TJ4OUwLwXgkAFTGC76cRmg35zpHjrCu+ILjXusMOJbgpTE9DuD2284KKJv0boVwr131oU1ofbQcTmqUtbNmnlQDRTiB/03Lkh3KpHLis7QNQCzh2ywYW6uAEPQvV/objZSTrJlDk/M2OWV7XsggCZUeGmr46UPJjBMA8f8wXvHfWkjZaXF0aS+rtGV2qb2iBMs4ePDS/wM9nN3DZcRi9FpfCPMDgD6uoLv0OCeFwRU/lUuOPDMLSjOUBJlezmhpMI0LGy9q1Mi82td/uf4GSyGEw1XQdmGUfPpUa3ItLPSjO9qeBrmU2peEJJaWSuYNvM3JwPcgTR3fO4kPSAUJLiaVC1izUMnM8oiBHSECBfjYoOphmfgr/j4OBWZKK3r/CfNzNJbSmFPXb/ZPEcu73o1Wd9/GiJXPE7hRQx2kgzjimKPsvQxQOPZlnXKnWnHYOalc0pwy1JQxHP4fgyVxXt1wDSFFlPCfek1L/QsaxdJsQQwFXMlaGzRPcA6yFN0W4sGrO5EjZFIUYXlCCblylQDH+ty8AuLEEHY9RM1BpXJSuE2fUMDhL1rUmXLjJx3/kHPbprIoTmUJgoWDuDGUzIXDKWyL8KLl/vED4NgimT6VSVQ4AT23lNE7fKUonJWxwmPzMN9580tTdaJihp9WGgaArfOvwn9eAjnOi9dVfQ49mucgFcUCkVEoI6+iZAkudHWw/l4eT70SWfb0LAhSyhfo9McWCJtpC5/HqMRSx6AiiZ7+D2fRSuqyGmGyCA3Qd1t479b8n7qG6ljhhEHMKvsqeCT/dB8pBUQ1RTApTmViS+veSvb3+MBSd6j3dyt9wQpf5QPc0IcQoATYG17fLjWyr074CwUAWCp1CXYGtluJW1VV4P0DKs1qqFfIp6JOLJV4gmPm7mrupp/CiNG1GKk/dWnxZSBdOHTgqaVktLUjKiWI8rLkQ2oXm1l8YtYQZJqLT9lNsqNADixlT4UkIbQ/tl7SEuu1aKMorDUJwSEyuhsZC8IjolmRqL40DhPZaaBSBoiG0Ke586ZZu8WyjlmZfcPsTKAAPHiwHIX2I617DYLEWXgNa6vyiAxCSg9MqPZcrwmlyLebaWT5FbkprjSyhtQXra0p9zi8T7od9oimYT24AatKNo18ZssACU5ibS0BqXtqQbWdRf2r5DXoRPT8xmH9gkd1WoBRwrzHuXACz+SLNrIEOyH7/gKSGdnpOob/XW7ObHYxiK+ZynqVbkXkD3gCU/rNpA0RrrroMcy0a61fDwJiRpFnw12fVG8RNep+rHTUWi8T1vgV/43aq1vVe4y3LHHUp/fT11CfUvWazLwhyBYqMOruCFDdSFib8EGfGtBv5jQ/iLEDl5WvnRV8yEdKTN4tjeBym0pxUwhBSdfbXOq+OFrp1CCppnjKFrPjJR673Y4qvtn1jGiSbsDV0SNlGGGagDUiApsbqoyOwJzktcT5qlENkoUjqZs4JYsY4WqiWcU4ETAHAtrhs02fzi//xI7jIDs0qS1/TuPzitRx39o8Ua54XH89suslDN9OCYthReGRCXSzNqw64cwChmFVOF4UHRdfuRm/gPwNEy2+u772NOlaksET7iS5J/39/5h/vIN+xCbShCcIDgHRNBG1NOvIy0Qz5rUj/9kBVkvRHLZ4fF07jSkLuUh8IbboafJwrV1s1d35pcJUJfdnUzhF3slyJutnJN1LqxndeL3vhU82hb/xfWK2Kf/8yE9NTBAFXRJ3O3gNyIYKb/HTUSCCFZVqDvoSmDk8B5uPUCty9biTB4cVClwL0vuE+1lPbRZqEV+IT0qNgu7TUxJjhQQYSVJS3pKfMG1BWCfIIK+UUIghw/v094kvlcYAAAAA==",
+        "text": b"Lorem Ipsum\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Cras lobortis sem dui. "
+        b"Morbi at magna quis ligula faucibus\nconsectetur feugiat at purus. Sed nec lorem nibh. Nam vel "
+        b"libero odio. Vivamus tempus non enim egestas pretium.\nVestibulum turpis arcu, maximus nec libero "
+        b"quis, imperdiet suscipit purus. Vestibulum blandit quis lacus non\nsollicitudin. Nullam non "
+        b"convallis dui, et aliquet risus. Sed accumsan ullamcorper vehicula. Proin non urna facilisis,"
+        b"\ncondimentum eros quis, suscipit purus. Morbi euismod imperdiet neque fermentum dictum. Integer "
+        b"aliquam, erat sit\namet fringilla tempus, mauris ligula blandit sapien, et varius sem mauris eu "
+        b"diam. Sed fringilla neque est, in laoreet\nfelis tristique in. Donec luctus velit a posuere "
+        b"posuere. Suspendisse sodales pellentesque quam.\n",
+    }
+
+    scanner_event = run_test_scan(
+        mocker=mocker,
+        scan_class=ScanUnderTest,
+        fixture_path=Path(__file__).parent / "fixtures/test_text.webp",
+        options=(
+            {"split_words": False, "remove_formatting": False, "create_thumbnail": True}
+        ),
+    )
+
+    TestCase.maxDiff = None
+
+    # Output string formatting may result in slightly different results.
+    # Comparing similarity to the 99% percentile is good enough.
+    similarity = difflib.SequenceMatcher(
+        None, test_scan_event["text"], scanner_event["text"]
+    ).ratio()
+    assert similarity > 0.99
+
+    # Ensure the thumbnail conversion works properly.
+    TestCase().assertEqual(
+        test_scan_event["base64_thumbnail"], scanner_event["base64_thumbnail"]
+    )

From 623d7a11223bd8a60b48d7e1be665ac1a7b8be4b Mon Sep 17 00:00:00 2001
From: Paul Hutelmyer <paul.hutelmyer@gmail.com>
Date: Tue, 2 Jan 2024 15:09:38 -0500
Subject: [PATCH 2/9] Removing duplicates from IOCs

---
 src/python/strelka/strelka.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/python/strelka/strelka.py b/src/python/strelka/strelka.py
index c13998c3..07580952 100644
--- a/src/python/strelka/strelka.py
+++ b/src/python/strelka/strelka.py
@@ -739,7 +739,7 @@ def scan(self, data, file, options, expire_at) -> None:
 
     def scan_wrapper(
         self, data: bytes, file: File, options: dict, expire_at: int
-    ) -> Tuple[list[File], dict]:
+    ) -> Tuple[list[File], dict, list]:
         """Sets up scan attributes and calls scan method.
 
         Scanning code is wrapped in try/except for error handling.
@@ -801,6 +801,11 @@ def scan_wrapper(
                 **{"flags": self.flags},
                 **self.event,
             }
+
+            # Removes duplicate entries from IOC list
+            seen = set()
+            self.iocs = [x for x in self.iocs if x not in seen and not seen.add(x)]
+
             return self.files, {self.key: self.event}, self.iocs
 
     def emit_file(

From f649bebf022eea6208215ea29b478a2539a2075c Mon Sep 17 00:00:00 2001
From: Paul Hutelmyer <paul.hutelmyer@gmail.com>
Date: Tue, 2 Jan 2024 15:17:16 -0500
Subject: [PATCH 3/9] Updating IOC extractors

---
 src/python/strelka/strelka.py | 141 +++++++++++++++++-----------------
 1 file changed, 69 insertions(+), 72 deletions(-)

diff --git a/src/python/strelka/strelka.py b/src/python/strelka/strelka.py
index 07580952..b8c93d28 100644
--- a/src/python/strelka/strelka.py
+++ b/src/python/strelka/strelka.py
@@ -1,6 +1,5 @@
 import glob
 import importlib
-import ipaddress
 import itertools
 import json
 import logging
@@ -16,13 +15,14 @@
 from typing import Generator, Optional, Tuple
 
 import inflection
-import magic  # type: ignore
+import magic
 import redis
-import validators  # type: ignore
-import yara  # type: ignore
-from boltons import iterutils  # type: ignore
+import validators
+import yara
+from boltons import iterutils
 from opentelemetry import context, trace
-from tldextract import TLDExtract  # type: ignore
+from tldextract import TLDExtract
+from urllib.parse import urlparse
 
 from . import __namespace__
 from .telemetry.traces import get_tracer
@@ -861,107 +861,104 @@ def upload_to_coordinator(self, pointer, chunk, expire_at) -> None:
             p.expireat(f"data:{pointer}", expire_at)
             p.execute()
 
-    def process_ioc(
-        self, ioc, ioc_type, scanner_name, description="", malicious=False
-    ) -> None:
+    def process_ioc(self, ioc, scanner_name) -> None:
+        """
+        Processes an Indicator of Compromise (IOC) and appends it to the scanner's IOC list.
+        
+        This method takes an IOC (such as a URL, domain, IP address, or email) and categorizes it
+        into an appropriate type. It validates the IOC using various validators and regular expressions,
+        then appends a dictionary containing the IOC, its type, and the scanner name to the scanner's IOC list.
+        If the IOC does not match any valid type, a warning is logged, and the IOC is not added.
+        
+        Args:
+            ioc (str or bytes): The IOC to be processed. Can be a string or bytes.
+                                If bytes, it will be decoded to a string.
+            scanner_name (str): The name of the scanner processing the IOC. This is used to tag the IOC
+                                in the appended dictionary.
+        
+        Note:
+            - The method internally handles different formats and types of IOCs (like URLs, domains, IPs, and emails).
+            - If the IOC is invalid or does not match a known pattern, a warning is logged and the IOC is not added.
+        """
         if not ioc:
             return
-        if ioc_type == "url":
-            if validators.ipv4(self.extract(ioc).domain):
+
+        if validators.url(ioc):
+            ioc_type = "url"
+            netloc = urlparse(ioc).netloc
+
+            if validators.ipv4(netloc):
                 self.process_ioc(
-                    self.extract(ioc).domain, "ip", scanner_name, description, malicious
+                    netloc,
+                    scanner_name,
                 )
-            else:
+            elif validators.ipv6(netloc):
                 self.process_ioc(
-                    self.extract(ioc).registered_domain,
-                    "domain",
+                    netloc,
                     scanner_name,
-                    description,
-                    malicious,
                 )
-            if not validators.url(ioc):
-                logging.warning(f"{ioc} is not a valid url")
-                return
-        elif ioc_type == "ip":
-            try:
-                ipaddress.ip_address(ioc)
-            except ValueError:
-                logging.warning(f"{ioc} is not a valid IP")
-                return
-        elif ioc_type == "domain":
-            if not validators.domain(ioc):
-                logging.warning(f"{ioc} is not a valid domain")
-                return
-        elif ioc_type == "email":
-            if not validators.email(ioc):
-                logging.warning(f"{ioc} is not a valid email")
-                return
-
-        if malicious:
-            self.iocs.append(
-                {
-                    "ioc": ioc,
-                    "ioc_type": ioc_type,
-                    "scanner": scanner_name,
-                    "description": description,
-                    "malicious": True,
-                }
-            )
+            elif validators.domain(netloc):
+                self.process_ioc(
+                    netloc,
+                    scanner_name,
+                )
+        elif validators.domain(ioc):
+            ioc_type = "domain"
+        elif re.match("^[\w\.\-]{2,62}\.[a-zA-Z]{2,5}:\d{1,5}$", ioc):
+            ioc_type = "domain"
+            ioc = ioc.split(":")[0]
+        elif validators.ipv4(ioc):
+            ioc_type = "ip"
+        elif validators.ipv6(ioc):
+            ioc_type = "ip"
+        elif validators.email(ioc):
+            ioc_type = "email"
+        elif re.match("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}$", ioc):
+            ioc_type = "ip"
+            ioc = ioc.split(":")[0]
         else:
-            self.iocs.append(
-                {
-                    "ioc": ioc,
-                    "ioc_type": ioc_type,
-                    "scanner": scanner_name,
-                    "description": description,
-                }
-            )
+            logging.warning(f"{ioc} does not match a valid IOC type")
+            return
 
-    def add_iocs(self, ioc, ioc_type, description="", malicious=False) -> None:
+        self.iocs.append(
+            {
+                "ioc": ioc,
+                "ioc_type": ioc_type,
+                "scanner": scanner_name,
+            }
+        )
+
+    def add_iocs(self, ioc) -> None:
         """Adds ioc to the iocs.
         :param ioc: The IOC or list of IOCs to be added. All iocs must be of the same type. Must be type String or Bytes.
-        :param ioc_type: Must be one of md5, sha1, sha256, domain, url, email, ip, either as string or type object (e.g. self.type.domain).
-        :param description (Optional): Description of the IOCs.
-        :param malicious (Optional): Reasonable determination whether the indicator is or would be used maliciously. Example:
-          Malware Command and Control. Should not be used solely for determining maliciousness since testing values may be present.
         """
         try:
-            accepted_iocs = ["md5", "sha1", "sha256", "domain", "url", "email", "ip"]
-            if ioc_type not in accepted_iocs:
-                logging.warning(
-                    f"{ioc_type} not in accepted range. Acceptable ioc types are: {accepted_iocs}"
-                )
-                return
             if isinstance(ioc, list):
                 for i in ioc:
                     if isinstance(i, bytes):
                         i = i.decode()
                     if not isinstance(i, str):
                         logging.warning(
-                            f"Could not process {i} from {self.name}: Type {type(i)} is not type Bytes or String"
+                            f"Could not process {i} from {self.name}: Type {type(i)} is"
+                            " not type Bytes or String"
                         )
                         continue
                     self.process_ioc(
                         i,
-                        ioc_type,
                         self.name,
-                        description=description,
-                        malicious=malicious,
                     )
             else:
                 if isinstance(ioc, bytes):
                     ioc = ioc.decode()
                 if not isinstance(ioc, str):
                     logging.warning(
-                        f"Could not process {ioc} from {self.name}: Type {type(ioc)} is not type Bytes or String"
+                        f"Could not process {ioc} from {self.name}: Type {type(ioc)} is"
+                        " not type Bytes or String"
                     )
                     return
                 self.process_ioc(
                     ioc,
-                    ioc_type,
                     self.name,
-                    description=description,
-                    malicious=malicious,
                 )
         except Exception as e:
             logging.error(f"Failed to add {ioc} from {self.name}: {e}")

From d0a34d36a491922e74fc67a05d2479f330526b95 Mon Sep 17 00:00:00 2001
From: Paul Hutelmyer <paul.hutelmyer@gmail.com>
Date: Tue, 2 Jan 2024 15:18:06 -0500
Subject: [PATCH 4/9] Formatting

---
 src/python/strelka/strelka.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/python/strelka/strelka.py b/src/python/strelka/strelka.py
index b8c93d28..0b6c4c0c 100644
--- a/src/python/strelka/strelka.py
+++ b/src/python/strelka/strelka.py
@@ -864,18 +864,18 @@ def upload_to_coordinator(self, pointer, chunk, expire_at) -> None:
     def process_ioc(self, ioc, scanner_name) -> None:
         """
         Processes an Indicator of Compromise (IOC) and appends it to the scanner's IOC list.
-        
+
         This method takes an IOC (such as a URL, domain, IP address, or email) and categorizes it
         into an appropriate type. It validates the IOC using various validators and regular expressions,
         then appends a dictionary containing the IOC, its type, and the scanner name to the scanner's IOC list.
         If the IOC does not match any valid type, a warning is logged, and the IOC is not added.
-        
+
         Args:
             ioc (str or bytes): The IOC to be processed. Can be a string or bytes.
                                 If bytes, it will be decoded to a string.
             scanner_name (str): The name of the scanner processing the IOC. This is used to tag the IOC
                                 in the appended dictionary.
-        
+
         Note:
             - The method internally handles different formats and types of IOCs (like URLs, domains, IPs, and emails).
             - If the IOC is invalid or does not match a known pattern, a warning is logged and the IOC is not added.

From b42b78126dcf2d2d9a3f2538d347d0adc6c7e52d Mon Sep 17 00:00:00 2001
From: Paul Hutelmyer <paul.hutelmyer@gmail.com>
Date: Tue, 2 Jan 2024 15:26:01 -0500
Subject: [PATCH 5/9] Modifying IOC Supported Scanners

---
 src/python/strelka/scanners/scan_iqy.py   | 72 +++++++++++++----------
 src/python/strelka/scanners/scan_xl4ma.py | 62 +++++++++++--------
 2 files changed, 79 insertions(+), 55 deletions(-)

diff --git a/src/python/strelka/scanners/scan_iqy.py b/src/python/strelka/scanners/scan_iqy.py
index 5223e42e..24322f1a 100644
--- a/src/python/strelka/scanners/scan_iqy.py
+++ b/src/python/strelka/scanners/scan_iqy.py
@@ -1,4 +1,4 @@
-# Description #
+#### Description ####
 # This scanner is looking for iqy files used with excel.
 #
 # author: Tasha Taylor
@@ -11,52 +11,60 @@
 
 class ScanIqy(strelka.Scanner):
     """
-    Extract URLs from IQY files.
+    Strelka scanner for extracting URLs from IQY (Excel Web Query Internet Inquire) files.
 
-    IQY files, or Excel Web Query Internet Inquire files, are typically created from a VBA Web Query output.
-    The following is a typical format:
-        WEB
-        1
-        [URL]
-        [optional parameters]
-    Additional properties can be found at: https://learn.microsoft.com/en-us/office/vba/api/excel.querytable
+    IQY files are typically used to import data into Excel from the web. They often contain URLs
+    that specify the data source. This scanner aims to extract these URLs and process them for IOCs.
+
+    The following is a typical format of an IQY file:
+    WEB
+    1
+    [URL]
+    [optional parameters]
+
+    Reference for IQY file format: https://learn.microsoft.com/en-us/office/vba/api/excel.querytable
     """
 
     def scan(self, data, file, options, expire_at):
+        """
+        Processes the provided IQY data to extract URLs.
+
+        Attempts to decode the data and applies a regex pattern to identify and extract URLs.
+        Extracted URLs are added to the scanner's IOC list.
+
+        Args:
+            data (bytes): Data associated with the IQY file to be scanned.
+            file (strelka.File): File object associated with the data.
+            options (dict): Options to be applied during the scan.
+            expire_at (int): Expiration timestamp for extracted files.
+        """
         try:
-            # Regular expression for detecting a URL-like pattern
+            # Compile regex pattern for URL detection
             address_pattern = re.compile(
                 r"\b(?:http|https|ftp|ftps|file|smb)://\S+|"
                 r"\\{2}\w+\\(?:[\w$]+\\)*[\w$]+",
                 re.IGNORECASE,
             )
 
-            # Attempt UTF-8 decoding first, fall back to latin-1 if necessary
+            # Attempt to decode the data
             try:
-                data = data.decode("utf-8")
+                decoded_data = data.decode("utf-8")
             except UnicodeDecodeError:
-                data = data.decode("latin-1")
+                decoded_data = data.decode("latin-1")
 
-            # Split lines to review each record separately
-            data_lines = data.splitlines()
-
-            addresses = set()
-            # For each line, check if the line matches the address pattern.
-            # In a typical IQY file, the "WEB" keyword is at the beginning of the file,
-            # and what follows is usually just one URL with optional additional parameters.
-            # However, because we are iterating lines anyway, lets check for additional addresses anyway.
-            for entry in data_lines[1:]:
-                match = address_pattern.search(entry)
-                if match:
-                    address = match.group().strip()
-                    if address:
-                        addresses.add(address)
-
-            # Evaluate if any addresses were found and assign the boolean result.
-            self.event["address_found"] = bool(addresses)
+            # Extract addresses from the data
+            addresses = set(
+                match.group().strip()
+                for line in decoded_data.splitlines()
+                if (match := address_pattern.search(line))
+            )
 
-            # Send all addresses to the IOC parser.
-            self.add_iocs(list(addresses), self.type.url)
+            # Add extracted URLs to the scanner's IOC list
+            if addresses:
+                self.event["address_found"] = True
+                self.add_iocs(list(addresses))
+            else:
+                self.event["address_found"] = False
 
         except UnicodeDecodeError as e:
             self.flags.append(f"Unicode decoding error: {e}")
diff --git a/src/python/strelka/scanners/scan_xl4ma.py b/src/python/strelka/scanners/scan_xl4ma.py
index 6bca5eb9..02c46872 100644
--- a/src/python/strelka/scanners/scan_xl4ma.py
+++ b/src/python/strelka/scanners/scan_xl4ma.py
@@ -1,35 +1,51 @@
-from strelka import strelka
 from strelka.auxiliary.xl4ma import analyzer
+from strelka import strelka
 
 
 class ScanXl4ma(strelka.Scanner):
-    """Extracts Excel 4 cell contents and attempts to extract IOCs"""
+    """
+    Strelka scanner for extracting Excel 4 cell contents and IOCs.
+
+    This scanner uses the xl4ma analyzer to extract data from Excel files.
+    It attempts to decode Excel 4 cell contents and extract any potential IOCs.
+    Extracted data is added to the scanner's event, and IOCs are processed
+    using the scanner's IOC processing capabilities.
+
+    Attributes inherited from strelka.Scanner:
+        - name (str): Name of the scanner class.
+        - key (str): Metadata key used to identify scanner metadata in scan results.
+        - event (dict): Dictionary containing the result of the scan.
+        - flags (list): List of flags raised during scanning.
+        - iocs (list): List of IOCs extracted during scanning.
+    """
 
     def scan(self, data, file, options, expire_at):
-        results = {}
+        """
+        Overrideable scan method from strelka.Scanner.
+
+        Processes the provided data using the xl4ma analyzer and extracts
+        relevant information and IOCs.
 
-        # Attempt to process Excel data using analyzer
+        Args:
+            data (bytes): Data associated with the file to be scanned.
+            file (strelka.File): File object associated with the data.
+            options (dict): Options to be applied during the scan.
+            expire_at (int): Expiration timestamp for extracted files.
+        """
+        # Attempt to process Excel data using the xl4ma analyzer
         try:
+            # Process Excel data and store the results
             results = analyzer.process_data(data=data, filename=file.name)
+
+            # Check if decoding and IOCs are present in the results
+            if "decoded" in results:
+                self.event["decoded"] = results["decoded"]
+            if "iocs" in results:
+                self.event["iocs"] = results["iocs"]
+                self.add_iocs(results["iocs"])
         except strelka.ScannerTimeout:
+            # Propagate the timeout exception
             raise
         except Exception as e:
-            self.flags.append(str(e))
-            print(str(e))
-            return
-
-        # If processing successful, extract keys and apply to IOC scanner.
-        if results:
-            self.event["decoded"] = results.get("decoded", [])
-            self.event["iocs"] = results.get("iocs", [])
-
-            try:
-                self.add_iocs(
-                    results.get("iocs", []),
-                    self.type.url,
-                    description="extracted from excel4 macro",
-                )
-            except strelka.ScannerTimeout:
-                raise
-            except Exception:
-                self.flags.append("xl4ma_ioc_processing_error")
+            # Append exception message to flags for diagnostic purposes
+            self.flags.append(f"xl4ma_processing_exception: {str(e)}")

From 299f4748fbd09e5b25dc476a1d8c8864059ac98a Mon Sep 17 00:00:00 2001
From: Paul Hutelmyer <paul.hutelmyer@gmail.com>
Date: Wed, 3 Jan 2024 11:33:32 -0500
Subject: [PATCH 6/9] Removing Leading Docstring

---
 src/python/strelka/scanners/scan_iqy.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/python/strelka/scanners/scan_iqy.py b/src/python/strelka/scanners/scan_iqy.py
index 24322f1a..c0cf03d6 100644
--- a/src/python/strelka/scanners/scan_iqy.py
+++ b/src/python/strelka/scanners/scan_iqy.py
@@ -1,9 +1,3 @@
-#### Description ####
-# This scanner is looking for iqy files used with excel.
-#
-# author: Tasha Taylor
-# date: 10/30/2023
-
 import re
 
 from strelka import strelka

From 0693a1838038a8c8bcda352932c10b609b915e57 Mon Sep 17 00:00:00 2001
From: Paul Hutelmyer <paul.hutelmyer@gmail.com>
Date: Wed, 3 Jan 2024 11:35:10 -0500
Subject: [PATCH 7/9] Fixing import formatting

---
 src/python/strelka/scanners/scan_ocr.py   | 5 +++--
 src/python/strelka/scanners/scan_xl4ma.py | 2 +-
 src/python/strelka/strelka.py             | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/python/strelka/scanners/scan_ocr.py b/src/python/strelka/scanners/scan_ocr.py
index 08ad8c61..d44f4c5d 100644
--- a/src/python/strelka/scanners/scan_ocr.py
+++ b/src/python/strelka/scanners/scan_ocr.py
@@ -1,9 +1,10 @@
+import base64
+import io
 import os
 import subprocess
 import tempfile
-import io
+
 import fitz
-import base64
 from PIL import Image
 
 from strelka import strelka
diff --git a/src/python/strelka/scanners/scan_xl4ma.py b/src/python/strelka/scanners/scan_xl4ma.py
index 02c46872..840b2274 100644
--- a/src/python/strelka/scanners/scan_xl4ma.py
+++ b/src/python/strelka/scanners/scan_xl4ma.py
@@ -1,5 +1,5 @@
-from strelka.auxiliary.xl4ma import analyzer
 from strelka import strelka
+from strelka.auxiliary.xl4ma import analyzer
 
 
 class ScanXl4ma(strelka.Scanner):
diff --git a/src/python/strelka/strelka.py b/src/python/strelka/strelka.py
index 0b6c4c0c..9bb643c6 100644
--- a/src/python/strelka/strelka.py
+++ b/src/python/strelka/strelka.py
@@ -13,6 +13,7 @@
 import uuid
 from types import FrameType
 from typing import Generator, Optional, Tuple
+from urllib.parse import urlparse
 
 import inflection
 import magic
@@ -22,7 +23,6 @@
 from boltons import iterutils
 from opentelemetry import context, trace
 from tldextract import TLDExtract
-from urllib.parse import urlparse
 
 from . import __namespace__
 from .telemetry.traces import get_tracer

From 7a30ed193b9c084339668b59808fab06b5345a5e Mon Sep 17 00:00:00 2001
From: Paul Hutelmyer <paul.hutelmyer@gmail.com>
Date: Wed, 3 Jan 2024 11:39:02 -0500
Subject: [PATCH 8/9] Changing regex to raw strings

---
 src/python/strelka/strelka.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/python/strelka/strelka.py b/src/python/strelka/strelka.py
index 9bb643c6..a3fb5c1d 100644
--- a/src/python/strelka/strelka.py
+++ b/src/python/strelka/strelka.py
@@ -904,7 +904,7 @@ def process_ioc(self, ioc, scanner_name) -> None:
                 )
         elif validators.domain(ioc):
             ioc_type = "domain"
-        elif re.match("^[\w\.\-]{2,62}\.[a-zA-Z]{2,5}:\d{1,5}$", ioc):
+        elif re.match(r"^[\w\.\-]{2,62}\.[a-zA-Z]{2,5}:\d{1,5}$", ioc):
             ioc_type = "domain"
             ioc = ioc.split(":")[0]
         elif validators.ipv4(ioc):
@@ -913,7 +913,7 @@ def process_ioc(self, ioc, scanner_name) -> None:
             ioc_type = "ip"
         elif validators.email(ioc):
             ioc_type = "email"
-        elif re.match("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}$", ioc):
+        elif re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}$", ioc):
             ioc_type = "ip"
             ioc = ioc.split(":")[0]
         else:

From 95ebbf41b868342c0689a5143c083c1ffa80dab9 Mon Sep 17 00:00:00 2001
From: Paul Hutelmyer <paul.hutelmyer@gmail.com>
Date: Wed, 3 Jan 2024 12:08:26 -0500
Subject: [PATCH 9/9] Updating IOC duplicate removal / fixing tests

---
 src/python/strelka/strelka.py               | 12 +++++++++++-
 src/python/strelka/tests/test_scan_iqy.py   |  2 --
 src/python/strelka/tests/test_scan_xl4ma.py |  8 +-------
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/python/strelka/strelka.py b/src/python/strelka/strelka.py
index a3fb5c1d..48521951 100644
--- a/src/python/strelka/strelka.py
+++ b/src/python/strelka/strelka.py
@@ -803,8 +803,18 @@ def scan_wrapper(
             }
 
             # Removes duplicate entries from IOC list
+            unique_iocs = []
             seen = set()
-            self.iocs = [x for x in self.iocs if x not in seen and not seen.add(x)]
+            for ioc in self.iocs:
+                identifier = (
+                    ioc["ioc"],
+                    ioc["ioc_type"],
+                )  # Unique identifier based on 'ioc' and 'ioc_type'
+                if identifier not in seen:
+                    seen.add(identifier)
+                    unique_iocs.append(ioc)
+
+            self.iocs = unique_iocs
 
             return self.files, {self.key: self.event}, self.iocs
 
diff --git a/src/python/strelka/tests/test_scan_iqy.py b/src/python/strelka/tests/test_scan_iqy.py
index fad64fbd..7fa9047c 100644
--- a/src/python/strelka/tests/test_scan_iqy.py
+++ b/src/python/strelka/tests/test_scan_iqy.py
@@ -17,13 +17,11 @@ def test_scan_iqy(mocker):
         "address_found": True,
         "iocs": [
             {
-                "description": "",
                 "ioc": "github.com",
                 "ioc_type": "domain",
                 "scanner": "ScanIqy",
             },
             {
-                "description": "",
                 "ioc": "https://github.com/target/strelka/blob/master/docs/index.html",
                 "ioc_type": "url",
                 "scanner": "ScanIqy",
diff --git a/src/python/strelka/tests/test_scan_xl4ma.py b/src/python/strelka/tests/test_scan_xl4ma.py
index 18fec287..2d52a712 100644
--- a/src/python/strelka/tests/test_scan_xl4ma.py
+++ b/src/python/strelka/tests/test_scan_xl4ma.py
@@ -26,17 +26,11 @@ def test_scan_xl4ma(mocker):
             ]
         ),
         "iocs": [
-            {
-                "ioc": "example.com",
-                "ioc_type": "domain",
-                "scanner": "ScanXl4ma",
-                "description": "extracted from excel4 macro",
-            },
+            {"ioc": "www.example.com", "ioc_type": "domain", "scanner": "ScanXl4ma"},
             {
                 "ioc": "https://www.example.com/path/to/resource",
                 "ioc_type": "url",
                 "scanner": "ScanXl4ma",
-                "description": "extracted from excel4 macro",
             },
         ],
     }