From 08bdd9d4ad7072e26fd64b9e57e03eb02bc1d490 Mon Sep 17 00:00:00 2001 From: Paul Hutelmyer Date: Tue, 2 Jan 2024 15:09:29 -0500 Subject: [PATCH 1/9] Adding Thumbnail Functionality --- configs/python/backend/backend.yaml | 8 ++- src/python/strelka/scanners/scan_ocr.py | 76 +++++++++++++++------ src/python/strelka/tests/test_scan_ocr.py | 83 +++++++++++++++++++++++ 3 files changed, 142 insertions(+), 25 deletions(-) diff --git a/configs/python/backend/backend.yaml b/configs/python/backend/backend.yaml index 018f1f5b..7d2919ac 100644 --- a/configs/python/backend/backend.yaml +++ b/configs/python/backend/backend.yaml @@ -1,4 +1,4 @@ -version: 2023-07-07-update +version: 2024.02.01.01 logging_cfg: '/etc/strelka/logging.yaml' limits: max_files: 5000 @@ -380,9 +380,11 @@ scanners: priority: 5 options: extract_text: False - split_words: True - tmp_directory: '/dev/shm/' + tmp_directory: "/dev/shm/" pdf_to_png: True + remove_formatting: True + create_thumbnail: True + thumbnail_size: [250, 250] 'ScanOle': - positive: flavors: diff --git a/src/python/strelka/scanners/scan_ocr.py b/src/python/strelka/scanners/scan_ocr.py index e55ed9da..08ad8c61 100644 --- a/src/python/strelka/scanners/scan_ocr.py +++ b/src/python/strelka/scanners/scan_ocr.py @@ -1,33 +1,65 @@ import os import subprocess import tempfile - +import io import fitz +import base64 +from PIL import Image from strelka import strelka class ScanOcr(strelka.Scanner): - """Collects metadata and extracts optical text from image files. + """Extracts optical text from image files and creates a thumbnail. + + This scanner extracts text from image files using OCR (Optical Character Recognition) and + generates a base64-encoded thumbnail. It supports direct image files and converting PDFs + to images for OCR. Options: - extract_text: Boolean that determines if optical text should be - extracted as a child file. - Defaults to False. - tmp_directory: Location where tempfile writes temporary files. - Defaults to '/tmp/'. + extract_text: If True, extracted text is emitted as a child file. (default: False) + split_words: If True, splits the OCR text into words and stores an array. (default: True) + remove_formatting: If True, removes formatting characters (e.g., \r). Overridden by split_words. (default: True) + tmp_directory: Directory for temporary files. (default: '/tmp/') + pdf_to_png: If True, converts PDFs to PNG for OCR. (default: False) + create_thumbnail: If True, creates a thumbnail for the image. (default: False) + thumbnail_size: Size of the thumbnail to create. (default: (250, 250)) """ def scan(self, data, file, options, expire_at): extract_text = options.get("extract_text", False) split_words = options.get("split_words", True) + remove_formatting = options.get("remove_formatting", True) tmp_directory = options.get("tmp_directory", "/tmp/") pdf_to_png = options.get("pdf_to_png", False) + create_thumbnail = options.get("create_thumbnail", False) + thumbnail_size = options.get("thumbnail_size", (250, 250)) + # Convert PDF to PNG if required. if pdf_to_png and "application/pdf" in file.flavors.get("mime", []): - doc = fitz.open(stream=data, filetype="pdf") - data = doc.get_page_pixmap(0).tobytes("png") - + try: + doc = fitz.open(stream=data, filetype="pdf") + data = doc.get_page_pixmap(0).tobytes("png") + except Exception as e: + self.flags.append( + f"{self.__class__.__name__}: image_pdf_error: {str(e)[:50]}" + ) + + # Create a thumbnail from the image. + # Stores as a base64 value in the key: base64_thumbnail + if create_thumbnail: + try: + image = Image.open(io.BytesIO(data)) + image.thumbnail(thumbnail_size, Image.Resampling.BILINEAR) + buffered = io.BytesIO() + image.save(buffered, format="WEBP", quality=70, optimize=True) + base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8") + self.event["base64_thumbnail"] = base64_image + except Exception as e: + self.flags.append( + f"{self.__class__.__name__}: image_thumbnail_error: {str(e)[:50]}" + ) + # Perform OCR on the image data. with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_data: tmp_data.write(data) tmp_data.flush() @@ -35,28 +67,26 @@ def scan(self, data, file, options, expire_at): with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_tess: try: tess_txt_name = f"{tmp_tess.name}.txt" - - completed_process = subprocess.run( + subprocess.run( ["tesseract", tmp_data.name, tmp_tess.name], capture_output=True, check=True, ) - _ = completed_process - with open(tess_txt_name, "rb") as tess_txt: ocr_file = tess_txt.read() - if ocr_file: if split_words: self.event["text"] = ocr_file.split() else: - self.event["text"] = ( - ocr_file.replace(b"\r", b"") - .replace(b"\n", b"") - .replace(b"\f", b"") - ) - + if remove_formatting: + self.event["text"] = ( + ocr_file.replace(b"\r", b"") + .replace(b"\n", b"") + .replace(b"\f", b"") + ) + else: + self.event["text"] = ocr_file if extract_text: # Send extracted file back to Strelka self.emit_file(ocr_file, name="text") @@ -64,5 +94,7 @@ def scan(self, data, file, options, expire_at): os.remove(tess_txt_name) except subprocess.CalledProcessError as e: - self.flags.append("tesseract_process_error") + self.flags.append( + f"{self.__class__.__name__}: tesseract_process_error: {str(e)[:50]}" + ) raise strelka.ScannerException(e.stderr) diff --git a/src/python/strelka/tests/test_scan_ocr.py b/src/python/strelka/tests/test_scan_ocr.py index 57b16fe6..b0bc3f2a 100644 --- a/src/python/strelka/tests/test_scan_ocr.py +++ b/src/python/strelka/tests/test_scan_ocr.py @@ -1,3 +1,4 @@ +import difflib from pathlib import Path from unittest import TestCase, mock @@ -420,3 +421,85 @@ def test_scan_ocr_webp(mocker): TestCase.maxDiff = None TestCase().assertDictEqual(test_scan_event, scanner_event) + + +def test_scan_ocr_keep_formatting(mocker): + """ + Pass: Sample event matches output of scanner. + Failure: Unable to load file or sample event fails to match. + """ + + test_scan_event = { + "elapsed": mock.ANY, + "flags": [], + "text": b"Lorem Ipsum\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Cras lobortis sem dui. " + b"Morbi at magna quis ligula faucibus\nconsectetur feugiat at purus. Sed nec lorem nibh. Nam vel " + b"libero odio. Vivamus tempus non enim egestas pretium.\nVestibulum turpis arcu, maximus nec libero " + b"quis, imperdiet suscipit purus. Vestibulum blandit quis lacus non\nsollicitudin. Nullam non " + b"convallis dui, et aliquet risus. Sed accumsan ullamcorper vehicula. Proin non urna facilisis," + b"\ncondimentum eros quis, suscipit purus. Morbi euismod imperdiet neque fermentum dictum. Integer " + b"aliquam, erat sit\namet fringilla tempus, mauris ligula blandit sapien, et varius sem mauris eu " + b"diam. Sed fringilla neque est, in laoreet\nfelis tristique in. Donec luctus velit a posuere " + b"posuere. Suspendisse sodales pellentesque quam.\n", + } + + scanner_event = run_test_scan( + mocker=mocker, + scan_class=ScanUnderTest, + fixture_path=Path(__file__).parent / "fixtures/test_text.webp", + options=({"split_words": False, "remove_formatting": False}), + ) + + TestCase.maxDiff = None + + # Output string formatting may result in slightly different results. + # Comparing similarity to the 99% percentile is good enough. + similarity = difflib.SequenceMatcher( + None, test_scan_event["text"], scanner_event["text"] + ).ratio() + assert similarity > 0.99 + + +def test_scan_ocr_thumbnail(mocker): + """ + Pass: Sample event matches output of scanner. + Failure: Unable to load file or sample event fails to match. + """ + + test_scan_event = { + "elapsed": mock.ANY, + "flags": [], + "base64_thumbnail": "UklGRpoWAABXRUJQVlA4II4WAAAQUwCdASr6AJ8APp1EnEolo6KhqhUr6LATiWlu3WBpKP1U/6LwT/Ivon9X/f+IR2D5j/aH+h/iPObvj+IH+P6gvs3/Pel39D2n2wf5L0CPZ77T5kv1fmj9mfYE/Mz1m/8Pha/gv+h7An6e9I3QS+4eoT5dHsv9DlBZKO/3p+CYGIsxxoyLSfFJnk9qJs7xKOZiJEUM4voTPRt7q24KjWPzjP9LJSsOJXXHFklGiUlVhswwkwckKAJmkikmoUTlFySc2DYEAqA7oS65VGhxIchghhf/BGmBkrpssZjVSyMRC+nii9V5sve+Bg3cATPHNMkM/w6CnGg9glAqCdZjjqy0zamZLK642ZKCBc69CE4x1Cf0sCPXNLTS3mMTyUHftv4G9G1eDUlyTvICoabo0wKuqre97OFv755jJO70CvHC+HFkrKT9+V7VLhJab1LO1GaOub6hK1/oDTgqmRB2m6NL1kkSpOp43ZtJL7c0XJtRQr8E7m1N7zN2Gx1r1gDI/T9AyjPLGYj1eAiUzJZXW+HeQea7qMc9KneNWye5XTJmtxSRfzUUlqLLgRhoofNYvT/Kkt+VUVA5mV99OdE5R9JPWWzYIDOxV3fcS4D8faN3wisidC/76Cm4+9rP3jtmS+oz75VdxfzkRAE4vFcn56lQgWbtpI+lSKQY+LcRr9kAx1cgK5hvmF8k69iQaRIRUVKrczSS7kesuYQqO2cdcjlYdBpHhG5zzTioUyOCHCT2fOlXTZULVXUoDD9B0uTrYUMqVDJQP5II/eUTKByPmUWp/ZYL89ARn5nnFHJzPMCactNH/UsKcdzPArdZDEHVIjdngV1Rwk5Z0zTuGZEDdkMJAnD+CzqzRgw+hKRxiuvNOOXBZuItulW6ldy7uc0nVDkgAP77IeznaeNmVR+M7qCRlQfd4Czy8iTZqCCTFppPckMktMofwJIK4WofztLf4ehG5ucxbND1FiaHERAsW+P1E16YE3pEEnVyy78VQxYWA1inV6C3WmStm8YLlee80nnTNQ7tD2tqVS4awu/S0zeHTMW0Dv6PpxaD+cR88QpFPvhn4zgXFNfM+4IX5dctNjnwvhtvfi+frNNdGBmoJMTfZwuhXEBiDNlM93UFIhueTIJKO7noJ/1VcSd3vI+E+x5obCs1Y7K+2+3cOx+3/4Ju9nh6Q8urNc99XNm3aDWzOPufkp8QdGXjlcr/QApKXWpAtdC3w8VkNwZ92UeLaybH/y6g+8XfgfagTDqk59fDRbUIFj6tBelaEqV5l9zRe9hX940VJn/8PgVTjJIaeTdUQTdAH/tHf/bvvQVx18qvzyb3zTHCTa5X6FfTnwNhMkQ3O9XqOHgE8d/xOfitRHWsXccr/Dno2Ts+rN/X+aJ1HkXmlEVd/DMnmPfbkY79cx1vC37nk4btceed5M6AKcC96Otdr0M48tOpXRNJhpZrSrq8J2jhf5pJYNrD8+kfvl7mspxkUCOeFMMuCxJqEAgFu+vUvqVz97Nlr57p++IhmmBI32bdPpLP1Izg17IhWrqDf1EQp0kk3KFmLEs6BncDDZUWPwzcj0/aetD1tq4d5LFubRTi7b63w7J5vmAe5JSgwtn7fJ3TZ1/VgFoX3LMEgHmBpYjEjBsToibxoP7vDMCXf/a/RZzpA/jjT/GHomhs0DsPs63ba+Cb2blZ3qHrA6g3EKf7Lhapyk9LMj8YWb8z3P/L407laboZlj3BJdqaLQvX32KzVnFeYzcdutStRWHDYpeBO0iL2jju48wvxYGsP8Rv99bbSohwDXg/5Nf/zc50KuZPsIHpzEiwUeZqnq11XgH4nUpC3q+c/P05XSyf5Sy7GGxH5WI/s/PnLAA/hLwJsZnfCVMVB0nrN1CareL3oBX8MI5pacIVee1Nc3Fu6UaN/kqnjM7rsjOA7Yq/8RggDoUATJNtPioDeb0akRWNg/ZNQMtjrT6H/2+DjM+5w/aU+fT10annXH0pOlydUVODcN8dbUoJDTapuGPMYvoR1eMsetqJ933/Wqwfqvep8lFAeTKCJeK5gsum8WnXZ31Pw1lwPDGAhcvdkQleFV+WorMqp527ZXA8hxjBH8PyvmIwZZzu6iNzeEZULuk0YA4i+P1OYsztDn/Ait6d7VU3p/olPEMgq/kzV1K0eiyVKTUqHKL0D26nkS6uOLPgPeYbEGFYz4a3CEyDvj56zTLl3qtH49rZErA1rte9D5c18Bp09AC+vXOZVc2b28TIn4c8sI8kfHduHC7uhFlTqsJNMm54DOZctMWFD8EAFPJn+CXxU8OY72Q0Lo+LR6QBGqM+6eGdpM6OkCbG200mpIG1DSnCf7LMbB0LvfJyN6jzAQGZZq8klyEzmZIoCg8QlpCtoTJQ+AIqFWl9z4ywir7j1eaE69Ac68vNxMkj5pNtORvzKEFxIICioxjKBLO82BuwAy/4y3NkP/kvq7O0B6NrO/KtZXcqSwuhvgQoyIdKBpUcaZFbUWEMeHUelwLukHWTyZJGl4qHH1R7ngxSiyHI4u6Fjm+AC6yw6TlOrwbwbPtHElveg8aCXiTUVxNB1wRsvP4xpHeG4atg6flIeX4sgjtS1Ow4z96jJWi2HFjH6n/SVI/utmZZT+pDmQhSwd4ki7+L2q47FT/mB3ae13xPZbX/PRrg0oZG3Fx6RS8ZuohpFqX33DcSPE12Jtq9AA1CgCTh5cURyE0/yc2XI04tsdbsFxq4/jnn5u3NuXGgZWCU1JEiIQr446AAyRgNrBEVC1oiYPcxe1wnd17M1Z60V/RUd5/h0UKUOufwn3LSQLq0JvYr0Z4hY6fC1rZU+BmbhiW7P7X2AE0Rq6WHYc5baeeesKS6HJawuV9rBIZu7WLeNmvLnQhPF5UAKOaVq7W19l/gOq7nPxvhgOYNHzwq/53cTre0P7ScopX20PgD9tt/tL0oErJyGRVoBKoRuFLJ+c1Qk/6pEZY0GREKAFwUf+/MQvJjI4JQNh5qctnk0hBPwrMLxJ7E8wetG4kRrEApm6V58S4UJT9IbmrMe6FKBlzPZ+4g4zaf+ajS+/c3nQ3brRh+4e6cZif4Y5uIYZC8MYCBtJ2DD8FiC95HAHzQaAA+sxNeOA8u/0UqKqmCLo1AyxU9xIzZlMJYT3dH8OgFaZ+n4TOXKBzx7ddS2p8yT400Pg6Wdt81SbWf13/U6ra+4TxEl/6Ep5+q9kc/xHvGeqb0O8g5iEnLfU/08FF3nfkkgtOHKhhwmT1+RcWd3sMbPLoBxh8n0J/Q4NWMkgc//XzOtcbISz7a+dN328J2kLyfeK0662DDECirQdQMb8Sz+gsauD5e1cDcFgwcM86gOtURB9gb8v/JnNjsJaqFIizZOP4GTG+amm+vsUPYIsq4bBbqYPcgM6i5RT/0VEe9oQY1Cm0MTEwnaONIWens6HzNZF2CxKaWinwk3C7Gqqme9d+zkMCvqu1k+KFQbQwm9A8f8zWyClmwGmhT3n4b6s7MxKDagHzNqrsh7CyinVf/X2ULqhPHy2gs39BEGj00NsueWpDgYg7DuZM3rt08kRykqri4n5xbDYPCzsHpk+oxrvzKK3YPHEfs169xkuH9woGwI+o7HNl77LBogvxaUghXQvjsLL8hQTSFnCTsaT58BwWUAuPbthNiDeTxpjLabqY0QsyvNt6jTaEwyZD1bF9e34HIdlsI4pXO+Y8PqliWruxYKju47ey0iiD2C6xoe7dGpnNew7jcZ2XRDV/qhwTlXwjF4P1XRzTcxUIjyg+BgPmd2DRX8346onMOlWdNsM6G+/3zKMLGt5Waeptuv2qIfqDF/TT5mqMoKL+ZdepAnMoJL8cLNGb35K+ykMbhZVaWoQPVwCec/dR3tEcQkrc3xNCBVmyZuso9N1aySnjnMG/7b+tA9TQgdbhVwx/f00WDEcXCQ7Z9iB8XJ99V+UzaSjLGXk5HRHecwo343+nKg2ryjUZAhTn44dFq4c/vWjNyalTIgGiotFzHbGDsAlAzJic+zVGYHw7HKwnxlYZguMzeVciCU+ksBF2v0DN0OA/OgBzRWDaVvh/fAPDUMRy7dNoWx3N1LIdhNYf6pK7+o+lIWSjlhOO4oe+nFX4DL2dOgV7n1QaB5POTCahVp6SRDl5/76GMgSm1G8rlrKlzSsgqsemwSQMCLGayrYC+92Q5MEYbLzb/uePhP6DSBTKc0teQMOFJn4FcrZTA0zCjILhxspRsPLVDoexbIiws16BCbzG/ClSVOdX3aSE7ns8UgtdWppWBQOASdVRlJKYIAcktwldaNyOBL0rVt8el3TJGArDt8awRBmHR99jXi3uyVpkGAQD/eu8Fgvx8dFNPN1oDxv0xXrA262qnvWPC8ck1iPMehtDpjlODu8woy5RQqmwgslsObv7JKAm3T899tz7278s2NN7zD0//CHMfJxhh9bDfAsN/Q2Q47HG+HpP8YuzD3Pa1ckqjW0xImxSaxxgkpxygewAE22tcQAHvy9Oq3zwNZ97VRBw/eK/jS1ViAR6vHNCJn35IKLxLSfZpsT0ARYlQ/r7/Sc3t0R0IGR9LdamA3z6Csk1uH/B6/uIBIzsTQHTZvQuR4ygEzb9ug5ZCiYc5EqUKRgGPmXkLD+VSkI1W9TO0jCjl/tLM4zUs2psRXCC2bNMbq0w1zFOLEMWzJf+klbutHP+j991N1e2orlWLKpzI+Qi3fJTzdfsbLP5KkdB5xBblDcd24dopvX1ZKGbqfXffZ0J6bMhb0id8wwiy6ETph0NN9KOISzyR+hu2Tk6GCzcCZJo5P5zbBNDlGkueYE+9PP+B+Ul+QWAeUu9CTTdudfReT5GLbO+ShTm2AirzLYiEFnOk82j6N7kARTgOQsHT51JQSF2ymhqY3f7sS9FD0WMBYacIbxPa0g7PeeTwPTeZZHpUnP+sI4P0tLZvrGCCSdtEPOOKof8faPVstuUPWFbIL/N/FKpTP8/mJjBW80HjGVfWTy+h28fkw7fQ8BG5BJBGQBjtLlKiFlc/3OaVoEPscXpnSlkC8tr1uoRc/AOTp7E7N/EzA285d5ozPyNFHJDG/gio4DVZMjpbEwPLdr0ORIES7qz5gOFOlf7n+v7EtrjVN4nFpSsX9qlqeznybqxRb51H5EPjuzO+EDvsx85zF9khtI5K9Ny1IND71U5YkbDoi5ZdYf+vFWGfMiaVs4YJROuebwueh+If62NoRSa8X+opRsKmMZdrYyO9yu7/4dbWcA1nA5pqzgv9wQdyJ9BW0aGA4rd1aTOVvmWJcXgDiC0Rn3AUTqITVNWmP2Muvv503udJ4zjeNdzNVx6W2D1biis2GzwygkLsBemBSwuXEMEENIE/2pXmo9BsiCpCGRzQX0rJ7x6Cbqs3JHNDTIgTBd+I/TO9ElC2tnHIo+Rr4VjUPfpQr/lFrlBkNdO+MgRV+ad9XTJStOeMbDTOcJXoix76/4Fv04VSyM2fHxJAy1r1HWuw3fbZrE5B6BZoidIWvf8Hcj6iZz9Gv6YeX1wA6ADEHh93PUa9mupMOD6cTwCdmyGYxzdS9RgWE2q4QmjxT4b9Y8ZBA5PBaNFQWR/QCWU8YXRo3FEdz/pPUxuRPIMd+4WW+9jxlTt1QEusvFPxt6/hWKtXi8LVCa+ebrcXVWSL8nWCPDXEkcExr0bchCSyRgbqm0LcGkZr+13EpdfD1arvM2vwcfbNTH4BF0xWO2+rNEVDj0qUIJ1PhtdzVuS1N5O9rP5b3XREc00ixxUb4nJ78P8AuC1aK9YHsSMfj9Pl7J9qFbObM5KeMBRL10+5cUAMlTfxsaEefcT/hPPtFc+/zS+WDKGTAcHnRZHi5DTyCJJK7UEZ2t4TJ4OUwLwXgkAFTGC76cRmg35zpHjrCu+ILjXusMOJbgpTE9DuD2284KKJv0boVwr131oU1ofbQcTmqUtbNmnlQDRTiB/03Lkh3KpHLis7QNQCzh2ywYW6uAEPQvV/objZSTrJlDk/M2OWV7XsggCZUeGmr46UPJjBMA8f8wXvHfWkjZaXF0aS+rtGV2qb2iBMs4ePDS/wM9nN3DZcRi9FpfCPMDgD6uoLv0OCeFwRU/lUuOPDMLSjOUBJlezmhpMI0LGy9q1Mi82td/uf4GSyGEw1XQdmGUfPpUa3ItLPSjO9qeBrmU2peEJJaWSuYNvM3JwPcgTR3fO4kPSAUJLiaVC1izUMnM8oiBHSECBfjYoOphmfgr/j4OBWZKK3r/CfNzNJbSmFPXb/ZPEcu73o1Wd9/GiJXPE7hRQx2kgzjimKPsvQxQOPZlnXKnWnHYOalc0pwy1JQxHP4fgyVxXt1wDSFFlPCfek1L/QsaxdJsQQwFXMlaGzRPcA6yFN0W4sGrO5EjZFIUYXlCCblylQDH+ty8AuLEEHY9RM1BpXJSuE2fUMDhL1rUmXLjJx3/kHPbprIoTmUJgoWDuDGUzIXDKWyL8KLl/vED4NgimT6VSVQ4AT23lNE7fKUonJWxwmPzMN9580tTdaJihp9WGgaArfOvwn9eAjnOi9dVfQ49mucgFcUCkVEoI6+iZAkudHWw/l4eT70SWfb0LAhSyhfo9McWCJtpC5/HqMRSx6AiiZ7+D2fRSuqyGmGyCA3Qd1t479b8n7qG6ljhhEHMKvsqeCT/dB8pBUQ1RTApTmViS+veSvb3+MBSd6j3dyt9wQpf5QPc0IcQoATYG17fLjWyr074CwUAWCp1CXYGtluJW1VV4P0DKs1qqFfIp6JOLJV4gmPm7mrupp/CiNG1GKk/dWnxZSBdOHTgqaVktLUjKiWI8rLkQ2oXm1l8YtYQZJqLT9lNsqNADixlT4UkIbQ/tl7SEuu1aKMorDUJwSEyuhsZC8IjolmRqL40DhPZaaBSBoiG0Ke586ZZu8WyjlmZfcPsTKAAPHiwHIX2I617DYLEWXgNa6vyiAxCSg9MqPZcrwmlyLebaWT5FbkprjSyhtQXra0p9zi8T7od9oimYT24AatKNo18ZssACU5ibS0BqXtqQbWdRf2r5DXoRPT8xmH9gkd1WoBRwrzHuXACz+SLNrIEOyH7/gKSGdnpOob/XW7ObHYxiK+ZynqVbkXkD3gCU/rNpA0RrrroMcy0a61fDwJiRpFnw12fVG8RNep+rHTUWi8T1vgV/43aq1vVe4y3LHHUp/fT11CfUvWazLwhyBYqMOruCFDdSFib8EGfGtBv5jQ/iLEDl5WvnRV8yEdKTN4tjeBym0pxUwhBSdfbXOq+OFrp1CCppnjKFrPjJR673Y4qvtn1jGiSbsDV0SNlGGGagDUiApsbqoyOwJzktcT5qlENkoUjqZs4JYsY4WqiWcU4ETAHAtrhs02fzi//xI7jIDs0qS1/TuPzitRx39o8Ua54XH89suslDN9OCYthReGRCXSzNqw64cwChmFVOF4UHRdfuRm/gPwNEy2+u772NOlaksET7iS5J/39/5h/vIN+xCbShCcIDgHRNBG1NOvIy0Qz5rUj/9kBVkvRHLZ4fF07jSkLuUh8IbboafJwrV1s1d35pcJUJfdnUzhF3slyJutnJN1LqxndeL3vhU82hb/xfWK2Kf/8yE9NTBAFXRJ3O3gNyIYKb/HTUSCCFZVqDvoSmDk8B5uPUCty9biTB4cVClwL0vuE+1lPbRZqEV+IT0qNgu7TUxJjhQQYSVJS3pKfMG1BWCfIIK+UUIghw/v094kvlcYAAAAA==", + "text": b"Lorem Ipsum\n\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Cras lobortis sem dui. " + b"Morbi at magna quis ligula faucibus\nconsectetur feugiat at purus. Sed nec lorem nibh. Nam vel " + b"libero odio. Vivamus tempus non enim egestas pretium.\nVestibulum turpis arcu, maximus nec libero " + b"quis, imperdiet suscipit purus. Vestibulum blandit quis lacus non\nsollicitudin. Nullam non " + b"convallis dui, et aliquet risus. Sed accumsan ullamcorper vehicula. Proin non urna facilisis," + b"\ncondimentum eros quis, suscipit purus. Morbi euismod imperdiet neque fermentum dictum. Integer " + b"aliquam, erat sit\namet fringilla tempus, mauris ligula blandit sapien, et varius sem mauris eu " + b"diam. Sed fringilla neque est, in laoreet\nfelis tristique in. Donec luctus velit a posuere " + b"posuere. Suspendisse sodales pellentesque quam.\n", + } + + scanner_event = run_test_scan( + mocker=mocker, + scan_class=ScanUnderTest, + fixture_path=Path(__file__).parent / "fixtures/test_text.webp", + options=( + {"split_words": False, "remove_formatting": False, "create_thumbnail": True} + ), + ) + + TestCase.maxDiff = None + + # Output string formatting may result in slightly different results. + # Comparing similarity to the 99% percentile is good enough. + similarity = difflib.SequenceMatcher( + None, test_scan_event["text"], scanner_event["text"] + ).ratio() + assert similarity > 0.99 + + # Ensure the thumbnail conversion works properly. + TestCase().assertEqual( + test_scan_event["base64_thumbnail"], scanner_event["base64_thumbnail"] + ) From 623d7a11223bd8a60b48d7e1be665ac1a7b8be4b Mon Sep 17 00:00:00 2001 From: Paul Hutelmyer Date: Tue, 2 Jan 2024 15:09:38 -0500 Subject: [PATCH 2/9] Removing duplicates from IOCs --- src/python/strelka/strelka.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/python/strelka/strelka.py b/src/python/strelka/strelka.py index c13998c3..07580952 100644 --- a/src/python/strelka/strelka.py +++ b/src/python/strelka/strelka.py @@ -739,7 +739,7 @@ def scan(self, data, file, options, expire_at) -> None: def scan_wrapper( self, data: bytes, file: File, options: dict, expire_at: int - ) -> Tuple[list[File], dict]: + ) -> Tuple[list[File], dict, list]: """Sets up scan attributes and calls scan method. Scanning code is wrapped in try/except for error handling. @@ -801,6 +801,11 @@ def scan_wrapper( **{"flags": self.flags}, **self.event, } + + # Removes duplicate entries from IOC list + seen = set() + self.iocs = [x for x in self.iocs if x not in seen and not seen.add(x)] + return self.files, {self.key: self.event}, self.iocs def emit_file( From f649bebf022eea6208215ea29b478a2539a2075c Mon Sep 17 00:00:00 2001 From: Paul Hutelmyer Date: Tue, 2 Jan 2024 15:17:16 -0500 Subject: [PATCH 3/9] Updating IOC extractors --- src/python/strelka/strelka.py | 141 +++++++++++++++++----------------- 1 file changed, 69 insertions(+), 72 deletions(-) diff --git a/src/python/strelka/strelka.py b/src/python/strelka/strelka.py index 07580952..b8c93d28 100644 --- a/src/python/strelka/strelka.py +++ b/src/python/strelka/strelka.py @@ -1,6 +1,5 @@ import glob import importlib -import ipaddress import itertools import json import logging @@ -16,13 +15,14 @@ from typing import Generator, Optional, Tuple import inflection -import magic # type: ignore +import magic import redis -import validators # type: ignore -import yara # type: ignore -from boltons import iterutils # type: ignore +import validators +import yara +from boltons import iterutils from opentelemetry import context, trace -from tldextract import TLDExtract # type: ignore +from tldextract import TLDExtract +from urllib.parse import urlparse from . import __namespace__ from .telemetry.traces import get_tracer @@ -861,107 +861,104 @@ def upload_to_coordinator(self, pointer, chunk, expire_at) -> None: p.expireat(f"data:{pointer}", expire_at) p.execute() - def process_ioc( - self, ioc, ioc_type, scanner_name, description="", malicious=False - ) -> None: + def process_ioc(self, ioc, scanner_name) -> None: + """ + Processes an Indicator of Compromise (IOC) and appends it to the scanner's IOC list. + + This method takes an IOC (such as a URL, domain, IP address, or email) and categorizes it + into an appropriate type. It validates the IOC using various validators and regular expressions, + then appends a dictionary containing the IOC, its type, and the scanner name to the scanner's IOC list. + If the IOC does not match any valid type, a warning is logged, and the IOC is not added. + + Args: + ioc (str or bytes): The IOC to be processed. Can be a string or bytes. + If bytes, it will be decoded to a string. + scanner_name (str): The name of the scanner processing the IOC. This is used to tag the IOC + in the appended dictionary. + + Note: + - The method internally handles different formats and types of IOCs (like URLs, domains, IPs, and emails). + - If the IOC is invalid or does not match a known pattern, a warning is logged and the IOC is not added. + """ if not ioc: return - if ioc_type == "url": - if validators.ipv4(self.extract(ioc).domain): + + if validators.url(ioc): + ioc_type = "url" + netloc = urlparse(ioc).netloc + + if validators.ipv4(netloc): self.process_ioc( - self.extract(ioc).domain, "ip", scanner_name, description, malicious + netloc, + scanner_name, ) - else: + elif validators.ipv6(netloc): self.process_ioc( - self.extract(ioc).registered_domain, - "domain", + netloc, scanner_name, - description, - malicious, ) - if not validators.url(ioc): - logging.warning(f"{ioc} is not a valid url") - return - elif ioc_type == "ip": - try: - ipaddress.ip_address(ioc) - except ValueError: - logging.warning(f"{ioc} is not a valid IP") - return - elif ioc_type == "domain": - if not validators.domain(ioc): - logging.warning(f"{ioc} is not a valid domain") - return - elif ioc_type == "email": - if not validators.email(ioc): - logging.warning(f"{ioc} is not a valid email") - return - - if malicious: - self.iocs.append( - { - "ioc": ioc, - "ioc_type": ioc_type, - "scanner": scanner_name, - "description": description, - "malicious": True, - } - ) + elif validators.domain(netloc): + self.process_ioc( + netloc, + scanner_name, + ) + elif validators.domain(ioc): + ioc_type = "domain" + elif re.match("^[\w\.\-]{2,62}\.[a-zA-Z]{2,5}:\d{1,5}$", ioc): + ioc_type = "domain" + ioc = ioc.split(":")[0] + elif validators.ipv4(ioc): + ioc_type = "ip" + elif validators.ipv6(ioc): + ioc_type = "ip" + elif validators.email(ioc): + ioc_type = "email" + elif re.match("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}$", ioc): + ioc_type = "ip" + ioc = ioc.split(":")[0] else: - self.iocs.append( - { - "ioc": ioc, - "ioc_type": ioc_type, - "scanner": scanner_name, - "description": description, - } - ) + logging.warning(f"{ioc} does not match a valid IOC type") + return - def add_iocs(self, ioc, ioc_type, description="", malicious=False) -> None: + self.iocs.append( + { + "ioc": ioc, + "ioc_type": ioc_type, + "scanner": scanner_name, + } + ) + + def add_iocs(self, ioc) -> None: """Adds ioc to the iocs. :param ioc: The IOC or list of IOCs to be added. All iocs must be of the same type. Must be type String or Bytes. - :param ioc_type: Must be one of md5, sha1, sha256, domain, url, email, ip, either as string or type object (e.g. self.type.domain). - :param description (Optional): Description of the IOCs. - :param malicious (Optional): Reasonable determination whether the indicator is or would be used maliciously. Example: - Malware Command and Control. Should not be used solely for determining maliciousness since testing values may be present. """ try: - accepted_iocs = ["md5", "sha1", "sha256", "domain", "url", "email", "ip"] - if ioc_type not in accepted_iocs: - logging.warning( - f"{ioc_type} not in accepted range. Acceptable ioc types are: {accepted_iocs}" - ) - return if isinstance(ioc, list): for i in ioc: if isinstance(i, bytes): i = i.decode() if not isinstance(i, str): logging.warning( - f"Could not process {i} from {self.name}: Type {type(i)} is not type Bytes or String" + f"Could not process {i} from {self.name}: Type {type(i)} is" + " not type Bytes or String" ) continue self.process_ioc( i, - ioc_type, self.name, - description=description, - malicious=malicious, ) else: if isinstance(ioc, bytes): ioc = ioc.decode() if not isinstance(ioc, str): logging.warning( - f"Could not process {ioc} from {self.name}: Type {type(ioc)} is not type Bytes or String" + f"Could not process {ioc} from {self.name}: Type {type(ioc)} is" + " not type Bytes or String" ) return self.process_ioc( ioc, - ioc_type, self.name, - description=description, - malicious=malicious, ) except Exception as e: logging.error(f"Failed to add {ioc} from {self.name}: {e}") From d0a34d36a491922e74fc67a05d2479f330526b95 Mon Sep 17 00:00:00 2001 From: Paul Hutelmyer Date: Tue, 2 Jan 2024 15:18:06 -0500 Subject: [PATCH 4/9] Formatting --- src/python/strelka/strelka.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python/strelka/strelka.py b/src/python/strelka/strelka.py index b8c93d28..0b6c4c0c 100644 --- a/src/python/strelka/strelka.py +++ b/src/python/strelka/strelka.py @@ -864,18 +864,18 @@ def upload_to_coordinator(self, pointer, chunk, expire_at) -> None: def process_ioc(self, ioc, scanner_name) -> None: """ Processes an Indicator of Compromise (IOC) and appends it to the scanner's IOC list. - + This method takes an IOC (such as a URL, domain, IP address, or email) and categorizes it into an appropriate type. It validates the IOC using various validators and regular expressions, then appends a dictionary containing the IOC, its type, and the scanner name to the scanner's IOC list. If the IOC does not match any valid type, a warning is logged, and the IOC is not added. - + Args: ioc (str or bytes): The IOC to be processed. Can be a string or bytes. If bytes, it will be decoded to a string. scanner_name (str): The name of the scanner processing the IOC. This is used to tag the IOC in the appended dictionary. - + Note: - The method internally handles different formats and types of IOCs (like URLs, domains, IPs, and emails). - If the IOC is invalid or does not match a known pattern, a warning is logged and the IOC is not added. From b42b78126dcf2d2d9a3f2538d347d0adc6c7e52d Mon Sep 17 00:00:00 2001 From: Paul Hutelmyer Date: Tue, 2 Jan 2024 15:26:01 -0500 Subject: [PATCH 5/9] Modifying IOC Supported Scanners --- src/python/strelka/scanners/scan_iqy.py | 72 +++++++++++++---------- src/python/strelka/scanners/scan_xl4ma.py | 62 +++++++++++-------- 2 files changed, 79 insertions(+), 55 deletions(-) diff --git a/src/python/strelka/scanners/scan_iqy.py b/src/python/strelka/scanners/scan_iqy.py index 5223e42e..24322f1a 100644 --- a/src/python/strelka/scanners/scan_iqy.py +++ b/src/python/strelka/scanners/scan_iqy.py @@ -1,4 +1,4 @@ -# Description # +#### Description #### # This scanner is looking for iqy files used with excel. # # author: Tasha Taylor @@ -11,52 +11,60 @@ class ScanIqy(strelka.Scanner): """ - Extract URLs from IQY files. + Strelka scanner for extracting URLs from IQY (Excel Web Query Internet Inquire) files. - IQY files, or Excel Web Query Internet Inquire files, are typically created from a VBA Web Query output. - The following is a typical format: - WEB - 1 - [URL] - [optional parameters] - Additional properties can be found at: https://learn.microsoft.com/en-us/office/vba/api/excel.querytable + IQY files are typically used to import data into Excel from the web. They often contain URLs + that specify the data source. This scanner aims to extract these URLs and process them for IOCs. + + The following is a typical format of an IQY file: + WEB + 1 + [URL] + [optional parameters] + + Reference for IQY file format: https://learn.microsoft.com/en-us/office/vba/api/excel.querytable """ def scan(self, data, file, options, expire_at): + """ + Processes the provided IQY data to extract URLs. + + Attempts to decode the data and applies a regex pattern to identify and extract URLs. + Extracted URLs are added to the scanner's IOC list. + + Args: + data (bytes): Data associated with the IQY file to be scanned. + file (strelka.File): File object associated with the data. + options (dict): Options to be applied during the scan. + expire_at (int): Expiration timestamp for extracted files. + """ try: - # Regular expression for detecting a URL-like pattern + # Compile regex pattern for URL detection address_pattern = re.compile( r"\b(?:http|https|ftp|ftps|file|smb)://\S+|" r"\\{2}\w+\\(?:[\w$]+\\)*[\w$]+", re.IGNORECASE, ) - # Attempt UTF-8 decoding first, fall back to latin-1 if necessary + # Attempt to decode the data try: - data = data.decode("utf-8") + decoded_data = data.decode("utf-8") except UnicodeDecodeError: - data = data.decode("latin-1") + decoded_data = data.decode("latin-1") - # Split lines to review each record separately - data_lines = data.splitlines() - - addresses = set() - # For each line, check if the line matches the address pattern. - # In a typical IQY file, the "WEB" keyword is at the beginning of the file, - # and what follows is usually just one URL with optional additional parameters. - # However, because we are iterating lines anyway, lets check for additional addresses anyway. - for entry in data_lines[1:]: - match = address_pattern.search(entry) - if match: - address = match.group().strip() - if address: - addresses.add(address) - - # Evaluate if any addresses were found and assign the boolean result. - self.event["address_found"] = bool(addresses) + # Extract addresses from the data + addresses = set( + match.group().strip() + for line in decoded_data.splitlines() + if (match := address_pattern.search(line)) + ) - # Send all addresses to the IOC parser. - self.add_iocs(list(addresses), self.type.url) + # Add extracted URLs to the scanner's IOC list + if addresses: + self.event["address_found"] = True + self.add_iocs(list(addresses)) + else: + self.event["address_found"] = False except UnicodeDecodeError as e: self.flags.append(f"Unicode decoding error: {e}") diff --git a/src/python/strelka/scanners/scan_xl4ma.py b/src/python/strelka/scanners/scan_xl4ma.py index 6bca5eb9..02c46872 100644 --- a/src/python/strelka/scanners/scan_xl4ma.py +++ b/src/python/strelka/scanners/scan_xl4ma.py @@ -1,35 +1,51 @@ -from strelka import strelka from strelka.auxiliary.xl4ma import analyzer +from strelka import strelka class ScanXl4ma(strelka.Scanner): - """Extracts Excel 4 cell contents and attempts to extract IOCs""" + """ + Strelka scanner for extracting Excel 4 cell contents and IOCs. + + This scanner uses the xl4ma analyzer to extract data from Excel files. + It attempts to decode Excel 4 cell contents and extract any potential IOCs. + Extracted data is added to the scanner's event, and IOCs are processed + using the scanner's IOC processing capabilities. + + Attributes inherited from strelka.Scanner: + - name (str): Name of the scanner class. + - key (str): Metadata key used to identify scanner metadata in scan results. + - event (dict): Dictionary containing the result of the scan. + - flags (list): List of flags raised during scanning. + - iocs (list): List of IOCs extracted during scanning. + """ def scan(self, data, file, options, expire_at): - results = {} + """ + Overrideable scan method from strelka.Scanner. + + Processes the provided data using the xl4ma analyzer and extracts + relevant information and IOCs. - # Attempt to process Excel data using analyzer + Args: + data (bytes): Data associated with the file to be scanned. + file (strelka.File): File object associated with the data. + options (dict): Options to be applied during the scan. + expire_at (int): Expiration timestamp for extracted files. + """ + # Attempt to process Excel data using the xl4ma analyzer try: + # Process Excel data and store the results results = analyzer.process_data(data=data, filename=file.name) + + # Check if decoding and IOCs are present in the results + if "decoded" in results: + self.event["decoded"] = results["decoded"] + if "iocs" in results: + self.event["iocs"] = results["iocs"] + self.add_iocs(results["iocs"]) except strelka.ScannerTimeout: + # Propagate the timeout exception raise except Exception as e: - self.flags.append(str(e)) - print(str(e)) - return - - # If processing successful, extract keys and apply to IOC scanner. - if results: - self.event["decoded"] = results.get("decoded", []) - self.event["iocs"] = results.get("iocs", []) - - try: - self.add_iocs( - results.get("iocs", []), - self.type.url, - description="extracted from excel4 macro", - ) - except strelka.ScannerTimeout: - raise - except Exception: - self.flags.append("xl4ma_ioc_processing_error") + # Append exception message to flags for diagnostic purposes + self.flags.append(f"xl4ma_processing_exception: {str(e)}") From 299f4748fbd09e5b25dc476a1d8c8864059ac98a Mon Sep 17 00:00:00 2001 From: Paul Hutelmyer Date: Wed, 3 Jan 2024 11:33:32 -0500 Subject: [PATCH 6/9] Removing Leading Docstring --- src/python/strelka/scanners/scan_iqy.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/python/strelka/scanners/scan_iqy.py b/src/python/strelka/scanners/scan_iqy.py index 24322f1a..c0cf03d6 100644 --- a/src/python/strelka/scanners/scan_iqy.py +++ b/src/python/strelka/scanners/scan_iqy.py @@ -1,9 +1,3 @@ -#### Description #### -# This scanner is looking for iqy files used with excel. -# -# author: Tasha Taylor -# date: 10/30/2023 - import re from strelka import strelka From 0693a1838038a8c8bcda352932c10b609b915e57 Mon Sep 17 00:00:00 2001 From: Paul Hutelmyer Date: Wed, 3 Jan 2024 11:35:10 -0500 Subject: [PATCH 7/9] Fixing import formatting --- src/python/strelka/scanners/scan_ocr.py | 5 +++-- src/python/strelka/scanners/scan_xl4ma.py | 2 +- src/python/strelka/strelka.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/python/strelka/scanners/scan_ocr.py b/src/python/strelka/scanners/scan_ocr.py index 08ad8c61..d44f4c5d 100644 --- a/src/python/strelka/scanners/scan_ocr.py +++ b/src/python/strelka/scanners/scan_ocr.py @@ -1,9 +1,10 @@ +import base64 +import io import os import subprocess import tempfile -import io + import fitz -import base64 from PIL import Image from strelka import strelka diff --git a/src/python/strelka/scanners/scan_xl4ma.py b/src/python/strelka/scanners/scan_xl4ma.py index 02c46872..840b2274 100644 --- a/src/python/strelka/scanners/scan_xl4ma.py +++ b/src/python/strelka/scanners/scan_xl4ma.py @@ -1,5 +1,5 @@ -from strelka.auxiliary.xl4ma import analyzer from strelka import strelka +from strelka.auxiliary.xl4ma import analyzer class ScanXl4ma(strelka.Scanner): diff --git a/src/python/strelka/strelka.py b/src/python/strelka/strelka.py index 0b6c4c0c..9bb643c6 100644 --- a/src/python/strelka/strelka.py +++ b/src/python/strelka/strelka.py @@ -13,6 +13,7 @@ import uuid from types import FrameType from typing import Generator, Optional, Tuple +from urllib.parse import urlparse import inflection import magic @@ -22,7 +23,6 @@ from boltons import iterutils from opentelemetry import context, trace from tldextract import TLDExtract -from urllib.parse import urlparse from . import __namespace__ from .telemetry.traces import get_tracer From 7a30ed193b9c084339668b59808fab06b5345a5e Mon Sep 17 00:00:00 2001 From: Paul Hutelmyer Date: Wed, 3 Jan 2024 11:39:02 -0500 Subject: [PATCH 8/9] Changing regex to raw strings --- src/python/strelka/strelka.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/strelka/strelka.py b/src/python/strelka/strelka.py index 9bb643c6..a3fb5c1d 100644 --- a/src/python/strelka/strelka.py +++ b/src/python/strelka/strelka.py @@ -904,7 +904,7 @@ def process_ioc(self, ioc, scanner_name) -> None: ) elif validators.domain(ioc): ioc_type = "domain" - elif re.match("^[\w\.\-]{2,62}\.[a-zA-Z]{2,5}:\d{1,5}$", ioc): + elif re.match(r"^[\w\.\-]{2,62}\.[a-zA-Z]{2,5}:\d{1,5}$", ioc): ioc_type = "domain" ioc = ioc.split(":")[0] elif validators.ipv4(ioc): @@ -913,7 +913,7 @@ def process_ioc(self, ioc, scanner_name) -> None: ioc_type = "ip" elif validators.email(ioc): ioc_type = "email" - elif re.match("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}$", ioc): + elif re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}$", ioc): ioc_type = "ip" ioc = ioc.split(":")[0] else: From 95ebbf41b868342c0689a5143c083c1ffa80dab9 Mon Sep 17 00:00:00 2001 From: Paul Hutelmyer Date: Wed, 3 Jan 2024 12:08:26 -0500 Subject: [PATCH 9/9] Updating IOC duplicate removal / fixing tests --- src/python/strelka/strelka.py | 12 +++++++++++- src/python/strelka/tests/test_scan_iqy.py | 2 -- src/python/strelka/tests/test_scan_xl4ma.py | 8 +------- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/python/strelka/strelka.py b/src/python/strelka/strelka.py index a3fb5c1d..48521951 100644 --- a/src/python/strelka/strelka.py +++ b/src/python/strelka/strelka.py @@ -803,8 +803,18 @@ def scan_wrapper( } # Removes duplicate entries from IOC list + unique_iocs = [] seen = set() - self.iocs = [x for x in self.iocs if x not in seen and not seen.add(x)] + for ioc in self.iocs: + identifier = ( + ioc["ioc"], + ioc["ioc_type"], + ) # Unique identifier based on 'ioc' and 'ioc_type' + if identifier not in seen: + seen.add(identifier) + unique_iocs.append(ioc) + + self.iocs = unique_iocs return self.files, {self.key: self.event}, self.iocs diff --git a/src/python/strelka/tests/test_scan_iqy.py b/src/python/strelka/tests/test_scan_iqy.py index fad64fbd..7fa9047c 100644 --- a/src/python/strelka/tests/test_scan_iqy.py +++ b/src/python/strelka/tests/test_scan_iqy.py @@ -17,13 +17,11 @@ def test_scan_iqy(mocker): "address_found": True, "iocs": [ { - "description": "", "ioc": "github.com", "ioc_type": "domain", "scanner": "ScanIqy", }, { - "description": "", "ioc": "https://github.com/target/strelka/blob/master/docs/index.html", "ioc_type": "url", "scanner": "ScanIqy", diff --git a/src/python/strelka/tests/test_scan_xl4ma.py b/src/python/strelka/tests/test_scan_xl4ma.py index 18fec287..2d52a712 100644 --- a/src/python/strelka/tests/test_scan_xl4ma.py +++ b/src/python/strelka/tests/test_scan_xl4ma.py @@ -26,17 +26,11 @@ def test_scan_xl4ma(mocker): ] ), "iocs": [ - { - "ioc": "example.com", - "ioc_type": "domain", - "scanner": "ScanXl4ma", - "description": "extracted from excel4 macro", - }, + {"ioc": "www.example.com", "ioc_type": "domain", "scanner": "ScanXl4ma"}, { "ioc": "https://www.example.com/path/to/resource", "ioc_type": "url", "scanner": "ScanXl4ma", - "description": "extracted from excel4 macro", }, ], }