Merge pull request #343 from morriscode/scanpdf-xref-list

Updating ScanPDF to store Xref objects in a list
target · Mar 7, 2023 · befb6b1 · befb6b1
2 parents a1b8318 + 917f26c
commit befb6b1
Show file tree

Hide file tree

Showing 3 changed files with 94 additions and 11 deletions.
diff --git a/configs/python/backend/backend.yaml b/configs/python/backend/backend.yaml
@@ -506,6 +506,7 @@ scanners:
           - 'OpenAction'
           - 'URI'
           - 'XObject'
+        max_objects: 250
   'ScanPe':
     - positive:
         flavors:

diff --git a/src/python/strelka/scanners/scan_pdf.py b/src/python/strelka/scanners/scan_pdf.py
@@ -1,5 +1,14 @@
-# https://pymupdf.readthedocs.io/en/latest/index.html
-# https://www.osti.gov/servlets/purl/1030303
+"""
+This module contains a scanner for extracting metadata and files from PDF files.
+
+Resources:
+- https://pymupdf.readthedocs.io/en/latest/index.html
+- https://www.osti.gov/servlets/purl/1030303
+
+Requirements:
+- PyMuPDF
+"""
+
 
 import io
 import re
@@ -10,19 +19,33 @@
 
 from strelka import strelka
 
-# hide PyMuPDF warnings
+# Hide PyMuPDF warnings
 fitz.TOOLS.mupdf_display_errors(False)
+
+# Regex to extract phone numbers from PDF file
 phone_numbers = re.compile(
     r"[+]?(?:\d{1,2})?\s?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{2,4}?-?\d{2,4}?",
     flags=0,
 )
 
 
 class ScanPdf(strelka.Scanner):
-    """Collects metadata and extracts files from PDF files."""
+    """
+    A scanner that collects metadata and extracts files from PDF files.
+    """
 
     @staticmethod
     def _convert_timestamp(timestamp):
+        """
+        Converts a date string to a DateTime object, sets the timezone to UTC, and returns it as an ISO string.
+
+        Args:
+            timestamp (str): A date string in the format 'D:%Y%m%d%H%M%S%z'.
+
+        Returns:
+            str: An ISO-formatted date string in the format '%Y-%m-%dT%H:%M:%SZ'.
+        """
+
         try:
             # Date string is converted to DateTime, timezone is set to UTC, and returned as ISO string
             return (
@@ -36,17 +59,22 @@ def _convert_timestamp(timestamp):
             return
 
     def scan(self, data, file, options, expire_at):
+        # Set maximum XREF objects to be collected (default: 250)
+        max_objects = options.get("max_objects", 250)
+
+        # Set Default Variables
         self.event["images"] = 0
         self.event["lines"] = 0
         self.event["links"] = []
         self.event["words"] = 0
+        self.event.setdefault("xref_object", set())
         keys = list()
 
         try:
             with io.BytesIO(data) as pdf_io:
                 reader = fitz.open(stream=pdf_io, filetype="pdf")
 
-            # collect metadata
+            # Collect Metadata
             self.event["author"] = reader.metadata["author"]
             self.event["creator"] = reader.metadata["creator"]
             self.event["creation_date"] = self._convert_timestamp(
@@ -73,7 +101,7 @@ def scan(self, data, file, options, expire_at):
             self.event["title"] = reader.metadata["title"]
             self.event["xrefs"] = reader.xref_length() - 1
 
-            # collect phones
+            # Collect Phones Numbers
             phones = []
             for i in range(self.event["pages"]):
                 phones.extend(
@@ -87,18 +115,24 @@ def scan(self, data, file, options, expire_at):
                 )
             self.event["phones"] = list(set(phones))
 
-            # iterate through xref objects
+            # iterate through xref objects. Collect, count, and extract objects
+            self.event["xref_object"] = set()
             for xref in range(1, reader.xref_length()):
                 xref_object = reader.xref_object(xref, compressed=True)
+                if xref_object not in self.event["xref_object"]:
+                    self.event["xref_object"].add(xref_object)
                 for obj in options.get("objects", []):
                     pattern = f"/{obj}"
                     if pattern in xref_object:
                         keys.append(obj.lower())
-                # extract urls from xref
+                # Extract urls from xref
                 self.event["links"].extend(re.findall('"(https?://.*?)"', xref_object))
             self.event["objects"] = dict(Counter(keys))
 
-            # submit embedded files to strelka
+            # Convert unique xref_object set back to list
+            self.event["xref_object"] = list(self.event["xref_object"])[:max_objects]
+
+            # Submit embedded files to strelka
             try:
                 for i in range(reader.embfile_count()):
                     props = reader.embfile_info(i)
@@ -111,7 +145,7 @@ def scan(self, data, file, options, expire_at):
             except Exception:
                 self.flags.append("embedded_parsing_failure")
 
-            # submit extracted images to strelka
+            # Submit extracted images to strelka
             try:
                 for i in range(len(reader)):
                     for img in reader.get_page_images(i):
@@ -126,7 +160,7 @@ def scan(self, data, file, options, expire_at):
             except Exception:
                 self.flags.append("image_parsing_failure")
 
-            # parse data from each page
+            # Parse data from each page
             try:
                 text = ""
                 for page in reader:

diff --git a/src/python/strelka/tests/test_scan_pdf.py b/src/python/strelka/tests/test_scan_pdf.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 from unittest import TestCase, mock
 
+from pytest_unordered import unordered
 from strelka.scanners.scan_pdf import ScanPdf as ScanUnderTest
 from strelka.tests import run_test_scan
 
@@ -38,12 +39,59 @@ def test_scan_pdf(mocker):
         "xrefs": 40,
         "phones": [],
         "objects": {},
+        "xref_object": unordered(
+            [
+                "<</P 19 0 R/S/Span/Type/StructElem/Pg 3 0 R/K 0>>",
+                "<</P 19 0 R/S/Span/Type/StructElem/ActualText( )/K[2]/Pg 3 0 R>>",
+                "<</Filter/FlateDecode/Length 4050>>",
+                "<</Type/ExtGState/BM/Normal/ca 1>>",
+                "<</Type/FontDescriptor/FontName/TimesNewRomanPSMT/Flags 32/ItalicAngle 0/Ascent 891/Descent -216/CapHeight 693/AvgWidth 401/MaxWidth 2614/FontWeight 400/XHeight 250/Leading 42/StemV 40/FontBBox[-568 -216 2046 693]>>",
+                "<</P 18 0 R/S/P/Type/StructElem/K[5 6 7 8 9 10 11 12 13 14 15 16]/Pg 3 0 R>>",
+                "<</P 18 0 R/S/P/Type/StructElem/K[27 28 29 30 31 32 33]/Pg 3 0 R>>",
+                "<</Type/Page/Parent 2 0 R/Resources<</ExtGState<</GS5 5 0 R/GS8 8 0 R>>/Font<</F1 6 0 R/F2 10 0 R/F3 12 0 R>>/XObject<</Image9 9 0 R>>/ProcSet[/PDF/Text/ImageB/ImageC/ImageI]>>/MediaBox[0 0 612 792]/Contents 4 0 R/Group<</Type/Group/S/Transparency/CS/DeviceRGB>>/Tabs/S/StructParents 0>>",
+                "<</Type/StructTreeRoot/RoleMap 16 0 R/ParentTree 17 0 R/K[18 0 R]/ParentTreeNextKey 1>>",
+                "<</P 15 0 R/S/Part/Type/StructElem/K[19 0 R 25 0 R 28 0 R 29 0 R 30 0 R 31 0 R 32 0 R 33 0 R 34 0 R 35 0 R]>>",
+                "<</Type/Catalog/Pages 2 0 R/Lang(en-US)/StructTreeRoot 15 0 R/MarkInfo<</Marked true>>>>",
+                "<</Type/XRef/Size 40/W[1 4 2]/Root 1 0 R/Info 14 0 R/ID[<996084F03FED2848AB7A00AD5BCAA8E6><996084F03FED2848AB7A00AD5BCAA8E6>]/Filter/FlateDecode/Length 132>>",
+                "<</Type/Font/Subtype/TrueType/Name/F2/BaseFont/ABCDEE+Calibri/Encoding/WinAnsiEncoding/FontDescriptor 11 0 R/FirstChar 32/LastChar 32/Widths 37 0 R>>",
+                "<</P 18 0 R/S/P/Type/StructElem/K[59]/Pg 3 0 R>>",
+                "[20 0 R 23 0 R 24 0 R 27 0 R 26 0 R 28 0 R 28 0 R 28 0 R 28 0 R 28 0 R 28 0 R 28 0 R 28 0 R 28 0 R 28 0 R 28 0 R 28 0 R 29 0 R 29 0 R 29 0 R 29 0 R 29 0 R 29 0 R 29 0 R 29 0 R 29 0 R 29 0 R 30 0 R 30 0 R 30 0 R 30 0 R 30 0 R 30 0 R 30 0 R 31 0 R 31 0 R 31 0 R 31 0 R 31 0 R 31 0 R 31 0 R 31 0 R 31 0 R 31 0 R 31 0 R 32 0 R 32 0 R 32 0 R 32 0 R 32 0 R 32 0 R 32 0 R 32 0 R 32 0 R 32 0 R 33 0 R 33 0 R 34 0 R 34 0 R 35 0 R]",
+                "<</Type/Font/Subtype/TrueType/Name/F1/BaseFont/TimesNewRomanPSMT/Encoding/WinAnsiEncoding/FontDescriptor 7 0 R/FirstChar 32/LastChar 117/Widths 36 0 R>>",
+                "<</P 19 0 R/S/Span/Type/StructElem/ActualText(Lorem Ipsum)/K[1]/Pg 3 0 R>>",
+                "<</P 18 0 R/S/P/Type/StructElem/K[55 56]/Pg 3 0 R>>",
+                "<</P 18 0 R/S/P/Type/StructElem/K[17 18 19 20 21 22 23 24 25 26]/Pg 3 0 R>>",
+                "[278 0 0 0 0 0 0 0 0 0 0 0 278 0 278 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 667 0 722 722 0 0 0 0 278 0 0 556 833 722 0 667 778 0 667 0 0 667 0 0 0 0 0 0 0 0 0 0 556 556 500 556 556 278 556 556 222 222 0 222 833 556 556 556 556 333 500 278 556 500 0 500]",
+                "<</Type/FontDescriptor/FontName/ArialMT/Flags 32/ItalicAngle 0/Ascent 905/Descent -210/CapHeight 728/AvgWidth 441/MaxWidth 2665/FontWeight 400/XHeight 250/Leading 33/StemV 44/FontBBox[-665 -210 2000 728]>>",
+                "[226]",
+                "<</P 18 0 R/S/P/Type/StructElem/K[45 46 47 48 49 50 51 52 53 54]/Pg 3 0 R>>",
+                "<</Type/ObjStm/N 20/First 142/Filter/FlateDecode/Length 601>>",
+                "<</Type/FontDescriptor/FontName/ABCDEE+Calibri/Flags 32/ItalicAngle 0/Ascent 750/Descent -250/CapHeight 750/AvgWidth 521/MaxWidth 1743/FontWeight 400/XHeight 250/StemV 52/FontBBox[-503 -250 1240 750]/FontFile2 38 0 R>>",
+                "<</P 18 0 R/S/P/Type/StructElem/K[34 35 36 37 38 39 40 41 42 43 44]/Pg 3 0 R>>",
+                "<</Type/Pages/Count 1/Kids[3 0 R]>>",
+                "[250 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 333 0 0 611 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 444 0 0 0 0 0 0 0 778 0 500 500 0 333 389 0 500]",
+                "<</Type/ExtGState/BM/Normal/CA 1>>",
+                "<</Type/Font/Subtype/TrueType/Name/F3/BaseFont/ArialMT/Encoding/WinAnsiEncoding/FontDescriptor 13 0 R/FirstChar 32/LastChar 120/Widths 39 0 R>>",
+                "<</Filter/FlateDecode/Length 175850/Length1 537988>>",
+                "<</Footnote/Note/Endnote/Note/Textbox/Sect/Header/Sect/Footer/Sect/InlineShape/Sect/Annotation/Sect/Artifact/Sect/Workbook/Document/Worksheet/Part/Macrosheet/Part/Chartsheet/Part/Dialogsheet/Part/Slide/Part/Chart/Sect/Diagram/Figure>>",
+                "<</P 25 0 R/S/Span/Type/StructElem/Pg 3 0 R/K 4>>",
+                "<</Nums[0 21 0 R]>>",
+                "<</P 18 0 R/S/P/Type/StructElem/K[57 58]/Pg 3 0 R>>",
+                "<</P 18 0 R/S/P/Type/StructElem/K[26 0 R 27 0 R]/Pg 3 0 R>>",
+                "<</Author(Ryan.OHoro)/Creator<FEFF004D006900630072006F0073006F0066007400AE00200057006F0072006400200032003000310036>/CreationDate(D:20221216134852-06'00')/ModDate(D:20221216134852-06'00')/Producer<FEFF004D006900630072006F0073006F0066007400AE00200057006F0072006400200032003000310036>>>",
+                "<</Type/XObject/Subtype/Image/Width 340/Height 245/ColorSpace/DeviceRGB/BitsPerComponent 8/Filter/DCTDecode/Interpolate true/Length 21001>>",
+                "<</P 25 0 R/S/InlineShape/Alt()/Type/StructElem/K[3]/Pg 3 0 R>>",
+                "<</P 18 0 R/S/H1/Type/StructElem/K[20 0 R 23 0 R 24 0 R]/Pg 3 0 R>>",
+            ]
+        ),
     }
 
     scanner_event = run_test_scan(
         mocker=mocker,
         scan_class=ScanUnderTest,
         fixture_path=Path(__file__).parent / "fixtures/test.pdf",
+        options={
+            "max_objects": 250,
+        },
     )
 
     TestCase.maxDiff = None