Skip to content

Commit

Permalink
Merge pull request #343 from morriscode/scanpdf-xref-list
Browse files Browse the repository at this point in the history
Updating ScanPDF to store Xref objects in a list
  • Loading branch information
phutelmyer authored Mar 7, 2023
2 parents a1b8318 + 917f26c commit befb6b1
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 11 deletions.
1 change: 1 addition & 0 deletions configs/python/backend/backend.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,7 @@ scanners:
- 'OpenAction'
- 'URI'
- 'XObject'
max_objects: 250
'ScanPe':
- positive:
flavors:
Expand Down
56 changes: 45 additions & 11 deletions src/python/strelka/scanners/scan_pdf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# https://pymupdf.readthedocs.io/en/latest/index.html
# https://www.osti.gov/servlets/purl/1030303
"""
This module contains a scanner for extracting metadata and files from PDF files.
Resources:
- https://pymupdf.readthedocs.io/en/latest/index.html
- https://www.osti.gov/servlets/purl/1030303
Requirements:
- PyMuPDF
"""


import io
import re
Expand All @@ -10,19 +19,33 @@

from strelka import strelka

# hide PyMuPDF warnings
# Hide PyMuPDF warnings
fitz.TOOLS.mupdf_display_errors(False)

# Regex to extract phone numbers from PDF file
phone_numbers = re.compile(
r"[+]?(?:\d{1,2})?\s?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{2,4}?-?\d{2,4}?",
flags=0,
)


class ScanPdf(strelka.Scanner):
"""Collects metadata and extracts files from PDF files."""
"""
A scanner that collects metadata and extracts files from PDF files.
"""

@staticmethod
def _convert_timestamp(timestamp):
"""
Converts a date string to a DateTime object, sets the timezone to UTC, and returns it as an ISO string.
Args:
timestamp (str): A date string in the format 'D:%Y%m%d%H%M%S%z'.
Returns:
str: An ISO-formatted date string in the format '%Y-%m-%dT%H:%M:%SZ'.
"""

try:
# Date string is converted to DateTime, timezone is set to UTC, and returned as ISO string
return (
Expand All @@ -36,17 +59,22 @@ def _convert_timestamp(timestamp):
return

def scan(self, data, file, options, expire_at):
# Set maximum XREF objects to be collected (default: 250)
max_objects = options.get("max_objects", 250)

# Set Default Variables
self.event["images"] = 0
self.event["lines"] = 0
self.event["links"] = []
self.event["words"] = 0
self.event.setdefault("xref_object", set())
keys = list()

try:
with io.BytesIO(data) as pdf_io:
reader = fitz.open(stream=pdf_io, filetype="pdf")

# collect metadata
# Collect Metadata
self.event["author"] = reader.metadata["author"]
self.event["creator"] = reader.metadata["creator"]
self.event["creation_date"] = self._convert_timestamp(
Expand All @@ -73,7 +101,7 @@ def scan(self, data, file, options, expire_at):
self.event["title"] = reader.metadata["title"]
self.event["xrefs"] = reader.xref_length() - 1

# collect phones
# Collect Phones Numbers
phones = []
for i in range(self.event["pages"]):
phones.extend(
Expand All @@ -87,18 +115,24 @@ def scan(self, data, file, options, expire_at):
)
self.event["phones"] = list(set(phones))

# iterate through xref objects
# iterate through xref objects. Collect, count, and extract objects
self.event["xref_object"] = set()
for xref in range(1, reader.xref_length()):
xref_object = reader.xref_object(xref, compressed=True)
if xref_object not in self.event["xref_object"]:
self.event["xref_object"].add(xref_object)
for obj in options.get("objects", []):
pattern = f"/{obj}"
if pattern in xref_object:
keys.append(obj.lower())
# extract urls from xref
# Extract urls from xref
self.event["links"].extend(re.findall('"(https?://.*?)"', xref_object))
self.event["objects"] = dict(Counter(keys))

# submit embedded files to strelka
# Convert unique xref_object set back to list
self.event["xref_object"] = list(self.event["xref_object"])[:max_objects]

# Submit embedded files to strelka
try:
for i in range(reader.embfile_count()):
props = reader.embfile_info(i)
Expand All @@ -111,7 +145,7 @@ def scan(self, data, file, options, expire_at):
except Exception:
self.flags.append("embedded_parsing_failure")

# submit extracted images to strelka
# Submit extracted images to strelka
try:
for i in range(len(reader)):
for img in reader.get_page_images(i):
Expand All @@ -126,7 +160,7 @@ def scan(self, data, file, options, expire_at):
except Exception:
self.flags.append("image_parsing_failure")

# parse data from each page
# Parse data from each page
try:
text = ""
for page in reader:
Expand Down
48 changes: 48 additions & 0 deletions src/python/strelka/tests/test_scan_pdf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path
from unittest import TestCase, mock

from pytest_unordered import unordered
from strelka.scanners.scan_pdf import ScanPdf as ScanUnderTest
from strelka.tests import run_test_scan

Expand Down Expand Up @@ -38,12 +39,59 @@ def test_scan_pdf(mocker):
"xrefs": 40,
"phones": [],
"objects": {},
"xref_object": unordered(
[
"<</P 19 0 R/S/Span/Type/StructElem/Pg 3 0 R/K 0>>",
"<</P 19 0 R/S/Span/Type/StructElem/ActualText( )/K[2]/Pg 3 0 R>>",
"<</Filter/FlateDecode/Length 4050>>",
"<</Type/ExtGState/BM/Normal/ca 1>>",
"<</Type/FontDescriptor/FontName/TimesNewRomanPSMT/Flags 32/ItalicAngle 0/Ascent 891/Descent -216/CapHeight 693/AvgWidth 401/MaxWidth 2614/FontWeight 400/XHeight 250/Leading 42/StemV 40/FontBBox[-568 -216 2046 693]>>",
"<</P 18 0 R/S/P/Type/StructElem/K[5 6 7 8 9 10 11 12 13 14 15 16]/Pg 3 0 R>>",
"<</P 18 0 R/S/P/Type/StructElem/K[27 28 29 30 31 32 33]/Pg 3 0 R>>",
"<</Type/Page/Parent 2 0 R/Resources<</ExtGState<</GS5 5 0 R/GS8 8 0 R>>/Font<</F1 6 0 R/F2 10 0 R/F3 12 0 R>>/XObject<</Image9 9 0 R>>/ProcSet[/PDF/Text/ImageB/ImageC/ImageI]>>/MediaBox[0 0 612 792]/Contents 4 0 R/Group<</Type/Group/S/Transparency/CS/DeviceRGB>>/Tabs/S/StructParents 0>>",
"<</Type/StructTreeRoot/RoleMap 16 0 R/ParentTree 17 0 R/K[18 0 R]/ParentTreeNextKey 1>>",
"<</P 15 0 R/S/Part/Type/StructElem/K[19 0 R 25 0 R 28 0 R 29 0 R 30 0 R 31 0 R 32 0 R 33 0 R 34 0 R 35 0 R]>>",
"<</Type/Catalog/Pages 2 0 R/Lang(en-US)/StructTreeRoot 15 0 R/MarkInfo<</Marked true>>>>",
"<</Type/XRef/Size 40/W[1 4 2]/Root 1 0 R/Info 14 0 R/ID[<996084F03FED2848AB7A00AD5BCAA8E6><996084F03FED2848AB7A00AD5BCAA8E6>]/Filter/FlateDecode/Length 132>>",
"<</Type/Font/Subtype/TrueType/Name/F2/BaseFont/ABCDEE+Calibri/Encoding/WinAnsiEncoding/FontDescriptor 11 0 R/FirstChar 32/LastChar 32/Widths 37 0 R>>",
"<</P 18 0 R/S/P/Type/StructElem/K[59]/Pg 3 0 R>>",
"[20 0 R 23 0 R 24 0 R 27 0 R 26 0 R 28 0 R 28 0 R 28 0 R 28 0 R 28 0 R 28 0 R 28 0 R 28 0 R 28 0 R 28 0 R 28 0 R 28 0 R 29 0 R 29 0 R 29 0 R 29 0 R 29 0 R 29 0 R 29 0 R 29 0 R 29 0 R 29 0 R 30 0 R 30 0 R 30 0 R 30 0 R 30 0 R 30 0 R 30 0 R 31 0 R 31 0 R 31 0 R 31 0 R 31 0 R 31 0 R 31 0 R 31 0 R 31 0 R 31 0 R 31 0 R 32 0 R 32 0 R 32 0 R 32 0 R 32 0 R 32 0 R 32 0 R 32 0 R 32 0 R 32 0 R 33 0 R 33 0 R 34 0 R 34 0 R 35 0 R]",
"<</Type/Font/Subtype/TrueType/Name/F1/BaseFont/TimesNewRomanPSMT/Encoding/WinAnsiEncoding/FontDescriptor 7 0 R/FirstChar 32/LastChar 117/Widths 36 0 R>>",
"<</P 19 0 R/S/Span/Type/StructElem/ActualText(Lorem Ipsum)/K[1]/Pg 3 0 R>>",
"<</P 18 0 R/S/P/Type/StructElem/K[55 56]/Pg 3 0 R>>",
"<</P 18 0 R/S/P/Type/StructElem/K[17 18 19 20 21 22 23 24 25 26]/Pg 3 0 R>>",
"[278 0 0 0 0 0 0 0 0 0 0 0 278 0 278 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 667 0 722 722 0 0 0 0 278 0 0 556 833 722 0 667 778 0 667 0 0 667 0 0 0 0 0 0 0 0 0 0 556 556 500 556 556 278 556 556 222 222 0 222 833 556 556 556 556 333 500 278 556 500 0 500]",
"<</Type/FontDescriptor/FontName/ArialMT/Flags 32/ItalicAngle 0/Ascent 905/Descent -210/CapHeight 728/AvgWidth 441/MaxWidth 2665/FontWeight 400/XHeight 250/Leading 33/StemV 44/FontBBox[-665 -210 2000 728]>>",
"[226]",
"<</P 18 0 R/S/P/Type/StructElem/K[45 46 47 48 49 50 51 52 53 54]/Pg 3 0 R>>",
"<</Type/ObjStm/N 20/First 142/Filter/FlateDecode/Length 601>>",
"<</Type/FontDescriptor/FontName/ABCDEE+Calibri/Flags 32/ItalicAngle 0/Ascent 750/Descent -250/CapHeight 750/AvgWidth 521/MaxWidth 1743/FontWeight 400/XHeight 250/StemV 52/FontBBox[-503 -250 1240 750]/FontFile2 38 0 R>>",
"<</P 18 0 R/S/P/Type/StructElem/K[34 35 36 37 38 39 40 41 42 43 44]/Pg 3 0 R>>",
"<</Type/Pages/Count 1/Kids[3 0 R]>>",
"[250 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 333 0 0 611 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 444 0 0 0 0 0 0 0 778 0 500 500 0 333 389 0 500]",
"<</Type/ExtGState/BM/Normal/CA 1>>",
"<</Type/Font/Subtype/TrueType/Name/F3/BaseFont/ArialMT/Encoding/WinAnsiEncoding/FontDescriptor 13 0 R/FirstChar 32/LastChar 120/Widths 39 0 R>>",
"<</Filter/FlateDecode/Length 175850/Length1 537988>>",
"<</Footnote/Note/Endnote/Note/Textbox/Sect/Header/Sect/Footer/Sect/InlineShape/Sect/Annotation/Sect/Artifact/Sect/Workbook/Document/Worksheet/Part/Macrosheet/Part/Chartsheet/Part/Dialogsheet/Part/Slide/Part/Chart/Sect/Diagram/Figure>>",
"<</P 25 0 R/S/Span/Type/StructElem/Pg 3 0 R/K 4>>",
"<</Nums[0 21 0 R]>>",
"<</P 18 0 R/S/P/Type/StructElem/K[57 58]/Pg 3 0 R>>",
"<</P 18 0 R/S/P/Type/StructElem/K[26 0 R 27 0 R]/Pg 3 0 R>>",
"<</Author(Ryan.OHoro)/Creator<FEFF004D006900630072006F0073006F0066007400AE00200057006F0072006400200032003000310036>/CreationDate(D:20221216134852-06'00')/ModDate(D:20221216134852-06'00')/Producer<FEFF004D006900630072006F0073006F0066007400AE00200057006F0072006400200032003000310036>>>",
"<</Type/XObject/Subtype/Image/Width 340/Height 245/ColorSpace/DeviceRGB/BitsPerComponent 8/Filter/DCTDecode/Interpolate true/Length 21001>>",
"<</P 25 0 R/S/InlineShape/Alt()/Type/StructElem/K[3]/Pg 3 0 R>>",
"<</P 18 0 R/S/H1/Type/StructElem/K[20 0 R 23 0 R 24 0 R]/Pg 3 0 R>>",
]
),
}

scanner_event = run_test_scan(
mocker=mocker,
scan_class=ScanUnderTest,
fixture_path=Path(__file__).parent / "fixtures/test.pdf",
options={
"max_objects": 250,
},
)

TestCase.maxDiff = None
Expand Down

0 comments on commit befb6b1

Please sign in to comment.