Skip to content

Commit

Permalink
Merge pull request #234 from Derekt2/patch-1
Browse files Browse the repository at this point in the history
Update scan_pdf.py
  • Loading branch information
phutelmyer authored Nov 21, 2022
2 parents 2089d54 + 718ae15 commit c986f59
Showing 1 changed file with 20 additions and 1 deletion.
21 changes: 20 additions & 1 deletion src/python/strelka/scanners/scan_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@

# hide PyMuPDF warnings
fitz.TOOLS.mupdf_display_errors(False)

phone_numbers = re.compile(
"\+?(?:\d{1,2})?\s?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{2,4}?\-?\d{2,4}?",
flags=0,
)

class ScanPdf(strelka.Scanner):
"""Collects metadata and extracts files from PDF files."""
Expand Down Expand Up @@ -55,6 +58,22 @@ def scan(self, data, file, options, expire_at):
self.event['subject'] = reader.metadata['subject']
self.event['title'] = reader.metadata['title']
self.event['xrefs'] = reader.xref_length() - 1

#collect phones
phones = []
for i in range(self.event["pages"]):
phones.extend(
[
re.sub("[^0-9]", "", x)
for x in re.findall(
phone_numbers,
reader.get_page_text(i).replace(
"\t", " "
),
)
]
)
self.event["phones"] = list(set(phones))

# iterate through xref objects
for xref in range(1, reader.xref_length()):
Expand Down

0 comments on commit c986f59

Please sign in to comment.