diff --git a/src/python/strelka/scanners/scan_pdf.py b/src/python/strelka/scanners/scan_pdf.py index dddb1b49..bcbc0dd0 100644 --- a/src/python/strelka/scanners/scan_pdf.py +++ b/src/python/strelka/scanners/scan_pdf.py @@ -10,7 +10,10 @@ # hide PyMuPDF warnings fitz.TOOLS.mupdf_display_errors(False) - +phone_numbers = re.compile( + "\+?(?:\d{1,2})?\s?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{2,4}?\-?\d{2,4}?", + flags=0, +) class ScanPdf(strelka.Scanner): """Collects metadata and extracts files from PDF files.""" @@ -55,6 +58,22 @@ def scan(self, data, file, options, expire_at): self.event['subject'] = reader.metadata['subject'] self.event['title'] = reader.metadata['title'] self.event['xrefs'] = reader.xref_length() - 1 + + #collect phones + phones = [] + for i in range(self.event["pages"]): + phones.extend( + [ + re.sub("[^0-9]", "", x) + for x in re.findall( + phone_numbers, + reader.get_page_text(i).replace( + "\t", " " + ), + ) + ] + ) + self.event["phones"] = list(set(phones)) # iterate through xref objects for xref in range(1, reader.xref_length()):