Skip to content

Commit

Permalink
Merge pull request #1743 from intezer/add-pdf-miner-integration
Browse files Browse the repository at this point in the history
Add pdfminer integration
  • Loading branch information
doomedraven authored Sep 13, 2023
2 parents bdc499f + b120035 commit a30f947
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 0 deletions.
1 change: 1 addition & 0 deletions extra/optional_dependencies.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ regex
ruff
scp
urlextract==1.5.0
pdfminer==20191125
2 changes: 2 additions & 0 deletions lib/cuckoo/common/integrations/parse_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import logging
from typing import Any, Dict

from lib.cuckoo.common.integrations.pdfminer import pdfminer_parse
from lib.cuckoo.common.integrations.peepdf import peepdf_parse
from lib.cuckoo.common.path_utils import path_exists

Expand Down Expand Up @@ -59,6 +60,7 @@ def _parse(self, filepath: str) -> Dict[str, Any]:
"Keywords": {str(keyword["name"]): keyword["count"] for keyword in pdfid_data["pdfid"]["keywords"]["keyword"]},
}
pdfresult = peepdf_parse(self.file_path, pdfresult)
pdfresult = pdfminer_parse(self.file_path, pdfresult)

return pdfresult

Expand Down
64 changes: 64 additions & 0 deletions lib/cuckoo/common/integrations/pdfminer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import logging
from typing import Any
from typing import Dict
from typing import Iterable
from typing import Set
from typing import Union

try:
from pdfminer import pdfparser
from pdfminer import pdfdocument
from pdfminer import pdftypes

HAVE_PDFMINER = True
except ImportError:
HAVE_PDFMINER = False

log = logging.getLogger(__name__)


def _search_for_url(obj: Union[dict, list]) -> Iterable[str]:
if obj is None:
return

if isinstance(obj, pdftypes.PDFStream):
yield from _search_for_url(obj.attrs)
elif isinstance(obj, list):
for v in obj:
yield from _search_for_url(v)
elif isinstance(obj, dict):
for key, value in obj.items():
if key == 'URI':
yield value.decode() if isinstance(value, bytes) else value
continue

yield from _search_for_url(value)


def _mine_for_urls(file_path: str) -> Set[str]:
urls = set()
try:
with open(file_path, 'rb') as f:
parser = pdfparser.PDFParser(f)
doc = pdfdocument.PDFDocument(parser)

for xref in doc.xrefs:
for object_id in xref.get_objids():
try:
obj = doc.getobj(object_id)
urls.update(_search_for_url(obj))
except Exception as ex:
log.error(ex, exc_info=True)
except Exception as ex:
log.error(ex, exc_info=True)

return urls


def pdfminer_parse(filepath: str, pdfresult: Dict[str, Any]) -> Dict[str, Any]:
if not HAVE_PDFMINER:
return pdfresult

urls = _mine_for_urls(filepath)
pdfresult["All_URLs"] = list(urls)
return pdfresult

0 comments on commit a30f947

Please sign in to comment.