Skip to content

Commit

Permalink
feat(pdf): use JtR to automatically unlock PDFs
Browse files Browse the repository at this point in the history
  • Loading branch information
benjamin-awd committed Sep 20, 2023
1 parent e495a3d commit d61bc43
Show file tree
Hide file tree
Showing 10 changed files with 815 additions and 7 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ __pycache__
*.tfstate.backup
credentials.json
token.json
.hash
.pot
_ignore*
*.tfvars
!tests/fixtures/*/**
Expand Down
1 change: 1 addition & 0 deletions Brewfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ brew "tesseract"
brew "terraform"
brew "precommit"
brew "make"
brwe "john-jumbo"
3 changes: 2 additions & 1 deletion monopoly/banks/hsbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ class Hsbc(BankBase):
)

pdf_config = PdfConfig(
password=settings.hsbc_pdf_password,
password=settings.hsbc_pdf_password_prefix,
page_range=(0, -1),
page_bbox=(0, 0, 379, 842),
brute_force_mask="?d?d?d?d?d?d",
)
2 changes: 1 addition & 1 deletion monopoly/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class Settings(BaseSettings):
secret_id: str = ""
gcs_bucket: str = ""
ocbc_pdf_password: str = ""
hsbc_pdf_password: str = ""
hsbc_pdf_password_prefix: str = ""
trusted_user_emails: list = []

model_config = SettingsConfigDict(env_file=".env", extra="allow")
Expand Down
355 changes: 355 additions & 0 deletions monopoly/helpers/pdf2john.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,355 @@
# pylint: skip-file
#!/usr/bin/env python

# Copyright (c) 2013 Shane Quigley, < shane at softwareontheside.info >

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import re
import sys
from xml.dom import minidom

PY3 = sys.version_info[0] == 3


class PdfHashExtractor:
def __init__(self, file_name):
self.file_name = file_name
f = open(file_name, "rb")
self.encrypted = f.read()
f.close()
self.process = True
psr = re.compile(b"PDF-\d\.\d")
try:
self.pdf_spec = psr.findall(self.encrypted)[0]
except IndexError:
sys.stderr.write("%s is not a PDF file!\n" % file_name)
self.process = False

def parse(self):
if not self.process:
return

try:
trailer = self.get_trailer()
except RuntimeError:
e = sys.exc_info()[1]
sys.stderr.write("%s : %s\n" % (self.file_name, str(e)))
return
# print >> sys.stderr, trailer
object_id = self.get_object_id(b"Encrypt", trailer)
# print >> sys.stderr, object_id
if len(object_id) == 0:
raise RuntimeError("Could not find object id")
encryption_dictionary = self.get_encryption_dictionary(object_id)
# print >> sys.stderr, encryption_dictionary
dr = re.compile(b"\d+")
vr = re.compile(b"\/V \d")
rr = re.compile(b"\/R \d")
try:
v = dr.findall(vr.findall(encryption_dictionary)[0])[0]
except IndexError:
raise RuntimeError("Could not find /V")
r = dr.findall(rr.findall(encryption_dictionary)[0])[0]
lr = re.compile(b"\/Length \d+")
longest = 0
# According to the docs:
# Length : (Optional; PDF 1.4; only if V is 2 or 3). Default value: 40
length = b"40"
for le in lr.findall(encryption_dictionary):
if int(dr.findall(le)[0]) > longest:
longest = int(dr.findall(le)[0])
length = dr.findall(le)[0]
pr = re.compile(b"\/P -?\d+")
try:
p = pr.findall(encryption_dictionary)[0]
except IndexError:
# print >> sys.stderr, "** dict:", encryption_dictionary
raise RuntimeError("Could not find /P")
pr = re.compile(b"-?\d+")
p = pr.findall(p)[0]
meta = "1" if self.is_meta_data_encrypted(encryption_dictionary) else "0"
idr = re.compile(b"\/ID\s*\[\s*<\w+>\s*<\w+>\s*\]")
try:
i_d = idr.findall(trailer)[0] # id key word
except IndexError:
# some pdf files use () instead of <>
idr = re.compile(b"\/ID\s*\[\s*\(\w+\)\s*\(\w+\)\s*\]")
try:
i_d = idr.findall(trailer)[0] # id key word
except IndexError:
# print >> sys.stderr, "** idr:", idr
# print >> sys.stderr, "** trailer:", trailer
raise RuntimeError("Could not find /ID tag")

idr = re.compile(b"<\w+>")
try:
i_d = idr.findall(trailer)[0]
except IndexError:
idr = re.compile(b"\(\w+\)")
i_d = idr.findall(trailer)[0]
i_d = i_d.replace(b"<", b"")
i_d = i_d.replace(b">", b"")
i_d = i_d.lower()
passwords = self.get_passwords_for_JtR(encryption_dictionary)
output = (
"$pdf$"
+ v.decode("ascii")
+ "*"
+ r.decode("ascii")
+ "*"
+ length.decode("ascii")
+ "*"
)
output += p.decode("ascii") + "*" + meta + "*"
output += str(int(len(i_d) / 2)) + "*" + i_d.decode("ascii") + "*" + passwords
print(output)
return output

def get_passwords_for_JtR(self, encryption_dictionary):
output = ""
letters = [b"U", b"O"]
if b"1.7" in self.pdf_spec:
letters = [b"U", b"O", b"UE", b"OE"]
for let in letters:
pr_str = b"\/" + let + b"\s*\([^)]+\)"
pr = re.compile(pr_str)
pas = pr.findall(encryption_dictionary)
if len(pas) > 0:
pas = pr.findall(encryption_dictionary)[0]
# because regexs in python suck <=== LOL
while pas[-2] == b"\\":
pr_str += b"[^)]+\)"
pr = re.compile(pr_str)
# print >> sys.stderr, "pr_str:", pr_str
# print >> sys.stderr, encryption_dictionary
try:
pas = pr.findall(encryption_dictionary)[0]
except IndexError:
break
output += self.get_password_from_byte_string(pas) + "*"
else:
pr = re.compile(let + b"\s*<\w+>")
pas = pr.findall(encryption_dictionary)
if not pas:
continue
pas = pas[0]
pr = re.compile(b"<\w+>")
pas = pr.findall(pas)[0]
pas = pas.replace(b"<", b"")
pas = pas.replace(b">", b"")
if PY3:
output += (
str(int(len(pas) / 2)) + "*" + str(pas.lower(), "ascii") + "*"
)
else:
output += str(int(len(pas) / 2)) + "*" + pas.lower() + "*"
return output[:-1]

def is_meta_data_encrypted(self, encryption_dictionary):
mr = re.compile(b"\/EncryptMetadata\s\w+")
if len(mr.findall(encryption_dictionary)) > 0:
wr = re.compile(b"\w+")
is_encrypted = wr.findall(mr.findall(encryption_dictionary)[0])[-1]
if is_encrypted == b"false":
return False
else:
return True
else:
return True

def parse_meta_data(self, trailer):
root_object_id = self.get_object_id(b"Root", trailer)
root_object = self.get_pdf_object(root_object_id)
object_id = self.get_object_id(b"Metadata", root_object)
xmp_metadata_object = self.get_pdf_object(object_id)
return self.get_xmp_values(xmp_metadata_object)

def get_xmp_values(self, xmp_metadata_object):
xmp_metadata_object = xmp_metadata_object.partition(b"stream")[2]
xmp_metadata_object = xmp_metadata_object.partition(b"endstream")[0]
try:
xml_metadata = minidom.parseString(xmp_metadata_object)
except:
return ""
values = []
values.append(self.get_dc_value("title", xml_metadata))
values.append(self.get_dc_value("creator", xml_metadata))
values.append(self.get_dc_value("description", xml_metadata))
values.append(self.get_dc_value("subject", xml_metadata))
created_year = xml_metadata.getElementsByTagName("xmp:CreateDate")
if len(created_year) > 0:
created_year = created_year[0].firstChild.data[0:4]
values.append(str(created_year))
return " ".join(values).replace(":", "")

def get_dc_value(self, value, xml_metadata):
output = xml_metadata.getElementsByTagName("dc:" + value)
if len(output) > 0:
output = output[0]
output = output.getElementsByTagName("rdf:li")[0]
if output.firstChild:
output = output.firstChild.data
return output
return ""

def get_encryption_dictionary(self, object_id):
encryption_dictionary = self.get_pdf_object(object_id)
for o in encryption_dictionary.split(b"endobj"):
if object_id + b" obj" in o:
encryption_dictionary = o
return encryption_dictionary

def get_object_id(self, name, trailer):
oir = re.compile(b"\/" + name + b"\s\d+\s\d\sR")
try:
object_id = oir.findall(trailer)[0]
except IndexError:
# print >> sys.stderr, " ** get_object_id: name \"", name, "\", trailer ", trailer
return ""
oir = re.compile(b"\d+ \d")
object_id = oir.findall(object_id)[0]
return object_id

def get_pdf_object(self, object_id):
output = (
object_id
+ b" obj"
+ self.encrypted.partition(b"\r" + object_id + b" obj")[2]
)
if output == object_id + b" obj":
output = (
object_id
+ b" obj"
+ self.encrypted.partition(b"\n" + object_id + b" obj")[2]
)
output = output.partition(b"endobj")[0] + b"endobj"
# print >> sys.stderr, output
return output

def get_trailer(self):
trailer = self.get_data_between(b"trailer", b">>", b"/ID")
if trailer == b"":
trailer = self.get_data_between(b"DecodeParms", b"stream", b"")
if trailer == "":
raise RuntimeError("Can't find trailer")
if trailer != "" and trailer.find(b"Encrypt") == -1:
# print >> sys.stderr, trailer
raise RuntimeError("File not encrypted")
return trailer

def get_data_between(self, s1, s2, tag):
output = b""
inside_first = False
lines = re.split(b"\n|\r", self.encrypted)
for line in lines:
inside_first = inside_first or line.find(s1) != -1
if inside_first:
output += line
if line.find(s2) != -1:
if tag == b"" or output.find(tag) != -1:
break
else:
output = b""
inside_first = False
return output

def get_hex_byte(self, o_or_u, i):
if PY3:
return hex(o_or_u[i]).replace("0x", "")
else:
return hex(ord(o_or_u[i])).replace("0x", "")

def get_password_from_byte_string(self, o_or_u):
pas = ""
escape_seq = False
escapes = 0
excluded_indexes = [0, 1, 2]
# For UE & OE in 1.7 spec
if not PY3:
if o_or_u[2] != "(":
excluded_indexes.append(3)
else:
if o_or_u[2] != 40:
excluded_indexes.append(3)
for i in range(len(o_or_u)):
if i not in excluded_indexes:
if len(self.get_hex_byte(o_or_u, i)) == 1 and o_or_u[i] != "\\"[0]:
pas += "0" # need to be 2 digit hex numbers
is_back_slash = True
if not PY3:
is_back_slash = o_or_u[i] != "\\"[0]
else:
is_back_slash = o_or_u[i] != 92
if is_back_slash or escape_seq:
if escape_seq:
if not PY3:
esc = "\\" + o_or_u[i]
else:
esc = "\\" + chr(o_or_u[i])
esc = self.unescape(esc)
if len(hex(ord(esc[0])).replace("0x", "")) == 1:
pas += "0"
pas += hex(ord(esc[0])).replace("0x", "")
escape_seq = False
else:
pas += self.get_hex_byte(o_or_u, i)
else:
escape_seq = True
escapes += 1
output = len(o_or_u) - (len(excluded_indexes) + 1) - escapes
return str(output) + "*" + pas[:-2]

def unescape(self, esc):
escape_seq_map = {
"\\n": "\n",
"\\s": "\s",
"\\e": "\e",
"\\r": "\r",
"\\t": "\t",
"\\v": "\v",
"\\f": "\f",
"\\b": "\b",
"\\a": "\a",
"\\)": ")",
"\\(": "(",
"\\\\": "\\",
"\\0": "\0",
}

return escape_seq_map[esc]


if __name__ == "__main__":
if len(sys.argv) < 2:
sys.stderr.write("Usage: %s <PDF file(s)>\n" % os.path.basename(sys.argv[0]))
sys.exit(-1)
for j in range(1, len(sys.argv)):
if not PY3:
filename = sys.argv[j].decode("UTF-8")
else:
filename = sys.argv[j]
# sys.stderr.write("Analyzing %s\n" % sys.argv[j].decode('UTF-8'))
unlocker = PdfHashExtractor(filename)
try:
unlocker.parse()
except RuntimeError:
e = sys.exc_info()[1]
sys.stderr.write("%s : %s\n" % (filename, str(e)))
Loading

0 comments on commit d61bc43

Please sign in to comment.