Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

swap pycryptodome to the faster, smaller, and industry standard cryto… #456

Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Changed
- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
- Raise a warning instead of an error when extracting text from a non-extractable PDF ([#350](https://github.com/pdfminer/pdfminer.six/issues/350))
- Switched from pycryptodome to cryptography package for AES decryption ([#456](https://github.com/pdfminer/pdfminer.six/pull/456))

## [20200517]

Expand Down
68 changes: 39 additions & 29 deletions pdfminer/pdfdocument.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import hashlib as md5
import logging
import re
import struct
from hashlib import sha256, md5

try:
from Crypto.Cipher import ARC4, AES
from Crypto.Hash import SHA256
except ImportError:
AES = SHA256 = None
from . import arcfour as ARC4
from cryptography.hazmat.primitives.ciphers import (Cipher, algorithms,
modes)
from cryptography.hazmat.backends import default_backend

from . import arcfour as ARC4
from .psparser import PSEOF, literal_name, LIT, KWD
from . import settings
from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \
Expand Down Expand Up @@ -328,7 +327,7 @@ def compute_u(self, key):
return ARC4.new(key).encrypt(self.PASSWORD_PADDING) # 2
else:
# Algorithm 3.5
hash = md5.md5(self.PASSWORD_PADDING) # 2
hash = md5(self.PASSWORD_PADDING) # 2
hash.update(self.docid[0]) # 3
result = ARC4.new(key).encrypt(hash.digest()) # 4
for i in range(1, 20): # 5
Expand All @@ -340,7 +339,7 @@ def compute_u(self, key):
def compute_encryption_key(self, password):
# Algorithm 3.2
password = (password + self.PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2
hash = md5(password) # 2
hash.update(self.o) # 3
# See https://github.com/pdfminer/pdfminer.six/issues/186
hash.update(struct.pack('<L', self.p)) # 4
Expand All @@ -353,7 +352,7 @@ def compute_encryption_key(self, password):
if self.r >= 3:
n = self.length // 8
for _ in range(50):
result = md5.md5(result[:n]).digest()
result = md5(result[:n]).digest()
return result[:n]

def authenticate(self, password):
Expand All @@ -380,10 +379,10 @@ def verify_encryption_key(self, key):
def authenticate_owner_password(self, password):
# Algorithm 3.7
password = (password + self.PASSWORD_PADDING)[:32]
hash = md5.md5(password)
hash = md5(password)
if self.r >= 3:
for _ in range(50):
hash = md5.md5(hash.digest())
hash = md5(hash.digest())
n = 5
if self.r >= 3:
n = self.length // 8
Expand All @@ -403,7 +402,7 @@ def decrypt(self, objid, genno, data, attrs=None):
def decrypt_rc4(self, objid, genno, data):
key = self.key + struct.pack('<L', objid)[:3] \
+ struct.pack('<L', genno)[:2]
hash = md5.md5(key)
hash = md5(key)
key = hash.digest()[:min(len(key), 16)]
return ARC4.new(key).decrypt(data)

Expand Down Expand Up @@ -459,9 +458,14 @@ def decrypt_identity(self, objid, genno, data):
def decrypt_aes128(self, objid, genno, data):
key = self.key + struct.pack('<L', objid)[:3] \
+ struct.pack('<L', genno)[:2] + b'sAlT'
hash = md5.md5(key)
hash = md5(key)
key = hash.digest()[:min(len(key), 16)]
return AES.new(key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
initialization_vector = data[:16]
ciphertext = data[16:]
cipher = Cipher(algorithms.AES(key),
modes.CBC(initialization_vector),
backend=default_backend())
return cipher.decryptor().update(ciphertext)


class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
Expand Down Expand Up @@ -489,27 +493,35 @@ def get_cfm(self, name):

def authenticate(self, password):
password = password.encode('utf-8')[:127]
hash = SHA256.new(password)
hash = sha256(password)
hash.update(self.o_validation_salt)
hash.update(self.u)
if hash.digest() == self.o_hash:
hash = SHA256.new(password)
hash = sha256(password)
hash.update(self.o_key_salt)
hash.update(self.u)
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16)\
.decrypt(self.oe)
hash = SHA256.new(password)
cipher = Cipher(algorithms.AES(hash.digest()),
modes.CBC(b'\0' * 16),
backend=default_backend())
return cipher.decryptor().update(self.oe)
hash = sha256(password)
hash.update(self.u_validation_salt)
if hash.digest() == self.u_hash:
hash = SHA256.new(password)
hash = sha256(password)
hash.update(self.u_key_salt)
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16)\
.decrypt(self.ue)
cipher = Cipher(algorithms.AES(hash.digest()),
modes.CBC(b'\0' * 16),
backend=default_backend())
return cipher.decryptor().update(self.ue)
return None

def decrypt_aes256(self, objid, genno, data):
return AES.new(self.key, mode=AES.MODE_CBC, IV=data[:16])\
.decrypt(data[16:])
initialization_vector = data[:16]
ciphertext = data[16:]
cipher = Cipher(algorithms.AES(self.key),
modes.CBC(initialization_vector),
backend=default_backend())
return cipher.decryptor().update(ciphertext)


class PDFDocument:
Expand All @@ -528,11 +540,9 @@ class PDFDocument:
security_handler_registry = {
1: PDFStandardSecurityHandler,
2: PDFStandardSecurityHandler,
4: PDFStandardSecurityHandlerV4,
5: PDFStandardSecurityHandlerV5,
}
if AES is not None:
security_handler_registry[4] = PDFStandardSecurityHandlerV4
if SHA256 is not None:
security_handler_registry[5] = PDFStandardSecurityHandlerV5

def __init__(self, parser, password='', caching=True, fallback=True):
"Set the document to use a given PDFParser object."
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
package_data={'pdfminer': ['cmap/*.pickle.gz']},
install_requires=[
'chardet ; python_version > "3.0"',
'pycryptodome',
'cryptography',
pietermarsman marked this conversation as resolved.
Show resolved Hide resolved
'sortedcontainers',
],
extras_require={
Expand Down