Skip to content

Commit

Permalink
oleobj: smarter way to create dump filename
Browse files Browse the repository at this point in the history
Sofar we have only looked at the `filename` attribute but in malware
samples the path has been empty and windows used src_path or tmp_path
to determine dumped file type.

Look at all 3 filenames/paths, try to preserve suffix but still limit
length of resulting file name. Deal with multiple objects of same
resulting filename by offering random file names
  • Loading branch information
christian-intra2net committed May 28, 2019
1 parent e8dfd16 commit 98851d8
Showing 1 changed file with 115 additions and 19 deletions.
134 changes: 115 additions & 19 deletions oletools/oleobj.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
import sys
import io
from zipfile import is_zipfile
import random

import olefile

Expand Down Expand Up @@ -227,6 +228,12 @@ def enable_logging():
'worksheet'
]

# Save maximum length of a filename
MAX_FILENAME_LENGTH = 255

# Max attempts at generating a non-existent random file name
MAX_FILENAME_ATTEMPTS = 100

# === FUNCTIONS ===============================================================


Expand Down Expand Up @@ -491,11 +498,40 @@ def parse(self, data):
self.extra_data = data[index+self.data_size:]


def sanitize_filename(filename, replacement='_', max_length=200):
"""compute basename of filename. Replaces all non-whitelisted characters.
The returned filename is always a ascii basename of the file."""
def shorten_filename(fname, max_len):
"""Create filename shorter than max_len, trying to preserve suffix."""
# simple cases:
if not max_len:
return fname
name_len = len(fname)
if name_len < max_len:
return fname

idx = fname.rfind('.')
if idx == -1:
return fname[:max_len]

suffix_len = name_len - idx # length of suffix including '.'
if suffix_len > max_len:
return fname[:max_len]

# great, can preserve suffix
return fname[:max_len-suffix_len] + fname[idx:]


def sanitize_filename(filename, replacement='_',
max_len=MAX_FILENAME_LENGTH):
"""
Return filename that is save to work with.
Removes path components, replaces all non-whitelisted characters (so output
is always a pure-ascii string), replaces '..' and ' ' and shortens to
given max length, trying to preserve suffix.
Might return empty string
"""
basepath = os.path.basename(filename).strip()
sane_fname = re.sub(u'[^a-zA-Z0-9.\\-_ ]', replacement, basepath)
sane_fname = re.sub(u'[^a-zA-Z0-9.\-_ ]', replacement, basepath)
sane_fname = str(sane_fname) # py3: does nothing; py2: unicode --> str

while ".." in sane_fname:
Expand All @@ -504,14 +540,71 @@ def sanitize_filename(filename, replacement='_', max_length=200):
while " " in sane_fname:
sane_fname = sane_fname.replace(' ', ' ')

if not filename:
sane_fname = 'NONAME'
# limit filename length, try to preserve suffix
return shorten_filename(sane_fname, max_len)


# limit filename length
if max_length:
sane_fname = sane_fname[:max_length]
def get_sane_embedded_filenames(filename, src_path, tmp_path, max_len,
noname_index):
"""
Get some sane filenames out of path information, preserving file suffix.
Returns several canddiates, first with suffix, then without, then random
with suffix and finally one last attempt ignoring max_len using arg
`noname_index`.
In some malware examples, filename (on which we relied sofar exclusively
for this) is empty or " ", but src_path and tmp_path contain paths with
proper file names. Try to extract filename from any of those.
Preservation of suffix is especially important since that controls how
windoze treats the file.
"""
suffixes = []
candidates_without_suffix = [] # remember these as fallback
for candidate in (filename, src_path, tmp_path):
# remove path component. Could be from linux, mac or windows
idx = max(candidate.rfind('/'), candidate.rfind('\\'))
candidate = candidate[idx+1:].strip()

# sanitize
candidate = sanitize_filename(candidate, max_len=max_len)

if not candidate:
continue # skip whitespace-only

# identify suffix. Dangerous suffixes are all short
idx = candidate.rfind('.')
if idx is -1:
candidates_without_suffix.append(candidate)
continue
elif idx < len(candidate)-5:
candidates_without_suffix.append(candidate)
continue

# remember suffix
suffixes.append(candidate[idx:])

yield candidate

return sane_fname
# parts with suffix not good enough? try those without one
for candidate in candidates_without_suffix:
yield candidate

# then try random
suffixes.append('') # ensure there is something in there
for _ in range(MAX_FILENAME_ATTEMPTS):
for suffix in suffixes:
leftover_len = max_len - len(suffix)
if leftover_len < 1:
continue
name = ''.join(random.sample('abcdefghijklmnopqrstuvwxyz',
min(26, leftover_len)))
yield name + suffix

# still not returned? Then we have to make up a name ourselves
# do not care any more about max_len (maybe it was 0 or negative)
yield 'oleobj_%03d' % noname_index


def find_ole_in_ppt(filename):
Expand Down Expand Up @@ -663,7 +756,8 @@ def find_ole(filename, data, xml_parser=None):
if xml_parser is None:
xml_parser = XmlParser(arg_for_zip)
# force iteration so XmlParser.iter_non_xml() returns data
[x for x in xml_parser.iter_xml()]
for _ in xml_parser.iter_xml():
pass

log.info('is zip file: ' + filename)
# we looped through the XML files before, now we can
Expand Down Expand Up @@ -732,16 +826,17 @@ def process_file(filename, data, output_dir=None):
If output_dir is given and does not exist, it is created. If it is not
given, data is saved to same directory as the input file.
"""
# sanitize filename, leave space for embedded filename part
sane_fname = sanitize_filename(filename, max_len=MAX_FILENAME_LENGTH-5) or\
'NONAME'
if output_dir:
if not os.path.isdir(output_dir):
log.info('creating output directory %s', output_dir)
os.mkdir(output_dir)

fname_prefix = os.path.join(output_dir,
sanitize_filename(filename))
fname_prefix = os.path.join(output_dir, sane_fname)
else:
base_dir = os.path.dirname(filename)
sane_fname = sanitize_filename(filename)
fname_prefix = os.path.join(base_dir, sane_fname)

# TODO: option to extract objects to files (false by default)
Expand Down Expand Up @@ -797,11 +892,12 @@ def process_file(filename, data, output_dir=None):
print(u'Filename = "%s"' % opkg.filename)
print(u'Source path = "%s"' % opkg.src_path)
print(u'Temp path = "%s"' % opkg.temp_path)
if opkg.filename:
fname = '%s_%s' % (fname_prefix,
sanitize_filename(opkg.filename))
else:
fname = '%s_object_%03d.noname' % (fname_prefix, index)
for embedded_fname in get_sane_embedded_filenames(
opkg.filename, opkg.src_path, opkg.temp_path,
MAX_FILENAME_LENGTH - len(sane_fname) - 1, index):
fname = fname_prefix + '_' + embedded_fname
if not os.path.isfile(fname):
break

# dump
try:
Expand Down

0 comments on commit 98851d8

Please sign in to comment.