diff --git a/build/python/backend/requirements.txt b/build/python/backend/requirements.txt index 7e7d84c2..523d434d 100644 --- a/build/python/backend/requirements.txt +++ b/build/python/backend/requirements.txt @@ -5,6 +5,7 @@ construct==2.10.67 cryptography==3.4.7 docker==5.0.0 esprima==4.0.1 +eml-parser>=1.17 git+https://github.com/jshlbrd/python-entropy.git # v0.11 as of this freeze (package installed as 'entropy') grpcio==1.42.0 grpcio-tools==1.42.0 @@ -21,7 +22,7 @@ olefile==0.46 oletools==0.56.1 opencv-python==4.5.1.48 opencv-contrib-python==4.5.3.56 -PyMuPDF==1.18.0 +PyMuPDF==1.19.6 pefile==2019.4.18 pgpdump3==1.5.2 pyelftools==0.27 @@ -32,6 +33,7 @@ python-docx==0.8.10 python-magic==0.4.22 pyyaml>=5.4.1 pyzbar==0.1.8 +pytz>=2022.1 rarfile==4.0 redis==3.5.3 requests==2.25.1 diff --git a/src/python/strelka/scanners/scan_email.py b/src/python/strelka/scanners/scan_email.py index 551aa963..a6378315 100644 --- a/src/python/strelka/scanners/scan_email.py +++ b/src/python/strelka/scanners/scan_email.py @@ -1,57 +1,124 @@ -import email +import eml_parser +import base64 +import pytz from strelka import strelka class ScanEmail(strelka.Scanner): """Collects metadata and extract files from email messages.""" - def scan(self, data, file, options, expire_at): - headers = options.get('headers', []) - self.event['total'] = {'parts': 0, 'extracted': 0} + def scan(self, data, file, options, expire_at): + attachments = [] + self.event['total'] = {'attachments': 0, 'extracted': 0} try: - message = email.message_from_string( - data.decode('UTF-8', 'replace') - ) - - self.event['headers'] = [] - for h, v in message.items(): - if headers and h not in headers: - continue - - self.event['headers'].append({ - 'header': h, - 'value': v, - }) - - self.event['parts'] = [] - for (index, part) in enumerate(message.walk()): - self.event['total']['parts'] += 1 - extract_data = part.get_payload(decode=True) - if extract_data is not None: - part_filename = part.get_filename() - if part_filename is not None: - extract_name = f'{part_filename}' - self.event['parts'].append(part_filename) - else: - extract_name = f'part_{index}' - - extract_file = strelka.File( - name=extract_name, - source=self.name, - ) - extract_file.add_flavors({'external': [part.get_content_type()]}) - - for c in strelka.chunk_string(extract_data): - self.upload_to_coordinator( - extract_file.pointer, - c, - expire_at, + + # Open and parse email byte string + # If fail to open, return. + try: + ep = eml_parser.EmlParser(include_attachment_data=True, include_raw_body=True) + parsed_eml = ep.decode_email_bytes(data) + except: + self.flags.append('parse_load_error') + return + + # Check if email was parsed properly and attempt to deconflict and reload. + # If fail to reparse, return. + try: + if not (parsed_eml['header']['subject'] and parsed_eml['header']['header']): + if b'\nReceived: from ' in data: + data = (data.rpartition(b"\nReceived: from ")[1] + data.rpartition(b"\nReceived: from ")[ + 2])[1:] + elif b"Start mail input; end with .\n" in data: + data = data.rpartition(b"Start mail input; end with .\n")[2] + parsed_eml = ep.decode_email_bytes(data) + if not (parsed_eml['header']['subject'] and parsed_eml['header']['header']): + self.flags.append('parse_manual_email_error') + return + except: + self.flags.append('parse_manual_email_error') + return + + # Body + # If body exists in email, collect partial message contents and domains + try: + if 'body' in parsed_eml: + for body in parsed_eml['body']: + if 'content_type' in body: + if body['content_type'] == 'text/plain': + if len(body['content']) <= 200: + self.event['body'] = body['content'] + else: + self.event['body'] = body['content'][:100] + '...' + body['content'][-100:] + else: + self.event['body'] = body['content'][:100] + '...' + body['content'][-100:] + if 'domain' in body: + if 'domain' in self.event: + self.event['domains'] += body['domain'] + else: + self.event['domains'] = body['domain'] + except: + self.flags.append('parse_body_error') + + # Attachments + # If attachments exist in email, collect attachment details and raw data to be resubmitted to pipeline. + try: + if 'attachment' in parsed_eml: + self.event['attachments'] = {} + self.event['attachments']['filenames'] = [] + self.event['attachments']['hashes'] = [] + self.event['attachments']['totalsize'] = 0 + for attachment in parsed_eml['attachment']: + self.event['attachments']['filenames'].append(attachment["filename"]) + self.event['attachments']['hashes'].append(attachment['hash']['md5']) + self.event['attachments']['totalsize'] += attachment['size'] + attachments.append({ + 'name': attachment['filename'], + 'content-type': attachment['content_header']['content-type'][0], + 'raw': base64.b64decode(attachment['raw']) + } ) + except: + self.flags.append('parse_attachment_error') + + # Header + # Collect email header information + try: + self.event['subject'] = parsed_eml['header']['subject'] + self.event['to'] = parsed_eml['header']['to'] + self.event['from'] = parsed_eml['header']['from'] + self.event['date_utc'] = parsed_eml['header']['date'].astimezone(pytz.utc).isoformat()[:-6] + '.000Z' + self.event['message_id'] = str(parsed_eml['header']['header']['message-id'][0][1:-1]) + if 'received_domain' in parsed_eml['header']: + self.event['received_domain'] = parsed_eml['header']['received_domain'] + if 'received_ip' in parsed_eml['header']: + self.event['received_ip'] = parsed_eml['header']['received_ip'] + except: + self.flags.append('parse_header_error') + + # If attachments were found, submit back into pipeline + try: + if attachments: + for attachment in attachments: + self.event['total']['attachments'] += 1 + extract_file = strelka.File( + name=attachment['name'], + source=self.name, + ) + extract_file.add_flavors({'external': [attachment['content-type'].partition(";")[0]]}) + + for c in strelka.chunk_string(attachment['raw']): + self.upload_to_coordinator( + extract_file.pointer, + c, + expire_at, + ) - self.files.append(extract_file) - self.event['total']['extracted'] += 1 + self.files.append(extract_file) + self.event['total']['extracted'] += 1 + except: + self.flags.append('extract_attachment_error') except AssertionError: self.flags.append('assertion_error')