From cf2941aa4eee506f2ab3df6fc75050e0d1396570 Mon Sep 17 00:00:00 2001 From: Georges Toth Date: Thu, 4 Jan 2024 01:35:34 +0100 Subject: [PATCH] cleanup & fix linter issues --- .flake8 | 5 - .pylintrc | 363 ---------------------------------------- OUT_FORMAT | 33 ---- eml_parser/decode.py | 12 +- eml_parser/parser.py | 39 ++--- eml_parser/routing.py | 3 - tests/test_emlparser.py | 4 +- tests/test_regexes.py | 4 +- 8 files changed, 24 insertions(+), 439 deletions(-) delete mode 100644 .flake8 delete mode 100644 .pylintrc delete mode 100644 OUT_FORMAT diff --git a/.flake8 b/.flake8 deleted file mode 100644 index fc88872..0000000 --- a/.flake8 +++ /dev/null @@ -1,5 +0,0 @@ -[flake8] -max-line-length = 240 -docstring-convention = google - -ignore = E111,E114,E121,E123,E125,E126,S101,E131,E122,E501,E127,W503,W504 diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index 505b768..0000000 --- a/.pylintrc +++ /dev/null @@ -1,363 +0,0 @@ -[MASTER] - -# Specify a configuration file. -#rcfile= - -# Python code to execute, usually for sys.path manipulation such as -# pygtk.require(). -#init-hook= - -# Add files or directories to the blacklist. They should be base names, not -# paths. -ignore=.git - -# Add files or directories matching the regex patterns to the blacklist. The -# regex matches against base names, not paths. -ignore-patterns= - -# Pickle collected data for later comparisons. -persistent=yes - -# List of plugins (as comma separated values of python modules names) to load, -# usually to register additional checkers. -load-plugins= - -# Use multiple processes to speed up Pylint. -jobs=1 - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - -# A comma-separated list of package or module names from where C extensions may -# be loaded. Extensions are loading into the active Python interpreter and may -# run arbitrary code -extension-pkg-whitelist=falcon,psycopg2 - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED -confidence= - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -#enable= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once).You can also use "--disable=all" to -# disable everything first and then reenable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use"--disable=all --enable=classes -# --disable=W" -#disable=import-star-module-level,old-octal-literal,oct-method,print-statement,unpacking-in-except,parameter-unpacking,backtick,old-raise-syntax,old-ne-operator,long-suffix,dict-view-method,dict-iter-method,metaclass-assignment,next-method-called,raising-string,indexing-exception,raw_input-builtin,long-builtin,file-builtin,execfile-builtin,coerce-builtin,cmp-builtin,buffer-builtin,basestring-builtin,apply-builtin,filter-builtin-not-iterating,using-cmp-argument,useless-suppression,range-builtin-not-iterating,suppressed-message,no-absolute-import,old-division,cmp-method,reload-builtin,zip-builtin-not-iterating,intern-builtin,unichr-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,input-builtin,round-builtin,hex-method,nonzero-method,map-builtin-not-iterating -disable=C0103,R0914,R0912,R0915,I0011,W1202,R0801,too-many-lines,too-many-nested-blocks,too-many-instance-attributes - - -[REPORTS] - -# Set the output format. Available formats are text, parseable, colorized, msvs -# (visual studio) and html. You can also give a reporter class, eg -# mypackage.mymodule.MyReporterClass. -output-format=parseable - -# Put messages in a separate file for each module / package specified on the -# command line instead of printing them on stdout. Reports (if any) will be -# written in a file name "pylint_global.[txt|html]". This option is deprecated -# and it will be removed in Pylint 2.0. -#files-output=pylint.log - -# Tells whether to display a full report or only the messages -reports=no - -# Python expression which should return a note less than 10 (10 is the highest -# note). You have access to the variables errors warning, statement which -# respectively contain the number of errors / warnings messages and the total -# number of statements analyzed. This is used by the global evaluation report -# (RP0004). -evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details -#msg-template= - - -[BASIC] - -# Good variable names which should always be accepted, separated by a comma -good-names=i,j,k,ex,Run,_ - -# Bad variable names which should always be refused, separated by a comma -bad-names=foo,bar,baz,toto,tutu,tata - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Include a hint for the correct naming format with invalid-name -include-naming-hint=no - -# List of decorators that produce properties, such as abc.abstractproperty. Add -# to this list to register other decorators that produce valid properties. -property-classes=abc.abstractproperty - -# Regular expression matching correct function names -function-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Regular expression matching correct variable names -variable-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Regular expression matching correct constant names -const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ - -# Regular expression matching correct attribute names -attr-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Regular expression matching correct argument names -argument-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Regular expression matching correct class attribute names -class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ - -# Regular expression matching correct inline iteration names -inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ - -# Regular expression matching correct class names -class-rgx=[A-Z_][a-zA-Z0-9]+$ - -# Regular expression matching correct module names -module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ - -# Regular expression matching correct method names -method-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Regular expression which should only match function or class names that do -# not require a docstring. -no-docstring-rgx=^_ - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=-1 - - -[ELIF] - -# Maximum number of nested blocks for function / method body -max-nested-blocks=5 - - -[FORMAT] - -# Maximum number of characters on a single line. -max-line-length=240 - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no - -# Maximum number of lines in a module -max-module-lines=1000 - -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -indent-string=' ' - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format=LF - - -[LOGGING] - -# Logging modules to check that the string format arguments are in logging -# function parameter format -logging-modules=logging - - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=FIXME,XXX,TODO - - -[SIMILARITIES] - -# Minimum lines number of a similarity. -min-similarity-lines=4 - -# Ignore comments when computing similarities. -ignore-comments=yes - -# Ignore docstrings when computing similarities. -ignore-docstrings=yes - -# Ignore imports when computing similarities. -ignore-imports=no - - -[TYPECHECK] - -# Tells whether missing members accessed in mixin class should be ignored. A -# mixin class is detected if its name ends with "mixin" (case insensitive). -ignore-mixin-members=yes - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis. It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members=query, rollback, commit, add, delete - -# List of decorators that produce context managers, such as -# contextlib.contextmanager. Add to this list to register other decorators that -# produce valid context managers. -contextmanager-decorators=contextlib.contextmanager - - -[VARIABLES] - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# A regular expression matching the name of dummy variables (i.e. expectedly -# not used). -dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid to define new builtins when possible. -additional-builtins= - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_,_cb - -# List of qualified module names which can have objects that can redefine -# builtins. -redefining-builtins-modules=six.moves,future.builtins - - -[SPELLING] - -# Spelling dictionary name. Available dictionaries: none. To make it working -# install python-enchant package. -spelling-dict= - -# List of comma separated words that should not be checked. -spelling-ignore-words= - -# A path to a file that contains private dictionary; one word per line. -spelling-private-dict-file= - -# Tells whether to store unknown words to indicated private dictionary in -# --spelling-private-dict-file option instead of raising a message. -spelling-store-unknown-words=no - - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__,__new__,setUp - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=mcs - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict,_fields,_replace,_source,_make - - -[DESIGN] - -# Maximum number of arguments for function / method -max-args=10 - -# Argument names that match this expression will be ignored. Default to name -# with leading underscore -ignored-argument-names=_.* - -# Maximum number of locals for function / method body -max-locals=15 - -# Maximum number of return / yield for function / method body -max-returns=6 - -# Maximum number of branch for function / method body -max-branches=12 - -# Maximum number of statements in function / method body -max-statements=50 - -# Maximum number of parents for a class (see R0901). -max-parents=7 - -# Maximum number of attributes for a class (see R0902). -max-attributes=7 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=1 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=20 - -# Maximum number of boolean expressions in a if statement -max-bool-expr=5 - - -[IMPORTS] - -# Deprecated modules which should not be used, separated by a comma -deprecated-modules=regsub,TERMIOS,Bastion,rexec - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled) -import-graph= - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled) -ext-import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled) -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "Exception" -overgeneral-exceptions=Exception diff --git a/OUT_FORMAT b/OUT_FORMAT deleted file mode 100644 index 25e2d50..0000000 --- a/OUT_FORMAT +++ /dev/null @@ -1,33 +0,0 @@ -Mail Part,Name,Preprocessing,Value,Data type,Purpose,Status -body,hash,None,sha256,string,Hash of the whole body,Ok -body,content_header,None,Plain,dict of list of string,All content headers and associated values,Ok -body,content_type,none,Plain,string,Given mime type of the body part,Ok -body,domain-hash,Lower,sha256,list of string,FQDN found in the body part,Ok -body,email-hash,Lower,sha256,list of string,Emails found in the body part,Ok -body,uri-hash,None,sha256,list of string,Uris found in the body part,Ok -body,ip-hash,Lower,sha256,list of string,Ips found in the body part,Todo -,,,,,, -,,,,,, -attachment,mime_type,None,Plain,string,Real mime type of the body part,Ok -attachment,filename,None,Plain,string,name of the file,Ok -attachment,extension,None,Plain,string,extention of the file,Ok -attachment,content_header,None,Plain,dict of list of string,All attachement headers and associated values,Ok -attachment,hash ,None,mhash,list of string,"Contains sha 1,256,512 and md5 of the file",Ok -attachment,size,None,Plain,integer,len of the file,Ok -attachment,mime_type_short,None,Plain,string,real mime type of the attachement,Ok -,,,,,, -,,,,,, -header,received,trim newlines,Plain,list of string,Line of routing informations,Ok -header,delivered_to,None,Plain,list of string,Header delivered-to data ,Ok -header,message_id,None,Plain,string,Header message-ip data,Ok -header,from,None,Plain,string,Header from data,Ok -header,to,None,Plain,list of string,All destination,Ok -header,header,None,Plain,dict of list of string,All mail headers and associated values,Ok -header,parse_date,None,Plain,datetime,Time of parsing by mailm0n,Ok -header,date,None,Plain,datetime,Time of mail in the header,Ok -header,received_email,Utc conv,Plain,string,All email parsed in routing headers,Ok -header,received_domain,Utc conv,Plain,string,All domains parsed in routing headers,Ok -header,subject,None,Plain,string,Subject of the email,Ok -header,received_ip,Lower,Plain,string,All IP parsed in routing headers,Todo -header,cc ,None,Plain,list of string,All blind destinationt,Ok -header,defect,None,Plain,string,Email Parsing error,Ok diff --git a/eml_parser/decode.py b/eml_parser/decode.py index fa73c9f..ac66d8d 100644 --- a/eml_parser/decode.py +++ b/eml_parser/decode.py @@ -1,8 +1,5 @@ -# pylint: disable=line-too-long - """This module contains various string import, check and parse methods.""" -from __future__ import annotations import datetime import email @@ -76,12 +73,11 @@ def decode_field(field: str) -> str: for _text, charset in _decoded: if charset: string += decode_string(_text, charset) - else: + elif isinstance(_text, bytes): # @TODO might be an idea to check with charset-normalizer here - if isinstance(_text, bytes): - string += _text.decode('utf-8', 'ignore') - else: - string += _text + string += _text.decode('utf-8', 'ignore') + else: + string += _text return string diff --git a/eml_parser/parser.py b/eml_parser/parser.py index 011b2b9..1895adf 100644 --- a/eml_parser/parser.py +++ b/eml_parser/parser.py @@ -1,10 +1,7 @@ -# pylint: disable=line-too-long - """eml_parser serves as a python module for parsing eml files and returning various\ information found in the e-mail as well as computed information. """ -from __future__ import annotations import base64 import binascii @@ -20,6 +17,7 @@ import ipaddress import logging import os.path +import pathlib import re import typing import urllib.parse @@ -186,7 +184,7 @@ def __init__( self.msg: typing.Optional[email.message.Message] = None - def decode_email(self, eml_file: os.PathLike[str], ignore_bad_start: bool = False) -> dict: + def decode_email(self, eml_file: os.PathLike, ignore_bad_start: bool = False) -> dict: """Function for decoding an EML file into an easily parsable structure. Some intelligence is applied while parsing the file in order to work around @@ -367,7 +365,7 @@ def parse_email(self) -> dict: for by_item in parsed_routing.get('by', []): for byhostentry_ in self.pconf['byhostentry']: byhostentry = byhostentry_.lower() - # print ("%s %s" % (byhostentry, by_item)) + if byhostentry in by_item: # Save the last Found.. ( most external ) headers_struc['received_src'] = parsed_routing.get('from') @@ -547,18 +545,19 @@ def parse_email(self) -> dict: # We are using replace . to : for avoiding issue in mongo k = k.lower().replace('.', ':') # Lot of lowers, pre-compute :) . - # print(v) + if multipart: if k in ch: ch[k].append(v) else: ch[k] = [v] - else: # if not multipart, store only content-xx related header with part - if k.startswith('content'): # otherwise, we got all header headers - if k in ch: - ch[k].append(v) - else: - ch[k] = [v] + elif k.startswith('content'): # otherwise, we got all header headers + # if not multipart, store only content-xx related header with part + if k in ch: + ch[k].append(v) + else: + ch[k] = [v] + bodie['content_header'] = ch # Store content headers dict if self.include_raw_body: @@ -777,7 +776,7 @@ def clean_found_uri(self, url: str) -> typing.Optional[str]: return None # let's try to be smart by stripping of noisy bogus parts - url = re.split(r"""[', ")}\\]""", url, 1)[0] + url = re.split(r"""[', ")}\\]""", url, maxsplit=1)[0] # filter bogus URLs if url.endswith('://'): @@ -1037,12 +1036,11 @@ def prepare_multipart_part_attachment(self, msg: email.message.Message, counter: attachment[file_id]['filename'] = filename attachment[file_id]['size'] = file_size - # os.path always returns the extension as second element - # in case there is no extension it returns an empty string - extension = os.path.splitext(filename)[1].lower() + # in case there is no extension pathlib.Path(filename).suffix returns an empty string + extension = pathlib.Path(filename).suffix if extension: - # strip leading dot - attachment[file_id]['extension'] = extension[1:] + # strip leading dot and lower-case + attachment[file_id]['extension'] = extension[1:].lower() attachment[file_id]['hash'] = self.get_file_hash(data) @@ -1052,9 +1050,8 @@ def prepare_multipart_part_attachment(self, msg: email.message.Message, counter: attachment[file_id]['mime_type'] = mime_type # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0] attachment[file_id]['mime_type_short'] = mime_type_short - else: - if magic is not None: - logger.warning('Error determining attachment mime-type - "%s"', str(file_id)) + elif magic is not None: + logger.warning('Error determining attachment mime-type - "%s"', str(file_id)) if self.include_attachment_data: attachment[file_id]['raw'] = base64.b64encode(data) diff --git a/eml_parser/routing.py b/eml_parser/routing.py index ae8805b..ec25ba6 100644 --- a/eml_parser/routing.py +++ b/eml_parser/routing.py @@ -1,8 +1,5 @@ -# pylint: disable=line-too-long - """This module is used for parsing the received lines into a machine readable structure.""" -from __future__ import annotations import re import typing diff --git a/tests/test_emlparser.py b/tests/test_emlparser.py index a5d2bce..1973bcf 100644 --- a/tests/test_emlparser.py +++ b/tests/test_emlparser.py @@ -1,6 +1,4 @@ -# -*- coding: utf-8 -*- -# pylint: disable=line-too-long -from __future__ import annotations + import datetime import email.policy diff --git a/tests/test_regexes.py b/tests/test_regexes.py index 04a3d97..76f6756 100644 --- a/tests/test_regexes.py +++ b/tests/test_regexes.py @@ -1,6 +1,4 @@ -# -*- coding: utf-8 -*- -# pylint: disable=line-too-long -from __future__ import annotations + import pathlib from eml_parser.regexes import *