From cf2941aa4eee506f2ab3df6fc75050e0d1396570 Mon Sep 17 00:00:00 2001
From: Georges Toth <georges@trypill.org>
Date: Thu, 4 Jan 2024 01:35:34 +0100
Subject: [PATCH] cleanup & fix linter issues

---
 .flake8                 |   5 -
 .pylintrc               | 363 ----------------------------------------
 OUT_FORMAT              |  33 ----
 eml_parser/decode.py    |  12 +-
 eml_parser/parser.py    |  39 ++---
 eml_parser/routing.py   |   3 -
 tests/test_emlparser.py |   4 +-
 tests/test_regexes.py   |   4 +-
 8 files changed, 24 insertions(+), 439 deletions(-)
 delete mode 100644 .flake8
 delete mode 100644 .pylintrc
 delete mode 100644 OUT_FORMAT

diff --git a/.flake8 b/.flake8
deleted file mode 100644
index fc88872..0000000
--- a/.flake8
+++ /dev/null
@@ -1,5 +0,0 @@
-[flake8]
-max-line-length = 240
-docstring-convention = google
-
-ignore = E111,E114,E121,E123,E125,E126,S101,E131,E122,E501,E127,W503,W504
diff --git a/.pylintrc b/.pylintrc
deleted file mode 100644
index 505b768..0000000
--- a/.pylintrc
+++ /dev/null
@@ -1,363 +0,0 @@
-[MASTER]
-
-# Specify a configuration file.
-#rcfile=
-
-# Python code to execute, usually for sys.path manipulation such as
-# pygtk.require().
-#init-hook=
-
-# Add files or directories to the blacklist. They should be base names, not
-# paths.
-ignore=.git
-
-# Add files or directories matching the regex patterns to the blacklist. The
-# regex matches against base names, not paths.
-ignore-patterns=
-
-# Pickle collected data for later comparisons.
-persistent=yes
-
-# List of plugins (as comma separated values of python modules names) to load,
-# usually to register additional checkers.
-load-plugins=
-
-# Use multiple processes to speed up Pylint.
-jobs=1
-
-# Allow loading of arbitrary C extensions. Extensions are imported into the
-# active Python interpreter and may run arbitrary code.
-unsafe-load-any-extension=no
-
-# A comma-separated list of package or module names from where C extensions may
-# be loaded. Extensions are loading into the active Python interpreter and may
-# run arbitrary code
-extension-pkg-whitelist=falcon,psycopg2
-
-
-[MESSAGES CONTROL]
-
-# Only show warnings with the listed confidence levels. Leave empty to show
-# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
-confidence=
-
-# Enable the message, report, category or checker with the given id(s). You can
-# either give multiple identifier separated by comma (,) or put this option
-# multiple time (only on the command line, not in the configuration file where
-# it should appear only once). See also the "--disable" option for examples.
-#enable=
-
-# Disable the message, report, category or checker with the given id(s). You
-# can either give multiple identifiers separated by comma (,) or put this
-# option multiple times (only on the command line, not in the configuration
-# file where it should appear only once).You can also use "--disable=all" to
-# disable everything first and then reenable specific checks. For example, if
-# you want to run only the similarities checker, you can use "--disable=all
-# --enable=similarities". If you want to run only the classes checker, but have
-# no Warning level messages displayed, use"--disable=all --enable=classes
-# --disable=W"
-#disable=import-star-module-level,old-octal-literal,oct-method,print-statement,unpacking-in-except,parameter-unpacking,backtick,old-raise-syntax,old-ne-operator,long-suffix,dict-view-method,dict-iter-method,metaclass-assignment,next-method-called,raising-string,indexing-exception,raw_input-builtin,long-builtin,file-builtin,execfile-builtin,coerce-builtin,cmp-builtin,buffer-builtin,basestring-builtin,apply-builtin,filter-builtin-not-iterating,using-cmp-argument,useless-suppression,range-builtin-not-iterating,suppressed-message,no-absolute-import,old-division,cmp-method,reload-builtin,zip-builtin-not-iterating,intern-builtin,unichr-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,input-builtin,round-builtin,hex-method,nonzero-method,map-builtin-not-iterating
-disable=C0103,R0914,R0912,R0915,I0011,W1202,R0801,too-many-lines,too-many-nested-blocks,too-many-instance-attributes
-
-
-[REPORTS]
-
-# Set the output format. Available formats are text, parseable, colorized, msvs
-# (visual studio) and html. You can also give a reporter class, eg
-# mypackage.mymodule.MyReporterClass.
-output-format=parseable
-
-# Put messages in a separate file for each module / package specified on the
-# command line instead of printing them on stdout. Reports (if any) will be
-# written in a file name "pylint_global.[txt|html]". This option is deprecated
-# and it will be removed in Pylint 2.0.
-#files-output=pylint.log
-
-# Tells whether to display a full report or only the messages
-reports=no
-
-# Python expression which should return a note less than 10 (10 is the highest
-# note). You have access to the variables errors warning, statement which
-# respectively contain the number of errors / warnings messages and the total
-# number of statements analyzed. This is used by the global evaluation report
-# (RP0004).
-evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
-
-# Template used to display messages. This is a python new-style format string
-# used to format the message information. See doc for all details
-#msg-template=
-
-
-[BASIC]
-
-# Good variable names which should always be accepted, separated by a comma
-good-names=i,j,k,ex,Run,_
-
-# Bad variable names which should always be refused, separated by a comma
-bad-names=foo,bar,baz,toto,tutu,tata
-
-# Colon-delimited sets of names that determine each other's naming style when
-# the name regexes allow several styles.
-name-group=
-
-# Include a hint for the correct naming format with invalid-name
-include-naming-hint=no
-
-# List of decorators that produce properties, such as abc.abstractproperty. Add
-# to this list to register other decorators that produce valid properties.
-property-classes=abc.abstractproperty
-
-# Regular expression matching correct function names
-function-rgx=[a-z_][a-z0-9_]{2,30}$
-
-# Regular expression matching correct variable names
-variable-rgx=[a-z_][a-z0-9_]{2,30}$
-
-# Regular expression matching correct constant names
-const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
-
-# Regular expression matching correct attribute names
-attr-rgx=[a-z_][a-z0-9_]{2,30}$
-
-# Regular expression matching correct argument names
-argument-rgx=[a-z_][a-z0-9_]{2,30}$
-
-# Regular expression matching correct class attribute names
-class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
-
-# Regular expression matching correct inline iteration names
-inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
-
-# Regular expression matching correct class names
-class-rgx=[A-Z_][a-zA-Z0-9]+$
-
-# Regular expression matching correct module names
-module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
-
-# Regular expression matching correct method names
-method-rgx=[a-z_][a-z0-9_]{2,30}$
-
-# Regular expression which should only match function or class names that do
-# not require a docstring.
-no-docstring-rgx=^_
-
-# Minimum line length for functions/classes that require docstrings, shorter
-# ones are exempt.
-docstring-min-length=-1
-
-
-[ELIF]
-
-# Maximum number of nested blocks for function / method body
-max-nested-blocks=5
-
-
-[FORMAT]
-
-# Maximum number of characters on a single line.
-max-line-length=240
-
-# Regexp for a line that is allowed to be longer than the limit.
-ignore-long-lines=^\s*(# )?<?https?://\S+>?$
-
-# Allow the body of an if to be on the same line as the test if there is no
-# else.
-single-line-if-stmt=no
-
-# Maximum number of lines in a module
-max-module-lines=1000
-
-# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
-# tab).
-indent-string='    '
-
-# Number of spaces of indent required inside a hanging  or continued line.
-indent-after-paren=4
-
-# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
-expected-line-ending-format=LF
-
-
-[LOGGING]
-
-# Logging modules to check that the string format arguments are in logging
-# function parameter format
-logging-modules=logging
-
-
-[MISCELLANEOUS]
-
-# List of note tags to take in consideration, separated by a comma.
-notes=FIXME,XXX,TODO
-
-
-[SIMILARITIES]
-
-# Minimum lines number of a similarity.
-min-similarity-lines=4
-
-# Ignore comments when computing similarities.
-ignore-comments=yes
-
-# Ignore docstrings when computing similarities.
-ignore-docstrings=yes
-
-# Ignore imports when computing similarities.
-ignore-imports=no
-
-
-[TYPECHECK]
-
-# Tells whether missing members accessed in mixin class should be ignored. A
-# mixin class is detected if its name ends with "mixin" (case insensitive).
-ignore-mixin-members=yes
-
-# List of module names for which member attributes should not be checked
-# (useful for modules/projects where namespaces are manipulated during runtime
-# and thus existing member attributes cannot be deduced by static analysis. It
-# supports qualified module names, as well as Unix pattern matching.
-ignored-modules=
-
-# List of class names for which member attributes should not be checked (useful
-# for classes with dynamically set attributes). This supports the use of
-# qualified names.
-ignored-classes=optparse.Values,thread._local,_thread._local
-
-# List of members which are set dynamically and missed by pylint inference
-# system, and so shouldn't trigger E1101 when accessed. Python regular
-# expressions are accepted.
-generated-members=query, rollback, commit, add, delete
-
-# List of decorators that produce context managers, such as
-# contextlib.contextmanager. Add to this list to register other decorators that
-# produce valid context managers.
-contextmanager-decorators=contextlib.contextmanager
-
-
-[VARIABLES]
-
-# Tells whether we should check for unused import in __init__ files.
-init-import=no
-
-# A regular expression matching the name of dummy variables (i.e. expectedly
-# not used).
-dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
-
-# List of additional names supposed to be defined in builtins. Remember that
-# you should avoid to define new builtins when possible.
-additional-builtins=
-
-# List of strings which can identify a callback function by name. A callback
-# name must start or end with one of those strings.
-callbacks=cb_,_cb
-
-# List of qualified module names which can have objects that can redefine
-# builtins.
-redefining-builtins-modules=six.moves,future.builtins
-
-
-[SPELLING]
-
-# Spelling dictionary name. Available dictionaries: none. To make it working
-# install python-enchant package.
-spelling-dict=
-
-# List of comma separated words that should not be checked.
-spelling-ignore-words=
-
-# A path to a file that contains private dictionary; one word per line.
-spelling-private-dict-file=
-
-# Tells whether to store unknown words to indicated private dictionary in
-# --spelling-private-dict-file option instead of raising a message.
-spelling-store-unknown-words=no
-
-
-[CLASSES]
-
-# List of method names used to declare (i.e. assign) instance attributes.
-defining-attr-methods=__init__,__new__,setUp
-
-# List of valid names for the first argument in a class method.
-valid-classmethod-first-arg=cls
-
-# List of valid names for the first argument in a metaclass class method.
-valid-metaclass-classmethod-first-arg=mcs
-
-# List of member names, which should be excluded from the protected access
-# warning.
-exclude-protected=_asdict,_fields,_replace,_source,_make
-
-
-[DESIGN]
-
-# Maximum number of arguments for function / method
-max-args=10
-
-# Argument names that match this expression will be ignored. Default to name
-# with leading underscore
-ignored-argument-names=_.*
-
-# Maximum number of locals for function / method body
-max-locals=15
-
-# Maximum number of return / yield for function / method body
-max-returns=6
-
-# Maximum number of branch for function / method body
-max-branches=12
-
-# Maximum number of statements in function / method body
-max-statements=50
-
-# Maximum number of parents for a class (see R0901).
-max-parents=7
-
-# Maximum number of attributes for a class (see R0902).
-max-attributes=7
-
-# Minimum number of public methods for a class (see R0903).
-min-public-methods=1
-
-# Maximum number of public methods for a class (see R0904).
-max-public-methods=20
-
-# Maximum number of boolean expressions in a if statement
-max-bool-expr=5
-
-
-[IMPORTS]
-
-# Deprecated modules which should not be used, separated by a comma
-deprecated-modules=regsub,TERMIOS,Bastion,rexec
-
-# Create a graph of every (i.e. internal and external) dependencies in the
-# given file (report RP0402 must not be disabled)
-import-graph=
-
-# Create a graph of external dependencies in the given file (report RP0402 must
-# not be disabled)
-ext-import-graph=
-
-# Create a graph of internal dependencies in the given file (report RP0402 must
-# not be disabled)
-int-import-graph=
-
-# Force import order to recognize a module as part of the standard
-# compatibility libraries.
-known-standard-library=
-
-# Force import order to recognize a module as part of a third party library.
-known-third-party=enchant
-
-# Analyse import fallback blocks. This can be used to support both Python 2 and
-# 3 compatible code, which means that the block might have code that exists
-# only in one or another interpreter, leading to false positives when analysed.
-analyse-fallback-blocks=no
-
-
-[EXCEPTIONS]
-
-# Exceptions that will emit a warning when being caught. Defaults to
-# "Exception"
-overgeneral-exceptions=Exception
diff --git a/OUT_FORMAT b/OUT_FORMAT
deleted file mode 100644
index 25e2d50..0000000
--- a/OUT_FORMAT
+++ /dev/null
@@ -1,33 +0,0 @@
-Mail Part,Name,Preprocessing,Value,Data type,Purpose,Status
-body,hash,None,sha256,string,Hash of the whole body,Ok
-body,content_header,None,Plain,dict of list of string,All content headers and associated values,Ok
-body,content_type,none,Plain,string,Given mime type of the body part,Ok
-body,domain-hash,Lower,sha256,list of string,FQDN found in the body part,Ok
-body,email-hash,Lower,sha256,list of string,Emails found in the body part,Ok
-body,uri-hash,None,sha256,list of string,Uris found in the body part,Ok
-body,ip-hash,Lower,sha256,list of string,Ips found in the body part,Todo
-,,,,,,
-,,,,,,
-attachment,mime_type,None,Plain,string,Real mime type of the body part,Ok
-attachment,filename,None,Plain,string,name of the file,Ok
-attachment,extension,None,Plain,string,extention of the file,Ok
-attachment,content_header,None,Plain,dict of list of string,All attachement headers and associated values,Ok
-attachment,hash  ,None,mhash,list of string,"Contains sha 1,256,512 and md5 of the file",Ok
-attachment,size,None,Plain,integer,len of the file,Ok
-attachment,mime_type_short,None,Plain,string,real mime type of the attachement,Ok
-,,,,,,
-,,,,,,
-header,received,trim newlines,Plain,list of string,Line of routing informations,Ok
-header,delivered_to,None,Plain,list of string,Header delivered-to data ,Ok
-header,message_id,None,Plain,string,Header message-ip data,Ok
-header,from,None,Plain,string,Header from data,Ok
-header,to,None,Plain,list of string,All destination,Ok
-header,header,None,Plain,dict of list of string,All mail headers and associated values,Ok
-header,parse_date,None,Plain,datetime,Time of parsing by mailm0n,Ok
-header,date,None,Plain,datetime,Time of mail in the header,Ok
-header,received_email,Utc conv,Plain,string,All email parsed in routing headers,Ok
-header,received_domain,Utc conv,Plain,string,All domains parsed in routing headers,Ok
-header,subject,None,Plain,string,Subject of the email,Ok
-header,received_ip,Lower,Plain,string,All IP parsed in routing headers,Todo
-header,cc ,None,Plain,list of string,All blind destinationt,Ok
-header,defect,None,Plain,string,Email Parsing error,Ok
diff --git a/eml_parser/decode.py b/eml_parser/decode.py
index fa73c9f..ac66d8d 100644
--- a/eml_parser/decode.py
+++ b/eml_parser/decode.py
@@ -1,8 +1,5 @@
-# pylint: disable=line-too-long
-
 """This module contains various string import, check and parse methods."""
 
-from __future__ import annotations
 
 import datetime
 import email
@@ -76,12 +73,11 @@ def decode_field(field: str) -> str:
     for _text, charset in _decoded:
         if charset:
             string += decode_string(_text, charset)
-        else:
+        elif isinstance(_text, bytes):
             # @TODO might be an idea to check with charset-normalizer here
-            if isinstance(_text, bytes):
-                string += _text.decode('utf-8', 'ignore')
-            else:
-                string += _text
+            string += _text.decode('utf-8', 'ignore')
+        else:
+            string += _text
 
     return string
 
diff --git a/eml_parser/parser.py b/eml_parser/parser.py
index 011b2b9..1895adf 100644
--- a/eml_parser/parser.py
+++ b/eml_parser/parser.py
@@ -1,10 +1,7 @@
-# pylint: disable=line-too-long
-
 """eml_parser serves as a python module for parsing eml files and returning various\
 information found in the e-mail as well as computed information.
 """
 
-from __future__ import annotations
 
 import base64
 import binascii
@@ -20,6 +17,7 @@
 import ipaddress
 import logging
 import os.path
+import pathlib
 import re
 import typing
 import urllib.parse
@@ -186,7 +184,7 @@ def __init__(
 
         self.msg: typing.Optional[email.message.Message] = None
 
-    def decode_email(self, eml_file: os.PathLike[str], ignore_bad_start: bool = False) -> dict:
+    def decode_email(self, eml_file: os.PathLike, ignore_bad_start: bool = False) -> dict:
         """Function for decoding an EML file into an easily parsable structure.
 
         Some intelligence is applied while parsing the file in order to work around
@@ -367,7 +365,7 @@ def parse_email(self) -> dict:
                     for by_item in parsed_routing.get('by', []):
                         for byhostentry_ in self.pconf['byhostentry']:
                             byhostentry = byhostentry_.lower()
-                            # print ("%s %s" % (byhostentry, by_item))
+
                             if byhostentry in by_item:
                                 # Save the last Found.. ( most external )
                                 headers_struc['received_src'] = parsed_routing.get('from')
@@ -547,18 +545,19 @@ def parse_email(self) -> dict:
 
                 # We are using replace . to : for avoiding issue in mongo
                 k = k.lower().replace('.', ':')  # Lot of lowers, pre-compute :) .
-                # print(v)
+
                 if multipart:
                     if k in ch:
                         ch[k].append(v)
                     else:
                         ch[k] = [v]
-                else:  # if not multipart, store only content-xx related header with part
-                    if k.startswith('content'):  # otherwise, we got all header headers
-                        if k in ch:
-                            ch[k].append(v)
-                        else:
-                            ch[k] = [v]
+                elif k.startswith('content'):  # otherwise, we got all header headers
+                    # if not multipart, store only content-xx related header with part
+                    if k in ch:
+                        ch[k].append(v)
+                    else:
+                        ch[k] = [v]
+
             bodie['content_header'] = ch  # Store content headers dict
 
             if self.include_raw_body:
@@ -777,7 +776,7 @@ def clean_found_uri(self, url: str) -> typing.Optional[str]:
             return None
 
         # let's try to be smart by stripping of noisy bogus parts
-        url = re.split(r"""[', ")}\\]""", url, 1)[0]
+        url = re.split(r"""[', ")}\\]""", url, maxsplit=1)[0]
 
         # filter bogus URLs
         if url.endswith('://'):
@@ -1037,12 +1036,11 @@ def prepare_multipart_part_attachment(self, msg: email.message.Message, counter:
             attachment[file_id]['filename'] = filename
             attachment[file_id]['size'] = file_size
 
-            # os.path always returns the extension as second element
-            # in case there is no extension it returns an empty string
-            extension = os.path.splitext(filename)[1].lower()
+            # in case there is no extension pathlib.Path(filename).suffix returns an empty string
+            extension = pathlib.Path(filename).suffix
             if extension:
-                # strip leading dot
-                attachment[file_id]['extension'] = extension[1:]
+                # strip leading dot and lower-case
+                attachment[file_id]['extension'] = extension[1:].lower()
 
             attachment[file_id]['hash'] = self.get_file_hash(data)
 
@@ -1052,9 +1050,8 @@ def prepare_multipart_part_attachment(self, msg: email.message.Message, counter:
                 attachment[file_id]['mime_type'] = mime_type
                 # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0]
                 attachment[file_id]['mime_type_short'] = mime_type_short
-            else:
-                if magic is not None:
-                    logger.warning('Error determining attachment mime-type - "%s"', str(file_id))
+            elif magic is not None:
+                logger.warning('Error determining attachment mime-type - "%s"', str(file_id))
 
             if self.include_attachment_data:
                 attachment[file_id]['raw'] = base64.b64encode(data)
diff --git a/eml_parser/routing.py b/eml_parser/routing.py
index ae8805b..ec25ba6 100644
--- a/eml_parser/routing.py
+++ b/eml_parser/routing.py
@@ -1,8 +1,5 @@
-# pylint: disable=line-too-long
-
 """This module is used for parsing the received lines into a machine readable structure."""
 
-from __future__ import annotations
 
 import re
 import typing
diff --git a/tests/test_emlparser.py b/tests/test_emlparser.py
index a5d2bce..1973bcf 100644
--- a/tests/test_emlparser.py
+++ b/tests/test_emlparser.py
@@ -1,6 +1,4 @@
-# -*- coding: utf-8 -*-
-# pylint: disable=line-too-long
-from __future__ import annotations
+
 
 import datetime
 import email.policy
diff --git a/tests/test_regexes.py b/tests/test_regexes.py
index 04a3d97..76f6756 100644
--- a/tests/test_regexes.py
+++ b/tests/test_regexes.py
@@ -1,6 +1,4 @@
-# -*- coding: utf-8 -*-
-# pylint: disable=line-too-long
-from __future__ import annotations
+
 
 import pathlib
 from eml_parser.regexes import *