diff --git a/data/mbox-sample1/1/Headers.pickle b/data/mbox-sample1/1/Headers.pickle index 88f57ab..f7cb7c2 100644 Binary files a/data/mbox-sample1/1/Headers.pickle and b/data/mbox-sample1/1/Headers.pickle differ diff --git a/data/mbox-sample1/1/Message.pickle b/data/mbox-sample1/1/Message.pickle index d3ad047..ab57f7f 100644 Binary files a/data/mbox-sample1/1/Message.pickle and b/data/mbox-sample1/1/Message.pickle differ diff --git a/data/mbox-sample1/2/Headers.pickle b/data/mbox-sample1/2/Headers.pickle index 3e9cecd..53dacb6 100644 Binary files a/data/mbox-sample1/2/Headers.pickle and b/data/mbox-sample1/2/Headers.pickle differ diff --git a/data/msg-Digitization Archiving Solutions/1/Date.txt b/data/msg-Digitization Archiving Solutions/1/Date.txt index cf7332a..ee5927a 100644 --- a/data/msg-Digitization Archiving Solutions/1/Date.txt +++ b/data/msg-Digitization Archiving Solutions/1/Date.txt @@ -1 +1 @@ -Thu, 03 Feb 2022 15:01:10 -0500 \ No newline at end of file +2022-02-03 15:01:10-05:00 \ No newline at end of file diff --git a/data/msg-Digitization Archiving Solutions/1/HTML_Body.txt b/data/msg-Digitization Archiving Solutions/1/HTML_Body.txt index a792eda..cfc5d1f 100644 --- a/data/msg-Digitization Archiving Solutions/1/HTML_Body.txt +++ b/data/msg-Digitization Archiving Solutions/1/HTML_Body.txt @@ -1,747 +1,362 @@ - - + + - - - +o\:* {behavior:url(#default#VML);} +w\:* {behavior:url(#default#VML);} +.shape {behavior:url(#default#VML);} + - -
-

Good Afternoon! - - -

-

- - -

-

My name is Matthew McCabe and I’d like to introduce myself as the VP, Sales -& Marketing for - The Crowley Company - (Crowley). -  If you’re not familiar with Crowley, we provide a full range of digitization solutions ranging from - front-end capture -, - ECM integration -, as well as “best in class” - -cultural heritage scanners - (offering up to - -FADGI - 4-star quality). -  - - -

-

-  - - -

-

With - more than two decades - in the records management industry, I’m excited about the opportunity to offer you brand name production-level scan systems with built-in software that -adds maximum image quality, enhanced productivity, and increased efficiency that are all user friendly and packed with functionality. - - -

-

-  - - -

-

Crowley’s capture offerings include: - - -

-

- - • - -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  - - - - document scanners - - - -

-

- - • - -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  - - - - microform scanners - (microfilm, microfiche, aperture card) - - -

-

- - • - -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  - - - - book and large format scanners - - - -

-

- - • - -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  - - - - archive writers - - - -

-

- - -

-

We understand that you may be in the planning and preparation phase for FY ’22 and beyond and are thinking about your future digitization projects. -  The next time you consider purchasing scanning equipment, we’d appreciate your consideration -and encourage you to review Crowley’s Digitization Solutions. -  That way should you need to ramp up production, replace an aging unit, or want to see the latest in digitization technology, you will be able to make an informed decision. - - -

-

- - -

-

If you would like any additional information or would like to set up a time to speak more in detail about your collection and evolving requirements, then please don’t hesitate to ask. - - -

-

- -  - -

-

Sincerely, - - -

-

- -  - -

-

- - - - - -

-

- - - -  - - - -

-

- Matthew J. McCabe - - - -

-

- VP, Sales -& Marketing - - - -

-

- - The Crowley Company - - - - -

-

- 5111 Pegasus Court, Suite M - - - -

-

- Frederick, MD -  21704 - - - -

-

- 240-215-0224 ext. 210 - - - -

-

- 240-447-6642 mobile - - - -

-

- 240-215-0234 fax - - - -

-

- - - - - mattm@thecrowleycompany.com - - - - - - - - - - - -

-

- - - - - www.thecrowleycompany.com - - - - - - - - - -

-

- - - -  - - - -

- - -

- FOLLOW US ON - - - - LINKEDIN - - - - - - - -

-

- -  - -

- + +
+

Good Afternoon!

+

+

My name is Matthew McCabe and I’d like to introduce myself as the VP, Sales & Marketing for +The Crowley Company (Crowley).  If you’re not familiar with Crowley, we provide a full range of digitization solutions ranging from +front-end capture, +ECM integration, as well as “best in class” + +cultural heritage scanners (offering up to +FADGI 4-star quality). 

+

 

+

With more than two decades in the records management industry, I’m excited about the opportunity to offer you brand name production-level scan systems with built-in software that + adds maximum image quality, enhanced productivity, and increased efficiency that are all user friendly and packed with functionality. +

+

 

+

Crowley’s capture offerings include:

+

+                     +document scanners

+

+                     +microform scanners (microfilm, microfiche, aperture card)

+

+                     +book and large format scanners

+

+                     +archive writers

+

+

We understand that you may be in the planning and preparation phase for FY ’22 and beyond and are thinking about your future digitization projects.  The next time you consider purchasing scanning equipment, we’d appreciate your consideration + and encourage you to review Crowley’s Digitization Solutions.  That way should you need to ramp up production, replace an aging unit, or want to see the latest in digitization technology, you will be able to make an informed decision.

+

+

If you would like any additional information or would like to set up a time to speak more in detail about your collection and evolving requirements, then please don’t hesitate to ask.

+

 

+

Sincerely,

+

 

+

+

 

+

Matthew J. McCabe

+

VP, Sales & Marketing

+

The Crowley Company

+

5111 Pegasus Court, Suite M

+

Frederick, MD  21704

+

240-215-0224 ext. 210

+

240-447-6642 mobile

+

240-215-0234 fax

+

mattm@thecrowleycompany.com

+

www.thecrowleycompany.com

+

 

+ +

FOLLOW US ON LINKEDIN

+

 

+
- - + -
-

- - - -  - - - -

-

- - Hardware Financing Option - - - - $ - - - - -Available -  - - - - - - - - -

-

- -  - -

-

- -  Click - - - - - here - - - -to discover end-of-year tax incentives, financing purchase options or receive a leasing quote on Crowley - - - digitization products - - - . - - - -

-

- -  - - - -

-

- -  -  Call - (240) 215-0224 - or - - - click - - - - - to connect with your Crowley representative for additional information - - - -

-

- -  -  and a scanner quote. - - - -

-

- - -  - - -

+
+

 

+

Hardware Financing Option$ + Available 

+

 

+

  Click +here + to discover end-of-year tax incentives, financing purchase options or receive a leasing quote on Crowley +digitization products.

+

  +

+

  Call +(240) 215-0224 or click +to connect with your Crowley representative for additional information +

+

  and a scanner quote. +

+

 

-

- -  - -

-

- -  - -

-
+ +

 

+

 

+ - \ No newline at end of file + diff --git a/data/msg-Digitization Archiving Solutions/1/Headers.pickle b/data/msg-Digitization Archiving Solutions/1/Headers.pickle index a35d924..b9476af 100644 Binary files a/data/msg-Digitization Archiving Solutions/1/Headers.pickle and b/data/msg-Digitization Archiving Solutions/1/Headers.pickle differ diff --git a/docs/Gemfile b/docs/Gemfile index 574efc8..eaee490 100644 --- a/docs/Gemfile +++ b/docs/Gemfile @@ -22,7 +22,7 @@ gem "rake" # If you have any plugins, put them here! group :jekyll_plugins do - gem "jekyll-feed", "~> 0.6" + gem "jekyll-feed", "~> 0.15.1" gem "jekyll-remote-theme" gem "jekyll-seo-tag" gem "jekyll-sitemap" diff --git a/mailbagit/__init__.py b/mailbagit/__init__.py index 717b329..88ab1eb 100644 --- a/mailbagit/__init__.py +++ b/mailbagit/__init__.py @@ -1,7 +1,7 @@ # __init__.py # Version of the mailbagit package -__version__ = "0.6.3" +__version__ = "0.7.0" import os from pathlib import Path diff --git a/mailbagit/formats/msg.py b/mailbagit/formats/msg.py index ac4448d..5a6f246 100644 --- a/mailbagit/formats/msg.py +++ b/mailbagit/formats/msg.py @@ -9,7 +9,7 @@ import mailbagit.helper.format as format import mailbagit.helper.common as common import mailbagit.globals as globals -from extract_msg import attachment +import chardet import uuid log = get_logger() @@ -91,10 +91,15 @@ def messages(self, iteration_only=False): text_body = None html_encoding = None text_encoding = None + # encoding check priorities + encodings = {1: {"name": "cp1252", "label": "Windows 1252"}, 2: {"name": "utf-8", "label": "utf-8"}} try: - if mail.htmlBody: - html_body = mail.htmlBody.decode("utf-8").strip() - html_encoding = "utf-8" + try: + if mail.htmlBody: + html_body, html_encoding, errors = format.safely_decode("HTML", mail.htmlBody, encodings, errors) + except Exception as e: + desc = "Error parsing HTML body" + errors = common.handle_error(errors, e, desc) if mail.body: text_body = mail.body text_encoding = mail.stringEncoding @@ -155,7 +160,7 @@ def messages(self, iteration_only=False): contentID = None try: - contentID = mailAttachment.contendId + contentID = mailAttachment.contentId except Exception as e: desc = "Error reading ContentID, creating an ID instead" errors = common.handle_error(errors, e, desc, "warn") @@ -181,7 +186,7 @@ def messages(self, iteration_only=False): Original_File=originalFile, Message_Path=messagePath, Derivatives_Path=derivativesPath, - Date=mail.date, + Date=str(mail.date), From=mail.sender, To=mail.to, Cc=mail.cc, diff --git a/mailbagit/formats/pst.py b/mailbagit/formats/pst.py index 6917424..b30b889 100644 --- a/mailbagit/formats/pst.py +++ b/mailbagit/formats/pst.py @@ -2,7 +2,7 @@ import email from pathlib import Path import chardet -from extract_msg.constants import CODE_PAGES +from extract_msg.encoding import _CODE_PAGES from RTFDE.deencapsulate import DeEncapsulator from mailbagit.loggerx import get_logger from mailbagit.email_account import EmailAccount @@ -105,12 +105,12 @@ def folders(self, folder, path, originalFile, errors, iteration_only=False): if entry.data: value = entry.get_data_as_integer() # Use the extract_msg code page in constants.py - encodings[1] = {"name": CODE_PAGES[value], "label": "PidTagInternetCodepage"} + encodings[1] = {"name": _CODE_PAGES[value], "label": "PidTagInternetCodepage"} if entry.entry_type == LIBPFF_ENTRY_TYPE_MESSAGE_CODEPAGE: if entry.data: value = entry.get_data_as_integer() # Use the extract_msg code page in constants.py - encodings[2] = {"name": CODE_PAGES[value], "label": "PidTagMessageCodepage"} + encodings[2] = {"name": _CODE_PAGES[value], "label": "PidTagMessageCodepage"} # messageObj.html_body sometimes fails. This seems to often be the case for email in "Deleted Items" try: if messageObj.html_body: diff --git a/setup.py b/setup.py index de459e4..bdba6d5 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name="mailbagit", - version="0.6.3", + version="0.7.0", author="Gregory Wiedeman", author_email="gwiedeman@albany.edu", description="A tool for preserving email in multiple preservation formats.", @@ -25,7 +25,7 @@ "beautifulsoup4>=4.11.1,<5", "black>=22.1.0,<23", "jsonmodels>=2.2,<=2.5.0", - "extract_msg>=0.34.3,<0.42.0", + "extract_msg>=0.42.0", "structlog>=21.1.0,<22", "packaging>=21.0,<21.3", "python-json-logger>=2.0.2,<3", @@ -45,5 +45,5 @@ "pst": ["libpff-python==20211114"], "dev": ["pyinstaller==5.0.1,<6"], }, - python_requires=">=3.7", + python_requires=">=3.8", )