From c9d99a7c31065e286f62087a704e720efedaaaee Mon Sep 17 00:00:00 2001 From: Gregory Wiedeman Date: Wed, 15 Nov 2023 15:46:05 -0500 Subject: [PATCH 1/3] Addressed MSG decode errors --- docs/Gemfile | 2 +- mailbagit/formats/msg.py | 15 +++++--- mailbagit/formats/pst.py | 2 +- setup.py | 4 +-- test.py | 74 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 88 insertions(+), 9 deletions(-) create mode 100644 test.py diff --git a/docs/Gemfile b/docs/Gemfile index 574efc8..eaee490 100644 --- a/docs/Gemfile +++ b/docs/Gemfile @@ -22,7 +22,7 @@ gem "rake" # If you have any plugins, put them here! group :jekyll_plugins do - gem "jekyll-feed", "~> 0.6" + gem "jekyll-feed", "~> 0.15.1" gem "jekyll-remote-theme" gem "jekyll-seo-tag" gem "jekyll-sitemap" diff --git a/mailbagit/formats/msg.py b/mailbagit/formats/msg.py index ac4448d..eeab551 100644 --- a/mailbagit/formats/msg.py +++ b/mailbagit/formats/msg.py @@ -9,7 +9,7 @@ import mailbagit.helper.format as format import mailbagit.helper.common as common import mailbagit.globals as globals -from extract_msg import attachment +import chardet import uuid log = get_logger() @@ -91,10 +91,15 @@ def messages(self, iteration_only=False): text_body = None html_encoding = None text_encoding = None + # encoding check priorities + encodings = {1: {"name": "cp1252", "label": "Windows 1252"}, 2: {"name": "utf-8", "label": "utf-8"}} try: - if mail.htmlBody: - html_body = mail.htmlBody.decode("utf-8").strip() - html_encoding = "utf-8" + try: + if mail.htmlBody: + html_body, html_encoding, errors = format.safely_decode("HTML", mail.htmlBody, encodings, errors) + except Exception as e: + desc = "Error parsing HTML body" + errors = common.handle_error(errors, e, desc) if mail.body: text_body = mail.body text_encoding = mail.stringEncoding @@ -181,7 +186,7 @@ def messages(self, iteration_only=False): Original_File=originalFile, Message_Path=messagePath, Derivatives_Path=derivativesPath, - Date=mail.date, + Date=str(mail.date), From=mail.sender, To=mail.to, Cc=mail.cc, diff --git a/mailbagit/formats/pst.py b/mailbagit/formats/pst.py index 6917424..063e58c 100644 --- a/mailbagit/formats/pst.py +++ b/mailbagit/formats/pst.py @@ -2,7 +2,7 @@ import email from pathlib import Path import chardet -from extract_msg.constants import CODE_PAGES +from extract_msg.encoding import _CODE_PAGES from RTFDE.deencapsulate import DeEncapsulator from mailbagit.loggerx import get_logger from mailbagit.email_account import EmailAccount diff --git a/setup.py b/setup.py index de459e4..31e8169 100755 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ "beautifulsoup4>=4.11.1,<5", "black>=22.1.0,<23", "jsonmodels>=2.2,<=2.5.0", - "extract_msg>=0.34.3,<0.42.0", + "extract_msg>=0.42.0", "structlog>=21.1.0,<22", "packaging>=21.0,<21.3", "python-json-logger>=2.0.2,<3", @@ -45,5 +45,5 @@ "pst": ["libpff-python==20211114"], "dev": ["pyinstaller==5.0.1,<6"], }, - python_requires=">=3.7", + python_requires=">=3.8", ) diff --git a/test.py b/test.py new file mode 100644 index 0000000..774872c --- /dev/null +++ b/test.py @@ -0,0 +1,74 @@ +import extract_msg +import chardet +import os +import bs4 +import RTFDE +from striprtf.striprtf import rtf_to_text +from rtfparse.parser import Rtf_Parser +from rtfparse.renderers.de_encapsulate_html import De_encapsulate_HTML + +# filePath = "/data/msg_bugs/issues/Creation of a Committee to Create Additional Funding Opportunities for UAlbany.msg" +filePath = "/data/msg_bugs/issues/FW SUNY-wide transfer in the major (1).msg" + +# filePath = "/data/msg_bugs/new" +""" +for msg in os.listdir(filePath): + if msg.endswith("msg"): + print (msg) + mail = extract_msg.openMsg(os.path.join(filePath, msg)) + body = mail.rtfBody + print (chardet.detect(body)) + deencapsultor = RTFDE.DeEncapsulator(body) + deencapsultor.deencapsulate() +""" +mail = extract_msg.openMsg(os.path.join(filePath)) +# print (mail.sender) +# print (mail.to) + +# print (mail.areStringsUnicode) + +# print (mail.rtfBody) +# body = mail.rtfBody + +""" +with open("/data/msg_bugs/issues/test-default.rtf", "wb") as f: + f.write(body) +with open("/data/msg_bugs/issues/test-1252.rtf", "w") as f: + f.write(body.decode("cp1252")) +with open("/data/msg_bugs/issues/test-950.rtf", "w") as f: + f.write(body.decode("cp950")) +""" + + +# enc = chardet.detect(body) +# print (chardet.detect(body)) + +parser = Rtf_Parser(rtf_path="/data/msg_bugs/issues/rtf/test-default.rtf") +parsed = parser.parse_file() + +renderer = De_encapsulate_HTML() +with open("/data/msg_bugs/issues/rtf/out.html", mode="w", encoding="utf-8") as html_file: + renderer.render(parsed, html_file) + +# deencapsultor = RTFDE.DeEncapsulator(body) +# deencapsultor.deencapsulate() + + +""" +if (mail.htmlBody): + enc = chardet.detect(mail.htmlBody) + print ("\t--> " + enc["encoding"]) + print (mail.htmlBody) + #body = mail.htmlBody.decode('utf-8') + #soup = bs4.BeautifulSoup(mail.htmlBody, 'html.parser') + #meta = soup.find("meta") + #print (meta) + #try: + # body = mail.htmlBody.decode('cp1252') + #except: + # body = mail.htmlBody.decode('utf-8') + #print (body) + #print (mail.stringEncoding) + #print (mail.overrideEncoding) + #print (dir(mail)) +""" From 031a7cb9cccef8fb400a7031cc873643d0d538f6 Mon Sep 17 00:00:00 2001 From: Gregory Wiedeman Date: Wed, 15 Nov 2023 15:48:34 -0500 Subject: [PATCH 2/3] bumped version --- mailbagit/__init__.py | 2 +- setup.py | 2 +- test.py | 74 ------------------------------------------- 3 files changed, 2 insertions(+), 76 deletions(-) delete mode 100644 test.py diff --git a/mailbagit/__init__.py b/mailbagit/__init__.py index 717b329..88ab1eb 100644 --- a/mailbagit/__init__.py +++ b/mailbagit/__init__.py @@ -1,7 +1,7 @@ # __init__.py # Version of the mailbagit package -__version__ = "0.6.3" +__version__ = "0.7.0" import os from pathlib import Path diff --git a/setup.py b/setup.py index 31e8169..bdba6d5 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name="mailbagit", - version="0.6.3", + version="0.7.0", author="Gregory Wiedeman", author_email="gwiedeman@albany.edu", description="A tool for preserving email in multiple preservation formats.", diff --git a/test.py b/test.py deleted file mode 100644 index 774872c..0000000 --- a/test.py +++ /dev/null @@ -1,74 +0,0 @@ -import extract_msg -import chardet -import os -import bs4 -import RTFDE -from striprtf.striprtf import rtf_to_text -from rtfparse.parser import Rtf_Parser -from rtfparse.renderers.de_encapsulate_html import De_encapsulate_HTML - -# filePath = "/data/msg_bugs/issues/Creation of a Committee to Create Additional Funding Opportunities for UAlbany.msg" -filePath = "/data/msg_bugs/issues/FW SUNY-wide transfer in the major (1).msg" - -# filePath = "/data/msg_bugs/new" -""" -for msg in os.listdir(filePath): - if msg.endswith("msg"): - print (msg) - mail = extract_msg.openMsg(os.path.join(filePath, msg)) - body = mail.rtfBody - print (chardet.detect(body)) - deencapsultor = RTFDE.DeEncapsulator(body) - deencapsultor.deencapsulate() -""" -mail = extract_msg.openMsg(os.path.join(filePath)) -# print (mail.sender) -# print (mail.to) - -# print (mail.areStringsUnicode) - -# print (mail.rtfBody) -# body = mail.rtfBody - -""" -with open("/data/msg_bugs/issues/test-default.rtf", "wb") as f: - f.write(body) -with open("/data/msg_bugs/issues/test-1252.rtf", "w") as f: - f.write(body.decode("cp1252")) -with open("/data/msg_bugs/issues/test-950.rtf", "w") as f: - f.write(body.decode("cp950")) -""" - - -# enc = chardet.detect(body) -# print (chardet.detect(body)) - -parser = Rtf_Parser(rtf_path="/data/msg_bugs/issues/rtf/test-default.rtf") -parsed = parser.parse_file() - -renderer = De_encapsulate_HTML() -with open("/data/msg_bugs/issues/rtf/out.html", mode="w", encoding="utf-8") as html_file: - renderer.render(parsed, html_file) - -# deencapsultor = RTFDE.DeEncapsulator(body) -# deencapsultor.deencapsulate() - - -""" -if (mail.htmlBody): - enc = chardet.detect(mail.htmlBody) - print ("\t--> " + enc["encoding"]) - print (mail.htmlBody) - #body = mail.htmlBody.decode('utf-8') - #soup = bs4.BeautifulSoup(mail.htmlBody, 'html.parser') - #meta = soup.find("meta") - #print (meta) - #try: - # body = mail.htmlBody.decode('cp1252') - #except: - # body = mail.htmlBody.decode('utf-8') - #print (body) - #print (mail.stringEncoding) - #print (mail.overrideEncoding) - #print (dir(mail)) -""" From 028acadcc6b4d60f201b8d8e2578441579eeb8ac Mon Sep 17 00:00:00 2001 From: Gregory Wiedeman Date: Wed, 15 Nov 2023 16:05:38 -0500 Subject: [PATCH 3/3] updated attribute caught by tests and re-ran test data so test pass --- data/mbox-sample1/1/Headers.pickle | Bin 15652 -> 15650 bytes data/mbox-sample1/1/Message.pickle | Bin 15580 -> 15579 bytes data/mbox-sample1/2/Headers.pickle | Bin 36567 -> 36566 bytes .../1/Date.txt | 2 +- .../1/HTML_Body.txt | 1093 ++++++----------- .../1/Headers.pickle | Bin 16017 -> 15928 bytes mailbagit/formats/msg.py | 2 +- mailbagit/formats/pst.py | 4 +- 8 files changed, 358 insertions(+), 743 deletions(-) diff --git a/data/mbox-sample1/1/Headers.pickle b/data/mbox-sample1/1/Headers.pickle index 88f57abdeae77936058396676fdeaab5fc481074..f7cb7c2de9604c57221a904e3e30a31d056a99e7 100644 GIT binary patch delta 36 scmZ2dwWx}vfn}=rL>2=^vx%l=8*^6L@G){t$;gkLl2MpZRGOp*0Ozp`mH+?% delta 39 vcmZ2fwWNxrfn}=XL>2=^^NFTrj9eSDSK9D1adAz_$d8?pQJ7IwnxqE+`^yZk diff --git a/data/mbox-sample1/1/Message.pickle b/data/mbox-sample1/1/Message.pickle index d3ad04759c185c1026491a4ae013bfa0b60a8264..ab57f7f4010a4111ba2d893f00b41e45d70cd660 100644 GIT binary patch delta 28 kcmcapdApLOfo1B2jVvc^_!zmSWMsuo$;ipbEltt`0J3unZvX%Q delta 29 lcmcazd8d-4fo1B&jVvc^_?fu4retKrPRYp0$SqCM0|2>^3w!_o diff --git a/data/mbox-sample1/2/Headers.pickle b/data/mbox-sample1/2/Headers.pickle index 3e9cecd7c323dfb3855eeb8ec135435872f65171..53dacb6c6b8e4370eb1ac5966c97babb2f1a9420 100644 GIT binary patch delta 23 fcmcaUm+9JECYA=4si!Bh7%-YmG&S3pliddZZpaAY delta 25 hcmcaMm+AUkCYA=4sb?m#7%-YoG&N)7+L)c)2LODl2<`v? diff --git a/data/msg-Digitization Archiving Solutions/1/Date.txt b/data/msg-Digitization Archiving Solutions/1/Date.txt index cf7332a..ee5927a 100644 --- a/data/msg-Digitization Archiving Solutions/1/Date.txt +++ b/data/msg-Digitization Archiving Solutions/1/Date.txt @@ -1 +1 @@ -Thu, 03 Feb 2022 15:01:10 -0500 \ No newline at end of file +2022-02-03 15:01:10-05:00 \ No newline at end of file diff --git a/data/msg-Digitization Archiving Solutions/1/HTML_Body.txt b/data/msg-Digitization Archiving Solutions/1/HTML_Body.txt index a792eda..cfc5d1f 100644 --- a/data/msg-Digitization Archiving Solutions/1/HTML_Body.txt +++ b/data/msg-Digitization Archiving Solutions/1/HTML_Body.txt @@ -1,747 +1,362 @@ - - + + - - - +o\:* {behavior:url(#default#VML);} +w\:* {behavior:url(#default#VML);} +.shape {behavior:url(#default#VML);} + - -
-

Good Afternoon! - - -

-

- - -

-

My name is Matthew McCabe and I’d like to introduce myself as the VP, Sales -& Marketing for - The Crowley Company - (Crowley). -  If you’re not familiar with Crowley, we provide a full range of digitization solutions ranging from - front-end capture -, - ECM integration -, as well as “best in class” - -cultural heritage scanners - (offering up to - -FADGI - 4-star quality). -  - - -

-

-  - - -

-

With - more than two decades - in the records management industry, I’m excited about the opportunity to offer you brand name production-level scan systems with built-in software that -adds maximum image quality, enhanced productivity, and increased efficiency that are all user friendly and packed with functionality. - - -

-

-  - - -

-

Crowley’s capture offerings include: - - -

-

- - • - -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  - - - - document scanners - - - -

-

- - • - -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  - - - - microform scanners - (microfilm, microfiche, aperture card) - - -

-

- - • - -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  - - - - book and large format scanners - - - -

-

- - • - -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  - - - - archive writers - - - -

-

- - -

-

We understand that you may be in the planning and preparation phase for FY ’22 and beyond and are thinking about your future digitization projects. -  The next time you consider purchasing scanning equipment, we’d appreciate your consideration -and encourage you to review Crowley’s Digitization Solutions. -  That way should you need to ramp up production, replace an aging unit, or want to see the latest in digitization technology, you will be able to make an informed decision. - - -

-

- - -

-

If you would like any additional information or would like to set up a time to speak more in detail about your collection and evolving requirements, then please don’t hesitate to ask. - - -

-

- -  - -

-

Sincerely, - - -

-

- -  - -

-

- - - - - -

-

- - - -  - - - -

-

- Matthew J. McCabe - - - -

-

- VP, Sales -& Marketing - - - -

-

- - The Crowley Company - - - - -

-

- 5111 Pegasus Court, Suite M - - - -

-

- Frederick, MD -  21704 - - - -

-

- 240-215-0224 ext. 210 - - - -

-

- 240-447-6642 mobile - - - -

-

- 240-215-0234 fax - - - -

-

- - - - - mattm@thecrowleycompany.com - - - - - - - - - - - -

-

- - - - - www.thecrowleycompany.com - - - - - - - - - -

-

- - - -  - - - -

- - -

- FOLLOW US ON - - - - LINKEDIN - - - - - - - -

-

- -  - -

- + +
+

Good Afternoon!

+

+

My name is Matthew McCabe and I’d like to introduce myself as the VP, Sales & Marketing for +The Crowley Company (Crowley).  If you’re not familiar with Crowley, we provide a full range of digitization solutions ranging from +front-end capture, +ECM integration, as well as “best in class” + +cultural heritage scanners (offering up to +FADGI 4-star quality). 

+

 

+

With more than two decades in the records management industry, I’m excited about the opportunity to offer you brand name production-level scan systems with built-in software that + adds maximum image quality, enhanced productivity, and increased efficiency that are all user friendly and packed with functionality. +

+

 

+

Crowley’s capture offerings include:

+

+                     +document scanners

+

+                     +microform scanners (microfilm, microfiche, aperture card)

+

+                     +book and large format scanners

+

+                     +archive writers

+

+

We understand that you may be in the planning and preparation phase for FY ’22 and beyond and are thinking about your future digitization projects.  The next time you consider purchasing scanning equipment, we’d appreciate your consideration + and encourage you to review Crowley’s Digitization Solutions.  That way should you need to ramp up production, replace an aging unit, or want to see the latest in digitization technology, you will be able to make an informed decision.

+

+

If you would like any additional information or would like to set up a time to speak more in detail about your collection and evolving requirements, then please don’t hesitate to ask.

+

 

+

Sincerely,

+

 

+

+

 

+

Matthew J. McCabe

+

VP, Sales & Marketing

+

The Crowley Company

+

5111 Pegasus Court, Suite M

+

Frederick, MD  21704

+

240-215-0224 ext. 210

+

240-447-6642 mobile

+

240-215-0234 fax

+

mattm@thecrowleycompany.com

+

www.thecrowleycompany.com

+

 

+ +

FOLLOW US ON LINKEDIN

+

 

+
- - + -
-

- - - -  - - - -

-

- - Hardware Financing Option - - - - $ - - - - -Available -  - - - - - - - - -

-

- -  - -

-

- -  Click - - - - - here - - - -to discover end-of-year tax incentives, financing purchase options or receive a leasing quote on Crowley - - - digitization products - - - . - - - -

-

- -  - - - -

-

- -  -  Call - (240) 215-0224 - or - - - click - - - - - to connect with your Crowley representative for additional information - - - -

-

- -  -  and a scanner quote. - - - -

-

- - -  - - -

+
+

 

+

Hardware Financing Option$ + Available 

+

 

+

  Click +here + to discover end-of-year tax incentives, financing purchase options or receive a leasing quote on Crowley +digitization products.

+

  +

+

  Call +(240) 215-0224 or click +to connect with your Crowley representative for additional information +

+

  and a scanner quote. +

+

 

-

- -  - -

-

- -  - -

-
+ +

 

+

 

+ - \ No newline at end of file + diff --git a/data/msg-Digitization Archiving Solutions/1/Headers.pickle b/data/msg-Digitization Archiving Solutions/1/Headers.pickle index a35d924e8eceb5ad08174c9c934835ed39a03ac2..b9476afcc57d98f33cfcceb2b6b3d0d0a0902821 100644 GIT binary patch delta 66 zcmbPOyQ7Asfn}=hMwS-a$(wEMWc+&AQ&Q7XlS_)H#7^nqNzF~n%+X6ND#|Y^o-*0N WPMf=jB_}g4b&6kcQsU%7I|Tq%b{QN1 delta 134 zcmdl{GqIMXfn{pjMwS*^jUJYi#FEr0J@O$Lr8)`*#tLq!NeV^=Mn(#Trd9@qR)z)& zx(22O29sND9ZfSLde~D^(^8X5il@X*(dgkx%}vbA(Mv5V$}cLO(jy&Ql2}yYlwX>c kl2}yfmtW!rVov^VE5VfUV)7$fZMKZJe#J?tlO^mF0N?*Ff&c&j diff --git a/mailbagit/formats/msg.py b/mailbagit/formats/msg.py index eeab551..5a6f246 100644 --- a/mailbagit/formats/msg.py +++ b/mailbagit/formats/msg.py @@ -160,7 +160,7 @@ def messages(self, iteration_only=False): contentID = None try: - contentID = mailAttachment.contendId + contentID = mailAttachment.contentId except Exception as e: desc = "Error reading ContentID, creating an ID instead" errors = common.handle_error(errors, e, desc, "warn") diff --git a/mailbagit/formats/pst.py b/mailbagit/formats/pst.py index 063e58c..b30b889 100644 --- a/mailbagit/formats/pst.py +++ b/mailbagit/formats/pst.py @@ -105,12 +105,12 @@ def folders(self, folder, path, originalFile, errors, iteration_only=False): if entry.data: value = entry.get_data_as_integer() # Use the extract_msg code page in constants.py - encodings[1] = {"name": CODE_PAGES[value], "label": "PidTagInternetCodepage"} + encodings[1] = {"name": _CODE_PAGES[value], "label": "PidTagInternetCodepage"} if entry.entry_type == LIBPFF_ENTRY_TYPE_MESSAGE_CODEPAGE: if entry.data: value = entry.get_data_as_integer() # Use the extract_msg code page in constants.py - encodings[2] = {"name": CODE_PAGES[value], "label": "PidTagMessageCodepage"} + encodings[2] = {"name": _CODE_PAGES[value], "label": "PidTagMessageCodepage"} # messageObj.html_body sometimes fails. This seems to often be the case for email in "Deleted Items" try: if messageObj.html_body: