From 5f49fc2fa727750fa9fef3cb73da384528f64f87 Mon Sep 17 00:00:00 2001 From: Mohamed Hamed Date: Fri, 24 Sep 2021 18:08:44 +0200 Subject: [PATCH] Fix encoding --- lib/griddler/email.rb | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/lib/griddler/email.rb b/lib/griddler/email.rb index 64b32c6..31fc492 100644 --- a/lib/griddler/email.rb +++ b/lib/griddler/email.rb @@ -46,14 +46,6 @@ def extract_subject clean_invalid_utf8_bytes(params[:subject]) end - - ##to delete - def extract_body - text = EmailParser.extract_reply_body(text_or_sanitized_html) - text = clean_raw_text(text) if params.fetch(:text, '').presence - text - end - def extract_headers if params[:headers].is_a?(Hash) deep_clean_invalid_utf8_bytes(params[:headers]) @@ -71,28 +63,9 @@ def text_or_sanitized_html text.presence || clean_raw_html(params.fetch(:html, '')).presence end - - ##to delete - def clean_raw_text(text) - cleaned_text = clean_invalid_utf8_bytes(text) - full_sanitizer = Rails::Html::FullSanitizer.new - cleaned_text = full_sanitizer.sanitize(cleaned_text) - cleaned_text = HTMLEntities.new.decode(cleaned_text) - cleaned_text - end - - ##to delete - def clean_raw_html(html) - Loofah::HTML5::WhiteList::ACCEPTABLE_PROTOCOLS.add('cid') - Loofah::HTML5::WhiteList::ACCEPTABLE_PROTOCOLS.add('data') - cleaned_html = clean_invalid_utf8_bytes(html) - cleaned_html = sanitize(cleaned_html) - cleaned_html = HTMLEntities.new.decode(cleaned_html) - cleaned_html - end - def clean_raw_utf8(text) - clean_invalid_utf8_bytes(text) || '' + source_encoding = 'EUC-KR' + clean_invalid_utf8_bytes(text, source_encoding: source_encoding) || '' end def deep_clean_invalid_utf8_bytes(object) @@ -111,8 +84,16 @@ def deep_clean_invalid_utf8_bytes(object) end end - def clean_invalid_utf8_bytes(text) - text.encode!('UTF-8', 'EUC-KR') + def clean_invalid_utf8_bytes(text, source_encoding: nil) + if source_encoding + text.force_encoding(source_encoding).encode!('UTF-8', source_encoding) + else + if text && !text.valid_encoding? + text.force_encoding('ISO-8859-1').encode!('UTF-8') + else + text + end + end end end end