From 681d10243d0ce59cb760fe9a9dc2cd3ead3422fc Mon Sep 17 00:00:00 2001 From: Nahue Date: Tue, 2 Jun 2020 20:27:59 -0300 Subject: [PATCH] Auto detects when the text is latin1 or ascii and decode it as UTF-8 so no strange chars are stored or used for the entry version comparisons --- diffengine/__init__.py | 3 ++- diffengine/{text_builder.py => text.py} | 14 ++++++++++++ diffengine/twitter.py | 2 +- test_diffengine.py | 29 ++++++++++++++++++------- 4 files changed, 38 insertions(+), 10 deletions(-) rename diffengine/{text_builder.py => text.py} (83%) diff --git a/diffengine/__init__.py b/diffengine/__init__.py index b18cf3c..c0c155b 100755 --- a/diffengine/__init__.py +++ b/diffengine/__init__.py @@ -37,6 +37,7 @@ from diffengine.exceptions.webdriver import UnknownWebdriverError from diffengine.exceptions.twitter import ConfigNotFoundError, TwitterError +from diffengine.text import to_utf8 from diffengine.twitter import TwitterHandler from diffengine.exceptions.sendgrid import ( ConfigNotFoundError as SGConfigNotFoundError, @@ -169,7 +170,7 @@ def get_latest(self): logging.warn("Got %s when fetching %s", resp.status_code, self.url) return None - doc = readability.Document(resp.text) + doc = readability.Document(to_utf8(resp.text)) title = doc.title() summary = doc.summary(html_partial=True) summary = bleach.clean(summary, tags=["p"], strip=True) diff --git a/diffengine/text_builder.py b/diffengine/text.py similarity index 83% rename from diffengine/text_builder.py rename to diffengine/text.py index 5f1bda1..f064642 100644 --- a/diffengine/text_builder.py +++ b/diffengine/text.py @@ -58,3 +58,17 @@ def build_with_default_content(diff): text = text[0:225] + "…" text += " " + diff.url return text + + +def to_utf8(text): + for encoding in ["latin1", "ascii"]: + try: + result = text.encode(encoding).decode("utf8", "strict") + break + except (UnicodeEncodeError, UnicodeDecodeError): + result = None + + if result is None: + return text + + return result diff --git a/diffengine/twitter.py b/diffengine/twitter.py index c49e007..702a8ee 100644 --- a/diffengine/twitter.py +++ b/diffengine/twitter.py @@ -3,7 +3,7 @@ from datetime import datetime -from diffengine.text_builder import build_text +from diffengine.text import build_text from diffengine.exceptions.twitter import ( AlreadyTweetedError, ConfigNotFoundError, diff --git a/test_diffengine.py b/test_diffengine.py index ad1fe15..e01b1a5 100644 --- a/test_diffengine.py +++ b/test_diffengine.py @@ -33,7 +33,7 @@ AlreadyEmailedError as SGAlreadyEmailedError, ArchiveUrlNotFoundError as SGArchiveNotFoundError, ) -from diffengine.text_builder import build_text +from diffengine.text import build_text from diffengine.exceptions.twitter import ( ConfigNotFoundError, TokenNotFoundError, @@ -619,8 +619,8 @@ def get_mocked_diff(with_archive_urls=True): class TextBuilderTest(TestCase): @patch("logging.warning") - @patch("diffengine.text_builder.build_with_lang") - @patch("diffengine.text_builder.build_with_default_content") + @patch("diffengine.text.build_with_lang") + @patch("diffengine.text.build_with_default_content") def test_build_with_default_content_when_no_lang_given( self, mocked_build_with_default_content, mocked_build_from_lang, mocked_warning ): @@ -635,8 +635,8 @@ def test_build_with_default_content_when_no_lang_given( mocked_build_from_lang.assert_not_called() @patch("logging.warning") - @patch("diffengine.text_builder.build_with_lang") - @patch("diffengine.text_builder.build_with_default_content") + @patch("diffengine.text.build_with_lang") + @patch("diffengine.text.build_with_default_content") def test_build_with_default_content_when_lang_is_incomplete( self, mocked_build_with_default_content, mocked_build_from_lang, mocked_warning ): @@ -656,8 +656,8 @@ def test_build_with_default_content_when_lang_is_incomplete( mocked_build_from_lang.assert_not_called() @patch("logging.warning") - @patch("diffengine.text_builder.build_with_lang") - @patch("diffengine.text_builder.build_with_default_content") + @patch("diffengine.text.build_with_lang") + @patch("diffengine.text.build_with_default_content") def test_build_with_lang_when_lang_given( self, mocked_build_with_default_content, mocked_build_from_lang, mocked_warning ): @@ -678,7 +678,7 @@ def test_build_with_lang_when_lang_given( mocked_build_with_default_content.assert_not_called() mocked_build_from_lang.assert_called_once() - @patch("diffengine.text_builder.build_with_lang") + @patch("diffengine.text.build_with_lang") def test_default_content_text(self, mocked_build_from_lang): diff = get_mocked_diff() type(diff.new).title = "Test" @@ -760,3 +760,16 @@ def test_lang_content_text(self): self.assertEqual( text, "change in the URL, the title and the summary\n%s" % diff.url ) + + +class EncodingTest(TestCase): + def test_utf8_do_nothingg(self): + text_utf8 = "Me preocupa más la parte futbolística" + result = to_utf8(text_utf8) + self.assertEquals(result, text_utf8) + + def test_latin1_to_utf8(self): + text_latin = "Me preocupa más la parte futbolística" + text_utf8 = "Me preocupa más la parte futbolística" + result = to_utf8(text_latin) + self.assertEquals(result, text_utf8)