Skip to content

Commit

Permalink
Merge pull request #88 from nahuelhds/feature/encoding-autodetection
Browse files Browse the repository at this point in the history
Encoding autodetection
  • Loading branch information
edsu authored Jun 3, 2020
2 parents 44b8f87 + 681d102 commit 505aa2a
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 10 deletions.
3 changes: 2 additions & 1 deletion diffengine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

from diffengine.exceptions.webdriver import UnknownWebdriverError
from diffengine.exceptions.twitter import ConfigNotFoundError, TwitterError
from diffengine.text import to_utf8
from diffengine.twitter import TwitterHandler
from diffengine.exceptions.sendgrid import (
ConfigNotFoundError as SGConfigNotFoundError,
Expand Down Expand Up @@ -169,7 +170,7 @@ def get_latest(self):
logging.warn("Got %s when fetching %s", resp.status_code, self.url)
return None

doc = readability.Document(resp.text)
doc = readability.Document(to_utf8(resp.text))
title = doc.title()
summary = doc.summary(html_partial=True)
summary = bleach.clean(summary, tags=["p"], strip=True)
Expand Down
14 changes: 14 additions & 0 deletions diffengine/text_builder.py → diffengine/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,17 @@ def build_with_default_content(diff):
text = text[0:225] + "…"
text += " " + diff.url
return text


def to_utf8(text):
for encoding in ["latin1", "ascii"]:
try:
result = text.encode(encoding).decode("utf8", "strict")
break
except (UnicodeEncodeError, UnicodeDecodeError):
result = None

if result is None:
return text

return result
2 changes: 1 addition & 1 deletion diffengine/twitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from datetime import datetime

from diffengine.text_builder import build_text
from diffengine.text import build_text
from diffengine.exceptions.twitter import (
AlreadyTweetedError,
ConfigNotFoundError,
Expand Down
29 changes: 21 additions & 8 deletions test_diffengine.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
AlreadyEmailedError as SGAlreadyEmailedError,
ArchiveUrlNotFoundError as SGArchiveNotFoundError,
)
from diffengine.text_builder import build_text
from diffengine.text import build_text
from diffengine.exceptions.twitter import (
ConfigNotFoundError,
TokenNotFoundError,
Expand Down Expand Up @@ -619,8 +619,8 @@ def get_mocked_diff(with_archive_urls=True):

class TextBuilderTest(TestCase):
@patch("logging.warning")
@patch("diffengine.text_builder.build_with_lang")
@patch("diffengine.text_builder.build_with_default_content")
@patch("diffengine.text.build_with_lang")
@patch("diffengine.text.build_with_default_content")
def test_build_with_default_content_when_no_lang_given(
self, mocked_build_with_default_content, mocked_build_from_lang, mocked_warning
):
Expand All @@ -635,8 +635,8 @@ def test_build_with_default_content_when_no_lang_given(
mocked_build_from_lang.assert_not_called()

@patch("logging.warning")
@patch("diffengine.text_builder.build_with_lang")
@patch("diffengine.text_builder.build_with_default_content")
@patch("diffengine.text.build_with_lang")
@patch("diffengine.text.build_with_default_content")
def test_build_with_default_content_when_lang_is_incomplete(
self, mocked_build_with_default_content, mocked_build_from_lang, mocked_warning
):
Expand All @@ -656,8 +656,8 @@ def test_build_with_default_content_when_lang_is_incomplete(
mocked_build_from_lang.assert_not_called()

@patch("logging.warning")
@patch("diffengine.text_builder.build_with_lang")
@patch("diffengine.text_builder.build_with_default_content")
@patch("diffengine.text.build_with_lang")
@patch("diffengine.text.build_with_default_content")
def test_build_with_lang_when_lang_given(
self, mocked_build_with_default_content, mocked_build_from_lang, mocked_warning
):
Expand All @@ -678,7 +678,7 @@ def test_build_with_lang_when_lang_given(
mocked_build_with_default_content.assert_not_called()
mocked_build_from_lang.assert_called_once()

@patch("diffengine.text_builder.build_with_lang")
@patch("diffengine.text.build_with_lang")
def test_default_content_text(self, mocked_build_from_lang):
diff = get_mocked_diff()
type(diff.new).title = "Test"
Expand Down Expand Up @@ -760,3 +760,16 @@ def test_lang_content_text(self):
self.assertEqual(
text, "change in the URL, the title and the summary\n%s" % diff.url
)


class EncodingTest(TestCase):
def test_utf8_do_nothingg(self):
text_utf8 = "Me preocupa más la parte futbolística"
result = to_utf8(text_utf8)
self.assertEquals(result, text_utf8)

def test_latin1_to_utf8(self):
text_latin = "Me preocupa más la parte futbolística"
text_utf8 = "Me preocupa más la parte futbolística"
result = to_utf8(text_latin)
self.assertEquals(result, text_utf8)

0 comments on commit 505aa2a

Please sign in to comment.