Merge pull request #4 from Qordobacode/ENG-6931--smart-quotes-bad

Fix smart quote handling
writer · Feb 24, 2020 · 55fed90 · 55fed90
2 parents ee29fd8 + 809f5c4
commit 55fed90
Show file tree

Hide file tree

Showing 7 changed files with 266 additions and 8 deletions.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.14.0
+0.16.0
diff --git a/build_release.sh b/build_release.sh
@@ -40,6 +40,7 @@ check_credential_file_exists ()
 update_deps ()
 {
     pip3 install -U setuptools wheel
+    pip3 install -r requirements-dev.txt
     pip3 install -r requirements.txt
 }
 

diff --git a/marian_client/__init__.py b/marian_client/__init__.py
@@ -15,6 +15,8 @@
     WebSocketAddressException,
 )
 
+from marian_client.quote_manager import Quotes
+
 GENERIC_WEBSOCKET_ERROR_CODE = 469
 
 
@@ -142,8 +144,11 @@ def _send_message(self, tokenized_sentence: str):
         # since there is no hope if we can't connect
         self._check_connection()
 
+        orignal_message = Quotes(tokenized_sentence)
+
         try:
-            self.ws.send(tokenized_sentence)
+            # need to strip smart quotes before sending
+            self.ws.send(orignal_message.simplified)
         except (
             WebSocketConnectionClosedException,
             WebSocketAddressException,
@@ -178,16 +183,20 @@ def _send_message(self, tokenized_sentence: str):
             self.ws.connected = False
 
         assert r is not None, "If r isn't set by here, we didn't send a request"
-        return success, r
+        # need to add back in smart quotes
+        corrected_message_requoted = orignal_message.requote_modified_string(r)
+        return success, corrected_message_requoted
 
     def __call__(self, tokenized_sentence: str):
 
         success, r = self._send_message(tokenized_sentence)
 
-        if self.debug and r is not None:
-            print(r.status_code, r.reason)
-
         if success:
             return True, r, (None, None)
         else:
-            return False, None, (r.status_code, r.reason)
+            if r is not None:
+                if self.debug:
+                    print(r.status_code, r.reason)
+                return False, None, (r.status_code, r.reason)
+            else:
+                return False, None, (GENERIC_WEBSOCKET_ERROR_CODE, "Something went wrong")
diff --git a/marian_client/quote_manager.py b/marian_client/quote_manager.py
@@ -0,0 +1,181 @@
+from typing import List, Tuple
+
+from diff_match_patch import diff_match_patch
+
+
+dmp = diff_match_patch()
+
+
+class Quotes:
+    singles = [
+        "｀",
+        "΄",
+        "＇",
+        "ˈ",
+        "ˊ",
+        "ᑊ",
+        "ˋ",
+        "ꞌ",
+        "ᛌ",
+        "𖽒",
+        "𖽑",
+        "‘",
+        "’",
+        "י",
+        "՚",
+        "‛",
+        "՝",
+        "`",
+        "'",
+        "′",
+        "׳",
+        "´",
+        "ʹ",
+        "˴",
+        "ߴ",
+        "‵",
+        "ߵ",
+        "ʻ",
+        "ʼ",
+        "᾽",
+        "ʽ",
+        "῾",
+        "ʾ",
+        "᾿",
+    ]
+    doubles = ['"', "＂", "〃", "ˮ", "ײ", "″", "״", "‶", "˶", "ʺ", "“", "”", "˝", "‟"]
+    ascii_single = "'"
+    ascii_double = '"'
+
+    def __init__(self, orignal_string: str):
+        self.orig = orignal_string
+        # A list of characters that will be joined into the simplified string
+        simplified_split: List[str] = []
+        # List of quotes, where index of i means the ith quote in the string
+        # is quote_positions[i] type of quote
+        self.quote_positions: List[str] = []
+
+        for char in list(self.orig):
+            if Quotes.is_quote(char):
+                self.quote_positions.append(char)
+            if char in Quotes.singles:
+                simplified_split.append(Quotes.ascii_single)
+            elif char in Quotes.doubles:
+                simplified_split.append(Quotes.ascii_double)
+            else:
+                simplified_split.append(char)
+
+        self.simplified = "".join(simplified_split)
+
+    @staticmethod
+    def is_quote(char: str) -> bool:
+        return char in Quotes.singles or char in Quotes.doubles
+
+    @staticmethod
+    def count_quotes(text: str) -> Tuple[int, int]:
+        chars = list(text)
+        single_count = sum(x in Quotes.singles for x in chars)
+        double_count = sum(x in Quotes.doubles for x in chars)
+        return single_count, double_count
+
+    def requote_same_quote_count(self, modified_string: str) -> str:
+        requoted_split = []
+        quotes = self.quote_positions[:]  # make a copy
+        for char in list(modified_string):
+            if Quotes.is_quote(char):
+                requoted_split.append(quotes.pop(0))
+            else:
+                requoted_split.append(char)
+        return "".join(requoted_split)
+
+    def requote_different_quote_count(self, modified_string: str) -> str:
+        """
+        @TODO - assume this is rare and just make a best effort
+        The idea is to use dmp to find diffs where the quotes don't match
+        if the quote is an addition, don't touch it
+        if the quote is removed, just throw that one away
+
+        This implementation could be much better, but it probably isn't worth the effort
+
+        In [7]: dmp.diff_main("Yeah, he'd've done something 'intelligent' I
+        ...: guess.", "Yeah, he's done something 'intelligent'")
+        Out[7]:
+        [(0, "Yeah, he'"),
+        (-1, "d've"),
+        (1, 's'),
+        (0, " done something 'intelligent'"),
+        (-1, ' I guess.')]
+        """
+        diff = dmp.diff_main(self.simplified, modified_string)
+        quotes = self.quote_positions[:]
+
+        requoted = ""
+        PREVIOUS_DELETE = False
+        PREVIOUS_QUOTE_COUNT = 0, 0
+        for kind, substr in diff:
+            if kind == 0:
+                # this part of the string has not been changed
+                for char in list(substr):
+                    if Quotes.is_quote(char):
+                        requoted += quotes.pop(0)
+                    else:
+                        requoted += char
+            elif kind == -1:
+                PREVIOUS_DELETE = True
+                PREVIOUS_QUOTE_COUNT = Quotes.count_quotes(substr)
+            elif kind == 1:
+                if not PREVIOUS_DELETE:
+                    # this is addition, not replacement
+                    # no need to do anything, leave as dumb quote
+                    pass
+                else:
+                    modified_quote_count = Quotes.count_quotes(substr)
+                    if PREVIOUS_QUOTE_COUNT == modified_quote_count:
+                        # the edit didn't change the quote count
+                        for char in list(substr):
+                            if Quotes.is_quote(char):
+                                requoted += quotes.pop(0)
+                            else:
+                                requoted += char
+                    else:
+                        # the only cases we handle are if one quote was deleted (single or double)
+                        # everything else is too rare and complicated
+                        old_single, old_double = PREVIOUS_QUOTE_COUNT
+                        new_single, new_double = modified_quote_count
+                        if old_single == 1 and new_single == 0:
+                            # find the first single quote in quotes and delete it
+                            for i, q in enumerate(quotes):
+                                if q in Quotes.singles:
+                                    del quotes[i]
+                                    break
+                        if old_double == 1 and new_double == 0:
+                            # find the first double quote in quotes and delete it
+                            for i, q in enumerate(quotes):
+                                if q in Quotes.doubles:
+                                    del quotes[i]
+                                    break
+                        requoted += substr
+                PREVIOUS_DELETE = False
+                PREVIOUS_QUOTE_COUNT = 0, 0
+            else:
+                # kind should only ever be 0, -1, or 1
+                pass
+        return requoted
+
+    def requote_modified_string(self, modified_string: str) -> str:
+        requoted = ""
+        if self.simplified == modified_string:
+            # the easiest case
+            requoted = self.orig
+        elif Quotes.count_quotes(modified_string) == Quotes.count_quotes(
+            self.simplified
+        ):
+            # assume that if the count of single- and double-quotes hasn't changed
+            # then this is not a coincidence
+            # What if GEC deleted one quote but added another?
+            # Potential bug, but seems so unlikely
+            requoted = self.requote_same_quote_count(modified_string)
+        else:
+            requoted = self.requote_different_quote_count(modified_string)
+        return requoted
+
diff --git a/marian_client/version.py b/marian_client/version.py
@@ -1,2 +1,2 @@
 # CHANGES HERE HAVE NO EFFECT: ../VERSION is the source of truth
-__version__ = "0.14.0"
+__version__ = "0.16.0"
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
+diff-match-patch==20181111
 websocket-client==0.56.0
diff --git a/tests/test_quotes.py b/tests/test_quotes.py
@@ -0,0 +1,66 @@
+from typing import Tuple
+
+import pytest
+
+from marian_client.quote_manager import Quotes
+
+
+smart_quotes = [
+    "We’ve sent you a couple of emails, but we haven’t heard back.",
+    "Yeah, he’d’ve done something ʻsmartʼ I guess.",
+    "One of these customers calls in, saying, ‘I’m upset about a bad Yelp review I got and also I don’t understand how this part of my ads program is working.’",
+    "You get another phone call and it’s a business that says something.",
+    "They say, ‘Hey, I’d like to actually grow my business more and I’d like to spend more money.’",
+]
+
+change_words_not_quotes = [
+    "We've sent you a couple of emails, but we haven't heard back.",
+    "Yeah, she'll've said something 'intelligent' I suppose.",
+    "A customers calls in and says, 'I'm not happy with a Yelp review I got and also I don't understand how this part of my ads program is working.'",
+    "You get a second phone call and it's a company that says something.",
+    "They say, 'I'd like to grow my business and I'd like to spend more money.'",
+]
+
+requoted = [
+    "We’ve sent you a couple of emails, but we haven’t heard back.",
+    "Yeah, she’ll’ve said something ʻintelligentʼ I suppose.",
+    "A customers calls in and says, ‘I’m not happy with a Yelp review I got and also I don’t understand how this part of my ads program is working.’",
+    "You get a second phone call and it’s a company that says something.",
+    "They say, ‘I’d like to grow my business and I’d like to spend more money.’",
+]
+
+
+@pytest.mark.parametrize("text", smart_quotes)
+def test_requote_no_change(text: str):
+    q = Quotes(text)
+    assert q.requote_modified_string(q.simplified) == q.orig
+
+
+@pytest.mark.parametrize(
+    "before_after_correct", list(zip(smart_quotes, change_words_not_quotes, requoted))
+)
+def test_text_change_but_same_quote_count(before_after_correct: Tuple[str, str, str]):
+    before, after, correct = before_after_correct
+    q = Quotes(before)
+    assert q.requote_modified_string(after) == correct
+
+
+changed_quote_cases = [
+    (
+        "Yeah , he’d’ve done something “intelligent”",
+        'Yeah , he\'s done something "intelligent"',
+        "Yeah , he’s done something “intelligent”",
+    ),
+    (
+        "There are many storeʼs like that one , which Iʼm a fan of",
+        "There are many stores like that one , which I'm a fan of",
+        "There are many stores like that one , which Iʼm a fan of",
+    ),
+]
+
+
+@pytest.mark.parametrize("before_after_correct", changed_quote_cases)
+def test_change_quote_count(before_after_correct: Tuple[str, str, str]):
+    before, after, correct = before_after_correct
+    q = Quotes(before)
+    assert q.requote_modified_string(after) == correct
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		diff-match-patch==20181111
		websocket-client==0.56.0