Skip to content

Commit

Permalink
Merge pull request #4 from Qordobacode/ENG-6931--smart-quotes-bad
Browse files Browse the repository at this point in the history
Fix smart quote handling
  • Loading branch information
sam-writer authored Feb 24, 2020
2 parents ee29fd8 + 809f5c4 commit 55fed90
Show file tree
Hide file tree
Showing 7 changed files with 266 additions and 8 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.14.0
0.16.0
1 change: 1 addition & 0 deletions build_release.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ check_credential_file_exists ()
update_deps ()
{
pip3 install -U setuptools wheel
pip3 install -r requirements-dev.txt
pip3 install -r requirements.txt
}

Expand Down
21 changes: 15 additions & 6 deletions marian_client/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
WebSocketAddressException,
)

from marian_client.quote_manager import Quotes

GENERIC_WEBSOCKET_ERROR_CODE = 469


Expand Down Expand Up @@ -142,8 +144,11 @@ def _send_message(self, tokenized_sentence: str):
# since there is no hope if we can't connect
self._check_connection()

orignal_message = Quotes(tokenized_sentence)

try:
self.ws.send(tokenized_sentence)
# need to strip smart quotes before sending
self.ws.send(orignal_message.simplified)
except (
WebSocketConnectionClosedException,
WebSocketAddressException,
Expand Down Expand Up @@ -178,16 +183,20 @@ def _send_message(self, tokenized_sentence: str):
self.ws.connected = False

assert r is not None, "If r isn't set by here, we didn't send a request"
return success, r
# need to add back in smart quotes
corrected_message_requoted = orignal_message.requote_modified_string(r)
return success, corrected_message_requoted

def __call__(self, tokenized_sentence: str):

success, r = self._send_message(tokenized_sentence)

if self.debug and r is not None:
print(r.status_code, r.reason)

if success:
return True, r, (None, None)
else:
return False, None, (r.status_code, r.reason)
if r is not None:
if self.debug:
print(r.status_code, r.reason)
return False, None, (r.status_code, r.reason)
else:
return False, None, (GENERIC_WEBSOCKET_ERROR_CODE, "Something went wrong")
181 changes: 181 additions & 0 deletions marian_client/quote_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
from typing import List, Tuple

from diff_match_patch import diff_match_patch


dmp = diff_match_patch()


class Quotes:
singles = [
"`",
"΄",
"'",
"ˈ",
"ˊ",
"ᑊ",
"ˋ",
"ꞌ",
"ᛌ",
"𖽒",
"𖽑",
"‘",
"’",
"י",
"՚",
"‛",
"՝",
"`",
"'",
"′",
"׳",
"´",
"ʹ",
"˴",
"ߴ",
"‵",
"ߵ",
"ʻ",
"ʼ",
"᾽",
"ʽ",
"῾",
"ʾ",
"᾿",
]
doubles = ['"', """, "〃", "ˮ", "ײ", "″", "״", "‶", "˶", "ʺ", "“", "”", "˝", "‟"]
ascii_single = "'"
ascii_double = '"'

def __init__(self, orignal_string: str):
self.orig = orignal_string
# A list of characters that will be joined into the simplified string
simplified_split: List[str] = []
# List of quotes, where index of i means the ith quote in the string
# is quote_positions[i] type of quote
self.quote_positions: List[str] = []

for char in list(self.orig):
if Quotes.is_quote(char):
self.quote_positions.append(char)
if char in Quotes.singles:
simplified_split.append(Quotes.ascii_single)
elif char in Quotes.doubles:
simplified_split.append(Quotes.ascii_double)
else:
simplified_split.append(char)

self.simplified = "".join(simplified_split)

@staticmethod
def is_quote(char: str) -> bool:
return char in Quotes.singles or char in Quotes.doubles

@staticmethod
def count_quotes(text: str) -> Tuple[int, int]:
chars = list(text)
single_count = sum(x in Quotes.singles for x in chars)
double_count = sum(x in Quotes.doubles for x in chars)
return single_count, double_count

def requote_same_quote_count(self, modified_string: str) -> str:
requoted_split = []
quotes = self.quote_positions[:] # make a copy
for char in list(modified_string):
if Quotes.is_quote(char):
requoted_split.append(quotes.pop(0))
else:
requoted_split.append(char)
return "".join(requoted_split)

def requote_different_quote_count(self, modified_string: str) -> str:
"""
@TODO - assume this is rare and just make a best effort
The idea is to use dmp to find diffs where the quotes don't match
if the quote is an addition, don't touch it
if the quote is removed, just throw that one away
This implementation could be much better, but it probably isn't worth the effort
In [7]: dmp.diff_main("Yeah, he'd've done something 'intelligent' I
...: guess.", "Yeah, he's done something 'intelligent'")
Out[7]:
[(0, "Yeah, he'"),
(-1, "d've"),
(1, 's'),
(0, " done something 'intelligent'"),
(-1, ' I guess.')]
"""
diff = dmp.diff_main(self.simplified, modified_string)
quotes = self.quote_positions[:]

requoted = ""
PREVIOUS_DELETE = False
PREVIOUS_QUOTE_COUNT = 0, 0
for kind, substr in diff:
if kind == 0:
# this part of the string has not been changed
for char in list(substr):
if Quotes.is_quote(char):
requoted += quotes.pop(0)
else:
requoted += char
elif kind == -1:
PREVIOUS_DELETE = True
PREVIOUS_QUOTE_COUNT = Quotes.count_quotes(substr)
elif kind == 1:
if not PREVIOUS_DELETE:
# this is addition, not replacement
# no need to do anything, leave as dumb quote
pass
else:
modified_quote_count = Quotes.count_quotes(substr)
if PREVIOUS_QUOTE_COUNT == modified_quote_count:
# the edit didn't change the quote count
for char in list(substr):
if Quotes.is_quote(char):
requoted += quotes.pop(0)
else:
requoted += char
else:
# the only cases we handle are if one quote was deleted (single or double)
# everything else is too rare and complicated
old_single, old_double = PREVIOUS_QUOTE_COUNT
new_single, new_double = modified_quote_count
if old_single == 1 and new_single == 0:
# find the first single quote in quotes and delete it
for i, q in enumerate(quotes):
if q in Quotes.singles:
del quotes[i]
break
if old_double == 1 and new_double == 0:
# find the first double quote in quotes and delete it
for i, q in enumerate(quotes):
if q in Quotes.doubles:
del quotes[i]
break
requoted += substr
PREVIOUS_DELETE = False
PREVIOUS_QUOTE_COUNT = 0, 0
else:
# kind should only ever be 0, -1, or 1
pass
return requoted

def requote_modified_string(self, modified_string: str) -> str:
requoted = ""
if self.simplified == modified_string:
# the easiest case
requoted = self.orig
elif Quotes.count_quotes(modified_string) == Quotes.count_quotes(
self.simplified
):
# assume that if the count of single- and double-quotes hasn't changed
# then this is not a coincidence
# What if GEC deleted one quote but added another?
# Potential bug, but seems so unlikely
requoted = self.requote_same_quote_count(modified_string)
else:
requoted = self.requote_different_quote_count(modified_string)
return requoted

2 changes: 1 addition & 1 deletion marian_client/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# CHANGES HERE HAVE NO EFFECT: ../VERSION is the source of truth
__version__ = "0.14.0"
__version__ = "0.16.0"
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
diff-match-patch==20181111
websocket-client==0.56.0
66 changes: 66 additions & 0 deletions tests/test_quotes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from typing import Tuple

import pytest

from marian_client.quote_manager import Quotes


smart_quotes = [
"We’ve sent you a couple of emails, but we haven’t heard back.",
"Yeah, he’d’ve done something ʻsmartʼ I guess.",
"One of these customers calls in, saying, ‘I’m upset about a bad Yelp review I got and also I don’t understand how this part of my ads program is working.’",
"You get another phone call and it’s a business that says something.",
"They say, ‘Hey, I’d like to actually grow my business more and I’d like to spend more money.’",
]

change_words_not_quotes = [
"We've sent you a couple of emails, but we haven't heard back.",
"Yeah, she'll've said something 'intelligent' I suppose.",
"A customers calls in and says, 'I'm not happy with a Yelp review I got and also I don't understand how this part of my ads program is working.'",
"You get a second phone call and it's a company that says something.",
"They say, 'I'd like to grow my business and I'd like to spend more money.'",
]

requoted = [
"We’ve sent you a couple of emails, but we haven’t heard back.",
"Yeah, she’ll’ve said something ʻintelligentʼ I suppose.",
"A customers calls in and says, ‘I’m not happy with a Yelp review I got and also I don’t understand how this part of my ads program is working.’",
"You get a second phone call and it’s a company that says something.",
"They say, ‘I’d like to grow my business and I’d like to spend more money.’",
]


@pytest.mark.parametrize("text", smart_quotes)
def test_requote_no_change(text: str):
q = Quotes(text)
assert q.requote_modified_string(q.simplified) == q.orig


@pytest.mark.parametrize(
"before_after_correct", list(zip(smart_quotes, change_words_not_quotes, requoted))
)
def test_text_change_but_same_quote_count(before_after_correct: Tuple[str, str, str]):
before, after, correct = before_after_correct
q = Quotes(before)
assert q.requote_modified_string(after) == correct


changed_quote_cases = [
(
"Yeah , he’d’ve done something “intelligent”",
'Yeah , he\'s done something "intelligent"',
"Yeah , he’s done something “intelligent”",
),
(
"There are many storeʼs like that one , which Iʼm a fan of",
"There are many stores like that one , which I'm a fan of",
"There are many stores like that one , which Iʼm a fan of",
),
]


@pytest.mark.parametrize("before_after_correct", changed_quote_cases)
def test_change_quote_count(before_after_correct: Tuple[str, str, str]):
before, after, correct = before_after_correct
q = Quotes(before)
assert q.requote_modified_string(after) == correct

0 comments on commit 55fed90

Please sign in to comment.