-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from Qordobacode/ENG-6931--smart-quotes-bad
Fix smart quote handling
- Loading branch information
Showing
7 changed files
with
266 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
0.14.0 | ||
0.16.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
from typing import List, Tuple | ||
|
||
from diff_match_patch import diff_match_patch | ||
|
||
|
||
dmp = diff_match_patch() | ||
|
||
|
||
class Quotes: | ||
singles = [ | ||
"`", | ||
"΄", | ||
"'", | ||
"ˈ", | ||
"ˊ", | ||
"ᑊ", | ||
"ˋ", | ||
"ꞌ", | ||
"ᛌ", | ||
"𖽒", | ||
"𖽑", | ||
"‘", | ||
"’", | ||
"י", | ||
"՚", | ||
"‛", | ||
"՝", | ||
"`", | ||
"'", | ||
"′", | ||
"׳", | ||
"´", | ||
"ʹ", | ||
"˴", | ||
"ߴ", | ||
"‵", | ||
"ߵ", | ||
"ʻ", | ||
"ʼ", | ||
"᾽", | ||
"ʽ", | ||
"῾", | ||
"ʾ", | ||
"᾿", | ||
] | ||
doubles = ['"', """, "〃", "ˮ", "ײ", "″", "״", "‶", "˶", "ʺ", "“", "”", "˝", "‟"] | ||
ascii_single = "'" | ||
ascii_double = '"' | ||
|
||
def __init__(self, orignal_string: str): | ||
self.orig = orignal_string | ||
# A list of characters that will be joined into the simplified string | ||
simplified_split: List[str] = [] | ||
# List of quotes, where index of i means the ith quote in the string | ||
# is quote_positions[i] type of quote | ||
self.quote_positions: List[str] = [] | ||
|
||
for char in list(self.orig): | ||
if Quotes.is_quote(char): | ||
self.quote_positions.append(char) | ||
if char in Quotes.singles: | ||
simplified_split.append(Quotes.ascii_single) | ||
elif char in Quotes.doubles: | ||
simplified_split.append(Quotes.ascii_double) | ||
else: | ||
simplified_split.append(char) | ||
|
||
self.simplified = "".join(simplified_split) | ||
|
||
@staticmethod | ||
def is_quote(char: str) -> bool: | ||
return char in Quotes.singles or char in Quotes.doubles | ||
|
||
@staticmethod | ||
def count_quotes(text: str) -> Tuple[int, int]: | ||
chars = list(text) | ||
single_count = sum(x in Quotes.singles for x in chars) | ||
double_count = sum(x in Quotes.doubles for x in chars) | ||
return single_count, double_count | ||
|
||
def requote_same_quote_count(self, modified_string: str) -> str: | ||
requoted_split = [] | ||
quotes = self.quote_positions[:] # make a copy | ||
for char in list(modified_string): | ||
if Quotes.is_quote(char): | ||
requoted_split.append(quotes.pop(0)) | ||
else: | ||
requoted_split.append(char) | ||
return "".join(requoted_split) | ||
|
||
def requote_different_quote_count(self, modified_string: str) -> str: | ||
""" | ||
@TODO - assume this is rare and just make a best effort | ||
The idea is to use dmp to find diffs where the quotes don't match | ||
if the quote is an addition, don't touch it | ||
if the quote is removed, just throw that one away | ||
This implementation could be much better, but it probably isn't worth the effort | ||
In [7]: dmp.diff_main("Yeah, he'd've done something 'intelligent' I | ||
...: guess.", "Yeah, he's done something 'intelligent'") | ||
Out[7]: | ||
[(0, "Yeah, he'"), | ||
(-1, "d've"), | ||
(1, 's'), | ||
(0, " done something 'intelligent'"), | ||
(-1, ' I guess.')] | ||
""" | ||
diff = dmp.diff_main(self.simplified, modified_string) | ||
quotes = self.quote_positions[:] | ||
|
||
requoted = "" | ||
PREVIOUS_DELETE = False | ||
PREVIOUS_QUOTE_COUNT = 0, 0 | ||
for kind, substr in diff: | ||
if kind == 0: | ||
# this part of the string has not been changed | ||
for char in list(substr): | ||
if Quotes.is_quote(char): | ||
requoted += quotes.pop(0) | ||
else: | ||
requoted += char | ||
elif kind == -1: | ||
PREVIOUS_DELETE = True | ||
PREVIOUS_QUOTE_COUNT = Quotes.count_quotes(substr) | ||
elif kind == 1: | ||
if not PREVIOUS_DELETE: | ||
# this is addition, not replacement | ||
# no need to do anything, leave as dumb quote | ||
pass | ||
else: | ||
modified_quote_count = Quotes.count_quotes(substr) | ||
if PREVIOUS_QUOTE_COUNT == modified_quote_count: | ||
# the edit didn't change the quote count | ||
for char in list(substr): | ||
if Quotes.is_quote(char): | ||
requoted += quotes.pop(0) | ||
else: | ||
requoted += char | ||
else: | ||
# the only cases we handle are if one quote was deleted (single or double) | ||
# everything else is too rare and complicated | ||
old_single, old_double = PREVIOUS_QUOTE_COUNT | ||
new_single, new_double = modified_quote_count | ||
if old_single == 1 and new_single == 0: | ||
# find the first single quote in quotes and delete it | ||
for i, q in enumerate(quotes): | ||
if q in Quotes.singles: | ||
del quotes[i] | ||
break | ||
if old_double == 1 and new_double == 0: | ||
# find the first double quote in quotes and delete it | ||
for i, q in enumerate(quotes): | ||
if q in Quotes.doubles: | ||
del quotes[i] | ||
break | ||
requoted += substr | ||
PREVIOUS_DELETE = False | ||
PREVIOUS_QUOTE_COUNT = 0, 0 | ||
else: | ||
# kind should only ever be 0, -1, or 1 | ||
pass | ||
return requoted | ||
|
||
def requote_modified_string(self, modified_string: str) -> str: | ||
requoted = "" | ||
if self.simplified == modified_string: | ||
# the easiest case | ||
requoted = self.orig | ||
elif Quotes.count_quotes(modified_string) == Quotes.count_quotes( | ||
self.simplified | ||
): | ||
# assume that if the count of single- and double-quotes hasn't changed | ||
# then this is not a coincidence | ||
# What if GEC deleted one quote but added another? | ||
# Potential bug, but seems so unlikely | ||
requoted = self.requote_same_quote_count(modified_string) | ||
else: | ||
requoted = self.requote_different_quote_count(modified_string) | ||
return requoted | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,2 @@ | ||
# CHANGES HERE HAVE NO EFFECT: ../VERSION is the source of truth | ||
__version__ = "0.14.0" | ||
__version__ = "0.16.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
diff-match-patch==20181111 | ||
websocket-client==0.56.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
from typing import Tuple | ||
|
||
import pytest | ||
|
||
from marian_client.quote_manager import Quotes | ||
|
||
|
||
smart_quotes = [ | ||
"We’ve sent you a couple of emails, but we haven’t heard back.", | ||
"Yeah, he’d’ve done something ʻsmartʼ I guess.", | ||
"One of these customers calls in, saying, ‘I’m upset about a bad Yelp review I got and also I don’t understand how this part of my ads program is working.’", | ||
"You get another phone call and it’s a business that says something.", | ||
"They say, ‘Hey, I’d like to actually grow my business more and I’d like to spend more money.’", | ||
] | ||
|
||
change_words_not_quotes = [ | ||
"We've sent you a couple of emails, but we haven't heard back.", | ||
"Yeah, she'll've said something 'intelligent' I suppose.", | ||
"A customers calls in and says, 'I'm not happy with a Yelp review I got and also I don't understand how this part of my ads program is working.'", | ||
"You get a second phone call and it's a company that says something.", | ||
"They say, 'I'd like to grow my business and I'd like to spend more money.'", | ||
] | ||
|
||
requoted = [ | ||
"We’ve sent you a couple of emails, but we haven’t heard back.", | ||
"Yeah, she’ll’ve said something ʻintelligentʼ I suppose.", | ||
"A customers calls in and says, ‘I’m not happy with a Yelp review I got and also I don’t understand how this part of my ads program is working.’", | ||
"You get a second phone call and it’s a company that says something.", | ||
"They say, ‘I’d like to grow my business and I’d like to spend more money.’", | ||
] | ||
|
||
|
||
@pytest.mark.parametrize("text", smart_quotes) | ||
def test_requote_no_change(text: str): | ||
q = Quotes(text) | ||
assert q.requote_modified_string(q.simplified) == q.orig | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"before_after_correct", list(zip(smart_quotes, change_words_not_quotes, requoted)) | ||
) | ||
def test_text_change_but_same_quote_count(before_after_correct: Tuple[str, str, str]): | ||
before, after, correct = before_after_correct | ||
q = Quotes(before) | ||
assert q.requote_modified_string(after) == correct | ||
|
||
|
||
changed_quote_cases = [ | ||
( | ||
"Yeah , he’d’ve done something “intelligent”", | ||
'Yeah , he\'s done something "intelligent"', | ||
"Yeah , he’s done something “intelligent”", | ||
), | ||
( | ||
"There are many storeʼs like that one , which Iʼm a fan of", | ||
"There are many stores like that one , which I'm a fan of", | ||
"There are many stores like that one , which Iʼm a fan of", | ||
), | ||
] | ||
|
||
|
||
@pytest.mark.parametrize("before_after_correct", changed_quote_cases) | ||
def test_change_quote_count(before_after_correct: Tuple[str, str, str]): | ||
before, after, correct = before_after_correct | ||
q = Quotes(before) | ||
assert q.requote_modified_string(after) == correct |