diff --git a/app/lib/formatter.rb b/app/lib/formatter.rb index aa737549f7c41f..ac912fc7b7dd16 100644 --- a/app/lib/formatter.rb +++ b/app/lib/formatter.rb @@ -204,12 +204,22 @@ def rewrite(text, entities) result.flatten.join end + UNICODE_ESCAPE_BLACKLIST_RE = /\p{Z}|\p{P}/ + def utf8_friendly_extractor(text, options = {}) old_to_new_index = [0] escaped = text.chars.map do |c| - output = c.ord.to_s(16).length > 2 ? CGI.escape(c) : c + output = begin + if c.ord.to_s(16).length > 2 && UNICODE_ESCAPE_BLACKLIST_RE.match(c).nil? + CGI.escape(c) + else + c + end + end + old_to_new_index << old_to_new_index.last + output.length + output end.join diff --git a/config/initializers/twitter_regex.rb b/config/initializers/twitter_regex.rb index 0e8f5bfeb47353..0ddbbee9828bfd 100644 --- a/config/initializers/twitter_regex.rb +++ b/config/initializers/twitter_regex.rb @@ -1,7 +1,7 @@ module Twitter class Regex - REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}\(\)\?]/iou - REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*';:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou + REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou + REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou REGEXEN[:valid_url_balanced_parens] = / \( (?: diff --git a/spec/lib/formatter_spec.rb b/spec/lib/formatter_spec.rb index 8fb6695a9bf12c..96d2fc7e06c247 100644 --- a/spec/lib/formatter_spec.rb +++ b/spec/lib/formatter_spec.rb @@ -115,6 +115,22 @@ end end + context 'given a URL in quotation marks' do + let(:text) { '"https://example.com/"' } + + it 'does not match the quotation marks' do + is_expected.to include 'href="https://example.com/"' + end + end + + context 'given a URL in angle brackets' do + let(:text) { '' } + + it 'does not match the angle brackets' do + is_expected.to include 'href="https://example.com/"' + end + end + context 'given a URL with Japanese path string' do let(:text) { 'https://ja.wikipedia.org/wiki/日本' } @@ -131,6 +147,22 @@ end end + context 'given a URL with a full-width space' do + let(:text) { 'https://example.com/ abc123' } + + it 'does not match the full-width space' do + is_expected.to include 'href="https://example.com/"' + end + end + + context 'given a URL in Japanese quotation marks' do + let(:text) { '「[https://example.org/」' } + + it 'does not match the quotation marks' do + is_expected.to include 'href="https://example.org/"' + end + end + context 'given a URL with Simplified Chinese path string' do let(:text) { 'https://baike.baidu.com/item/中华人民共和国' } @@ -150,7 +182,11 @@ context 'given a URL containing unsafe code (XSS attack, visible part)' do let(:text) { %q{http://example.com/bb} } - it 'escapes the HTML in the URL' do + it 'does not include the HTML in the URL' do + is_expected.to include '"http://example.com/b"' + end + + it 'escapes the HTML' do is_expected.to include '<del>b</del>' end end @@ -158,7 +194,11 @@ context 'given a URL containing unsafe code (XSS attack, invisible part)' do let(:text) { %q{http://example.com/blahblahblahblah/a} } - it 'escapes the HTML in the URL' do + it 'does not include the HTML in the URL' do + is_expected.to include '"http://example.com/blahblahblahblah/a"' + end + + it 'escapes the HTML' do is_expected.to include '<script>alert("Hello")</script>' end end