Skip to content

Commit

Permalink
Fix URL linkifier grabbing full-width spaces and quotations (mastodon…
Browse files Browse the repository at this point in the history
  • Loading branch information
Gargron authored and hiyuki2578 committed Oct 2, 2019
1 parent 80adc73 commit 4b80e5a
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 5 deletions.
12 changes: 11 additions & 1 deletion app/lib/formatter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -204,12 +204,22 @@ def rewrite(text, entities)
result.flatten.join
end

UNICODE_ESCAPE_BLACKLIST_RE = /\p{Z}|\p{P}/

def utf8_friendly_extractor(text, options = {})
old_to_new_index = [0]

escaped = text.chars.map do |c|
output = c.ord.to_s(16).length > 2 ? CGI.escape(c) : c
output = begin
if c.ord.to_s(16).length > 2 && UNICODE_ESCAPE_BLACKLIST_RE.match(c).nil?
CGI.escape(c)
else
c
end
end

old_to_new_index << old_to_new_index.last + output.length

output
end.join

Expand Down
4 changes: 2 additions & 2 deletions config/initializers/twitter_regex.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module Twitter
class Regex
REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}\(\)\?]/iou
REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*';:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou
REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
REGEXEN[:valid_url_balanced_parens] = /
\(
(?:
Expand Down
44 changes: 42 additions & 2 deletions spec/lib/formatter_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,22 @@
end
end

context 'given a URL in quotation marks' do
let(:text) { '"https://example.com/"' }

it 'does not match the quotation marks' do
is_expected.to include 'href="https://example.com/"'
end
end

context 'given a URL in angle brackets' do
let(:text) { '<https://example.com/>' }

it 'does not match the angle brackets' do
is_expected.to include 'href="https://example.com/"'
end
end

context 'given a URL with Japanese path string' do
let(:text) { 'https://ja.wikipedia.org/wiki/日本' }

Expand All @@ -131,6 +147,22 @@
end
end

context 'given a URL with a full-width space' do
let(:text) { 'https://example.com/ abc123' }

it 'does not match the full-width space' do
is_expected.to include 'href="https://example.com/"'
end
end

context 'given a URL in Japanese quotation marks' do
let(:text) { '「[https://example.org/」' }

it 'does not match the quotation marks' do
is_expected.to include 'href="https://example.org/"'
end
end

context 'given a URL with Simplified Chinese path string' do
let(:text) { 'https://baike.baidu.com/item/中华人民共和国' }

Expand All @@ -150,15 +182,23 @@
context 'given a URL containing unsafe code (XSS attack, visible part)' do
let(:text) { %q{http://example.com/b<del>b</del>} }

it 'escapes the HTML in the URL' do
it 'does not include the HTML in the URL' do
is_expected.to include '"http://example.com/b"'
end

it 'escapes the HTML' do
is_expected.to include '&lt;del&gt;b&lt;/del&gt;'
end
end

context 'given a URL containing unsafe code (XSS attack, invisible part)' do
let(:text) { %q{http://example.com/blahblahblahblah/a<script>alert("Hello")</script>} }

it 'escapes the HTML in the URL' do
it 'does not include the HTML in the URL' do
is_expected.to include '"http://example.com/blahblahblahblah/a"'
end

it 'escapes the HTML' do
is_expected.to include '&lt;script&gt;alert(&quot;Hello&quot;)&lt;/script&gt;'
end
end
Expand Down

0 comments on commit 4b80e5a

Please sign in to comment.