Commit cb55bc3c authored by Sean McGivern's avatar Sean McGivern

Match Rinku's behaviour for closing punctuation in links

Rinku 2.0.0 (the version we use) will remove the last character of a link if
it's a closing part of a punctuation pair (different types of parentheses and
quotes), unless both of the below are true:

1. The matching pair has different start and end characters.
2. There are equal numbers of both in the matched string (they don't have to be
   balanced).
parent 1a09d5cd
...@@ -25,7 +25,7 @@ module Banzai ...@@ -25,7 +25,7 @@ module Banzai
# period or comma for punctuation without those characters being included # period or comma for punctuation without those characters being included
# in the generated link. # in the generated link.
# #
# Rubular: http://rubular.com/r/cxjPyZc7Sb # Rubular: http://rubular.com/r/JzPhi6DCZp
LINK_PATTERN = %r{([a-z][a-z0-9\+\.-]+://[^\s>]+)(?<!,|\.)} LINK_PATTERN = %r{([a-z][a-z0-9\+\.-]+://[^\s>]+)(?<!,|\.)}
# Text matching LINK_PATTERN inside these elements will not be linked # Text matching LINK_PATTERN inside these elements will not be linked
...@@ -37,23 +37,17 @@ module Banzai ...@@ -37,23 +37,17 @@ module Banzai
and contains(., '://') and contains(., '://')
]).freeze ]).freeze
PUNCTUATION_PAIRS = {
"'" => "'",
'"' => '"',
')' => '(',
']' => '[',
'}' => '{'
}.freeze
def call def call
return doc if context[:autolink] == false return doc if context[:autolink] == false
text_parse
end
private
# Return true if any of the UNSAFE_PROTOCOLS strings are included in the URI scheme
def contains_unsafe?(scheme)
return false unless scheme
scheme = scheme.strip.downcase
Banzai::Filter::SanitizationFilter::UNSAFE_PROTOCOLS.any? { |protocol| scheme.include?(protocol) }
end
def text_parse
doc.xpath(TEXT_QUERY).each do |node| doc.xpath(TEXT_QUERY).each do |node|
content = node.to_html content = node.to_html
...@@ -69,6 +63,16 @@ module Banzai ...@@ -69,6 +63,16 @@ module Banzai
doc doc
end end
private
# Return true if any of the UNSAFE_PROTOCOLS strings are included in the URI scheme
def contains_unsafe?(scheme)
return false unless scheme
scheme = scheme.strip.downcase
Banzai::Filter::SanitizationFilter::UNSAFE_PROTOCOLS.any? { |protocol| scheme.include?(protocol) }
end
def autolink_match(match) def autolink_match(match)
# start by stripping out dangerous links # start by stripping out dangerous links
begin begin
...@@ -84,6 +88,22 @@ module Banzai ...@@ -84,6 +88,22 @@ module Banzai
match.gsub!(/((?:&[\w#]+;)+)\z/, '') match.gsub!(/((?:&[\w#]+;)+)\z/, '')
dropped = ($1 || '').html_safe dropped = ($1 || '').html_safe
# To match the behaviour of Rinku, if the matched link ends with a
# closing part of a matched pair of punctuation, we remove that trailing
# character unless there are an equal number of closing and opening
# characters in the link.
if match.end_with?(*PUNCTUATION_PAIRS.keys)
close_character = match[-1]
close_count = match.count(close_character)
open_character = PUNCTUATION_PAIRS[close_character]
open_count = match.count(open_character)
if open_count != close_count || open_character == close_character
dropped += close_character
match = match[0..-2]
end
end
options = link_options.merge(href: match) options = link_options.merge(href: match)
content_tag(:a, match.html_safe, options) + dropped content_tag(:a, match.html_safe, options) + dropped
end end
......
...@@ -4,6 +4,7 @@ describe Banzai::Filter::AutolinkFilter do ...@@ -4,6 +4,7 @@ describe Banzai::Filter::AutolinkFilter do
include FilterSpecHelper include FilterSpecHelper
let(:link) { 'http://about.gitlab.com/' } let(:link) { 'http://about.gitlab.com/' }
let(:quotes) { ['"', "'"] }
it 'does nothing when :autolink is false' do it 'does nothing when :autolink is false' do
exp = act = link exp = act = link
...@@ -15,16 +16,6 @@ describe Banzai::Filter::AutolinkFilter do ...@@ -15,16 +16,6 @@ describe Banzai::Filter::AutolinkFilter do
expect(filter(act).to_html).to eq exp expect(filter(act).to_html).to eq exp
end end
context 'when the input contains no links' do
it 'does not parse_html back the rinku returned value' do
act = HTML::Pipeline.parse('<p>This text contains no links to autolink</p>')
expect_any_instance_of(described_class).not_to receive(:parse_html)
filter(act).to_html
end
end
context 'Various schemes' do context 'Various schemes' do
it 'autolinks http' do it 'autolinks http' do
doc = filter("See #{link}") doc = filter("See #{link}")
...@@ -141,6 +132,45 @@ describe Banzai::Filter::AutolinkFilter do ...@@ -141,6 +132,45 @@ describe Banzai::Filter::AutolinkFilter do
expect(doc.at_css('a').text).to eq link expect(doc.at_css('a').text).to eq link
end end
it 'includes trailing punctuation when part of a balanced pair' do
described_class::PUNCTUATION_PAIRS.each do |close, open|
next if open.in?(quotes)
balanced_link = "#{link}#{open}abc#{close}"
balanced_actual = filter("See #{balanced_link}...")
unbalanced_link = "#{link}#{close}"
unbalanced_actual = filter("See #{unbalanced_link}...")
expect(balanced_actual.at_css('a').text).to eq(balanced_link)
expect(unescape(balanced_actual.to_html)).to eq(Rinku.auto_link("See #{balanced_link}..."))
expect(unbalanced_actual.at_css('a').text).to eq(link)
expect(unescape(unbalanced_actual.to_html)).to eq(Rinku.auto_link("See #{unbalanced_link}..."))
end
end
it 'removes trailing quotes' do
quotes.each do |quote|
balanced_link = "#{link}#{quote}abc#{quote}"
balanced_actual = filter("See #{balanced_link}...")
unbalanced_link = "#{link}#{quote}"
unbalanced_actual = filter("See #{unbalanced_link}...")
expect(balanced_actual.at_css('a').text).to eq(balanced_link[0...-1])
expect(unescape(balanced_actual.to_html)).to eq(Rinku.auto_link("See #{balanced_link}..."))
expect(unbalanced_actual.at_css('a').text).to eq(link)
expect(unescape(unbalanced_actual.to_html)).to eq(Rinku.auto_link("See #{unbalanced_link}..."))
end
end
it 'removes one closing punctuation mark when the punctuation in the link is unbalanced' do
complicated_link = "(#{link}(a'b[c'd]))'"
expected_complicated_link = %Q{(<a href="#{link}(a'b[c'd]))">#{link}(a'b[c'd]))</a>'}
actual = unescape(filter(complicated_link).to_html)
expect(actual).to eq(Rinku.auto_link(complicated_link))
expect(actual).to eq(expected_complicated_link)
end
it 'does not include trailing HTML entities' do it 'does not include trailing HTML entities' do
doc = filter("See &lt;&lt;&lt;#{link}&gt;&gt;&gt;") doc = filter("See &lt;&lt;&lt;#{link}&gt;&gt;&gt;")
...@@ -162,16 +192,27 @@ describe Banzai::Filter::AutolinkFilter do ...@@ -162,16 +192,27 @@ describe Banzai::Filter::AutolinkFilter do
end end
context 'when the link is inside a tag' do context 'when the link is inside a tag' do
it 'renders text after the link correctly for http' do %w[http rdar].each do |protocol|
doc = filter(ERB::Util.html_escape_once("<http://link><another>")) it "renders text after the link correctly for #{protocol}" do
doc = filter(ERB::Util.html_escape_once("<#{protocol}://link><another>"))
expect(doc.children.last.text).to include('<another>') expect(doc.children.last.text).to include('<another>')
end
end end
end
it 'renders text after the link correctly for not other protocol' do # Rinku does not escape these characters in HTML attributes, but content_tag
doc = filter(ERB::Util.html_escape_once("<rdar://link><another>")) # does. We don't care about that difference for these specs, though.
def unescape(html)
%w([ ] { }).each do |cgi_escape|
html.sub!(CGI.escape(cgi_escape), cgi_escape)
end
expect(doc.children.last.text).to include('<another>') quotes.each do |html_escape|
html.sub!(CGI.escape_html(html_escape), html_escape)
html.sub!(CGI.escape(html_escape), CGI.escape_html(html_escape))
end end
html
end end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment