Match Rinku's behaviour for closing punctuation in links

Rinku 2.0.0 (the version we use) will remove the last character of a link if it's a closing part of a punctuation pair (different types of parentheses and quotes), unless both of the below are true: 1. The matching pair has different start and end characters. 2. There are equal numbers of both in the matched string (they don't have to be balanced).

Match Rinku's behaviour for closing punctuation in links
Rinku 2.0.0 (the version we use) will remove the last character of a link if it's a closing part of a punctuation pair (different types of parentheses and quotes), unless both of the below are true: 1. The matching pair has different start and end characters. 2. There are equal numbers of both in the matched string (they don't have to be balanced).
cb55bc3c · Sean McGivern · 1a09d5cd · cb55bc3c · cb55bc3c
Commit cb55bc3c authored Feb 22, 2018 by Sean McGivern
Showing with 92 additions and 31 deletions

lib/banzai/filter/autolink_filter.rb lib/banzai/filter/autolink_filter.rb +35 -15

spec/lib/banzai/filter/autolink_filter_spec.rb spec/lib/banzai/filter/autolink_filter_spec.rb +57 -16

No files found.
--- a/lib/banzai/filter/autolink_filter.rb
+++ b/lib/banzai/filter/autolink_filter.rb
@@ -25,7 +25,7 @@ module Banzai
      # period or comma for punctuation without those characters being included
      # in the generated link.
      #
-      # Rubular: http://rubular.com/r/cxjPyZc7Sb
+      # Rubular: http://rubular.com/r/JzPhi6DCZp
      LINK_PATTERN = %r{([a-z][a-z0-9\+\.-]+://[^\s>]+)(?<!,|\.)}

      # Text matching LINK_PATTERN inside these elements will not be linked
@@ -37,23 +37,17 @@ module Banzai
        and contains(., '://')
      ]).freeze

+      PUNCTUATION_PAIRS = {
+        "'" => "'",
+        '"' => '"',
+        ')' => '(',
+        ']' => '[',
+        '}' => '{'
+      }.freeze
+
      def call
        return doc if context[:autolink] == false

-        text_parse
-      end
-
-      private
-
-      # Return true if any of the UNSAFE_PROTOCOLS strings are included in the URI scheme
-      def contains_unsafe?(scheme)
-        return false unless scheme
-
-        scheme = scheme.strip.downcase
-        Banzai::Filter::SanitizationFilter::UNSAFE_PROTOCOLS.any? { |protocol| scheme.include?(protocol) }
-      end
-
-      def text_parse
        doc.xpath(TEXT_QUERY).each do |node|
          content = node.to_html

@@ -69,6 +63,16 @@ module Banzai
        doc
      end

+      private
+
+      # Return true if any of the UNSAFE_PROTOCOLS strings are included in the URI scheme
+      def contains_unsafe?(scheme)
+        return false unless scheme
+
+        scheme = scheme.strip.downcase
+        Banzai::Filter::SanitizationFilter::UNSAFE_PROTOCOLS.any? { |protocol| scheme.include?(protocol) }
+      end
+
      def autolink_match(match)
        # start by stripping out dangerous links
        begin
@@ -84,6 +88,22 @@ module Banzai
        match.gsub!(/((?:&[\w#]+;)+)\z/, '')
        dropped = ($1 || '').html_safe

+        # To match the behaviour of Rinku, if the matched link ends with a
+        # closing part of a matched pair of punctuation, we remove that trailing
+        # character unless there are an equal number of closing and opening
+        # characters in the link.
+        if match.end_with?(*PUNCTUATION_PAIRS.keys)
+          close_character = match[-1]
+          close_count = match.count(close_character)
+          open_character = PUNCTUATION_PAIRS[close_character]
+          open_count = match.count(open_character)
+
+          if open_count != close_count || open_character == close_character
+            dropped += close_character
+            match = match[0..-2]
+          end
+        end
+
        options = link_options.merge(href: match)
        content_tag(:a, match.html_safe, options) + dropped
      end

--- a/spec/lib/banzai/filter/autolink_filter_spec.rb
+++ b/spec/lib/banzai/filter/autolink_filter_spec.rb
@@ -4,6 +4,7 @@ describe Banzai::Filter::AutolinkFilter do
  include FilterSpecHelper

  let(:link) { 'http://about.gitlab.com/' }
+  let(:quotes) { ['"', "'"] }

  it 'does nothing when :autolink is false' do
    exp = act = link
@@ -15,16 +16,6 @@ describe Banzai::Filter::AutolinkFilter do
    expect(filter(act).to_html).to eq exp
  end

-  context 'when the input contains no links' do
-    it 'does not parse_html back the rinku returned value' do
-      act = HTML::Pipeline.parse('<p>This text contains no links to autolink</p>')
-
-      expect_any_instance_of(described_class).not_to receive(:parse_html)
-
-      filter(act).to_html
-    end
-  end
-
  context 'Various schemes' do
    it 'autolinks http' do
      doc = filter("See #{link}")
@@ -141,6 +132,45 @@ describe Banzai::Filter::AutolinkFilter do
      expect(doc.at_css('a').text).to eq link
    end

+    it 'includes trailing punctuation when part of a balanced pair' do
+      described_class::PUNCTUATION_PAIRS.each do |close, open|
+        next if open.in?(quotes)
+
+        balanced_link = "#{link}#{open}abc#{close}"
+        balanced_actual = filter("See #{balanced_link}...")
+        unbalanced_link = "#{link}#{close}"
+        unbalanced_actual = filter("See #{unbalanced_link}...")
+
+        expect(balanced_actual.at_css('a').text).to eq(balanced_link)
+        expect(unescape(balanced_actual.to_html)).to eq(Rinku.auto_link("See #{balanced_link}..."))
+        expect(unbalanced_actual.at_css('a').text).to eq(link)
+        expect(unescape(unbalanced_actual.to_html)).to eq(Rinku.auto_link("See #{unbalanced_link}..."))
+      end
+    end
+
+    it 'removes trailing quotes' do
+      quotes.each do |quote|
+        balanced_link = "#{link}#{quote}abc#{quote}"
+        balanced_actual = filter("See #{balanced_link}...")
+        unbalanced_link = "#{link}#{quote}"
+        unbalanced_actual = filter("See #{unbalanced_link}...")
+
+        expect(balanced_actual.at_css('a').text).to eq(balanced_link[0...-1])
+        expect(unescape(balanced_actual.to_html)).to eq(Rinku.auto_link("See #{balanced_link}..."))
+        expect(unbalanced_actual.at_css('a').text).to eq(link)
+        expect(unescape(unbalanced_actual.to_html)).to eq(Rinku.auto_link("See #{unbalanced_link}..."))
+      end
+    end
+
+    it 'removes one closing punctuation mark when the punctuation in the link is unbalanced' do
+      complicated_link = "(#{link}(a'b[c'd]))'"
+      expected_complicated_link = %Q{(<a href="#{link}(a'b[c'd]))">#{link}(a'b[c'd]))</a>'}
+      actual = unescape(filter(complicated_link).to_html)
+
+      expect(actual).to eq(Rinku.auto_link(complicated_link))
+      expect(actual).to eq(expected_complicated_link)
+    end
+
    it 'does not include trailing HTML entities' do
      doc = filter("See &lt;&lt;&lt;#{link}&gt;&gt;&gt;")

@@ -162,16 +192,27 @@ describe Banzai::Filter::AutolinkFilter do
  end

  context 'when the link is inside a tag' do
-    it 'renders text after the link correctly for http' do
-      doc = filter(ERB::Util.html_escape_once("<http://link><another>"))
+    %w[http rdar].each do |protocol|
+      it "renders text after the link correctly for #{protocol}" do
+        doc = filter(ERB::Util.html_escape_once("<#{protocol}://link><another>"))

-      expect(doc.children.last.text).to include('<another>')
+        expect(doc.children.last.text).to include('<another>')
+      end
    end
+  end

-    it 'renders text after the link correctly for not other protocol' do
-      doc = filter(ERB::Util.html_escape_once("<rdar://link><another>"))
+  # Rinku does not escape these characters in HTML attributes, but content_tag
+  # does. We don't care about that difference for these specs, though.
+  def unescape(html)
+    %w([ ] { }).each do |cgi_escape|
+      html.sub!(CGI.escape(cgi_escape), cgi_escape)
+    end

-      expect(doc.children.last.text).to include('<another>')
+    quotes.each do |html_escape|
+      html.sub!(CGI.escape_html(html_escape), html_escape)
+      html.sub!(CGI.escape(html_escape), CGI.escape_html(html_escape))
    end
+
+    html
  end
 end