Resolve "Wiki: links with spaces in the url render incorrectly with CommonMark"

198f8a2c · Brett Walker · Sean McGivern · 5dabe6d1 · 198f8a2c · 198f8a2c
Commit 198f8a2c authored Aug 21, 2018 by Brett Walker Committed by Sean McGivern Aug 21, 2018
4 changed files
--- a/changelogs/unreleased/48869-wiki-slugs-with-spaces.yml
+++ b/changelogs/unreleased/48869-wiki-slugs-with-spaces.yml
+---
+title: Allow spaces in wiki markdown links when using CommonMark
+merge_request: 20417
+author:
+type: fixed
--- a/lib/banzai/filter/spaced_link_filter.rb
+++ b/lib/banzai/filter/spaced_link_filter.rb
+# frozen_string_literal: true
+
+require 'uri'
+
+module Banzai
+  module Filter
+    # HTML Filter for markdown links with spaces in the URLs
+    #
+    # Based on Banzai::Filter::AutolinkFilter
+    #
+    # CommonMark does not allow spaces in the url portion of a link.
+    # For example, `[example](page slug)` is not valid.  However,
+    # in our wikis, we support (via RedCarpet) this type of link, allowing
+    # wiki pages to be easily linked by their title.  This filter adds that functionality.
+    # The intent is for this to only be used in Wikis - in general, we want
+    # to adhere to CommonMark's spec.
+    #
+    class SpacedLinkFilter < HTML::Pipeline::Filter
+      include ActionView::Helpers::TagHelper
+
+      # Pattern to match a standard markdown link
+      #
+      # Rubular: http://rubular.com/r/z9EAHxYmKI
+      LINK_PATTERN = /\[([^\]]+)\]\(([^)"]+)(?: \"([^\"]+)\")?\)/
+
+      # Text matching LINK_PATTERN inside these elements will not be linked
+      IGNORE_PARENTS = %w(a code kbd pre script style).to_set
+
+      # The XPath query to use for finding text nodes to parse.
+      TEXT_QUERY = %Q(descendant-or-self::text()[
+        not(#{IGNORE_PARENTS.map { |p| "ancestor::#{p}" }.join(' or ')})
+        and contains(., ']\(')
+      ]).freeze
+
+      def call
+        return doc if context[:markdown_engine] == :redcarpet
+
+        doc.xpath(TEXT_QUERY).each do |node|
+          content = node.to_html
+
+          next unless content.match(LINK_PATTERN)
+
+          html = spaced_link_filter(content)
+
+          next if html == content
+
+          node.replace(html)
+        end
+
+        doc
+      end
+
+      private
+
+      def spaced_link_match(link)
+        match = LINK_PATTERN.match(link)
+        return link unless match && match[1] && match[2]
+
+        # escape the spaces in the url so that it's a valid markdown link,
+        # then run it through the markdown processor again, let it do its magic
+        text     = match[1]
+        new_link = match[2].gsub(' ', '%20')
+        title    = match[3] ? " \"#{match[3]}\"" : ''
+        html     = Banzai::Filter::MarkdownFilter.call("[#{text}](#{new_link}#{title})", context)
+
+        # link is wrapped in a <p>, so strip that off
+        html.sub('<p>', '').chomp('</p>')
+      end
+
+      def spaced_link_filter(text)
+        Gitlab::StringRegexMarker.new(CGI.unescapeHTML(text), text.html_safe).mark(LINK_PATTERN) do |link, left:, right:|
+          spaced_link_match(link)
+        end
+      end
+    end
+  end
+end
--- a/lib/banzai/pipeline/wiki_pipeline.rb
+++ b/lib/banzai/pipeline/wiki_pipeline.rb
@@ -5,6 +5,7 @@ module Banzai
        @filters ||= begin
          super.insert_after(Filter::TableOfContentsFilter, Filter::GollumTagsFilter)
               .insert_before(Filter::TaskListFilter, Filter::WikiLinkFilter)
+               .insert_before(Filter::WikiLinkFilter, Filter::SpacedLinkFilter)
        end
      end
    end

--- a/spec/lib/banzai/filter/spaced_link_filter_spec.rb
+++ b/spec/lib/banzai/filter/spaced_link_filter_spec.rb
+require 'spec_helper'
+
+describe Banzai::Filter::SpacedLinkFilter do
+  include FilterSpecHelper
+
+  let(:link) { '[example](page slug)' }
+
+  it 'converts slug with spaces to a link' do
+    doc = filter("See #{link}")
+
+    expect(doc.at_css('a').text).to eq 'example'
+    expect(doc.at_css('a')['href']).to eq 'page%20slug'
+    expect(doc.at_css('p')).to eq nil
+  end
+
+  it 'converts slug with spaces and a title to a link' do
+    link = '[example](page slug "title")'
+    doc  = filter("See #{link}")
+
+    expect(doc.at_css('a').text).to eq 'example'
+    expect(doc.at_css('a')['href']).to eq 'page%20slug'
+    expect(doc.at_css('a')['title']).to eq 'title'
+    expect(doc.at_css('p')).to eq nil
+  end
+
+  it 'does nothing when markdown_engine is redcarpet' do
+    exp = act = link
+    expect(filter(act, markdown_engine: :redcarpet).to_html).to eq exp
+  end
+
+  it 'does nothing with empty text' do
+    link = '[](page slug)'
+    doc  = filter("See #{link}")
+
+    expect(doc.at_css('a')).to eq nil
+  end
+
+  it 'does nothing with an empty slug' do
+    link = '[example]()'
+    doc  = filter("See #{link}")
+
+    expect(doc.at_css('a')).to eq nil
+  end
+
+  it 'converts multiple URLs' do
+    link1 = '[first](slug one)'
+    link2 = '[second](http://example.com/slug two)'
+    doc   = filter("See #{link1} and #{link2}")
+
+    found_links = doc.css('a')
+
+    expect(found_links.size).to eq(2)
+    expect(found_links[0].text).to eq 'first'
+    expect(found_links[0]['href']).to eq 'slug%20one'
+    expect(found_links[1].text).to eq 'second'
+    expect(found_links[1]['href']).to eq 'http://example.com/slug%20two'
+  end
+
+  described_class::IGNORE_PARENTS.each do |elem|
+    it "ignores valid links contained inside '#{elem}' element" do
+      exp = act = "<#{elem}>See #{link}</#{elem}>"
+
+      expect(filter(act).to_html).to eq exp
+    end
+  end
+end