Commit 2e3ebe7f authored by Vasilii Iakliushin's avatar Vasilii Iakliushin

Optimize Nokogiri search for post-processing pipeline

Contributes to https://gitlab.com/gitlab-org/gitlab/-/issues/271242

**Problem**

doc.search(...) operation has a bad performance for complex html
documents with many elements.

**Solution**

Avoid unnecessary doc.search calls.

1. Combine multiple search operations
2. Share calculated result between several filters
parent b5962d43
---
name: optimize_linkable_attributes
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/59983
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/328696
milestone: '13.12'
type: development
group: group::source code
default_enabled: false
...@@ -10,19 +10,16 @@ module Banzai ...@@ -10,19 +10,16 @@ module Banzai
protected protected
def linkable_attributes def linkable_attributes
strong_memoize(:linkable_attributes) do if Feature.enabled?(:optimize_linkable_attributes, project, default_enabled: :yaml)
attrs = [] # Nokorigi Nodeset#search performs badly for documents with many nodes
#
attrs += doc.search('a:not(.gfm)').map do |el| # Here we store fetched attributes in the shared variable "result"
el.attribute('href') # This variable is passed through the chain of filters and can be
end # accessed by them
result[:linkable_attributes] ||= fetch_linkable_attributes
attrs += doc.search('img:not(.gfm), video:not(.gfm), audio:not(.gfm)').flat_map do |el| else
[el.attribute('src'), el.attribute('data-src')] strong_memoize(:linkable_attributes) do
end fetch_linkable_attributes
attrs.reject do |attr|
attr.blank? || attr.value.start_with?('//')
end end
end end
end end
...@@ -40,6 +37,16 @@ module Banzai ...@@ -40,6 +37,16 @@ module Banzai
def unescape_and_scrub_uri(uri) def unescape_and_scrub_uri(uri)
Addressable::URI.unescape(uri).scrub.delete("\0") Addressable::URI.unescape(uri).scrub.delete("\0")
end end
def fetch_linkable_attributes
attrs = []
attrs += doc.search('a:not(.gfm), img:not(.gfm), video:not(.gfm), audio:not(.gfm)').flat_map do |el|
[el.attribute('href'), el.attribute('src'), el.attribute('data-src')]
end
attrs.reject { |attr| attr.blank? || attr.value.start_with?('//') }
end
end end
end end
end end
...@@ -15,8 +15,16 @@ module Banzai ...@@ -15,8 +15,16 @@ module Banzai
def call def call
return doc if context[:system_note] return doc if context[:system_note]
linkable_attributes.each do |attr| if Feature.enabled?(:optimize_linkable_attributes, project, default_enabled: :yaml)
process_link_to_upload_attr(attr) # We exclude processed upload links from the linkable attributes to
# prevent further modifications by RepositoryLinkFilter
linkable_attributes.reject! do |attr|
process_link_to_upload_attr(attr)
end
else
linkable_attributes.each do |attr|
process_link_to_upload_attr(attr)
end
end end
doc doc
......
...@@ -3,24 +3,56 @@ ...@@ -3,24 +3,56 @@
require 'spec_helper' require 'spec_helper'
RSpec.describe Banzai::Pipeline::PostProcessPipeline do RSpec.describe Banzai::Pipeline::PostProcessPipeline do
context 'when a document only has upload links' do subject { described_class.call(doc, context) }
it 'does not make any Gitaly calls', :request_store do
markdown = <<-MARKDOWN.strip_heredoc let_it_be(:project) { create(:project, :public, :repository) }
[Relative Upload Link](/uploads/e90decf88d8f96fe9e1389afc2e4a91f/test.jpg)
![Relative Upload Image](/uploads/e90decf88d8f96fe9e1389afc2e4a91f/test.jpg) let(:context) { { project: project, ref: 'master' } }
MARKDOWN
context = { context 'when a document only has upload links' do
project: create(:project, :public, :repository), let(:doc) do
ref: 'master' <<-HTML.strip_heredoc
} <a href="/uploads/e90decf88d8f96fe9e1389afc2e4a91f/test.jpg">Relative Upload Link</a>
<img src="/uploads/e90decf88d8f96fe9e1389afc2e4a91f/test.jpg">
HTML
end
it 'does not make any Gitaly calls', :request_store do
Gitlab::GitalyClient.reset_counts Gitlab::GitalyClient.reset_counts
described_class.call(markdown, context) subject
expect(Gitlab::GitalyClient.get_request_count).to eq(0) expect(Gitlab::GitalyClient.get_request_count).to eq(0)
end end
end end
context 'when both upload and repository links are present' do
let(:html) do
<<-HTML.strip_heredoc
<a href="/uploads/e90decf88d8f96fe9e1389afc2e4a91f/test.jpg">Relative Upload Link</a>
<img src="/uploads/e90decf88d8f96fe9e1389afc2e4a91f/test.jpg">
<a href="/test.jpg">Just a link</a>
HTML
end
let(:doc) { HTML::Pipeline.parse(html) }
it 'searches for attributes only once' do
expect(doc).to receive(:search).once.and_call_original
subject
end
context 'when "optimize_linkable_attributes" is disabled' do
before do
stub_feature_flags(optimize_linkable_attributes: false)
end
it 'searches for attributes twice' do
expect(doc).to receive(:search).twice.and_call_original
subject
end
end
end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment