Commit 2e3ebe7f authored by Vasilii Iakliushin's avatar Vasilii Iakliushin

Optimize Nokogiri search for post-processing pipeline

Contributes to https://gitlab.com/gitlab-org/gitlab/-/issues/271242

**Problem**

doc.search(...) operation has a bad performance for complex html
documents with many elements.

**Solution**

Avoid unnecessary doc.search calls.

1. Combine multiple search operations
2. Share calculated result between several filters
parent b5962d43
---
name: optimize_linkable_attributes
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/59983
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/328696
milestone: '13.12'
type: development
group: group::source code
default_enabled: false
......@@ -10,19 +10,16 @@ module Banzai
protected
def linkable_attributes
if Feature.enabled?(:optimize_linkable_attributes, project, default_enabled: :yaml)
# Nokorigi Nodeset#search performs badly for documents with many nodes
#
# Here we store fetched attributes in the shared variable "result"
# This variable is passed through the chain of filters and can be
# accessed by them
result[:linkable_attributes] ||= fetch_linkable_attributes
else
strong_memoize(:linkable_attributes) do
attrs = []
attrs += doc.search('a:not(.gfm)').map do |el|
el.attribute('href')
end
attrs += doc.search('img:not(.gfm), video:not(.gfm), audio:not(.gfm)').flat_map do |el|
[el.attribute('src'), el.attribute('data-src')]
end
attrs.reject do |attr|
attr.blank? || attr.value.start_with?('//')
fetch_linkable_attributes
end
end
end
......@@ -40,6 +37,16 @@ module Banzai
def unescape_and_scrub_uri(uri)
Addressable::URI.unescape(uri).scrub.delete("\0")
end
def fetch_linkable_attributes
attrs = []
attrs += doc.search('a:not(.gfm), img:not(.gfm), video:not(.gfm), audio:not(.gfm)').flat_map do |el|
[el.attribute('href'), el.attribute('src'), el.attribute('data-src')]
end
attrs.reject { |attr| attr.blank? || attr.value.start_with?('//') }
end
end
end
end
......@@ -15,9 +15,17 @@ module Banzai
def call
return doc if context[:system_note]
if Feature.enabled?(:optimize_linkable_attributes, project, default_enabled: :yaml)
# We exclude processed upload links from the linkable attributes to
# prevent further modifications by RepositoryLinkFilter
linkable_attributes.reject! do |attr|
process_link_to_upload_attr(attr)
end
else
linkable_attributes.each do |attr|
process_link_to_upload_attr(attr)
end
end
doc
end
......
......@@ -3,24 +3,56 @@
require 'spec_helper'
RSpec.describe Banzai::Pipeline::PostProcessPipeline do
context 'when a document only has upload links' do
it 'does not make any Gitaly calls', :request_store do
markdown = <<-MARKDOWN.strip_heredoc
[Relative Upload Link](/uploads/e90decf88d8f96fe9e1389afc2e4a91f/test.jpg)
subject { described_class.call(doc, context) }
let_it_be(:project) { create(:project, :public, :repository) }
![Relative Upload Image](/uploads/e90decf88d8f96fe9e1389afc2e4a91f/test.jpg)
MARKDOWN
let(:context) { { project: project, ref: 'master' } }
context = {
project: create(:project, :public, :repository),
ref: 'master'
}
context 'when a document only has upload links' do
let(:doc) do
<<-HTML.strip_heredoc
<a href="/uploads/e90decf88d8f96fe9e1389afc2e4a91f/test.jpg">Relative Upload Link</a>
<img src="/uploads/e90decf88d8f96fe9e1389afc2e4a91f/test.jpg">
HTML
end
it 'does not make any Gitaly calls', :request_store do
Gitlab::GitalyClient.reset_counts
described_class.call(markdown, context)
subject
expect(Gitlab::GitalyClient.get_request_count).to eq(0)
end
end
context 'when both upload and repository links are present' do
let(:html) do
<<-HTML.strip_heredoc
<a href="/uploads/e90decf88d8f96fe9e1389afc2e4a91f/test.jpg">Relative Upload Link</a>
<img src="/uploads/e90decf88d8f96fe9e1389afc2e4a91f/test.jpg">
<a href="/test.jpg">Just a link</a>
HTML
end
let(:doc) { HTML::Pipeline.parse(html) }
it 'searches for attributes only once' do
expect(doc).to receive(:search).once.and_call_original
subject
end
context 'when "optimize_linkable_attributes" is disabled' do
before do
stub_feature_flags(optimize_linkable_attributes: false)
end
it 'searches for attributes twice' do
expect(doc).to receive(:search).twice.and_call_original
subject
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment