Commit 55f05e6e authored by Stan Hu's avatar Stan Hu

Merge branch 'kamil-improve-import-export' into 'master'

Improve performance and (memory usage (a little)) of project export

See merge request gitlab-org/gitlab-ce!32423
parents 383f3635 0e56c1e7
---
title: Reduce N+1 when doing project export
merge_request: 32423
author:
type: performance
......@@ -8,6 +8,7 @@ module Gitlab
@included_attributes = config[:included_attributes] || {}
@excluded_attributes = config[:excluded_attributes] || {}
@methods = config[:methods] || {}
@preloads = config[:preloads] || {}
end
def find_root(model_key)
......@@ -29,10 +30,26 @@ module Gitlab
only: @included_attributes[model_key],
except: @excluded_attributes[model_key],
methods: @methods[model_key],
include: resolve_model_tree(model_tree)
include: resolve_model_tree(model_tree),
preload: resolve_preloads(model_key, model_tree)
}.compact
end
def resolve_preloads(model_key, model_tree)
model_tree
.map { |submodel_key, submodel_tree| resolve_preload(model_key, submodel_key, submodel_tree) }
.compact
.to_h
.deep_merge(@preloads[model_key].to_h)
.presence
end
def resolve_preload(parent_model_key, model_key, model_tree)
return if @methods[parent_model_key]&.include?(model_key)
[model_key, resolve_preloads(model_key, model_tree)]
end
def resolve_model_tree(model_tree)
return unless model_tree
......
# frozen_string_literal: true
# ActiveModel::Serialization (https://github.com/rails/rails/blob/v5.0.7/activemodel/lib/active_model/serialization.rb#L184)
# is simple in that it recursively calls `as_json` on each object to
# serialize everything. However, for a model like a Project, this can
# generate a query for every single association, which can add up to tens
# of thousands of queries and lead to memory bloat.
#
# To improve this, we can do several things:
# 1. Use the option tree in http://api.rubyonrails.org/classes/ActiveModel/Serializers/JSON.html
# to generate the necessary preload clauses.
#
# 2. We observe that a single project has many issues, merge requests,
# etc. Instead of serializing everything at once, which could lead to
# database timeouts and high memory usage, we take each top-level
# association and serialize the data in batches.
#
# For example, we serialize the first 100 issues and preload all of
# their associated events, notes, etc. before moving onto the next
# batch. When we're done, we serialize merge requests in the same way.
# We repeat this pattern for the remaining associations specified in
# import_export.yml.
module Gitlab
module ImportExport
class FastHashSerializer
attr_reader :subject, :tree
BATCH_SIZE = 100
def initialize(subject, tree, batch_size: BATCH_SIZE)
@subject = subject
@batch_size = batch_size
@tree = tree
end
# Serializes the subject into a Hash for the given option tree
# (e.g. Project#as_json)
def execute
simple_serialize.merge(serialize_includes)
end
private
def simple_serialize
subject.as_json(
tree.merge(include: nil, preloads: nil))
end
def serialize_includes
return {} unless includes
includes
.map(&method(:serialize_include_definition))
.compact
.to_h
end
# definition:
# { labels: { includes: ... } }
def serialize_include_definition(definition)
raise ArgumentError, 'definition needs to be Hash' unless definition.is_a?(Hash)
raise ArgumentError, 'definition needs to have exactly one Hash element' unless definition.one?
key = definition.first.first
options = definition.first.second
record = subject.public_send(key) # rubocop: disable GitlabSecurity/PublicSend
return unless record
serialized_record = serialize_record(key, record, options)
return unless serialized_record
# `#as_json` always returns keys as `strings`
[key.to_s, serialized_record]
end
def serialize_record(key, record, options)
unless record.respond_to?(:as_json)
raise "Invalid type of #{key} is #{record.class}"
end
# no has-many relation
unless record.is_a?(ActiveRecord::Relation)
return record.as_json(options)
end
# has-many relation
data = []
record.in_batches(of: @batch_size) do |batch| # rubocop:disable Cop/InBatches
batch = batch.preload(preloads[key]) if preloads&.key?(key)
data += batch.as_json(options)
end
data
end
def includes
tree[:include]
end
def preloads
tree[:preload]
end
end
end
end
......@@ -231,6 +231,16 @@ methods:
ci_pipelines:
- :notes
preloads:
statuses:
# TODO: We cannot preload tags, as they are not part of `GenericCommitStatus`
# tags: # needed by tag_list
project: # deprecated: needed by coverage_regex of Ci::Build
merge_requests:
source_project: # needed by source_branch_sha and diff_head_sha
target_project: # needed by target_branch_sha
assignees: # needed by assigne_id that is implemented by DeprecatedAssignee
# EE specific relationships and settings to include. All of this will be merged
# into the previous structures if EE is used.
ee:
......
......@@ -41,8 +41,14 @@ module Gitlab
end
def serialize_project_tree
if Feature.enabled?(:export_fast_serialize, default_enabled: true)
Gitlab::ImportExport::FastHashSerializer
.new(@project, reader.project_tree)
.execute
else
@project.as_json(reader.project_tree)
end
end
def reader
@reader ||= Gitlab::ImportExport::Reader.new(shared: @shared)
......
......@@ -20,20 +20,41 @@ describe Gitlab::ImportExport::AttributesFinder do
except: [:iid],
include: [
{ merge_request_diff: {
include: []
include: [],
preload: { source_project: nil }
} },
{ merge_request_test: { include: [] } }
],
only: [:id]
only: [:id],
preload: {
merge_request_diff: { source_project: nil },
merge_request_test: nil
}
} },
{ commit_statuses: {
include: [{ commit: { include: [] } }]
include: [{ commit: { include: [] } }],
preload: { commit: nil }
} },
{ project_members: {
include: [{ user: { include: [],
only: [:email] } }]
only: [:email] } }],
preload: { user: nil }
} }
]
],
preload: {
commit_statuses: {
commit: nil
},
issues: nil,
labels: nil,
merge_requests: {
merge_request_diff: { source_project: nil },
merge_request_test: nil
},
project_members: {
user: nil
}
}
}
end
......@@ -50,7 +71,8 @@ describe Gitlab::ImportExport::AttributesFinder do
setup_yaml(tree: { project: [:issues] })
is_expected.to match(
include: [{ issues: { include: [] } }]
include: [{ issues: { include: [] } }],
preload: { issues: nil }
)
end
......@@ -58,7 +80,8 @@ describe Gitlab::ImportExport::AttributesFinder do
setup_yaml(tree: { project: [:project_feature] })
is_expected.to match(
include: [{ project_feature: { include: [] } }]
include: [{ project_feature: { include: [] } }],
preload: { project_feature: nil }
)
end
......@@ -67,7 +90,8 @@ describe Gitlab::ImportExport::AttributesFinder do
is_expected.to match(
include: [{ issues: { include: [] } },
{ snippets: { include: [] } }]
{ snippets: { include: [] } }],
preload: { issues: nil, snippets: nil }
)
end
......@@ -75,7 +99,9 @@ describe Gitlab::ImportExport::AttributesFinder do
setup_yaml(tree: { project: [issues: [:notes]] })
is_expected.to match(
include: [{ issues: { include: [{ notes: { include: [] } }] } }]
include: [{ issues: { include: [{ notes: { include: [] } }],
preload: { notes: nil } } }],
preload: { issues: { notes: nil } }
)
end
......@@ -85,7 +111,9 @@ describe Gitlab::ImportExport::AttributesFinder do
is_expected.to match(
include: [{ merge_requests:
{ include: [{ notes: { include: [] } },
{ merge_request_diff: { include: [] } }] } }]
{ merge_request_diff: { include: [] } }],
preload: { merge_request_diff: nil, notes: nil } } }],
preload: { merge_requests: { merge_request_diff: nil, notes: nil } }
)
end
......@@ -94,8 +122,11 @@ describe Gitlab::ImportExport::AttributesFinder do
is_expected.to match(
include: [{ merge_requests: {
include: [{ notes: { include: [{ author: { include: [] } }] } }]
} }]
include: [{ notes: { include: [{ author: { include: [] } }],
preload: { author: nil } } }],
preload: { notes: { author: nil } }
} }],
preload: { merge_requests: { notes: { author: nil } } }
)
end
......@@ -105,7 +136,8 @@ describe Gitlab::ImportExport::AttributesFinder do
is_expected.to match(
include: [{ issues: { include: [],
only: [:name, :description] } }]
only: [:name, :description] } }],
preload: { issues: nil }
)
end
......@@ -115,7 +147,8 @@ describe Gitlab::ImportExport::AttributesFinder do
is_expected.to match(
include: [{ issues: { except: [:name],
include: [] } }]
include: [] } }],
preload: { issues: nil }
)
end
......@@ -127,7 +160,8 @@ describe Gitlab::ImportExport::AttributesFinder do
is_expected.to match(
include: [{ issues: { except: [:name],
include: [],
only: [:description] } }]
only: [:description] } }],
preload: { issues: nil }
)
end
......@@ -137,7 +171,8 @@ describe Gitlab::ImportExport::AttributesFinder do
is_expected.to match(
include: [{ issues: { include: [],
methods: [:name] } }]
methods: [:name] } }],
preload: { issues: nil }
)
end
......
......@@ -25,7 +25,7 @@ describe Gitlab::ImportExport::Config do
expect { subject }.not_to raise_error
expect(subject).to be_a(Hash)
expect(subject.keys).to contain_exactly(
:tree, :excluded_attributes, :included_attributes, :methods)
:tree, :excluded_attributes, :included_attributes, :methods, :preloads)
end
end
end
......@@ -55,6 +55,10 @@ describe Gitlab::ImportExport::Config do
events:
- :action
preloads:
statuses:
project:
ee:
tree:
project:
......@@ -71,6 +75,9 @@ describe Gitlab::ImportExport::Config do
- :type_ee
events_ee:
- :action_ee
preloads:
statuses:
bridge_ee:
EOF
end
......@@ -111,6 +118,11 @@ describe Gitlab::ImportExport::Config do
methods: {
labels: [:type],
events: [:action]
},
preloads: {
statuses: {
project: nil
}
}
}
)
......@@ -150,6 +162,12 @@ describe Gitlab::ImportExport::Config do
labels: [:type, :type_ee],
events: [:action],
events_ee: [:action_ee]
},
preloads: {
statuses: {
project: nil,
bridge_ee: nil
}
}
}
)
......
require 'spec_helper'
describe Gitlab::ImportExport::FastHashSerializer do
subject { described_class.new(project, tree).execute }
let!(:project) { setup_project }
let(:user) { create(:user) }
let(:shared) { project.import_export_shared }
let(:reader) { Gitlab::ImportExport::Reader.new(shared: shared) }
let(:tree) { reader.project_tree }
before do
project.add_maintainer(user)
allow_any_instance_of(MergeRequest).to receive(:source_branch_sha).and_return('ABCD')
allow_any_instance_of(MergeRequest).to receive(:target_branch_sha).and_return('DCBA')
end
it 'saves the correct hash' do
is_expected.to include({ 'description' => 'description', 'visibility_level' => 20 })
end
it 'has approvals_before_merge set' do
expect(subject['approvals_before_merge']).to eq(1)
end
it 'has milestones' do
expect(subject['milestones']).not_to be_empty
end
it 'has merge requests' do
expect(subject['merge_requests']).not_to be_empty
end
it 'has merge request\'s milestones' do
expect(subject['merge_requests'].first['milestone']).not_to be_empty
end
it 'has merge request\'s source branch SHA' do
expect(subject['merge_requests'].first['source_branch_sha']).to eq('ABCD')
end
it 'has merge request\'s target branch SHA' do
expect(subject['merge_requests'].first['target_branch_sha']).to eq('DCBA')
end
it 'has events' do
expect(subject['merge_requests'].first['milestone']['events']).not_to be_empty
end
it 'has snippets' do
expect(subject['snippets']).not_to be_empty
end
it 'has snippet notes' do
expect(subject['snippets'].first['notes']).not_to be_empty
end
it 'has releases' do
expect(subject['releases']).not_to be_empty
end
it 'has no author on releases' do
expect(subject['releases'].first['author']).to be_nil
end
it 'has the author ID on releases' do
expect(subject['releases'].first['author_id']).not_to be_nil
end
it 'has issues' do
expect(subject['issues']).not_to be_empty
end
it 'has issue comments' do
notes = subject['issues'].first['notes']
expect(notes).not_to be_empty
expect(notes.first['type']).to eq('DiscussionNote')
end
it 'has issue assignees' do
expect(subject['issues'].first['issue_assignees']).not_to be_empty
end
it 'has author on issue comments' do
expect(subject['issues'].first['notes'].first['author']).not_to be_empty
end
it 'has project members' do
expect(subject['project_members']).not_to be_empty
end
it 'has merge requests diffs' do
expect(subject['merge_requests'].first['merge_request_diff']).not_to be_empty
end
it 'has merge request diff files' do
expect(subject['merge_requests'].first['merge_request_diff']['merge_request_diff_files']).not_to be_empty
end
it 'has merge request diff commits' do
expect(subject['merge_requests'].first['merge_request_diff']['merge_request_diff_commits']).not_to be_empty
end
it 'has merge requests comments' do
expect(subject['merge_requests'].first['notes']).not_to be_empty
end
it 'has author on merge requests comments' do
expect(subject['merge_requests'].first['notes'].first['author']).not_to be_empty
end
it 'has pipeline stages' do
expect(subject.dig('ci_pipelines', 0, 'stages')).not_to be_empty
end
it 'has pipeline statuses' do
expect(subject.dig('ci_pipelines', 0, 'stages', 0, 'statuses')).not_to be_empty
end
it 'has pipeline builds' do
builds_count = subject
.dig('ci_pipelines', 0, 'stages', 0, 'statuses')
.count { |hash| hash['type'] == 'Ci::Build' }
expect(builds_count).to eq(1)
end
it 'has no when YML attributes but only the DB column' do
allow_any_instance_of(Ci::Pipeline)
.to receive(:ci_yaml_file)
.and_return(File.read(Rails.root.join('spec/support/gitlab_stubs/gitlab_ci.yml')))
expect_any_instance_of(Gitlab::Ci::YamlProcessor).not_to receive(:build_attributes)
subject
end
it 'has pipeline commits' do
expect(subject['ci_pipelines']).not_to be_empty
end
it 'has ci pipeline notes' do
expect(subject['ci_pipelines'].first['notes']).not_to be_empty
end
it 'has labels with no associations' do
expect(subject['labels']).not_to be_empty
end
it 'has labels associated to records' do
expect(subject['issues'].first['label_links'].first['label']).not_to be_empty
end
it 'has project and group labels' do
label_types = subject['issues'].first['label_links'].map { |link| link['label']['type'] }
expect(label_types).to match_array(%w(ProjectLabel GroupLabel))
end
it 'has priorities associated to labels' do
priorities = subject['issues'].first['label_links'].flat_map { |link| link['label']['priorities'] }
expect(priorities).not_to be_empty
end
it 'has issue resource label events' do
expect(subject['issues'].first['resource_label_events']).not_to be_empty
end
it 'has merge request resource label events' do
expect(subject['merge_requests'].first['resource_label_events']).not_to be_empty
end
it 'saves the correct service type' do
expect(subject['services'].first['type']).to eq('CustomIssueTrackerService')
end
it 'saves the properties for a service' do
expect(subject['services'].first['properties']).to eq('one' => 'value')
end
it 'has project feature' do
project_feature = subject['project_feature']
expect(project_feature).not_to be_empty
expect(project_feature["issues_access_level"]).to eq(ProjectFeature::DISABLED)
expect(project_feature["wiki_access_level"]).to eq(ProjectFeature::ENABLED)
expect(project_feature["builds_access_level"]).to eq(ProjectFeature::PRIVATE)
end
it 'has custom attributes' do
expect(subject['custom_attributes'].count).to eq(2)
end
it 'has badges' do
expect(subject['project_badges'].count).to eq(2)
end
it 'does not complain about non UTF-8 characters in MR diff files' do
ActiveRecord::Base.connection.execute("UPDATE merge_request_diff_files SET diff = '---\n- :diff: !binary |-\n LS0tIC9kZXYvbnVsbAorKysgYi9pbWFnZXMvbnVjb3IucGRmCkBAIC0wLDAg\n KzEsMTY3OSBAQAorJVBERi0xLjUNJeLjz9MNCisxIDAgb2JqDTw8L01ldGFk\n YXR'")
expect(subject['merge_requests'].first['merge_request_diff']).not_to be_empty
end
context 'project attributes' do
it 'does not contain the runners token' do
expect(subject).not_to include("runners_token" => 'token')
end
end
it 'has a board and a list' do
expect(subject['boards'].first['lists']).not_to be_empty
end
def setup_project
issue = create(:issue, assignees: [user])
snippet = create(:project_snippet)
release = create(:release)
group = create(:group)
project = create(:project,
:public,
:repository,
:issues_disabled,
:wiki_enabled,
:builds_private,
description: 'description',
issues: [issue],
snippets: [snippet],
releases: [release],
group: group,
approvals_before_merge: 1
)
project_label = create(:label, project: project)
group_label = create(:group_label, group: group)
create(:label_link, label: project_label, target: issue)
create(:label_link, label: group_label, target: issue)
create(:label_priority, label: group_label, priority: 1)
milestone = create(:milestone, project: project)
merge_request = create(:merge_request, source_project: project, milestone: milestone)
ci_build = create(:ci_build, project: project, when: nil)
ci_build.pipeline.update(project: project)
create(:commit_status, project: project, pipeline: ci_build.pipeline)
create(:milestone, project: project)
create(:discussion_note, noteable: issue, project: project)
create(:note, noteable: merge_request, project: project)
create(:note, noteable: snippet, project: project)
create(:note_on_commit,
author: user,
project: project,
commit_id: ci_build.pipeline.sha)
create(:resource_label_event, label: project_label, issue: issue)
create(:resource_label_event, label: group_label, merge_request: merge_request)
create(:event, :created, target: milestone, project: project, author: user)
create(:service, project: project, type: 'CustomIssueTrackerService', category: 'issue_tracker', properties: { one: 'value' })
create(:project_custom_attribute, project: project)
create(:project_custom_attribute, project: project)
create(:project_badge, project: project)
create(:project_badge, project: project)
board = create(:board, project: project, name: 'TestBoard')
create(:list, board: board, position: 0, label: project_label)
project
end
end
......@@ -23,12 +23,65 @@ describe Gitlab::ImportExport::ProjectTreeSaver do
expect(project_tree_saver.save).to be true
end
context ':export_fast_serialize feature flag checks' do
before do
expect(Gitlab::ImportExport::Reader).to receive(:new).with(shared: shared).and_return(reader)
expect(reader).to receive(:project_tree).and_return(project_tree)
end
let(:serializer) { instance_double('Gitlab::ImportExport::FastHashSerializer') }
let(:reader) { instance_double('Gitlab::ImportExport::Reader') }
let(:project_tree) do
{
include: [{ issues: { include: [] } }],
preload: { issues: nil }
}
end
context 'when :export_fast_serialize feature is enabled' do
before do
stub_feature_flags(export_fast_serialize: true)
end
it 'uses FastHashSerializer' do
expect(Gitlab::ImportExport::FastHashSerializer)
.to receive(:new)
.with(project, project_tree)
.and_return(serializer)
expect(serializer).to receive(:execute)
project_tree_saver.save
end
end
context 'when :export_fast_serialize feature is disabled' do
before do
stub_feature_flags(export_fast_serialize: false)
end
it 'is serialized via built-in `as_json`' do
expect(project).to receive(:as_json).with(project_tree)
project_tree_saver.save
end
end
end
# It is mostly duplicated in
# `spec/lib/gitlab/import_export/fast_hash_serializer_spec.rb`
# except:
# context 'with description override' do
# context 'group members' do
# ^ These are specific for the ProjectTreeSaver
context 'JSON' do
let(:saved_project_json) do
project_tree_saver.save
project_json(project_tree_saver.full_path)
end
# It is not duplicated in
# `spec/lib/gitlab/import_export/fast_hash_serializer_spec.rb`
context 'with description override' do
let(:params) { { description: 'Foo Bar' } }
let(:project_tree_saver) { described_class.new(project: project, current_user: user, shared: shared, params: params) }
......
......@@ -13,6 +13,10 @@ tree:
group_members:
- :user
preloads:
merge_request_diff:
source_project:
included_attributes:
merge_requests:
- :id
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment