Commit af4b0bd1 authored by Simon Schrottner's avatar Simon Schrottner

Change BitBucket Pull Request Import to a Batch Process

As BitBucket importer tried to fetch all Pull Requests upfront, before
importing it, the memory consumption got very high. So high, that bigger
projects could never be imported fully, as they had too many Pull
Requests.

This change should reduce the amount of occupied memory, as the Pull
Requests are now fetched and imported in a batch of 100. As it used to
be done before hand, for creating pull request branches.

relates to https://gitlab.com/gitlab-org/gitlab/-/issues/23586
parent 369065d7
---
title: Add Batch Support for Importing Pull Requests from Bitbucket
merge_request: 46696
author: Simon Schrottner
type: performance
......@@ -8,9 +8,9 @@ module BitbucketServer
@connection = Connection.new(options)
end
def pull_requests(project_key, repo)
def pull_requests(project_key, repo, page_offset: 0, limit: nil)
path = "/projects/#{project_key}/repos/#{repo}/pull-requests?state=ALL"
get_collection(path, :pull_request)
get_collection(path, :pull_request, page_offset: page_offset, limit: limit)
end
def activities(project_key, repo, pull_request_id)
......
......@@ -177,16 +177,24 @@ module Gitlab
# on the remote server. Then we have to issue a `git fetch` to download these
# branches.
def import_pull_requests
log_info(stage: 'import_pull_requests', message: 'starting')
pull_requests = client.pull_requests(project_key, repository_slug).to_a
page = 0
log_info(stage: 'import_pull_requests', message: "starting")
loop do
log_debug(stage: 'import_pull_requests', message: "importing page #{page} and batch-size #{BATCH_SIZE} from #{page * BATCH_SIZE} to #{(page + 1) * BATCH_SIZE}")
pull_requests = client.pull_requests(project_key, repository_slug, page_offset: page, limit: BATCH_SIZE).to_a
break if pull_requests.empty?
# Creating branches on the server and fetching the newly-created branches
# may take a number of network round-trips. Do this in batches so that we can
# avoid doing a git fetch for every new branch.
pull_requests.each_slice(BATCH_SIZE) do |batch|
restore_branches(batch) if recover_missing_commits
# may take a number of network round-trips. This used to be done in batches to
# avoid doing a git fetch for every new branch, as the whole process is now
# batched, we do not need to separately do this in batches.
restore_branches(pull_requests) if recover_missing_commits
batch.each do |pull_request|
pull_requests.each do |pull_request|
if already_imported?(pull_request)
log_info(stage: 'import_pull_requests', message: 'already imported', iid: pull_request.iid)
else
......@@ -201,6 +209,9 @@ module Gitlab
backtrace = Gitlab::BacktraceCleaner.clean_backtrace(e.backtrace)
errors << { type: :pull_request, iid: pull_request.iid, errors: e.message, backtrace: backtrace.join("\n"), raw_response: pull_request.raw }
end
log_debug(stage: 'import_pull_requests', message: "finished page #{page} and batch-size #{BATCH_SIZE}")
page += 1
end
end
......@@ -416,6 +427,10 @@ module Gitlab
}
end
def log_debug(details)
logger.debug(log_base_data.merge(details))
end
def log_info(details)
logger.info(log_base_data.merge(details))
end
......
......@@ -19,6 +19,15 @@ RSpec.describe BitbucketServer::Client do
subject.pull_requests(project, repo_slug)
end
it 'requests a collection with offset and limit' do
offset = 10
limit = 100
expect(BitbucketServer::Paginator).to receive(:new).with(anything, path, :pull_request, page_offset: offset, limit: limit)
subject.pull_requests(project, repo_slug, page_offset: offset, limit: limit)
end
end
describe '#activities' do
......
......@@ -112,7 +112,7 @@ RSpec.describe Gitlab::BitbucketServerImport::Importer do
allow(subject).to receive(:delete_temp_branches)
allow(subject).to receive(:restore_branches)
allow(subject.client).to receive(:pull_requests).and_return([pull_request])
allow(subject.client).to receive(:pull_requests).and_return([pull_request], [])
end
# As we are using Caching with redis, it is best to clean the cache after each test run, else we need to wait for
......@@ -499,7 +499,7 @@ RSpec.describe Gitlab::BitbucketServerImport::Importer do
before do
Gitlab::Cache::Import::Caching.set_add(subject.already_imported_cache_key, pull_request_already_imported.iid)
allow(subject.client).to receive(:pull_requests).and_return([pull_request_to_be_imported, pull_request_already_imported])
allow(subject.client).to receive(:pull_requests).and_return([pull_request_to_be_imported, pull_request_already_imported], [])
end
it 'only imports one Merge Request, as the other on is in the cache' do
......@@ -535,7 +535,7 @@ RSpec.describe Gitlab::BitbucketServerImport::Importer do
updated_at: Time.now,
merged?: true)
expect(subject.client).to receive(:pull_requests).and_return([pull_request])
expect(subject.client).to receive(:pull_requests).and_return([pull_request], [])
expect(subject.client).to receive(:activities).and_return([])
expect(subject).to receive(:import_repository).twice
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment