Commit f60fd4e3 authored by Kassio Borges's avatar Kassio Borges

GithubImporter: Optimize Pull Request Review Importer

= Problem

The Github API does not provide a way to fetch all the pull requests
reviews of a project (repo), like it provides for comments, instead we
have to fetch the reviews by Pull Request.

For this reason, the
Gitlab::GithubImport::Importer::PullRequestsReviewsImporter¹ have to
iterate over the imported pull requests and for each one do request the
reviews, which might be more than one page.

If the importer hits a rate limit, the process restarts, and the
imported pull requests are skipped², but the importer goes over all the
review pages again.

In other words, for some projects with large number of pull requests and
large number of reviews per pull request, we might end up with
duplicated reviews and unnecessary API requests, which would lead to
longer importing times.

= Proposed solution

- To avoid duplicated comments, besides caching the Pull Requests ids,
  also cache the review ids and skip the already processed ones.

- To avoid unnecessary API requests, use the PageCounter to only request
  pages that weren't yet imported.

Related to: https://gitlab.com/gitlab-org/gitlab/-/issues/331315
Changelog: changed
MR: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/62036
parent 4756edbe
---
name: github_review_importer_query_only_unimported_merge_requests
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/62036
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/332982
milestone: '14.0'
type: development
group: group::import
default_enabled: true
......@@ -113,6 +113,17 @@ module Gitlab
end
end
# Returns the values of the given set.
#
# raw_key - The key of the set to check.
def self.values_from_set(raw_key)
key = cache_key_for(raw_key)
Redis::Cache.with do |redis|
redis.smembers(key)
end
end
# Sets multiple keys to given values.
#
# mapping - A Hash mapping the cache keys to their values.
......
......@@ -6,6 +6,13 @@ module Gitlab
class PullRequestsReviewsImporter
include ParallelScheduling
def initialize(...)
super
@merge_requests_already_imported_cache_key =
"github-importer/merge_request/already-imported/#{project.id}"
end
def importer_class
PullRequestReviewImporter
end
......@@ -22,11 +29,31 @@ module Gitlab
:pull_request_reviews
end
def id_for_already_imported_cache(merge_request)
merge_request.id
def id_for_already_imported_cache(review)
review.id
end
def each_object_to_import(&block)
if use_github_review_importer_query_only_unimported_merge_requests?
each_merge_request_to_import(&block)
else
each_merge_request_skipping_imported(&block)
end
end
def each_object_to_import
private
attr_reader :merge_requests_already_imported_cache_key
# https://gitlab.com/gitlab-org/gitlab/-/merge_requests/62036#note_587181108
def use_github_review_importer_query_only_unimported_merge_requests?
Feature.enabled?(
:github_review_importer_query_only_unimported_merge_requests,
default_enabled: :yaml
)
end
def each_merge_request_skipping_imported
project.merge_requests.find_each do |merge_request|
next if already_imported?(merge_request)
......@@ -40,6 +67,67 @@ module Gitlab
mark_as_imported(merge_request)
end
end
# The worker can be interrupted, by rate limit for instance,
# in different situations. To avoid requesting already imported data,
# if the worker is interrupted:
# - before importing all reviews of a merge request
# The reviews page is cached with the `PageCounter`, by merge request.
# - before importing all merge requests reviews
# Merge requests that had all the reviews imported are cached with
# `mark_merge_request_reviews_imported`
def each_merge_request_to_import
each_review_page do |page, merge_request|
page.objects.each do |review|
next if already_imported?(review)
review.merge_request_id = merge_request.id
yield(review)
mark_as_imported(review)
end
end
end
def each_review_page
merge_requests_to_import.find_each do |merge_request|
# The page counter needs to be scoped by merge request to avoid skipping
# pages of reviews from already imported merge requests.
page_counter = PageCounter.new(project, page_counter_id(merge_request))
repo = project.import_source
options = collection_options.merge(page: page_counter.current)
client.each_page(collection_method, repo, merge_request.iid, options) do |page|
next unless page_counter.set(page.number)
yield(page, merge_request)
end
# Avoid unnecessary Redis cache keys after the work is done.
page_counter.expire!
mark_merge_request_reviews_imported(merge_request)
end
end
# Returns only the merge requests that still have reviews to be imported.
def merge_requests_to_import
project.merge_requests.where.not(id: already_imported_merge_requests) # rubocop: disable CodeReuse/ActiveRecord
end
def already_imported_merge_requests
Gitlab::Cache::Import::Caching.values_from_set(merge_requests_already_imported_cache_key)
end
def page_counter_id(merge_request)
"merge_request/#{merge_request.id}/#{collection_method}"
end
def mark_merge_request_reviews_imported(merge_request)
Gitlab::Cache::Import::Caching.set_add(
merge_requests_already_imported_cache_key,
merge_request.id
)
end
end
end
end
......
......@@ -26,6 +26,10 @@ module Gitlab
def current
Gitlab::Cache::Import::Caching.read_integer(cache_key) || 1
end
def expire!
Gitlab::Cache::Import::Caching.expire(cache_key, 0)
end
end
end
end
......@@ -88,6 +88,18 @@ RSpec.describe Gitlab::Cache::Import::Caching, :clean_gitlab_redis_cache do
end
end
describe '.values_from_set' do
it 'returns empty list when the set is empty' do
expect(described_class.values_from_set('foo')).to eq([])
end
it 'returns the set list of values' do
described_class.set_add('foo', 10)
expect(described_class.values_from_set('foo')).to eq(['10'])
end
end
describe '.write_multiple' do
it 'sets multiple keys when key_prefix not set' do
mapping = { 'foo' => 10, 'bar' => 20 }
......
......@@ -27,30 +27,100 @@ RSpec.describe Gitlab::GithubImport::Importer::PullRequestsReviewsImporter do
end
describe '#each_object_to_import', :clean_gitlab_redis_cache do
it 'fetchs the merged pull requests data' do
merge_request = create(
:merged_merge_request,
iid: 999,
source_project: project,
target_project: project
)
review = double
expect(review)
.to receive(:merge_request_id=)
.with(merge_request.id)
allow(client)
.to receive(:pull_request_reviews)
.exactly(:once) # ensure to be cached on the second call
.with('github/repo', merge_request.iid)
.and_return([review])
expect { |b| subject.each_object_to_import(&b) }
.to yield_with_args(review)
subject.each_object_to_import {}
context 'when github_review_importer_query_only_unimported_merge_requests is enabled' do
before do
stub_feature_flags(github_review_importer_query_only_unimported_merge_requests: true)
end
let(:merge_request) do
create(
:merged_merge_request,
iid: 999,
source_project: project,
target_project: project
)
end
let(:review) { double(id: 1) }
it 'fetches the pull requests reviews data' do
page = double(objects: [review], number: 1)
expect(review)
.to receive(:merge_request_id=)
.with(merge_request.id)
expect(client)
.to receive(:each_page)
.exactly(:once) # ensure to be cached on the second call
.with(:pull_request_reviews, 'github/repo', merge_request.iid, page: 1)
.and_yield(page)
expect { |b| subject.each_object_to_import(&b) }
.to yield_with_args(review)
subject.each_object_to_import {}
end
it 'skips cached pages' do
Gitlab::GithubImport::PageCounter
.new(project, "merge_request/#{merge_request.id}/pull_request_reviews")
.set(2)
expect(review).not_to receive(:merge_request_id=)
expect(client)
.to receive(:each_page)
.exactly(:once) # ensure to be cached on the second call
.with(:pull_request_reviews, 'github/repo', merge_request.iid, page: 2)
subject.each_object_to_import {}
end
it 'skips cached merge requests' do
Gitlab::Cache::Import::Caching.set_add(
"github-importer/merge_request/already-imported/#{project.id}",
merge_request.id
)
expect(review).not_to receive(:merge_request_id=)
expect(client).not_to receive(:each_page)
subject.each_object_to_import {}
end
end
context 'when github_review_importer_query_only_unimported_merge_requests is disabled' do
before do
stub_feature_flags(github_review_importer_query_only_unimported_merge_requests: false)
end
it 'fetchs the merged pull requests data' do
merge_request = create(
:merged_merge_request,
iid: 999,
source_project: project,
target_project: project
)
review = double
expect(review)
.to receive(:merge_request_id=)
.with(merge_request.id)
allow(client)
.to receive(:pull_request_reviews)
.exactly(:once) # ensure to be cached on the second call
.with('github/repo', merge_request.iid)
.and_return([review])
expect { |b| subject.each_object_to_import(&b) }
.to yield_with_args(review)
subject.each_object_to_import {}
end
end
end
end
......@@ -31,4 +31,15 @@ RSpec.describe Gitlab::GithubImport::PageCounter, :clean_gitlab_redis_cache do
expect(counter.current).to eq(2)
end
end
describe '#expire!' do
it 'expires the current page counter' do
counter.set(2)
counter.expire!
expect(Gitlab::Cache::Import::Caching.read_integer(counter.cache_key)).to be_nil
expect(counter.current).to eq(1)
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment