Commit 0377c015 authored by Stan Hu's avatar Stan Hu

Refactor GitHub Importer database helpers into helper methods

This in preparation for addressing idle-in-transaction timeouts for other importers.

Part of #50021
parent 842377ab
...@@ -10,24 +10,6 @@ module Gitlab ...@@ -10,24 +10,6 @@ module Gitlab
Client.new(token_to_use, parallel: parallel) Client.new(token_to_use, parallel: parallel)
end end
# Inserts a raw row and returns the ID of the inserted row.
#
# attributes - The attributes/columns to set.
# relation - An ActiveRecord::Relation to use for finding the ID of the row
# when using MySQL.
def self.insert_and_return_id(attributes, relation)
# We use bulk_insert here so we can bypass any queries executed by
# callbacks or validation rules, as doing this wouldn't scale when
# importing very large projects.
result = Gitlab::Database
.bulk_insert(relation.table_name, [attributes], return_ids: true)
# MySQL doesn't support returning the IDs of a bulk insert in a way that
# is not a pain, so in this case we'll issue an extra query instead.
result.first ||
relation.where(iid: attributes[:iid]).limit(1).pluck(:id).first
end
# Returns the ID of the ghost user. # Returns the ID of the ghost user.
def self.ghost_user_id def self.ghost_user_id
key = 'github-import/ghost-user-id' key = 'github-import/ghost-user-id'
......
...@@ -4,6 +4,8 @@ module Gitlab ...@@ -4,6 +4,8 @@ module Gitlab
module GithubImport module GithubImport
module Importer module Importer
class IssueImporter class IssueImporter
include Gitlab::Import::DatabaseHelpers
attr_reader :project, :issue, :client, :user_finder, :milestone_finder, attr_reader :project, :issue, :client, :user_finder, :milestone_finder,
:issuable_finder :issuable_finder
...@@ -55,7 +57,7 @@ module Gitlab ...@@ -55,7 +57,7 @@ module Gitlab
updated_at: issue.updated_at updated_at: issue.updated_at
} }
GithubImport.insert_and_return_id(attributes, project.issues).tap do |id| insert_and_return_id(attributes, project.issues).tap do |id|
# We use .insert_and_return_id which effectively disables all callbacks. # We use .insert_and_return_id which effectively disables all callbacks.
# Trigger iid logic here to make sure we track internal id values consistently. # Trigger iid logic here to make sure we track internal id values consistently.
project.issues.find(id).ensure_project_iid! project.issues.find(id).ensure_project_iid!
......
...@@ -4,6 +4,8 @@ module Gitlab ...@@ -4,6 +4,8 @@ module Gitlab
module GithubImport module GithubImport
module Importer module Importer
class PullRequestImporter class PullRequestImporter
include Gitlab::Import::MergeRequestHelpers
attr_reader :pull_request, :project, :client, :user_finder, attr_reader :pull_request, :project, :client, :user_finder,
:milestone_finder, :issuable_finder :milestone_finder, :issuable_finder
...@@ -44,81 +46,27 @@ module Gitlab ...@@ -44,81 +46,27 @@ module Gitlab
description = MarkdownText description = MarkdownText
.format(pull_request.description, pull_request.author, author_found) .format(pull_request.description, pull_request.author, author_found)
# This work must be wrapped in a transaction as otherwise we can leave attributes = {
# behind incomplete data in the event of an error. This can then lead iid: pull_request.iid,
# to duplicate key errors when jobs are retried. title: pull_request.truncated_title,
MergeRequest.transaction do description: description,
attributes = { source_project_id: project.id,
iid: pull_request.iid, target_project_id: project.id,
title: pull_request.truncated_title, source_branch: pull_request.formatted_source_branch,
description: description, target_branch: pull_request.target_branch,
source_project_id: project.id, state: pull_request.state,
target_project_id: project.id, milestone_id: milestone_finder.id_for(pull_request),
source_branch: pull_request.formatted_source_branch, author_id: author_id,
target_branch: pull_request.target_branch, assignee_id: user_finder.assignee_id_for(pull_request),
state: pull_request.state, created_at: pull_request.created_at,
milestone_id: milestone_finder.id_for(pull_request), updated_at: pull_request.updated_at
author_id: author_id, }
assignee_id: user_finder.assignee_id_for(pull_request),
created_at: pull_request.created_at,
updated_at: pull_request.updated_at
}
# When creating merge requests there are a lot of hooks that may
# run, for many different reasons. Many of these hooks (e.g. the
# ones used for rendering Markdown) are completely unnecessary and
# may even lead to transaction timeouts.
#
# To ensure importing pull requests has a minimal impact and can
# complete in a reasonable time we bypass all the hooks by inserting
# the row and then retrieving it. We then only perform the
# additional work that is strictly necessary.
merge_request_id = GithubImport
.insert_and_return_id(attributes, project.merge_requests)
merge_request = project.merge_requests.find(merge_request_id)
# We use .insert_and_return_id which effectively disables all callbacks.
# Trigger iid logic here to make sure we track internal id values consistently.
merge_request.ensure_target_project_iid!
[merge_request, false] create_merge_request_without_hooks(project, attributes, pull_request.iid)
end
rescue ActiveRecord::InvalidForeignKey
# It's possible the project has been deleted since scheduling this
# job. In this case we'll just skip creating the merge request.
[]
rescue ActiveRecord::RecordNotUnique
# It's possible we previously created the MR, but failed when updating
# the Git data. In this case we'll just continue working on the
# existing row.
[project.merge_requests.find_by(iid: pull_request.iid), true]
end end
def insert_git_data(merge_request, already_exists = false) def insert_git_data(merge_request, already_exists)
# These fields are set so we can create the correct merge request insert_or_replace_git_data(merge_request, pull_request.source_branch_sha, pull_request.target_branch_sha, already_exists)
# diffs.
merge_request.source_branch_sha = pull_request.source_branch_sha
merge_request.target_branch_sha = pull_request.target_branch_sha
merge_request.keep_around_commit
# MR diffs normally use an "after_save" hook to pull data from Git.
# All of this happens in the transaction started by calling
# create/save/etc. This in turn can lead to these transactions being
# held open for much longer than necessary. To work around this we
# first save the diff, then populate it.
diff =
if already_exists
merge_request.merge_request_diffs.take ||
merge_request.merge_request_diffs.build
else
merge_request.merge_request_diffs.build
end
diff.importing = true
diff.save
diff.save_git_content
end end
end end
end end
......
# frozen_string_literal: true
module Gitlab
module Import
module DatabaseHelpers
# Inserts a raw row and returns the ID of the inserted row.
#
# attributes - The attributes/columns to set.
# relation - An ActiveRecord::Relation to use for finding the ID of the row
# when using MySQL.
def insert_and_return_id(attributes, relation)
# We use bulk_insert here so we can bypass any queries executed by
# callbacks or validation rules, as doing this wouldn't scale when
# importing very large projects.
result = Gitlab::Database
.bulk_insert(relation.table_name, [attributes], return_ids: true)
# MySQL doesn't support returning the IDs of a bulk insert in a way that
# is not a pain, so in this case we'll issue an extra query instead.
result.first ||
relation.where(iid: attributes[:iid]).limit(1).pluck(:id).first
end
end
end
end
# frozen_string_literal: true
module Gitlab
module Import
module MergeRequestHelpers
include DatabaseHelpers
def create_merge_request_without_hooks(project, attributes, iid)
# This work must be wrapped in a transaction as otherwise we can leave
# behind incomplete data in the event of an error. This can then lead
# to duplicate key errors when jobs are retried.
MergeRequest.transaction do
# When creating merge requests there are a lot of hooks that may
# run, for many different reasons. Many of these hooks (e.g. the
# ones used for rendering Markdown) are completely unnecessary and
# may even lead to transaction timeouts.
#
# To ensure importing pull requests has a minimal impact and can
# complete in a reasonable time we bypass all the hooks by inserting
# the row and then retrieving it. We then only perform the
# additional work that is strictly necessary.
merge_request_id = insert_and_return_id(attributes, project.merge_requests)
merge_request = project.merge_requests.find(merge_request_id)
# We use .insert_and_return_id which effectively disables all callbacks.
# Trigger iid logic here to make sure we track internal id values consistently.
merge_request.ensure_target_project_iid!
[merge_request, false]
end
rescue ActiveRecord::InvalidForeignKey
# It's possible the project has been deleted since scheduling this
# job. In this case we'll just skip creating the merge request.
[]
rescue ActiveRecord::RecordNotUnique
# It's possible we previously created the MR, but failed when updating
# the Git data. In this case we'll just continue working on the
# existing row.
[project.merge_requests.find_by(iid: iid), true]
end
def insert_or_replace_git_data(merge_request, source_branch_sha, target_branch_sha, already_exists = false)
# These fields are set so we can create the correct merge request
# diffs.
merge_request.source_branch_sha = source_branch_sha
merge_request.target_branch_sha = target_branch_sha
merge_request.keep_around_commit
# MR diffs normally use an "after_save" hook to pull data from Git.
# All of this happens in the transaction started by calling
# create/save/etc. This in turn can lead to these transactions being
# held open for much longer than necessary. To work around this we
# first save the diff, then populate it.
diff =
if already_exists
merge_request.merge_request_diffs.take ||
merge_request.merge_request_diffs.build
else
merge_request.merge_request_diffs.build
end
diff.importing = true
diff.save
diff.save_git_content
end
end
end
end
...@@ -92,7 +92,7 @@ describe Gitlab::GithubImport::Importer::IssueImporter, :clean_gitlab_redis_cach ...@@ -92,7 +92,7 @@ describe Gitlab::GithubImport::Importer::IssueImporter, :clean_gitlab_redis_cach
.with(issue) .with(issue)
.and_return([user.id, true]) .and_return([user.id, true])
expect(Gitlab::GithubImport) expect(importer)
.to receive(:insert_and_return_id) .to receive(:insert_and_return_id)
.with( .with(
{ {
...@@ -121,7 +121,7 @@ describe Gitlab::GithubImport::Importer::IssueImporter, :clean_gitlab_redis_cach ...@@ -121,7 +121,7 @@ describe Gitlab::GithubImport::Importer::IssueImporter, :clean_gitlab_redis_cach
.with(issue) .with(issue)
.and_return([project.creator_id, false]) .and_return([project.creator_id, false])
expect(Gitlab::GithubImport) expect(importer)
.to receive(:insert_and_return_id) .to receive(:insert_and_return_id)
.with( .with(
{ {
...@@ -150,7 +150,7 @@ describe Gitlab::GithubImport::Importer::IssueImporter, :clean_gitlab_redis_cach ...@@ -150,7 +150,7 @@ describe Gitlab::GithubImport::Importer::IssueImporter, :clean_gitlab_redis_cach
.with(issue) .with(issue)
.and_return([user.id, true]) .and_return([user.id, true])
expect(Gitlab::GithubImport) expect(importer)
.to receive(:insert_and_return_id) .to receive(:insert_and_return_id)
.and_raise(ActiveRecord::InvalidForeignKey, 'invalid foreign key') .and_raise(ActiveRecord::InvalidForeignKey, 'invalid foreign key')
...@@ -185,7 +185,7 @@ describe Gitlab::GithubImport::Importer::IssueImporter, :clean_gitlab_redis_cach ...@@ -185,7 +185,7 @@ describe Gitlab::GithubImport::Importer::IssueImporter, :clean_gitlab_redis_cach
.and_return([user.id, true]) .and_return([user.id, true])
issue = build_stubbed(:issue, project: project) issue = build_stubbed(:issue, project: project)
allow(Gitlab::GithubImport) allow(importer)
.to receive(:insert_and_return_id) .to receive(:insert_and_return_id)
.and_return(issue.id) .and_return(issue.id)
allow(project.issues).to receive(:find).with(issue.id).and_return(issue) allow(project.issues).to receive(:find).with(issue.id).and_return(issue)
......
...@@ -80,7 +80,7 @@ describe Gitlab::GithubImport::Importer::PullRequestImporter, :clean_gitlab_redi ...@@ -80,7 +80,7 @@ describe Gitlab::GithubImport::Importer::PullRequestImporter, :clean_gitlab_redi
end end
it 'imports the pull request with the pull request author as the merge request author' do it 'imports the pull request with the pull request author as the merge request author' do
expect(Gitlab::GithubImport) expect(importer)
.to receive(:insert_and_return_id) .to receive(:insert_and_return_id)
.with( .with(
{ {
...@@ -114,7 +114,7 @@ describe Gitlab::GithubImport::Importer::PullRequestImporter, :clean_gitlab_redi ...@@ -114,7 +114,7 @@ describe Gitlab::GithubImport::Importer::PullRequestImporter, :clean_gitlab_redi
it 'triggers internal_id functionality to track greatest iids' do it 'triggers internal_id functionality to track greatest iids' do
mr = build_stubbed(:merge_request, source_project: project, target_project: project) mr = build_stubbed(:merge_request, source_project: project, target_project: project)
allow(Gitlab::GithubImport).to receive(:insert_and_return_id).and_return(mr.id) allow(importer).to receive(:insert_and_return_id).and_return(mr.id)
allow(project.merge_requests).to receive(:find).with(mr.id).and_return(mr) allow(project.merge_requests).to receive(:find).with(mr.id).and_return(mr)
expect(mr).to receive(:ensure_target_project_iid!) expect(mr).to receive(:ensure_target_project_iid!)
...@@ -135,7 +135,7 @@ describe Gitlab::GithubImport::Importer::PullRequestImporter, :clean_gitlab_redi ...@@ -135,7 +135,7 @@ describe Gitlab::GithubImport::Importer::PullRequestImporter, :clean_gitlab_redi
.with(pull_request) .with(pull_request)
.and_return(user.id) .and_return(user.id)
expect(Gitlab::GithubImport) expect(importer)
.to receive(:insert_and_return_id) .to receive(:insert_and_return_id)
.with( .with(
{ {
...@@ -181,7 +181,7 @@ describe Gitlab::GithubImport::Importer::PullRequestImporter, :clean_gitlab_redi ...@@ -181,7 +181,7 @@ describe Gitlab::GithubImport::Importer::PullRequestImporter, :clean_gitlab_redi
.to receive(:source_branch) .to receive(:source_branch)
.and_return('master') .and_return('master')
expect(Gitlab::GithubImport) expect(importer)
.to receive(:insert_and_return_id) .to receive(:insert_and_return_id)
.with( .with(
{ {
...@@ -219,7 +219,7 @@ describe Gitlab::GithubImport::Importer::PullRequestImporter, :clean_gitlab_redi ...@@ -219,7 +219,7 @@ describe Gitlab::GithubImport::Importer::PullRequestImporter, :clean_gitlab_redi
.with(pull_request) .with(pull_request)
.and_return(user.id) .and_return(user.id)
expect(Gitlab::GithubImport) expect(importer)
.to receive(:insert_and_return_id) .to receive(:insert_and_return_id)
.and_raise(ActiveRecord::InvalidForeignKey, 'invalid foreign key') .and_raise(ActiveRecord::InvalidForeignKey, 'invalid foreign key')
......
...@@ -27,39 +27,6 @@ describe Gitlab::GithubImport do ...@@ -27,39 +27,6 @@ describe Gitlab::GithubImport do
end end
end end
describe '.insert_and_return_id' do
let(:attributes) { { iid: 1, title: 'foo' } }
let(:project) { create(:project) }
context 'on PostgreSQL' do
it 'returns the ID returned by the query' do
expect(Gitlab::Database)
.to receive(:bulk_insert)
.with(Issue.table_name, [attributes], return_ids: true)
.and_return([10])
id = described_class.insert_and_return_id(attributes, project.issues)
expect(id).to eq(10)
end
end
context 'on MySQL' do
it 'uses a separate query to retrieve the ID' do
issue = create(:issue, project: project, iid: attributes[:iid])
expect(Gitlab::Database)
.to receive(:bulk_insert)
.with(Issue.table_name, [attributes], return_ids: true)
.and_return([])
id = described_class.insert_and_return_id(attributes, project.issues)
expect(id).to eq(issue.id)
end
end
end
describe '.ghost_user_id', :clean_gitlab_redis_cache do describe '.ghost_user_id', :clean_gitlab_redis_cache do
it 'returns the ID of the ghost user' do it 'returns the ID of the ghost user' do
expect(described_class.ghost_user_id).to eq(User.ghost.id) expect(described_class.ghost_user_id).to eq(User.ghost.id)
......
# frozen_string_literal: true
require 'spec_helper'
describe Gitlab::Import::DatabaseHelpers do
let(:database_helper) do
Class.new do
include Gitlab::Import::DatabaseHelpers
end
end
subject { database_helper.new }
describe '.insert_and_return_id' do
let(:attributes) { { iid: 1, title: 'foo' } }
let(:project) { create(:project) }
context 'on PostgreSQL' do
it 'returns the ID returned by the query' do
expect(Gitlab::Database)
.to receive(:bulk_insert)
.with(Issue.table_name, [attributes], return_ids: true)
.and_return([10])
id = subject.insert_and_return_id(attributes, project.issues)
expect(id).to eq(10)
end
end
context 'on MySQL' do
it 'uses a separate query to retrieve the ID' do
issue = create(:issue, project: project, iid: attributes[:iid])
expect(Gitlab::Database)
.to receive(:bulk_insert)
.with(Issue.table_name, [attributes], return_ids: true)
.and_return([])
id = subject.insert_and_return_id(attributes, project.issues)
expect(id).to eq(issue.id)
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment