Commit 90886008 authored by Douwe Maan's avatar Douwe Maan

Merge branch '30093-apply-bfg-object-map-to-database' into 'master'

Remove cleaned up OIDs from database and cache

Closes #30093

See merge request gitlab-org/gitlab-ce!26555
parents 0cc30909 8973f32d
...@@ -51,6 +51,10 @@ class MergeRequestDiff < ApplicationRecord ...@@ -51,6 +51,10 @@ class MergeRequestDiff < ApplicationRecord
joins(:merge_request_diff_commits).where(merge_request_diff_commits: { sha: sha }).reorder(nil) joins(:merge_request_diff_commits).where(merge_request_diff_commits: { sha: sha }).reorder(nil)
end end
scope :by_project_id, -> (project_id) do
joins(:merge_request).where(merge_requests: { target_project_id: project_id })
end
scope :recent, -> { order(id: :desc).limit(100) } scope :recent, -> { order(id: :desc).limit(100) }
scope :files_in_database, -> { where(stored_externally: [false, nil]) } scope :files_in_database, -> { where(stored_externally: [false, nil]) }
......
...@@ -7,6 +7,10 @@ class NoteDiffFile < ApplicationRecord ...@@ -7,6 +7,10 @@ class NoteDiffFile < ApplicationRecord
joins(:diff_note).where("resolved_at IS NULL OR noteable_type = 'Commit'") joins(:diff_note).where("resolved_at IS NULL OR noteable_type = 'Commit'")
end end
scope :referencing_sha, -> (oids, project_id:) do
joins(:diff_note).where(notes: { project_id: project_id, commit_id: oids })
end
delegate :original_position, :project, to: :diff_note delegate :original_position, :project, to: :diff_note
belongs_to :diff_note, inverse_of: :note_diff_file belongs_to :diff_note, inverse_of: :note_diff_file
......
...@@ -18,9 +18,6 @@ module Projects ...@@ -18,9 +18,6 @@ module Projects
# per rewritten object, with the old and new SHAs space-separated. It can be # per rewritten object, with the old and new SHAs space-separated. It can be
# used to update or remove content that references the objects that BFG has # used to update or remove content that references the objects that BFG has
# altered # altered
#
# Currently, only the project repository is modified by this service, but we
# may wish to modify other data sources in the future.
def execute def execute
apply_bfg_object_map! apply_bfg_object_map!
...@@ -41,10 +38,52 @@ module Projects ...@@ -41,10 +38,52 @@ module Projects
raise NoUploadError unless project.bfg_object_map.exists? raise NoUploadError unless project.bfg_object_map.exists?
project.bfg_object_map.open do |io| project.bfg_object_map.open do |io|
repository_cleaner.apply_bfg_object_map(io) repository_cleaner.apply_bfg_object_map_stream(io) do |response|
cleanup_diffs(response)
end
end
end
def cleanup_diffs(response)
old_commit_shas = extract_old_commit_shas(response.entries)
ActiveRecord::Base.transaction do
cleanup_merge_request_diffs(old_commit_shas)
cleanup_note_diff_files(old_commit_shas)
end end
end end
def extract_old_commit_shas(batch)
batch.lazy.select { |entry| entry.type == :COMMIT }.map(&:old_oid).force
end
def cleanup_merge_request_diffs(old_commit_shas)
merge_request_diffs = MergeRequestDiff
.by_project_id(project.id)
.by_commit_sha(old_commit_shas)
# It's important to run the ActiveRecord callbacks here
merge_request_diffs.destroy_all # rubocop:disable Cop/DestroyAll
# TODO: ensure the highlight cache is removed immediately. It's too hard
# to calculate the Redis keys at present.
#
# https://gitlab.com/gitlab-org/gitlab-ce/issues/61115
end
def cleanup_note_diff_files(old_commit_shas)
# Pluck the IDs instead of running the query twice to ensure we clear the
# cache for exactly the note diffs we remove
ids = NoteDiffFile
.referencing_sha(old_commit_shas, project_id: project.id)
.pluck_primary_key
NoteDiffFile.id_in(ids).delete_all
# A highlighted version of the diff is stored in redis. Remove it now.
Gitlab::DiscussionsDiff::HighlightCache.clear_multiple(ids)
end
def repository_cleaner def repository_cleaner
@repository_cleaner ||= Gitlab::Git::RepositoryCleaner.new(repository.raw) @repository_cleaner ||= Gitlab::Git::RepositoryCleaner.new(repository.raw)
end end
......
---
title: Remove cleaned up OIDs from database and cache
merge_request: 26555
author:
type: added
...@@ -98,6 +98,12 @@ up its own internal state, maximizing the space saved. ...@@ -98,6 +98,12 @@ up its own internal state, maximizing the space saved.
`git gc` against the repository. You will receive an email once it has `git gc` against the repository. You will receive an email once it has
completed. completed.
This process will remove some copies of the rewritten commits from GitLab's
cache and database, but there are still numerous gaps in coverage - at present,
some of the copies may persist indefinitely. [Clearing the instance cache]
(../../../administration/raketasks/maintenance.md#clear-redis-cache) may help to
remove some of them, but it should not be depended on for security purposes!
## Using `git filter-branch` ## Using `git filter-branch`
1. Navigate to your repository: 1. Navigate to your repository:
......
...@@ -52,6 +52,19 @@ module Gitlab ...@@ -52,6 +52,19 @@ module Gitlab
end end
end end
# Clears multiple cache keys at once.
#
# raw_keys - An Array of unique cache keys, without namespaces.
#
# It returns the number of cache keys cleared. Ex.: 42
def clear_multiple(raw_keys)
return [] if raw_keys.empty?
keys = raw_keys.map { |id| cache_key_for(id) }
Redis::Cache.with { |redis| redis.del(keys) }
end
def cache_key_for(raw_key) def cache_key_for(raw_key)
"#{cache_key_prefix}:#{raw_key}" "#{cache_key_prefix}:#{raw_key}"
end end
......
...@@ -12,9 +12,9 @@ module Gitlab ...@@ -12,9 +12,9 @@ module Gitlab
@repository = repository @repository = repository
end end
def apply_bfg_object_map(io) def apply_bfg_object_map_stream(io, &blk)
wrapped_gitaly_errors do wrapped_gitaly_errors do
gitaly_cleanup_client.apply_bfg_object_map(io) gitaly_cleanup_client.apply_bfg_object_map_stream(io, &blk)
end end
end end
......
...@@ -12,25 +12,32 @@ module Gitlab ...@@ -12,25 +12,32 @@ module Gitlab
@storage = repository.storage @storage = repository.storage
end end
def apply_bfg_object_map(io) def apply_bfg_object_map_stream(io, &blk)
first_request = Gitaly::ApplyBfgObjectMapRequest.new(repository: gitaly_repo) responses = GitalyClient.call(
storage,
:cleanup_service,
:apply_bfg_object_map_stream,
build_object_map_enum(io),
timeout: GitalyClient.no_timeout
)
responses.each(&blk)
end
private
enum = Enumerator.new do |y| def build_object_map_enum(io)
y.yield first_request Enumerator.new do |y|
# First request. For simplicity, doesn't include any object map data
y << Gitaly::ApplyBfgObjectMapStreamRequest.new(repository: gitaly_repo)
# Now stream the BFG object map file to gitaly in chunks
while data = io.read(RepositoryService::MAX_MSG_SIZE) while data = io.read(RepositoryService::MAX_MSG_SIZE)
y.yield Gitaly::ApplyBfgObjectMapRequest.new(object_map: data) y << Gitaly::ApplyBfgObjectMapStreamRequest.new(object_map: data)
break if io&.eof? break if io&.eof?
end end
end end
GitalyClient.call(
storage,
:cleanup_service,
:apply_bfg_object_map,
enum,
timeout: GitalyClient.no_timeout
)
end end
end end
end end
......
...@@ -3,31 +3,32 @@ ...@@ -3,31 +3,32 @@
require 'spec_helper' require 'spec_helper'
describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do
def fake_file(offset)
{
text: 'foo',
type: 'new',
index: 2 + offset,
old_pos: 10 + offset,
new_pos: 11 + offset,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
}
end
let(:mapping) do
{
3 => [
fake_file(0),
fake_file(1)
],
4 => [
fake_file(2)
]
}
end
describe '#write_multiple' do describe '#write_multiple' do
it 'sets multiple keys serializing content as JSON' do it 'sets multiple keys serializing content as JSON' do
mapping = {
3 => [
{
text: 'foo',
type: 'new',
index: 2,
old_pos: 10,
new_pos: 11,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
},
{
text: 'foo',
type: 'new',
index: 3,
old_pos: 11,
new_pos: 12,
line_code: 'xpto',
rich_text: '<blops>blips</blops>'
}
]
}
described_class.write_multiple(mapping) described_class.write_multiple(mapping)
mapping.each do |key, value| mapping.each do |key, value|
...@@ -41,53 +42,16 @@ describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do ...@@ -41,53 +42,16 @@ describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do
describe '#read_multiple' do describe '#read_multiple' do
it 'reads multiple keys and serializes content into Gitlab::Diff::Line objects' do it 'reads multiple keys and serializes content into Gitlab::Diff::Line objects' do
mapping = {
3 => [
{
text: 'foo',
type: 'new',
index: 2,
old_pos: 11,
new_pos: 12,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
},
{
text: 'foo',
type: 'new',
index: 3,
old_pos: 10,
new_pos: 11,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
}
]
}
described_class.write_multiple(mapping) described_class.write_multiple(mapping)
found = described_class.read_multiple(mapping.keys) found = described_class.read_multiple(mapping.keys)
expect(found.size).to eq(1) expect(found.size).to eq(2)
expect(found.first.size).to eq(2) expect(found.first.size).to eq(2)
expect(found.first).to all(be_a(Gitlab::Diff::Line)) expect(found.first).to all(be_a(Gitlab::Diff::Line))
end end
it 'returns nil when cached key is not found' do it 'returns nil when cached key is not found' do
mapping = {
3 => [
{
text: 'foo',
type: 'new',
index: 2,
old_pos: 11,
new_pos: 12,
line_code: 'xpto',
rich_text: '<blips>blops</blips>'
}
]
}
described_class.write_multiple(mapping) described_class.write_multiple(mapping)
found = described_class.read_multiple([2, 3]) found = described_class.read_multiple([2, 3])
...@@ -95,8 +59,30 @@ describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do ...@@ -95,8 +59,30 @@ describe Gitlab::DiscussionsDiff::HighlightCache, :clean_gitlab_redis_cache do
expect(found.size).to eq(2) expect(found.size).to eq(2)
expect(found.first).to eq(nil) expect(found.first).to eq(nil)
expect(found.second.size).to eq(1) expect(found.second.size).to eq(2)
expect(found.second).to all(be_a(Gitlab::Diff::Line)) expect(found.second).to all(be_a(Gitlab::Diff::Line))
end end
end end
describe '#clear_multiple' do
it 'removes all named keys' do
described_class.write_multiple(mapping)
described_class.clear_multiple(mapping.keys)
expect(described_class.read_multiple(mapping.keys)).to all(be_nil)
end
it 'only removed named keys' do
to_clear, to_leave = mapping.keys
described_class.write_multiple(mapping)
described_class.clear_multiple([to_clear])
cleared, left = described_class.read_multiple([to_clear, to_leave])
expect(cleared).to be_nil
expect(left).to all(be_a(Gitlab::Diff::Line))
end
end
end end
...@@ -6,55 +6,62 @@ describe Gitlab::Git::RepositoryCleaner do ...@@ -6,55 +6,62 @@ describe Gitlab::Git::RepositoryCleaner do
let(:project) { create(:project, :repository) } let(:project) { create(:project, :repository) }
let(:repository) { project.repository } let(:repository) { project.repository }
let(:head_sha) { repository.head_commit.id } let(:head_sha) { repository.head_commit.id }
let(:object_map_data) { "#{head_sha} #{'0' * 40}" } let(:object_map_data) { "#{head_sha} #{Gitlab::Git::BLANK_SHA}" }
subject(:cleaner) { described_class.new(repository.raw) } let(:clean_refs) { %W[refs/environments/1 refs/merge-requests/1 refs/keep-around/#{head_sha}] }
let(:keep_refs) { %w[refs/heads/_keep refs/tags/_keep] }
describe '#apply_bfg_object_map' do subject(:cleaner) { described_class.new(repository.raw) }
let(:clean_refs) { %W[refs/environments/1 refs/merge-requests/1 refs/keep-around/#{head_sha}] }
let(:keep_refs) { %w[refs/heads/_keep refs/tags/_keep] }
shared_examples_for '#apply_bfg_object_map_stream' do
before do before do
(clean_refs + keep_refs).each { |ref| repository.create_ref(head_sha, ref) } (clean_refs + keep_refs).each { |ref| repository.create_ref(head_sha, ref) }
end end
context 'from StringIO' do it 'removes internal references' do
let(:object_map) { StringIO.new(object_map_data) } entries = []
it 'removes internal references' do cleaner.apply_bfg_object_map_stream(object_map) do |rsp|
cleaner.apply_bfg_object_map(object_map) entries.concat(rsp.entries)
end
aggregate_failures do aggregate_failures do
clean_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_falsy } clean_refs.each { |ref| expect(repository.ref_exists?(ref)).to be(false) }
keep_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_truthy } keep_refs.each { |ref| expect(repository.ref_exists?(ref)).to be(true) }
end
expect(entries).to contain_exactly(
Gitaly::ApplyBfgObjectMapStreamResponse::Entry.new(
type: :COMMIT,
old_oid: head_sha,
new_oid: Gitlab::Git::BLANK_SHA
)
)
end end
end end
end
context 'from Gitlab::HttpIO' do describe '#apply_bfg_object_map_stream (from StringIO)' do
let(:url) { 'http://example.com/bfg_object_map.txt' } let(:object_map) { StringIO.new(object_map_data) }
let(:tempfile) { Tempfile.new }
let(:object_map) { Gitlab::HttpIO.new(url, object_map_data.size) }
around do |example| include_examples '#apply_bfg_object_map_stream'
tempfile.write(object_map_data) end
tempfile.close
example.run describe '#apply_bfg_object_map_stream (from Gitlab::HttpIO)' do
ensure let(:url) { 'http://example.com/bfg_object_map.txt' }
tempfile.unlink let(:tempfile) { Tempfile.new }
end let(:object_map) { Gitlab::HttpIO.new(url, object_map_data.size) }
it 'removes internal references' do around do |example|
stub_remote_url_200(url, tempfile.path) tempfile.write(object_map_data)
tempfile.close
cleaner.apply_bfg_object_map(object_map) stub_remote_url_200(url, tempfile.path)
aggregate_failures do example.run
clean_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_falsy } ensure
keep_refs.each { |ref| expect(repository.ref_exists?(ref)).to be_truthy } tempfile.unlink
end
end
end end
include_examples '#apply_bfg_object_map_stream'
end end
end end
...@@ -6,14 +6,14 @@ describe Gitlab::GitalyClient::CleanupService do ...@@ -6,14 +6,14 @@ describe Gitlab::GitalyClient::CleanupService do
let(:relative_path) { project.disk_path + '.git' } let(:relative_path) { project.disk_path + '.git' }
let(:client) { described_class.new(project.repository) } let(:client) { described_class.new(project.repository) }
describe '#apply_bfg_object_map' do describe '#apply_bfg_object_map_stream' do
it 'sends an apply_bfg_object_map message' do it 'sends an apply_bfg_object_map_stream message' do
expect_any_instance_of(Gitaly::CleanupService::Stub) expect_any_instance_of(Gitaly::CleanupService::Stub)
.to receive(:apply_bfg_object_map) .to receive(:apply_bfg_object_map_stream)
.with(kind_of(Enumerator), kind_of(Hash)) .with(kind_of(Enumerator), kind_of(Hash))
.and_return(double) .and_return([])
client.apply_bfg_object_map(StringIO.new) client.apply_bfg_object_map_stream(StringIO.new)
end end
end end
end end
...@@ -10,4 +10,31 @@ describe NoteDiffFile do ...@@ -10,4 +10,31 @@ describe NoteDiffFile do
describe 'validations' do describe 'validations' do
it { is_expected.to validate_presence_of(:diff_note) } it { is_expected.to validate_presence_of(:diff_note) }
end end
describe '.referencing_sha' do
let!(:diff_note) { create(:diff_note_on_commit) }
let(:note_diff_file) { diff_note.note_diff_file }
let(:project) { diff_note.project }
it 'finds note diff files by project and sha' do
found = described_class.referencing_sha(diff_note.commit_id, project_id: project.id)
expect(found).to contain_exactly(note_diff_file)
end
it 'excludes note diff files with the wrong project' do
other_project = create(:project)
found = described_class.referencing_sha(diff_note.commit_id, project_id: other_project.id)
expect(found).to be_empty
end
it 'excludes note diff files with the wrong sha' do
found = described_class.referencing_sha(Gitlab::Git::BLANK_SHA, project_id: project.id)
expect(found).to be_empty
end
end
end end
...@@ -6,13 +6,13 @@ describe Projects::CleanupService do ...@@ -6,13 +6,13 @@ describe Projects::CleanupService do
let(:project) { create(:project, :repository, bfg_object_map: fixture_file_upload('spec/fixtures/bfg_object_map.txt')) } let(:project) { create(:project, :repository, bfg_object_map: fixture_file_upload('spec/fixtures/bfg_object_map.txt')) }
let(:object_map) { project.bfg_object_map } let(:object_map) { project.bfg_object_map }
let(:cleaner) { service.__send__(:repository_cleaner) }
subject(:service) { described_class.new(project) } subject(:service) { described_class.new(project) }
describe '#execute' do describe '#execute' do
it 'runs the apply_bfg_object_map gitaly RPC' do it 'runs the apply_bfg_object_map_stream gitaly RPC' do
expect_next_instance_of(Gitlab::Git::RepositoryCleaner) do |cleaner| expect(cleaner).to receive(:apply_bfg_object_map_stream).with(kind_of(IO))
expect(cleaner).to receive(:apply_bfg_object_map).with(kind_of(IO))
end
service.execute service.execute
end end
...@@ -37,10 +37,91 @@ describe Projects::CleanupService do ...@@ -37,10 +37,91 @@ describe Projects::CleanupService do
expect(object_map.exists?).to be_falsy expect(object_map.exists?).to be_falsy
end end
context 'with a tainted merge request diff' do
let(:merge_request) { create(:merge_request, source_project: project, target_project: project) }
let(:diff) { merge_request.merge_request_diff }
let(:entry) { build_entry(diff.commits.first.id) }
before do
allow(cleaner)
.to receive(:apply_bfg_object_map_stream)
.and_yield(Gitaly::ApplyBfgObjectMapStreamResponse.new(entries: [entry]))
end
it 'removes the tainted commit from the database' do
service.execute
expect(MergeRequestDiff.exists?(diff.id)).to be_falsy
end
it 'ignores non-commit responses from Gitaly' do
entry.type = :UNKNOWN
service.execute
expect(MergeRequestDiff.exists?(diff.id)).to be_truthy
end
end
context 'with a tainted diff note' do
let(:diff_note) { create(:diff_note_on_commit, project: project) }
let(:note_diff_file) { diff_note.note_diff_file }
let(:entry) { build_entry(diff_note.commit_id) }
let(:highlight_cache) { Gitlab::DiscussionsDiff::HighlightCache }
let(:cache_id) { note_diff_file.id }
before do
allow(cleaner)
.to receive(:apply_bfg_object_map_stream)
.and_yield(Gitaly::ApplyBfgObjectMapStreamResponse.new(entries: [entry]))
end
it 'removes the tainted commit from the database' do
service.execute
expect(NoteDiffFile.exists?(note_diff_file.id)).to be_falsy
end
it 'removes the highlight cache from redis' do
write_cache(highlight_cache, cache_id, [{}])
expect(read_cache(highlight_cache, cache_id)).not_to be_nil
service.execute
expect(read_cache(highlight_cache, cache_id)).to be_nil
end
it 'ignores non-commit responses from Gitaly' do
entry.type = :UNKNOWN
service.execute
expect(NoteDiffFile.exists?(note_diff_file.id)).to be_truthy
end
end
it 'raises an error if no object map can be found' do it 'raises an error if no object map can be found' do
object_map.remove! object_map.remove!
expect { service.execute }.to raise_error(described_class::NoUploadError) expect { service.execute }.to raise_error(described_class::NoUploadError)
end end
end end
def build_entry(old_oid)
Gitaly::ApplyBfgObjectMapStreamResponse::Entry.new(
type: :COMMIT,
old_oid: old_oid,
new_oid: Gitlab::Git::BLANK_SHA
)
end
def read_cache(cache, key)
cache.read_multiple([key]).first
end
def write_cache(cache, key, value)
cache.write_multiple(key => value)
end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment