Commit 0a374212 authored by Stan Hu's avatar Stan Hu Committed by Douglas Barbosa Alexandre

Link existing LFS objects from parent fork during uploads

Previously LFS uploads would always have to be reuploaded to a fork even
if the parent already had received the LFS file, but this is
unnecessary, wasting time and bandwidth. Consider this sequence of
events:

1. Push LFS file `test.bin` to project A.
2. Fork project A to project B.
3. Push LFS file `test2.bin` to project A.
4. Push to project B.

When 4 happens, GitLab should be smart enough to realize that if the
user has access to the parent project, then we should be able to link
the LFS files in A without requesting a reupload of the file.

Relates to https://gitlab.com/gitlab-org/gitlab/-/issues/297022

Changelog: fixed
parent f5e9dc63
......@@ -76,7 +76,10 @@ module Repositories
existing_oids = project.lfs_objects_oids(oids: objects_oids)
objects.each do |object|
object[:actions] = upload_actions(object) unless existing_oids.include?(object[:oid])
next if existing_oids.include?(object[:oid])
next if should_auto_link? && oids_from_fork.include?(object[:oid]) && link_to_project!(object)
object[:actions] = upload_actions(object)
end
objects
......@@ -150,6 +153,26 @@ module Repositories
Gitlab::LfsToken.new(user).basic_encoding
end
def should_auto_link?
return false unless Feature.enabled?(:lfs_auto_link_fork_source, project)
return false unless project.forked?
# Sanity check in case for some reason the user doesn't have access to the parent
can?(user, :download_code, project.fork_source)
end
def oids_from_fork
@oids_from_fork ||= project.lfs_objects_oids_from_fork_source(oids: objects_oids)
end
def link_to_project!(object)
lfs_object = LfsObject.for_oid_and_size(object[:oid], object[:size])
return unless lfs_object
LfsObjectsProject.link_to_project!(lfs_object, project)
end
end
end
......
......@@ -13,6 +13,7 @@ class LfsObject < ApplicationRecord
scope :with_files_stored_locally, -> { where(file_store: LfsObjectUploader::Store::LOCAL) }
scope :with_files_stored_remotely, -> { where(file_store: LfsObjectUploader::Store::REMOTE) }
scope :for_oids, -> (oids) { where(oid: oids) }
scope :for_oid_and_size, -> (oid, size) { find_by(oid: oid, size: size) }
validates :oid, presence: true, uniqueness: true
......
......@@ -21,9 +21,19 @@ class LfsObjectsProject < ApplicationRecord
scope :project_id_in, ->(ids) { where(project_id: ids) }
scope :lfs_object_in, -> (lfs_objects) { where(lfs_object: lfs_objects) }
def self.link_to_project!(lfs_object, project)
# We can't use an upsert here because there is no uniqueness constraint:
# https://gitlab.com/gitlab-org/gitlab/-/issues/347466
self.safe_find_or_create_by!(lfs_object_id: lfs_object.id, project_id: project.id) # rubocop:disable Performance/ActiveRecordSubtransactionMethods
end
def self.update_statistics_for_project_id(project_id)
ProjectCacheWorker.perform_async(project_id, [], [:lfs_objects_size]) # rubocop:disable CodeReuse/Worker
end
private
def update_project_statistics
ProjectCacheWorker.perform_async(project_id, [], [:lfs_objects_size])
self.class.update_statistics_for_project_id(project_id)
end
end
......@@ -1569,6 +1569,12 @@ class Project < ApplicationRecord
oids(lfs_objects, oids: oids)
end
def lfs_objects_oids_from_fork_source(oids: [])
return [] unless forked?
oids(fork_source.lfs_objects, oids: oids)
end
def personal?
!group
end
......
---
name: lfs_auto_link_fork_source
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/75972
rollout_issue_url:
milestone: '14.6'
type: development
group: group::source code
default_enabled: false
......@@ -25,6 +25,28 @@ RSpec.describe LfsObjectsProject do
end
end
describe '#link_to_project!' do
it 'does not throw error when duplicate exists' do
subject
expect do
result = described_class.link_to_project!(subject.lfs_object, subject.project)
expect(result).to be_a(LfsObjectsProject)
end.not_to change { described_class.count }
end
it 'upserts a new entry and updates the project cache' do
new_project = create(:project)
allow(ProjectCacheWorker).to receive(:perform_async).and_call_original
expect(ProjectCacheWorker).to receive(:perform_async).with(new_project.id, [], [:lfs_objects_size])
expect { described_class.link_to_project!(subject.lfs_object, new_project) }
.to change { described_class.count }
expect(described_class.find_by(lfs_object_id: subject.lfs_object.id, project_id: new_project.id)).to be_present
end
end
describe '#update_project_statistics' do
it 'updates project statistics when the object is added' do
expect(ProjectCacheWorker).to receive(:perform_async)
......
......@@ -3581,6 +3581,29 @@ RSpec.describe Project, factory_default: :keep do
expect(project.forks).to contain_exactly(forked_project)
end
end
describe '#lfs_object_oids_from_fork_source' do
let_it_be(:lfs_object) { create(:lfs_object) }
let_it_be(:another_lfs_object) { create(:lfs_object) }
let(:oids) { [lfs_object.oid, another_lfs_object.oid] }
context 'when fork has one of two LFS objects' do
before do
create(:lfs_objects_project, lfs_object: lfs_object, project: project)
create(:lfs_objects_project, lfs_object: another_lfs_object, project: forked_project)
end
it 'returns OIDs of owned LFS objects', :aggregate_failures do
expect(forked_project.lfs_objects_oids_from_fork_source(oids: oids)).to eq([lfs_object.oid])
expect(forked_project.lfs_objects_oids(oids: oids)).to eq([another_lfs_object.oid])
end
it 'returns empty when project is not a fork' do
expect(project.lfs_objects_oids_from_fork_source(oids: oids)).to eq([])
end
end
end
end
it_behaves_like 'can housekeep repository' do
......
......@@ -518,16 +518,46 @@ RSpec.describe 'Git LFS API and storage' do
end
context 'in source of fork project' do
let(:other_project) { create(:project, :empty_repo) }
let(:project) { fork_project(other_project) }
before do
lfs_object.update!(projects: [other_project])
end
context 'when user has access to both the parent and fork' do
before do
project.add_developer(user)
other_project.add_developer(user)
end
it 'links existing LFS objects to other project' do
expect(json_response['objects']).to be_kind_of(Array)
expect(json_response['objects'].first).to include(sample_object)
expect(json_response['objects'].first).not_to have_key('actions')
expect(lfs_object.reload.projects.pluck(:id)).to match_array([other_project.id, project.id])
end
context 'when feature flag is disabled' do
before do
stub_feature_flags(lfs_auto_link_fork_source: false)
end
it_behaves_like 'batch upload with existing LFS object'
end
end
context 'when user does not have access to parent' do
before do
project.add_developer(user)
end
it_behaves_like 'batch upload with existing LFS object'
end
end
end
context 'when pushing a LFS object that does not exist' do
it_behaves_like 'pushes new LFS objects', renew_authorization: true
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment