Commit 6f945f20 authored by Nick Thomas's avatar Nick Thomas

Foreground verification of uploads and LFS objects

parent 40c61acb
...@@ -15,4 +15,8 @@ class LfsObject < ActiveRecord::Base ...@@ -15,4 +15,8 @@ class LfsObject < ActiveRecord::Base
.where(lfs_objects_projects: { id: nil }) .where(lfs_objects_projects: { id: nil })
.destroy_all .destroy_all
end end
def self.calculate_oid(path)
Digest::SHA256.file(path).hexdigest
end
end end
---
title: Foreground verification of uploads and LFS objects
merge_request: 17402
author:
type: added
...@@ -78,34 +78,41 @@ Example output: ...@@ -78,34 +78,41 @@ Example output:
## Uploaded Files Integrity ## Uploaded Files Integrity
The uploads check Rake task will loop through all uploads in the database Various types of file can be uploaded to a GitLab installation by users.
and run two checks to determine the integrity of each file: Checksums are generated and stored in the database upon upload, and integrity
checks using those checksums can be run. These checks also detect missing files.
1. Check if the file exist on the file system. Currently, integrity checks are supported for the following types of file:
1. Check if the checksum of the file on the file system matches the checksum in the database.
* LFS objects
* User uploads
**Omnibus Installation** **Omnibus Installation**
``` ```
sudo gitlab-rake gitlab:lfs:check
sudo gitlab-rake gitlab:uploads:check sudo gitlab-rake gitlab:uploads:check
``` ```
**Source Installation** **Source Installation**
```bash ```bash
sudo -u git -H bundle exec rake gitlab:lfs:check RAILS_ENV=production
sudo -u git -H bundle exec rake gitlab:uploads:check RAILS_ENV=production sudo -u git -H bundle exec rake gitlab:uploads:check RAILS_ENV=production
``` ```
This task also accepts some environment variables which you can use to override These tasks also accept some environment variables which you can use to override
certain values: certain values:
Variable | Type | Description Variable | Type | Description
-------- | ---- | ----------- --------- | ------- | -----------
`BATCH` | integer | Specifies the size of the batch. Defaults to 200. `BATCH` | integer | Specifies the size of the batch. Defaults to 200.
`ID_FROM` | integer | Specifies the ID to start from, inclusive of the value. `ID_FROM` | integer | Specifies the ID to start from, inclusive of the value.
`ID_TO` | integer | Specifies the ID value to end at, inclusive of the value. `ID_TO` | integer | Specifies the ID value to end at, inclusive of the value.
`VERBOSE` | boolean | Causes failures to be listed individually, rather than being summarized.
```bash ```bash
sudo gitlab-rake gitlab:lfs:check BATCH=100 ID_FROM=50 ID_TO=250
sudo gitlab-rake gitlab:uploads:check BATCH=100 ID_FROM=50 ID_TO=250 sudo gitlab-rake gitlab:uploads:check BATCH=100 ID_FROM=50 ID_TO=250
``` ```
......
module Gitlab
module Verify
class BatchVerifier
attr_reader :batch_size, :start, :finish
def initialize(batch_size:, start: nil, finish: nil)
@batch_size = batch_size
@start = start
@finish = finish
end
# Yields a Range of IDs and a Hash of failed verifications (object => error)
def run_batches(&blk)
relation.in_batches(of: batch_size, start: start, finish: finish) do |relation| # rubocop: disable Cop/InBatches
range = relation.first.id..relation.last.id
failures = run_batch(relation)
yield(range, failures)
end
end
def name
raise NotImplementedError.new
end
def describe(_object)
raise NotImplementedError.new
end
private
def run_batch(relation)
relation.map { |upload| verify(upload) }.compact.to_h
end
def verify(object)
expected = expected_checksum(object)
actual = actual_checksum(object)
raise 'Checksum missing' unless expected.present?
raise 'Checksum mismatch' unless expected == actual
nil
rescue => err
[object, err]
end
# This should return an ActiveRecord::Relation suitable for calling #in_batches on
def relation
raise NotImplementedError.new
end
# The checksum we expect the object to have
def expected_checksum(_object)
raise NotImplementedError.new
end
# The freshly-recalculated checksum of the object
def actual_checksum(_object)
raise NotImplementedError.new
end
end
end
end
module Gitlab
module Verify
class LfsObjects < BatchVerifier
def name
'LFS objects'
end
def describe(object)
"LFS object: #{object.oid}"
end
private
def relation
LfsObject.all
end
def expected_checksum(lfs_object)
lfs_object.oid
end
def actual_checksum(lfs_object)
LfsObject.calculate_oid(lfs_object.file.path)
end
end
end
end
module Gitlab
module Verify
class RakeTask
def self.run!(verify_kls)
verifier = verify_kls.new(
batch_size: ENV.fetch('BATCH', 200).to_i,
start: ENV['ID_FROM'],
finish: ENV['ID_TO']
)
verbose = Gitlab::Utils.to_boolean(ENV['VERBOSE'])
new(verifier, verbose).run!
end
attr_reader :verifier, :output
def initialize(verifier, verbose)
@verifier = verifier
@verbose = verbose
end
def run!
say "Checking integrity of #{verifier.name}"
verifier.run_batches { |*args| run_batch(*args) }
say 'Done!'
end
def verbose?
!!@verbose
end
private
def say(text)
puts(text) # rubocop:disable Rails/Output
end
def run_batch(range, failures)
status_color = failures.empty? ? :green : :red
say "- #{range}: Failures: #{failures.count}".color(status_color)
return unless verbose?
failures.each do |object, error|
say " - #{verifier.describe(object)}: #{error.inspect}".color(:red)
end
end
end
end
end
module Gitlab
module Verify
class Uploads < BatchVerifier
def name
'Uploads'
end
def describe(object)
"Upload: #{object.id}"
end
private
def relation
Upload.all
end
def expected_checksum(upload)
upload.checksum
end
def actual_checksum(upload)
Upload.hexdigest(upload.absolute_path)
end
end
end
end
namespace :gitlab do
namespace :lfs do
desc 'GitLab | LFS | Check integrity of uploaded LFS objects'
task check: :environment do
Gitlab::Verify::RakeTask.run!(Gitlab::Verify::LfsObjects)
end
end
end
namespace :gitlab do
namespace :uploads do
desc 'GitLab | Uploads | Check integrity of uploaded files'
task check: :environment do
puts 'Checking integrity of uploaded files'
uploads_batches do |batch|
batch.each do |upload|
puts "- Checking file (#{upload.id}): #{upload.absolute_path}".color(:green)
if upload.exist?
check_checksum(upload)
else
puts " * File does not exist on the file system".color(:red)
end
end
end
puts 'Done!'
end
def batch_size
ENV.fetch('BATCH', 200).to_i
end
def calculate_checksum(absolute_path)
Digest::SHA256.file(absolute_path).hexdigest
end
def check_checksum(upload)
checksum = calculate_checksum(upload.absolute_path)
if checksum != upload.checksum
puts " * File checksum (#{checksum}) does not match the one in the database (#{upload.checksum})".color(:red)
end
end
def uploads_batches(&block)
Upload.all.in_batches(of: batch_size, start: ENV['ID_FROM'], finish: ENV['ID_TO']) do |relation| # rubocop: disable Cop/InBatches
yield relation
end
end
end
end
namespace :gitlab do
namespace :uploads do
desc 'GitLab | Uploads | Check integrity of uploaded files'
task check: :environment do
Gitlab::Verify::RakeTask.run!(Gitlab::Verify::Uploads)
end
end
end
...@@ -9,4 +9,10 @@ FactoryBot.define do ...@@ -9,4 +9,10 @@ FactoryBot.define do
trait :with_file do trait :with_file do
file { fixture_file_upload(Rails.root + "spec/fixtures/dk.png", "`/png") } file { fixture_file_upload(Rails.root + "spec/fixtures/dk.png", "`/png") }
end end
# The uniqueness constraint means we can't use the correct OID for all LFS
# objects, so the test needs to decide which (if any) object gets it
trait :correct_oid do
oid 'b804383982bb89b00e828e3f44c038cc991d3d1768009fc39ba8e2c081b9fb75'
end
end end
require 'spec_helper'
describe Gitlab::Verify::LfsObjects do
include GitlabVerifyHelpers
it_behaves_like 'Gitlab::Verify::BatchVerifier subclass' do
let!(:objects) { create_list(:lfs_object, 3, :with_file) }
end
describe '#run_batches' do
let(:failures) { collect_failures }
let(:failure) { failures[lfs_object] }
let!(:lfs_object) { create(:lfs_object, :with_file, :correct_oid) }
it 'passes LFS objects with the correct file' do
expect(failures).to eq({})
end
it 'fails LFS objects with a missing file' do
FileUtils.rm_f(lfs_object.file.path)
expect(failures.keys).to contain_exactly(lfs_object)
expect(failure).to be_a(Errno::ENOENT)
expect(failure.to_s).to include(lfs_object.file.path)
end
it 'fails LFS objects with a mismatched oid' do
File.truncate(lfs_object.file.path, 0)
expect(failures.keys).to contain_exactly(lfs_object)
expect(failure.to_s).to include('Checksum mismatch')
end
end
end
require 'spec_helper'
describe Gitlab::Verify::Uploads do
include GitlabVerifyHelpers
it_behaves_like 'Gitlab::Verify::BatchVerifier subclass' do
let(:projects) { create_list(:project, 3, :with_avatar) }
let!(:objects) { projects.flat_map(&:uploads) }
end
describe '#run_batches' do
let(:project) { create(:project, :with_avatar) }
let(:failures) { collect_failures }
let(:failure) { failures[upload] }
let!(:upload) { project.uploads.first }
it 'passes uploads with the correct file' do
expect(failures).to eq({})
end
it 'fails uploads with a missing file' do
FileUtils.rm_f(upload.absolute_path)
expect(failures.keys).to contain_exactly(upload)
expect(failure).to be_a(Errno::ENOENT)
expect(failure.to_s).to include(upload.absolute_path)
end
it 'fails uploads with a mismatched checksum' do
upload.update!(checksum: 'something incorrect')
expect(failures.keys).to contain_exactly(upload)
expect(failure.to_s).to include('Checksum mismatch')
end
it 'fails uploads with a missing precalculated checksum' do
upload.update!(checksum: '')
expect(failures.keys).to contain_exactly(upload)
expect(failure.to_s).to include('Checksum missing')
end
end
end
RSpec.shared_examples 'Gitlab::Verify::BatchVerifier subclass' do
describe 'batching' do
let(:first_batch) { objects[0].id..objects[0].id }
let(:second_batch) { objects[1].id..objects[1].id }
let(:third_batch) { objects[2].id..objects[2].id }
it 'iterates through objects in batches' do
expect(collect_ranges).to eq([first_batch, second_batch, third_batch])
end
it 'allows the starting ID to be specified' do
expect(collect_ranges(start: second_batch.first)).to eq([second_batch, third_batch])
end
it 'allows the finishing ID to be specified' do
expect(collect_ranges(finish: second_batch.last)).to eq([first_batch, second_batch])
end
end
end
module GitlabVerifyHelpers
def collect_ranges(args = {})
verifier = described_class.new(args.merge(batch_size: 1))
collect_results(verifier).map { |range, _| range }
end
def collect_failures
verifier = described_class.new(batch_size: 1)
out = {}
collect_results(verifier).map { |_, failures| out.merge!(failures) }
out
end
def collect_results(verifier)
out = []
verifier.run_batches { |*args| out << args }
out
end
end
require 'rake_helper'
describe 'gitlab:lfs rake tasks' do
describe 'check' do
let!(:lfs_object) { create(:lfs_object, :with_file, :correct_oid) }
before do
Rake.application.rake_require('tasks/gitlab/lfs/check')
stub_env('VERBOSE' => 'true')
end
it 'outputs the integrity check for each batch' do
expect { run_rake_task('gitlab:lfs:check') }.to output(/Failures: 0/).to_stdout
end
it 'errors out about missing files on the file system' do
FileUtils.rm_f(lfs_object.file.path)
expect { run_rake_task('gitlab:lfs:check') }.to output(/No such file.*#{Regexp.quote(lfs_object.file.path)}/).to_stdout
end
it 'errors out about invalid checksum' do
File.truncate(lfs_object.file.path, 0)
expect { run_rake_task('gitlab:lfs:check') }.to output(/Checksum mismatch/).to_stdout
end
end
end
...@@ -5,23 +5,24 @@ describe 'gitlab:uploads rake tasks' do ...@@ -5,23 +5,24 @@ describe 'gitlab:uploads rake tasks' do
let!(:upload) { create(:upload, path: Rails.root.join('spec/fixtures/banana_sample.gif')) } let!(:upload) { create(:upload, path: Rails.root.join('spec/fixtures/banana_sample.gif')) }
before do before do
Rake.application.rake_require 'tasks/gitlab/uploads' Rake.application.rake_require('tasks/gitlab/uploads/check')
stub_env('VERBOSE' => 'true')
end end
it 'outputs the integrity check for each uploaded file' do it 'outputs the integrity check for each batch' do
expect { run_rake_task('gitlab:uploads:check') }.to output(/Checking file \(#{upload.id}\): #{Regexp.quote(upload.absolute_path)}/).to_stdout expect { run_rake_task('gitlab:uploads:check') }.to output(/Failures: 0/).to_stdout
end end
it 'errors out about missing files on the file system' do it 'errors out about missing files on the file system' do
create(:upload) missing_upload = create(:upload)
expect { run_rake_task('gitlab:uploads:check') }.to output(/File does not exist on the file system/).to_stdout expect { run_rake_task('gitlab:uploads:check') }.to output(/No such file.*#{Regexp.quote(missing_upload.absolute_path)}/).to_stdout
end end
it 'errors out about invalid checksum' do it 'errors out about invalid checksum' do
upload.update_column(:checksum, '01a3156db2cf4f67ec823680b40b7302f89ab39179124ad219f94919b8a1769e') upload.update_column(:checksum, '01a3156db2cf4f67ec823680b40b7302f89ab39179124ad219f94919b8a1769e')
expect { run_rake_task('gitlab:uploads:check') }.to output(/File checksum \(9e697aa09fe196909813ee36103e34f721fe47a5fdc8aac0e4e4ac47b9b38282\) does not match the one in the database \(#{upload.checksum}\)/).to_stdout expect { run_rake_task('gitlab:uploads:check') }.to output(/Checksum mismatch/).to_stdout
end end
end end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment