Commit 03b29ac7 authored by Michał Zając's avatar Michał Zając Committed by Adam Hegyi

Remove duplicates from vulnerability_occurrences

When trying to introduce UUIDv5 in vulnerability_occurrences table we
discovered rows that would introduce duplicate values in the UUID
column. This would violate the UNIQUE constraint we have on that column.
We need to remove those rows before recalculating UUIDs for older
findings.
parent 56315676
---
title: Remove duplicates from vulnerability_occurrences
merge_request: 49937
author:
type: other
# frozen_string_literal: true
class ScheduleRemoveDuplicateVulnerabilitiesFindings < ActiveRecord::Migration[6.0]
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
INDEX_NAME = 'tmp_idx_deduplicate_vulnerability_occurrences'
MIGRATION = 'RemoveDuplicateVulnerabilitiesFindings'
DELAY_INTERVAL = 2.minutes.to_i
BATCH_SIZE = 5_000
disable_ddl_transaction!
class VulnerabilitiesFinding < ActiveRecord::Base
include ::EachBatch
self.table_name = "vulnerability_occurrences"
end
def up
add_concurrent_index :vulnerability_occurrences,
%i[project_id report_type location_fingerprint primary_identifier_id id],
name: INDEX_NAME
say "Scheduling #{MIGRATION} jobs"
queue_background_migration_jobs_by_range_at_intervals(
VulnerabilitiesFinding,
MIGRATION,
DELAY_INTERVAL,
batch_size: BATCH_SIZE
)
end
def down
remove_concurrent_index_by_name(:vulnerability_occurrences, INDEX_NAME)
end
end
322d7270e942c161cc8b50b8c3f531c93b6e6e938e415c1b6010a70b630bf82e
\ No newline at end of file
......@@ -23380,6 +23380,12 @@ CREATE INDEX temporary_index_vulnerabilities_on_id ON vulnerabilities USING btre
CREATE UNIQUE INDEX term_agreements_unique_index ON term_agreements USING btree (user_id, term_id);
CREATE INDEX tmp_build_stage_position_index ON ci_builds USING btree (stage_id, stage_idx) WHERE (stage_idx IS NOT NULL);
CREATE INDEX tmp_idx_deduplicate_vulnerability_occurrences ON vulnerability_occurrences USING btree (project_id, report_type, location_fingerprint, primary_identifier_id, id);
CREATE INDEX tmp_index_for_email_unconfirmation_migration ON emails USING btree (id) WHERE (confirmed_at IS NOT NULL);
CREATE INDEX tmp_index_oauth_applications_on_id_where_trusted ON oauth_applications USING btree (id) WHERE (trusted = true);
CREATE INDEX tmp_index_on_vulnerabilities_non_dismissed ON vulnerabilities USING btree (id) WHERE (state <> 2);
......
# frozen_string_literal: true
# rubocop: disable Style/Documentation
class Gitlab::BackgroundMigration::RemoveDuplicateVulnerabilitiesFindings
DELETE_BATCH_SIZE = 100
# rubocop:disable Gitlab/NamespacedClass
class VulnerabilitiesFinding < ActiveRecord::Base
self.table_name = "vulnerability_occurrences"
end
# rubocop:enable Gitlab/NamespacedClass
def perform(start_id, end_id)
batch = VulnerabilitiesFinding.where(id: start_id..end_id)
cte = Gitlab::SQL::CTE.new(:batch, batch.select(:report_type, :location_fingerprint, :primary_identifier_id, :project_id))
query = VulnerabilitiesFinding
.select('batch.report_type', 'batch.location_fingerprint', 'batch.primary_identifier_id', 'batch.project_id', 'array_agg(id) as ids')
.distinct
.with(cte.to_arel)
.from(cte.alias_to(Arel.sql('batch')))
.joins(
%(
INNER JOIN
vulnerability_occurrences ON
vulnerability_occurrences.report_type = batch.report_type AND
vulnerability_occurrences.location_fingerprint = batch.location_fingerprint AND
vulnerability_occurrences.primary_identifier_id = batch.primary_identifier_id AND
vulnerability_occurrences.project_id = batch.project_id
)).group('batch.report_type', 'batch.location_fingerprint', 'batch.primary_identifier_id', 'batch.project_id')
.having('COUNT(*) > 1')
ids_to_delete = []
query.to_a.each do |record|
# We want to keep the latest finding since it might have recent metadata
duplicate_ids = record.ids.uniq.sort
duplicate_ids.pop
ids_to_delete.concat(duplicate_ids)
if ids_to_delete.size == DELETE_BATCH_SIZE
VulnerabilitiesFinding.where(id: ids_to_delete).delete_all
ids_to_delete.clear
end
end
VulnerabilitiesFinding.where(id: ids_to_delete).delete_all if ids_to_delete.any?
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::BackgroundMigration::RemoveDuplicateVulnerabilitiesFindings do
let(:namespace) { table(:namespaces).create!(name: 'user', path: 'user') }
let(:users) { table(:users) }
let(:user) { create_user! }
let(:project) { table(:projects).create!(id: 123, namespace_id: namespace.id) }
let(:scanners) { table(:vulnerability_scanners) }
let!(:scanner) { scanners.create!(project_id: project.id, external_id: 'test 1', name: 'test scanner 1') }
let!(:scanner2) { scanners.create!(project_id: project.id, external_id: 'test 2', name: 'test scanner 2') }
let!(:scanner3) { scanners.create!(project_id: project.id, external_id: 'test 3', name: 'test scanner 3') }
let!(:unrelated_scanner) { scanners.create!(project_id: project.id, external_id: 'unreleated_scanner', name: 'unrelated scanner') }
let(:vulnerabilities) { table(:vulnerabilities) }
let(:vulnerability_findings) { table(:vulnerability_occurrences) }
let(:vulnerability_identifiers) { table(:vulnerability_identifiers) }
let(:vulnerability_identifier) do
vulnerability_identifiers.create!(
project_id: project.id,
external_type: 'vulnerability-identifier',
external_id: 'vulnerability-identifier',
fingerprint: '7e394d1b1eb461a7406d7b1e08f057a1cf11287a',
name: 'vulnerability identifier')
end
let!(:first_finding) do
create_finding!(
uuid: "test1",
vulnerability_id: nil,
report_type: 0,
location_fingerprint: '2bda3014914481791847d8eca38d1a8d13b6ad76',
primary_identifier_id: vulnerability_identifier.id,
scanner_id: scanner.id,
project_id: project.id
)
end
let!(:first_duplicate) do
create_finding!(
uuid: "test2",
vulnerability_id: nil,
report_type: 0,
location_fingerprint: '2bda3014914481791847d8eca38d1a8d13b6ad76',
primary_identifier_id: vulnerability_identifier.id,
scanner_id: scanner2.id,
project_id: project.id
)
end
let!(:second_duplicate) do
create_finding!(
uuid: "test3",
vulnerability_id: nil,
report_type: 0,
location_fingerprint: '2bda3014914481791847d8eca38d1a8d13b6ad76',
primary_identifier_id: vulnerability_identifier.id,
scanner_id: scanner3.id,
project_id: project.id
)
end
let!(:unrelated_finding) do
create_finding!(
uuid: "unreleated_finding",
vulnerability_id: nil,
report_type: 1,
location_fingerprint: 'random_location_fingerprint',
primary_identifier_id: vulnerability_identifier.id,
scanner_id: unrelated_scanner.id,
project_id: project.id
)
end
subject { described_class.new.perform(first_finding.id, unrelated_finding.id) }
before do
stub_const("#{described_class}::DELETE_BATCH_SIZE", 1)
end
it "removes entries which would result in duplicate UUIDv5" do
expect(vulnerability_findings.count).to eq(4)
expect { subject }.to change { vulnerability_findings.count }.from(4).to(2)
expect(vulnerability_findings.pluck(:id)).to eq([second_duplicate.id, unrelated_finding.id])
end
private
def create_vulnerability!(project_id:, author_id:, title: 'test', severity: 7, confidence: 7, report_type: 0)
vulnerabilities.create!(
project_id: project_id,
author_id: author_id,
title: title,
severity: severity,
confidence: confidence,
report_type: report_type
)
end
# rubocop:disable Metrics/ParameterLists
def create_finding!(
vulnerability_id:, project_id:, scanner_id:, primary_identifier_id:,
name: "test", severity: 7, confidence: 7, report_type: 0,
project_fingerprint: '123qweasdzxc', location_fingerprint: 'test',
metadata_version: 'test', raw_metadata: 'test', uuid: 'test')
vulnerability_findings.create!(
vulnerability_id: vulnerability_id,
project_id: project_id,
name: name,
severity: severity,
confidence: confidence,
report_type: report_type,
project_fingerprint: project_fingerprint,
scanner_id: scanner_id,
primary_identifier_id: vulnerability_identifier.id,
location_fingerprint: location_fingerprint,
metadata_version: metadata_version,
raw_metadata: raw_metadata,
uuid: uuid
)
end
# rubocop:enable Metrics/ParameterLists
def create_user!(name: "Example User", email: "user@example.com", user_type: nil, created_at: Time.zone.now, confirmed_at: Time.zone.now)
users.create!(
name: name,
email: email,
username: name,
projects_limit: 0,
user_type: user_type,
confirmed_at: confirmed_at
)
end
end
# frozen_string_literal: true
require 'spec_helper'
require Rails.root.join('db', 'post_migrate', '20201112130710_schedule_remove_duplicate_vulnerabilities_findings.rb')
RSpec.describe ScheduleRemoveDuplicateVulnerabilitiesFindings, :migration do
let(:namespace) { table(:namespaces).create!(name: 'user', path: 'user') }
let(:users) { table(:users) }
let(:user) { create_user! }
let(:project) { table(:projects).create!(id: 123, namespace_id: namespace.id) }
let(:scanners) { table(:vulnerability_scanners) }
let!(:scanner) { scanners.create!(project_id: project.id, external_id: 'test 1', name: 'test scanner 1') }
let!(:scanner2) { scanners.create!(project_id: project.id, external_id: 'test 2', name: 'test scanner 2') }
let!(:scanner3) { scanners.create!(project_id: project.id, external_id: 'test 3', name: 'test scanner 3') }
let!(:unrelated_scanner) { scanners.create!(project_id: project.id, external_id: 'unreleated_scanner', name: 'unrelated scanner') }
let(:vulnerabilities) { table(:vulnerabilities) }
let(:vulnerability_findings) { table(:vulnerability_occurrences) }
let(:vulnerability_identifiers) { table(:vulnerability_identifiers) }
let(:vulnerability_identifier) do
vulnerability_identifiers.create!(
project_id: project.id,
external_type: 'vulnerability-identifier',
external_id: 'vulnerability-identifier',
fingerprint: '7e394d1b1eb461a7406d7b1e08f057a1cf11287a',
name: 'vulnerability identifier')
end
let!(:first_finding) do
create_finding!(
uuid: "test1",
vulnerability_id: nil,
report_type: 0,
location_fingerprint: '2bda3014914481791847d8eca38d1a8d13b6ad76',
primary_identifier_id: vulnerability_identifier.id,
scanner_id: scanner.id,
project_id: project.id
)
end
let!(:first_duplicate) do
create_finding!(
uuid: "test2",
vulnerability_id: nil,
report_type: 0,
location_fingerprint: '2bda3014914481791847d8eca38d1a8d13b6ad76',
primary_identifier_id: vulnerability_identifier.id,
scanner_id: scanner2.id,
project_id: project.id
)
end
let!(:second_duplicate) do
create_finding!(
uuid: "test3",
vulnerability_id: nil,
report_type: 0,
location_fingerprint: '2bda3014914481791847d8eca38d1a8d13b6ad76',
primary_identifier_id: vulnerability_identifier.id,
scanner_id: scanner3.id,
project_id: project.id
)
end
let!(:unrelated_finding) do
create_finding!(
uuid: "unreleated_finding",
vulnerability_id: nil,
report_type: 1,
location_fingerprint: 'random_location_fingerprint',
primary_identifier_id: vulnerability_identifier.id,
scanner_id: unrelated_scanner.id,
project_id: project.id
)
end
before do
stub_const("#{described_class}::BATCH_SIZE", 1)
end
around do |example|
freeze_time { Sidekiq::Testing.fake! { example.run } }
end
it 'schedules background migration' do
migrate!
expect(BackgroundMigrationWorker.jobs.size).to eq(4)
expect(described_class::MIGRATION).to be_scheduled_migration(first_finding.id, first_finding.id)
expect(described_class::MIGRATION).to be_scheduled_migration(first_duplicate.id, first_duplicate.id)
expect(described_class::MIGRATION).to be_scheduled_migration(second_duplicate.id, second_duplicate.id)
expect(described_class::MIGRATION).to be_scheduled_migration(unrelated_finding.id, unrelated_finding.id)
end
private
def create_vulnerability!(project_id:, author_id:, title: 'test', severity: 7, confidence: 7, report_type: 0)
vulnerabilities.create!(
project_id: project_id,
author_id: author_id,
title: title,
severity: severity,
confidence: confidence,
report_type: report_type
)
end
# rubocop:disable Metrics/ParameterLists
def create_finding!(
vulnerability_id:, project_id:, scanner_id:, primary_identifier_id:,
name: "test", severity: 7, confidence: 7, report_type: 0,
project_fingerprint: '123qweasdzxc', location_fingerprint: 'test',
metadata_version: 'test', raw_metadata: 'test', uuid: 'test')
vulnerability_findings.create!(
vulnerability_id: vulnerability_id,
project_id: project_id,
name: name,
severity: severity,
confidence: confidence,
report_type: report_type,
project_fingerprint: project_fingerprint,
scanner_id: scanner_id,
primary_identifier_id: vulnerability_identifier.id,
location_fingerprint: location_fingerprint,
metadata_version: metadata_version,
raw_metadata: raw_metadata,
uuid: uuid
)
end
# rubocop:enable Metrics/ParameterLists
def create_user!(name: "Example User", email: "user@example.com", user_type: nil, created_at: Time.now, confirmed_at: Time.now)
users.create!(
name: name,
email: email,
username: name,
projects_limit: 0,
user_type: user_type,
confirmed_at: confirmed_at
)
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment