Commit 0bfd480e authored by Sean McGivern's avatar Sean McGivern

Merge branch 'feature/gb/build-trace-malformed-size-metric' into 'master'

Detect corrupted build logs and report them by incrementing Prometheus counter

See merge request gitlab-org/gitlab!49004
parents e6bbf55e c0825011
......@@ -82,6 +82,10 @@ module Ci
unless checksum.valid?
metrics.increment_trace_operation(operation: :invalid)
if checksum.corrupted?
metrics.increment_trace_operation(operation: :corrupted)
end
next unless log_invalid_chunks?
::Gitlab::ErrorTracking.log_exception(InvalidTraceError.new,
......@@ -89,7 +93,8 @@ module Ci
build_id: build.id,
state_crc32: checksum.state_crc32,
chunks_crc32: checksum.chunks_crc32,
chunks_count: checksum.chunks_count
chunks_count: checksum.chunks_count,
chunks_corrupted: checksum.corrupted?
)
end
end
......@@ -151,13 +156,21 @@ module Ci
end
def has_checksum?
params.dig(:checksum).present?
trace_checksum.present?
end
def build_running?
build_state == 'running'
end
def trace_checksum
params.dig(:output, :checksum) || params.dig(:checksum)
end
def trace_bytesize
params.dig(:output, :bytesize)
end
def pending_state
strong_memoize(:pending_state) { ensure_pending_state }
end
......@@ -166,7 +179,8 @@ module Ci
build_state = Ci::BuildPendingState.safe_find_or_create_by(
build_id: build.id,
state: params.fetch(:state),
trace_checksum: params.fetch(:checksum),
trace_checksum: trace_checksum,
trace_bytesize: trace_bytesize,
failure_reason: params.dig(:failure_reason)
)
......
---
title: Detect corrupted build logs and report them by incrementing Prometheus counter
merge_request: 49004
author:
type: added
# frozen_string_literal: true
class AddTraceBytesizeToCiBuildPendingStates < ActiveRecord::Migration[6.0]
DOWNTIME = false
def change
add_column :ci_build_pending_states, :trace_bytesize, :bigint
end
end
9dc8d6b557198a60def4690ea06ec3dc9a29deca9082b7b03666aaed483a42f1
\ No newline at end of file
......@@ -10129,7 +10129,8 @@ CREATE TABLE ci_build_pending_states (
build_id bigint NOT NULL,
state smallint,
failure_reason smallint,
trace_checksum bytea
trace_checksum bytea,
trace_bytesize bigint
);
CREATE SEQUENCE ci_build_pending_states_id_seq
......
......@@ -176,6 +176,10 @@ module API
optional :state, type: String, desc: %q(Job's status: success, failed)
optional :checksum, type: String, desc: %q(Job's trace CRC32 checksum)
optional :failure_reason, type: String, desc: %q(Job's failure_reason)
optional :output, type: Hash, desc: %q(Build log state) do
optional :checksum, type: String, desc: %q(Job's trace CRC32 checksum)
optional :bytesize, type: Integer, desc: %q(Job's trace size in bytes)
end
end
put '/:id' do
job = authenticate_job!
......
......@@ -9,11 +9,7 @@ module Gitlab
# We assume 'value' must be mutable, given
# that frozen string is enabled.
##
# TODO We need to remove this because it is going to change checksum of
# a trace.
#
value.gsub!(token, 'x' * token.length)
value.gsub!(token, 'x' * token.bytesize)
value
end
end
......
......@@ -64,10 +64,33 @@ module Gitlab
end
end
def state_bytesize
strong_memoize(:state_bytesize) do
build.pending_state&.trace_bytesize
end
end
def trace_size
strong_memoize(:trace_size) do
trace_chunks.sum { |chunk| chunk_size(chunk) }
end
end
def corrupted?
return false unless has_bytesize?
return false if valid?
state_bytesize.to_i != trace_size.to_i
end
def chunks_count
trace_chunks.to_a.size
end
def has_bytesize?
state_bytesize.present?
end
private
def chunk_size(chunk)
......
......@@ -18,7 +18,8 @@ module Gitlab
:conflict, # runner has sent unrecognized build state details
:locked, # build trace has been locked by a different mechanism
:stalled, # failed to migrate chunk due to a worker duplication
:invalid # malformed build trace has been detected using CRC32
:invalid, # invalid build trace has been detected using CRC32
:corrupted # malformed trace found after comparing CRC32 and size
].freeze
def increment_trace_operation(operation: :unknown)
......
......@@ -22,6 +22,10 @@ RSpec.describe Gitlab::Ci::MaskSecret do
expect(mask('token', nil)).to eq('token')
end
it 'does not change a bytesize of a value' do
expect(mask('token-ü/unicode', 'token-ü').bytesize).to eq 16
end
def mask(value, token)
subject.mask!(value.dup, token)
end
......
......@@ -8,8 +8,12 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do
subject { described_class.new(build) }
context 'when build pending state exists' do
let(:trace_details) do
{ trace_checksum: 'crc32:d4777540', trace_bytesize: 262161 }
end
before do
create(:ci_build_pending_state, build: build, trace_checksum: 'crc32:d4777540')
create(:ci_build_pending_state, build: build, **trace_details)
end
context 'when matching persisted trace chunks exist' do
......@@ -22,6 +26,7 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do
it 'calculates combined trace chunks CRC32 correctly' do
expect(subject.chunks_crc32).to eq 3564598592
expect(subject).to be_valid
expect(subject).not_to be_corrupted
end
end
......@@ -32,8 +37,9 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do
create_chunk(index: 2, data: 'ccccccccccccccccc')
end
it 'makes trace checksum invalid' do
it 'makes trace checksum invalid but not corrupted' do
expect(subject).not_to be_valid
expect(subject).not_to be_corrupted
end
end
......@@ -43,8 +49,9 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do
create_chunk(index: 2, data: 'ccccccccccccccccc')
end
it 'makes trace checksum invalid' do
it 'makes trace checksum invalid and corrupted' do
expect(subject).not_to be_valid
expect(subject).to be_corrupted
end
end
......@@ -55,8 +62,9 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do
create_chunk(index: 2, data: 'ccccccccccccccccc')
end
it 'makes trace checksum invalid' do
it 'makes trace checksum invalid but not corrupted' do
expect(subject).not_to be_valid
expect(subject).not_to be_corrupted
end
end
......@@ -99,6 +107,14 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do
it 'returns nil' do
expect(subject.last_chunk).to be_nil
end
it 'is not a valid trace' do
expect(subject).not_to be_valid
end
it 'is not a corrupted trace' do
expect(subject).not_to be_corrupted
end
end
context 'when there are multiple chunks' do
......@@ -110,6 +126,26 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do
it 'returns chunk with the highest index' do
expect(subject.last_chunk.chunk_index).to eq 1
end
it 'is not a valid trace' do
expect(subject).not_to be_valid
end
it 'is not a corrupted trace' do
expect(subject).not_to be_corrupted
end
end
end
describe '#trace_size' do
before do
create_chunk(index: 0, data: 'a' * 128.kilobytes)
create_chunk(index: 1, data: 'b' * 128.kilobytes)
create_chunk(index: 2, data: 'abcdefg-ü')
end
it 'returns total trace size in bytes' do
expect(subject.trace_size).to eq 262154
end
end
......
......@@ -61,6 +61,23 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do
expect(response.header).not_to have_key('X-GitLab-Trace-Update-Interval')
end
context 'when runner sends an unrecognized field in a payload' do
##
# This test case is here to ensure that the API used to communicate
# runner with GitLab can evolve.
#
# In case of adding new features on the Runner side we do not want
# GitLab-side to reject requests containing unrecognizable fields in
# a payload, because runners can be updated before a new version of
# GitLab is installed.
#
it 'ignores unrecognized fields' do
update_job(state: 'success', 'unknown': 'something')
expect(job.reload).to be_success
end
end
context 'when failure_reason is script_failure' do
before do
update_job(state: 'failed', failure_reason: 'script_failure')
......
......@@ -80,7 +80,11 @@ RSpec.describe Ci::UpdateBuildStateService do
context 'when build has a checksum' do
let(:params) do
{ checksum: 'crc32:12345678', state: 'failed', failure_reason: 'script_failure' }
{
output: { checksum: 'crc32:12345678', bytesize: 123 },
failure_reason: 'script_failure',
state: 'failed'
}
end
context 'when build does not have associated trace chunks' do
......@@ -154,14 +158,74 @@ RSpec.describe Ci::UpdateBuildStateService do
end
context 'when trace checksum is valid' do
let(:params) { { checksum: 'crc32:ed82cd11', state: 'success' } }
let(:params) do
{ output: { checksum: 'crc32:ed82cd11', bytesize: 4 }, state: 'success' }
end
it 'does not increment invalid trace metric' do
it 'does not increment invalid or corrupted trace metric' do
execute_with_stubbed_metrics!
expect(metrics)
.not_to have_received(:increment_trace_operation)
.with(operation: :invalid)
expect(metrics)
.not_to have_received(:increment_trace_operation)
.with(operation: :corrupted)
end
context 'when using deprecated parameters' do
let(:params) do
{ checksum: 'crc32:ed82cd11', state: 'success' }
end
it 'does not increment invalid or corrupted trace metric' do
execute_with_stubbed_metrics!
expect(metrics)
.not_to have_received(:increment_trace_operation)
.with(operation: :invalid)
expect(metrics)
.not_to have_received(:increment_trace_operation)
.with(operation: :corrupted)
end
end
end
context 'when trace checksum is invalid and the log is corrupted' do
let(:params) do
{ output: { checksum: 'crc32:12345678', bytesize: 1 }, state: 'success' }
end
it 'increments invalid and corrupted trace metrics' do
execute_with_stubbed_metrics!
expect(metrics)
.to have_received(:increment_trace_operation)
.with(operation: :invalid)
expect(metrics)
.to have_received(:increment_trace_operation)
.with(operation: :corrupted)
end
end
context 'when trace checksum is invalid but the log seems fine' do
let(:params) do
{ output: { checksum: 'crc32:12345678', bytesize: 4 }, state: 'success' }
end
it 'does not increment corrupted trace metric' do
execute_with_stubbed_metrics!
expect(metrics)
.to have_received(:increment_trace_operation)
.with(operation: :invalid)
expect(metrics)
.not_to have_received(:increment_trace_operation)
.with(operation: :corrupted)
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment