Commit 0bfd480e authored by Sean McGivern's avatar Sean McGivern

Merge branch 'feature/gb/build-trace-malformed-size-metric' into 'master'

Detect corrupted build logs and report them by incrementing Prometheus counter

See merge request gitlab-org/gitlab!49004
parents e6bbf55e c0825011
...@@ -82,6 +82,10 @@ module Ci ...@@ -82,6 +82,10 @@ module Ci
unless checksum.valid? unless checksum.valid?
metrics.increment_trace_operation(operation: :invalid) metrics.increment_trace_operation(operation: :invalid)
if checksum.corrupted?
metrics.increment_trace_operation(operation: :corrupted)
end
next unless log_invalid_chunks? next unless log_invalid_chunks?
::Gitlab::ErrorTracking.log_exception(InvalidTraceError.new, ::Gitlab::ErrorTracking.log_exception(InvalidTraceError.new,
...@@ -89,7 +93,8 @@ module Ci ...@@ -89,7 +93,8 @@ module Ci
build_id: build.id, build_id: build.id,
state_crc32: checksum.state_crc32, state_crc32: checksum.state_crc32,
chunks_crc32: checksum.chunks_crc32, chunks_crc32: checksum.chunks_crc32,
chunks_count: checksum.chunks_count chunks_count: checksum.chunks_count,
chunks_corrupted: checksum.corrupted?
) )
end end
end end
...@@ -151,13 +156,21 @@ module Ci ...@@ -151,13 +156,21 @@ module Ci
end end
def has_checksum? def has_checksum?
params.dig(:checksum).present? trace_checksum.present?
end end
def build_running? def build_running?
build_state == 'running' build_state == 'running'
end end
def trace_checksum
params.dig(:output, :checksum) || params.dig(:checksum)
end
def trace_bytesize
params.dig(:output, :bytesize)
end
def pending_state def pending_state
strong_memoize(:pending_state) { ensure_pending_state } strong_memoize(:pending_state) { ensure_pending_state }
end end
...@@ -166,7 +179,8 @@ module Ci ...@@ -166,7 +179,8 @@ module Ci
build_state = Ci::BuildPendingState.safe_find_or_create_by( build_state = Ci::BuildPendingState.safe_find_or_create_by(
build_id: build.id, build_id: build.id,
state: params.fetch(:state), state: params.fetch(:state),
trace_checksum: params.fetch(:checksum), trace_checksum: trace_checksum,
trace_bytesize: trace_bytesize,
failure_reason: params.dig(:failure_reason) failure_reason: params.dig(:failure_reason)
) )
......
---
title: Detect corrupted build logs and report them by incrementing Prometheus counter
merge_request: 49004
author:
type: added
# frozen_string_literal: true
class AddTraceBytesizeToCiBuildPendingStates < ActiveRecord::Migration[6.0]
DOWNTIME = false
def change
add_column :ci_build_pending_states, :trace_bytesize, :bigint
end
end
9dc8d6b557198a60def4690ea06ec3dc9a29deca9082b7b03666aaed483a42f1
\ No newline at end of file
...@@ -10129,7 +10129,8 @@ CREATE TABLE ci_build_pending_states ( ...@@ -10129,7 +10129,8 @@ CREATE TABLE ci_build_pending_states (
build_id bigint NOT NULL, build_id bigint NOT NULL,
state smallint, state smallint,
failure_reason smallint, failure_reason smallint,
trace_checksum bytea trace_checksum bytea,
trace_bytesize bigint
); );
CREATE SEQUENCE ci_build_pending_states_id_seq CREATE SEQUENCE ci_build_pending_states_id_seq
......
...@@ -176,6 +176,10 @@ module API ...@@ -176,6 +176,10 @@ module API
optional :state, type: String, desc: %q(Job's status: success, failed) optional :state, type: String, desc: %q(Job's status: success, failed)
optional :checksum, type: String, desc: %q(Job's trace CRC32 checksum) optional :checksum, type: String, desc: %q(Job's trace CRC32 checksum)
optional :failure_reason, type: String, desc: %q(Job's failure_reason) optional :failure_reason, type: String, desc: %q(Job's failure_reason)
optional :output, type: Hash, desc: %q(Build log state) do
optional :checksum, type: String, desc: %q(Job's trace CRC32 checksum)
optional :bytesize, type: Integer, desc: %q(Job's trace size in bytes)
end
end end
put '/:id' do put '/:id' do
job = authenticate_job! job = authenticate_job!
......
...@@ -9,11 +9,7 @@ module Gitlab ...@@ -9,11 +9,7 @@ module Gitlab
# We assume 'value' must be mutable, given # We assume 'value' must be mutable, given
# that frozen string is enabled. # that frozen string is enabled.
## value.gsub!(token, 'x' * token.bytesize)
# TODO We need to remove this because it is going to change checksum of
# a trace.
#
value.gsub!(token, 'x' * token.length)
value value
end end
end end
......
...@@ -64,10 +64,33 @@ module Gitlab ...@@ -64,10 +64,33 @@ module Gitlab
end end
end end
def state_bytesize
strong_memoize(:state_bytesize) do
build.pending_state&.trace_bytesize
end
end
def trace_size
strong_memoize(:trace_size) do
trace_chunks.sum { |chunk| chunk_size(chunk) }
end
end
def corrupted?
return false unless has_bytesize?
return false if valid?
state_bytesize.to_i != trace_size.to_i
end
def chunks_count def chunks_count
trace_chunks.to_a.size trace_chunks.to_a.size
end end
def has_bytesize?
state_bytesize.present?
end
private private
def chunk_size(chunk) def chunk_size(chunk)
......
...@@ -18,7 +18,8 @@ module Gitlab ...@@ -18,7 +18,8 @@ module Gitlab
:conflict, # runner has sent unrecognized build state details :conflict, # runner has sent unrecognized build state details
:locked, # build trace has been locked by a different mechanism :locked, # build trace has been locked by a different mechanism
:stalled, # failed to migrate chunk due to a worker duplication :stalled, # failed to migrate chunk due to a worker duplication
:invalid # malformed build trace has been detected using CRC32 :invalid, # invalid build trace has been detected using CRC32
:corrupted # malformed trace found after comparing CRC32 and size
].freeze ].freeze
def increment_trace_operation(operation: :unknown) def increment_trace_operation(operation: :unknown)
......
...@@ -22,6 +22,10 @@ RSpec.describe Gitlab::Ci::MaskSecret do ...@@ -22,6 +22,10 @@ RSpec.describe Gitlab::Ci::MaskSecret do
expect(mask('token', nil)).to eq('token') expect(mask('token', nil)).to eq('token')
end end
it 'does not change a bytesize of a value' do
expect(mask('token-ü/unicode', 'token-ü').bytesize).to eq 16
end
def mask(value, token) def mask(value, token)
subject.mask!(value.dup, token) subject.mask!(value.dup, token)
end end
......
...@@ -8,8 +8,12 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do ...@@ -8,8 +8,12 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do
subject { described_class.new(build) } subject { described_class.new(build) }
context 'when build pending state exists' do context 'when build pending state exists' do
let(:trace_details) do
{ trace_checksum: 'crc32:d4777540', trace_bytesize: 262161 }
end
before do before do
create(:ci_build_pending_state, build: build, trace_checksum: 'crc32:d4777540') create(:ci_build_pending_state, build: build, **trace_details)
end end
context 'when matching persisted trace chunks exist' do context 'when matching persisted trace chunks exist' do
...@@ -22,6 +26,7 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do ...@@ -22,6 +26,7 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do
it 'calculates combined trace chunks CRC32 correctly' do it 'calculates combined trace chunks CRC32 correctly' do
expect(subject.chunks_crc32).to eq 3564598592 expect(subject.chunks_crc32).to eq 3564598592
expect(subject).to be_valid expect(subject).to be_valid
expect(subject).not_to be_corrupted
end end
end end
...@@ -32,8 +37,9 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do ...@@ -32,8 +37,9 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do
create_chunk(index: 2, data: 'ccccccccccccccccc') create_chunk(index: 2, data: 'ccccccccccccccccc')
end end
it 'makes trace checksum invalid' do it 'makes trace checksum invalid but not corrupted' do
expect(subject).not_to be_valid expect(subject).not_to be_valid
expect(subject).not_to be_corrupted
end end
end end
...@@ -43,8 +49,9 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do ...@@ -43,8 +49,9 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do
create_chunk(index: 2, data: 'ccccccccccccccccc') create_chunk(index: 2, data: 'ccccccccccccccccc')
end end
it 'makes trace checksum invalid' do it 'makes trace checksum invalid and corrupted' do
expect(subject).not_to be_valid expect(subject).not_to be_valid
expect(subject).to be_corrupted
end end
end end
...@@ -55,8 +62,9 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do ...@@ -55,8 +62,9 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do
create_chunk(index: 2, data: 'ccccccccccccccccc') create_chunk(index: 2, data: 'ccccccccccccccccc')
end end
it 'makes trace checksum invalid' do it 'makes trace checksum invalid but not corrupted' do
expect(subject).not_to be_valid expect(subject).not_to be_valid
expect(subject).not_to be_corrupted
end end
end end
...@@ -99,6 +107,14 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do ...@@ -99,6 +107,14 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do
it 'returns nil' do it 'returns nil' do
expect(subject.last_chunk).to be_nil expect(subject.last_chunk).to be_nil
end end
it 'is not a valid trace' do
expect(subject).not_to be_valid
end
it 'is not a corrupted trace' do
expect(subject).not_to be_corrupted
end
end end
context 'when there are multiple chunks' do context 'when there are multiple chunks' do
...@@ -110,6 +126,26 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do ...@@ -110,6 +126,26 @@ RSpec.describe Gitlab::Ci::Trace::Checksum do
it 'returns chunk with the highest index' do it 'returns chunk with the highest index' do
expect(subject.last_chunk.chunk_index).to eq 1 expect(subject.last_chunk.chunk_index).to eq 1
end end
it 'is not a valid trace' do
expect(subject).not_to be_valid
end
it 'is not a corrupted trace' do
expect(subject).not_to be_corrupted
end
end
end
describe '#trace_size' do
before do
create_chunk(index: 0, data: 'a' * 128.kilobytes)
create_chunk(index: 1, data: 'b' * 128.kilobytes)
create_chunk(index: 2, data: 'abcdefg-ü')
end
it 'returns total trace size in bytes' do
expect(subject.trace_size).to eq 262154
end end
end end
......
...@@ -61,6 +61,23 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do ...@@ -61,6 +61,23 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do
expect(response.header).not_to have_key('X-GitLab-Trace-Update-Interval') expect(response.header).not_to have_key('X-GitLab-Trace-Update-Interval')
end end
context 'when runner sends an unrecognized field in a payload' do
##
# This test case is here to ensure that the API used to communicate
# runner with GitLab can evolve.
#
# In case of adding new features on the Runner side we do not want
# GitLab-side to reject requests containing unrecognizable fields in
# a payload, because runners can be updated before a new version of
# GitLab is installed.
#
it 'ignores unrecognized fields' do
update_job(state: 'success', 'unknown': 'something')
expect(job.reload).to be_success
end
end
context 'when failure_reason is script_failure' do context 'when failure_reason is script_failure' do
before do before do
update_job(state: 'failed', failure_reason: 'script_failure') update_job(state: 'failed', failure_reason: 'script_failure')
......
...@@ -80,7 +80,11 @@ RSpec.describe Ci::UpdateBuildStateService do ...@@ -80,7 +80,11 @@ RSpec.describe Ci::UpdateBuildStateService do
context 'when build has a checksum' do context 'when build has a checksum' do
let(:params) do let(:params) do
{ checksum: 'crc32:12345678', state: 'failed', failure_reason: 'script_failure' } {
output: { checksum: 'crc32:12345678', bytesize: 123 },
failure_reason: 'script_failure',
state: 'failed'
}
end end
context 'when build does not have associated trace chunks' do context 'when build does not have associated trace chunks' do
...@@ -154,14 +158,74 @@ RSpec.describe Ci::UpdateBuildStateService do ...@@ -154,14 +158,74 @@ RSpec.describe Ci::UpdateBuildStateService do
end end
context 'when trace checksum is valid' do context 'when trace checksum is valid' do
let(:params) { { checksum: 'crc32:ed82cd11', state: 'success' } } let(:params) do
{ output: { checksum: 'crc32:ed82cd11', bytesize: 4 }, state: 'success' }
end
it 'does not increment invalid trace metric' do it 'does not increment invalid or corrupted trace metric' do
execute_with_stubbed_metrics! execute_with_stubbed_metrics!
expect(metrics) expect(metrics)
.not_to have_received(:increment_trace_operation) .not_to have_received(:increment_trace_operation)
.with(operation: :invalid) .with(operation: :invalid)
expect(metrics)
.not_to have_received(:increment_trace_operation)
.with(operation: :corrupted)
end
context 'when using deprecated parameters' do
let(:params) do
{ checksum: 'crc32:ed82cd11', state: 'success' }
end
it 'does not increment invalid or corrupted trace metric' do
execute_with_stubbed_metrics!
expect(metrics)
.not_to have_received(:increment_trace_operation)
.with(operation: :invalid)
expect(metrics)
.not_to have_received(:increment_trace_operation)
.with(operation: :corrupted)
end
end
end
context 'when trace checksum is invalid and the log is corrupted' do
let(:params) do
{ output: { checksum: 'crc32:12345678', bytesize: 1 }, state: 'success' }
end
it 'increments invalid and corrupted trace metrics' do
execute_with_stubbed_metrics!
expect(metrics)
.to have_received(:increment_trace_operation)
.with(operation: :invalid)
expect(metrics)
.to have_received(:increment_trace_operation)
.with(operation: :corrupted)
end
end
context 'when trace checksum is invalid but the log seems fine' do
let(:params) do
{ output: { checksum: 'crc32:12345678', bytesize: 4 }, state: 'success' }
end
it 'does not increment corrupted trace metric' do
execute_with_stubbed_metrics!
expect(metrics)
.to have_received(:increment_trace_operation)
.with(operation: :invalid)
expect(metrics)
.not_to have_received(:increment_trace_operation)
.with(operation: :corrupted)
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment