Commit 4435cdde authored by Kamil Trzciński's avatar Kamil Trzciński

Merge branch 'safe-archiving-for-traces' into 'master'

Extra logging for new live trace architecture

See merge request gitlab-org/gitlab-ce!30892
parents 995e5b52 c2e0e689
...@@ -531,6 +531,14 @@ module Ci ...@@ -531,6 +531,14 @@ module Ci
trace.exist? trace.exist?
end end
def has_live_trace?
trace.live_trace_exist?
end
def has_archived_trace?
trace.archived_trace_exist?
end
def artifacts_file def artifacts_file
job_artifacts_archive&.file job_artifacts_archive&.file
end end
......
...@@ -176,6 +176,10 @@ module Ci ...@@ -176,6 +176,10 @@ module Ci
end end
end end
def self.archived_trace_exists_for?(job_id)
where(job_id: job_id).trace.take&.file&.file&.exists?
end
private private
def file_format_adapter_class def file_format_adapter_class
......
...@@ -2,8 +2,25 @@ ...@@ -2,8 +2,25 @@
module Ci module Ci
class ArchiveTraceService class ArchiveTraceService
def execute(job) def execute(job, worker_name:)
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless job.has_live_trace?
Sidekiq.logger.warn(class: worker_name,
message: 'The job does not have live trace but going to be archived.',
job_id: job.id)
return
end
job.trace.archive! job.trace.archive!
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless job.has_archived_trace?
Sidekiq.logger.warn(class: worker_name,
message: 'The job does not have archived trace after archiving.',
job_id: job.id)
end
rescue ::Gitlab::Ci::Trace::AlreadyArchivedError rescue ::Gitlab::Ci::Trace::AlreadyArchivedError
# It's already archived, thus we can safely ignore this exception. # It's already archived, thus we can safely ignore this exception.
rescue => e rescue => e
...@@ -11,7 +28,7 @@ module Ci ...@@ -11,7 +28,7 @@ module Ci
# If `archive!` keeps failing for over a week, that could incur data loss. # If `archive!` keeps failing for over a week, that could incur data loss.
# (See more https://docs.gitlab.com/ee/administration/job_traces.html#new-live-trace-architecture) # (See more https://docs.gitlab.com/ee/administration/job_traces.html#new-live-trace-architecture)
# In order to avoid interrupting the system, we do not raise an exception here. # In order to avoid interrupting the system, we do not raise an exception here.
archive_error(e, job) archive_error(e, job, worker_name)
end end
private private
...@@ -22,9 +39,12 @@ module Ci ...@@ -22,9 +39,12 @@ module Ci
"Counter of failed attempts of trace archiving") "Counter of failed attempts of trace archiving")
end end
def archive_error(error, job) def archive_error(error, job, worker_name)
failed_archive_counter.increment failed_archive_counter.increment
Rails.logger.error "Failed to archive trace. id: #{job.id} message: #{error.message}" # rubocop:disable Gitlab/RailsLogger
Sidekiq.logger.warn(class: worker_name,
message: "Failed to archive trace. message: #{error.message}.",
job_id: job.id)
Gitlab::Sentry Gitlab::Sentry
.track_exception(error, .track_exception(error,
......
...@@ -7,7 +7,7 @@ class ArchiveTraceWorker ...@@ -7,7 +7,7 @@ class ArchiveTraceWorker
# rubocop: disable CodeReuse/ActiveRecord # rubocop: disable CodeReuse/ActiveRecord
def perform(job_id) def perform(job_id)
Ci::Build.without_archived_trace.find_by(id: job_id).try do |job| Ci::Build.without_archived_trace.find_by(id: job_id).try do |job|
Ci::ArchiveTraceService.new.execute(job) Ci::ArchiveTraceService.new.execute(job, worker_name: self.class.name)
end end
end end
# rubocop: enable CodeReuse/ActiveRecord # rubocop: enable CodeReuse/ActiveRecord
......
...@@ -11,7 +11,7 @@ module Ci ...@@ -11,7 +11,7 @@ module Ci
# This could happen when ArchiveTraceWorker sidekiq jobs were lost by receiving SIGKILL # This could happen when ArchiveTraceWorker sidekiq jobs were lost by receiving SIGKILL
# More details in https://gitlab.com/gitlab-org/gitlab-ce/issues/36791 # More details in https://gitlab.com/gitlab-org/gitlab-ce/issues/36791
Ci::Build.finished.with_live_trace.find_each(batch_size: 100) do |build| Ci::Build.finished.with_live_trace.find_each(batch_size: 100) do |build|
Ci::ArchiveTraceService.new.execute(build) Ci::ArchiveTraceService.new.execute(build, worker_name: self.class.name)
end end
end end
# rubocop: enable CodeReuse/ActiveRecord # rubocop: enable CodeReuse/ActiveRecord
......
---
title: Extra logging for new live trace architecture
merge_request: 30892
author:
type: fixed
...@@ -63,7 +63,15 @@ module Gitlab ...@@ -63,7 +63,15 @@ module Gitlab
end end
def exist? def exist?
trace_artifact&.exists? || job.trace_chunks.any? || current_path.present? || old_trace.present? archived_trace_exist? || live_trace_exist?
end
def archived_trace_exist?
trace_artifact&.exists?
end
def live_trace_exist?
job.trace_chunks.any? || current_path.present? || old_trace.present?
end end
def read def read
...@@ -167,7 +175,7 @@ module Gitlab ...@@ -167,7 +175,7 @@ module Gitlab
def clone_file!(src_stream, temp_dir) def clone_file!(src_stream, temp_dir)
FileUtils.mkdir_p(temp_dir) FileUtils.mkdir_p(temp_dir)
Dir.mktmpdir('tmp-trace', temp_dir) do |dir_path| Dir.mktmpdir("tmp-trace-#{job.id}", temp_dir) do |dir_path|
temp_path = File.join(dir_path, "job.log") temp_path = File.join(dir_path, "job.log")
FileUtils.touch(temp_path) FileUtils.touch(temp_path)
size = IO.copy_stream(src_stream, temp_path) size = IO.copy_stream(src_stream, temp_path)
......
...@@ -166,6 +166,13 @@ module Gitlab ...@@ -166,6 +166,13 @@ module Gitlab
end end
def destroy! def destroy!
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless build.has_archived_trace?
Sidekiq.logger.warn(message: 'The job does not have archived trace but going to be destroyed.',
job_id: build.id)
end
trace_chunks.fast_destroy_all trace_chunks.fast_destroy_all
@tell = @size = 0 @tell = @size = 0
ensure ensure
......
...@@ -442,5 +442,15 @@ describe Gitlab::Ci::Trace::ChunkedIO, :clean_gitlab_redis_cache do ...@@ -442,5 +442,15 @@ describe Gitlab::Ci::Trace::ChunkedIO, :clean_gitlab_redis_cache do
expect(Ci::BuildTraceChunk.where(build: build).count).to eq(0) expect(Ci::BuildTraceChunk.where(build: build).count).to eq(0)
end end
context 'when the job does not have archived trace' do
it 'leaves a message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
message: 'The job does not have archived trace but going to be destroyed.',
job_id: build.id).and_call_original
subject
end
end
end end
end end
...@@ -692,6 +692,34 @@ describe Ci::Build do ...@@ -692,6 +692,34 @@ describe Ci::Build do
end end
end end
describe '#has_live_trace?' do
subject { build.has_live_trace? }
let(:build) { create(:ci_build, :trace_live) }
it { is_expected.to be_truthy }
context 'when build does not have live trace' do
let(:build) { create(:ci_build) }
it { is_expected.to be_falsy }
end
end
describe '#has_archived_trace?' do
subject { build.has_archived_trace? }
let(:build) { create(:ci_build, :trace_artifact) }
it { is_expected.to be_truthy }
context 'when build does not have archived trace' do
let(:build) { create(:ci_build) }
it { is_expected.to be_falsy }
end
end
describe '#has_job_artifacts?' do describe '#has_job_artifacts?' do
subject { build.has_job_artifacts? } subject { build.has_job_artifacts? }
......
...@@ -70,6 +70,31 @@ describe Ci::JobArtifact do ...@@ -70,6 +70,31 @@ describe Ci::JobArtifact do
end end
end end
describe '.archived_trace_exists_for?' do
subject { described_class.archived_trace_exists_for?(job_id) }
let!(:artifact) { create(:ci_job_artifact, :trace, job: job) }
let(:job) { create(:ci_build) }
context 'when the specified job_id exists' do
let(:job_id) { job.id }
it { is_expected.to be_truthy }
context 'when the job does have archived trace' do
let!(:artifact) { }
it { is_expected.to be_falsy }
end
end
context 'when the specified job_id does not exist' do
let(:job_id) { 10000 }
it { is_expected.to be_falsy }
end
end
describe 'callbacks' do describe 'callbacks' do
subject { create(:ci_job_artifact, :archive) } subject { create(:ci_job_artifact, :archive) }
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
require 'spec_helper' require 'spec_helper'
describe Ci::ArchiveTraceService, '#execute' do describe Ci::ArchiveTraceService, '#execute' do
subject { described_class.new.execute(job) } subject { described_class.new.execute(job, worker_name: ArchiveTraceWorker.name) }
context 'when job is finished' do context 'when job is finished' do
let(:job) { create(:ci_build, :success, :trace_live) } let(:job) { create(:ci_build, :success, :trace_live) }
...@@ -25,6 +25,34 @@ describe Ci::ArchiveTraceService, '#execute' do ...@@ -25,6 +25,34 @@ describe Ci::ArchiveTraceService, '#execute' do
expect { subject }.not_to change { Ci::JobArtifact.trace.count } expect { subject }.not_to change { Ci::JobArtifact.trace.count }
end end
end end
context 'when job does not have trace' do
let(:job) { create(:ci_build, :success) }
it 'leaves a warning message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
class: ArchiveTraceWorker.name,
message: 'The job does not have live trace but going to be archived.',
job_id: job.id)
subject
end
end
context 'when job failed to archive trace but did not raise an exception' do
before do
allow_any_instance_of(Gitlab::Ci::Trace).to receive(:archive!) {}
end
it 'leaves a warning message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
class: ArchiveTraceWorker.name,
message: 'The job does not have archived trace after archiving.',
job_id: job.id)
subject
end
end
end end
context 'when job is running' do context 'when job is running' do
...@@ -37,10 +65,10 @@ describe Ci::ArchiveTraceService, '#execute' do ...@@ -37,10 +65,10 @@ describe Ci::ArchiveTraceService, '#execute' do
issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/51502', issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/51502',
extra: { job_id: job.id } ).once extra: { job_id: job.id } ).once
expect(Rails.logger) expect(Sidekiq.logger).to receive(:warn).with(
.to receive(:error) class: ArchiveTraceWorker.name,
.with("Failed to archive trace. id: #{job.id} message: Job is not finished yet") message: "Failed to archive trace. message: Job is not finished yet.",
.and_call_original job_id: job.id).and_call_original
expect(Gitlab::Metrics) expect(Gitlab::Metrics)
.to receive(:counter) .to receive(:counter)
......
...@@ -720,6 +720,58 @@ shared_examples_for 'trace with enabled live trace feature' do ...@@ -720,6 +720,58 @@ shared_examples_for 'trace with enabled live trace feature' do
end end
end end
describe '#archived_trace_exist?' do
subject { trace.archived_trace_exist? }
context 'when trace does not exist' do
it { is_expected.to be_falsy }
end
context 'when archived trace exists' do
before do
create(:ci_job_artifact, :trace, job: build)
end
it { is_expected.to be_truthy }
end
context 'when live trace exists' do
before do
Gitlab::Ci::Trace::ChunkedIO.new(build) do |stream|
stream.write('abc')
end
end
it { is_expected.to be_falsy }
end
end
describe '#live_trace_exist?' do
subject { trace.live_trace_exist? }
context 'when trace does not exist' do
it { is_expected.to be_falsy }
end
context 'when archived trace exists' do
before do
create(:ci_job_artifact, :trace, job: build)
end
it { is_expected.to be_falsy }
end
context 'when live trace exists' do
before do
Gitlab::Ci::Trace::ChunkedIO.new(build) do |stream|
stream.write('abc')
end
end
it { is_expected.to be_truthy }
end
end
describe '#archive!' do describe '#archive!' do
subject { trace.archive! } subject { trace.archive! }
......
...@@ -11,7 +11,7 @@ describe ArchiveTraceWorker do ...@@ -11,7 +11,7 @@ describe ArchiveTraceWorker do
it 'executes service' do it 'executes service' do
expect_any_instance_of(Ci::ArchiveTraceService) expect_any_instance_of(Ci::ArchiveTraceService)
.to receive(:execute).with(job) .to receive(:execute).with(job, anything)
subject subject
end end
......
...@@ -34,7 +34,7 @@ describe Ci::ArchiveTracesCronWorker do ...@@ -34,7 +34,7 @@ describe Ci::ArchiveTracesCronWorker do
it 'executes service' do it 'executes service' do
expect_any_instance_of(Ci::ArchiveTraceService) expect_any_instance_of(Ci::ArchiveTraceService)
.to receive(:execute).with(build) .to receive(:execute).with(build, anything)
subject subject
end end
...@@ -60,7 +60,10 @@ describe Ci::ArchiveTracesCronWorker do ...@@ -60,7 +60,10 @@ describe Ci::ArchiveTracesCronWorker do
end end
it 'puts a log' do it 'puts a log' do
expect(Rails.logger).to receive(:error).with("Failed to archive trace. id: #{build.id} message: Unexpected error") expect(Sidekiq.logger).to receive(:warn).with(
class: described_class.name,
message: "Failed to archive trace. message: Unexpected error.",
job_id: build.id)
subject subject
end end
......
...@@ -7,8 +7,6 @@ describe StuckCiJobsWorker do ...@@ -7,8 +7,6 @@ describe StuckCiJobsWorker do
let!(:runner) { create :ci_runner } let!(:runner) { create :ci_runner }
let!(:job) { create :ci_build, runner: runner } let!(:job) { create :ci_build, runner: runner }
let(:trace_lease_key) { "trace:write:lock:#{job.id}" }
let(:trace_lease_uuid) { SecureRandom.uuid }
let(:worker_lease_key) { StuckCiJobsWorker::EXCLUSIVE_LEASE_KEY } let(:worker_lease_key) { StuckCiJobsWorker::EXCLUSIVE_LEASE_KEY }
let(:worker_lease_uuid) { SecureRandom.uuid } let(:worker_lease_uuid) { SecureRandom.uuid }
...@@ -16,7 +14,6 @@ describe StuckCiJobsWorker do ...@@ -16,7 +14,6 @@ describe StuckCiJobsWorker do
before do before do
stub_exclusive_lease(worker_lease_key, worker_lease_uuid) stub_exclusive_lease(worker_lease_key, worker_lease_uuid)
stub_exclusive_lease(trace_lease_key, trace_lease_uuid)
job.update!(status: status, updated_at: updated_at) job.update!(status: status, updated_at: updated_at)
end end
...@@ -195,7 +192,6 @@ describe StuckCiJobsWorker do ...@@ -195,7 +192,6 @@ describe StuckCiJobsWorker do
end end
it 'cancels exclusive leases after worker perform' do it 'cancels exclusive leases after worker perform' do
expect_to_cancel_exclusive_lease(trace_lease_key, trace_lease_uuid)
expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid) expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid)
worker.perform worker.perform
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment