Commit 8dbf036e authored by Sean McGivern's avatar Sean McGivern

Filter Sidekiq arguments in log files

Sidekiq arguments can contain sensitive data. As with Sentry (in the
previous commit), we should no emit sensitive arguments in log files.
parent 59721a9d
---
title: Filter potentially-sensitive Sidekiq arguments from logs and Sentry
merge_request: 33967
author:
type: changed
......@@ -29,9 +29,19 @@ Example:
gitlab_rails['env'] = {"SIDEKIQ_LOG_ARGUMENTS" => "1"}
```
Please note: It is not recommend to enable this setting in production because some
Sidekiq jobs (such as sending a password reset email) take secret arguments (for
example the password reset token).
This does not log all job arguments. To avoid logging sensitive
information (for instance, password reset tokens), it logs numeric
arguments for all workers, with overrides for some specific workers
where their arguments are not sensitive. Permitted non-numeric arguments
for specific workers can be seen in the [Sidekiq Processor source](https://gitlab.com/gitlab-org/gitlab/-/blob/master/lib/gitlab/error_tracking/processor/sidekiq_processor.rb).
Example log output:
```json
{"severity":"INFO","time":"2020-06-08T14:37:37.892Z","class":"AdminEmailsWorker","args":["[FILTERED]","[FILTERED]","[FILTERED]"],"retry":3,"queue":"admin_emails","backtrace":true,"jid":"9e35e2674ac7b12d123e13cc","created_at":"2020-06-08T14:37:37.373Z","meta.user":"root","meta.caller_id":"Admin::EmailsController#create","correlation_id":"37D3lArJmT1","uber-trace-id":"2d942cc98cc1b561:6dc94409cfdd4d77:9fbe19bdee865293:1","enqueued_at":"2020-06-08T14:37:37.410Z","pid":65011,"message":"AdminEmailsWorker JID-9e35e2674ac7b12d123e13cc: done: 0.48085 sec","job_status":"done","scheduling_latency_s":0.001012,"redis_calls":9,"redis_duration_s":0.004608,"redis_read_bytes":696,"redis_write_bytes":6141,"duration_s":0.48085,"cpu_s":0.308849,"completed_at":"2020-06-08T14:37:37.892Z","db_duration_s":0.010742}
{"severity":"INFO","time":"2020-06-08T14:37:37.894Z","class":"ActiveJob::QueueAdapters::SidekiqAdapter::JobWrapper","wrapped":"ActionMailer::DeliveryJob","queue":"mailers","args":["[FILTERED]"],"retry":3,"backtrace":true,"jid":"e47a4f6793d475378432e3c8","created_at":"2020-06-08T14:37:37.884Z","meta.user":"root","meta.caller_id":"AdminEmailsWorker","correlation_id":"37D3lArJmT1","uber-trace-id":"2d942cc98cc1b561:29344de0f966446d:5c3b0e0e1bef987b:1","enqueued_at":"2020-06-08T14:37:37.885Z","pid":65011,"message":"ActiveJob::QueueAdapters::SidekiqAdapter::JobWrapper JID-e47a4f6793d475378432e3c8: start","job_status":"start","scheduling_latency_s":0.009473}
{"severity":"INFO","time":"2020-06-08T14:39:50.648Z","class":"NewIssueWorker","args":["455","1"],"retry":3,"queue":"new_issue","backtrace":true,"jid":"a24af71f96fd129ec47f5d1e","created_at":"2020-06-08T14:39:50.643Z","meta.user":"root","meta.project":"h5bp/html5-boilerplate","meta.root_namespace":"h5bp","meta.caller_id":"Projects::IssuesController#create","correlation_id":"f9UCZHqhuP7","uber-trace-id":"28f65730f99f55a3:a5d2b62dec38dffc:48ddd092707fa1b7:1","enqueued_at":"2020-06-08T14:39:50.646Z","pid":65011,"message":"NewIssueWorker JID-a24af71f96fd129ec47f5d1e: start","job_status":"start","scheduling_latency_s":0.001144}
```
When using [Sidekiq JSON logging](../logs.md#sidekiqlog),
arguments logs are limited to a maximum size of 10 kilobytes of text;
......
......@@ -28,6 +28,13 @@ module Gitlab
end
end
def self.loggable_arguments(args, klass)
Gitlab::Utils::LogLimitedArray
.log_limited_array(filter_arguments(args, klass))
.map(&:to_s)
.to_a
end
def process(value, key = nil)
sidekiq = value.dig(:extra, :sidekiq)
......
......@@ -14,18 +14,11 @@ module Gitlab
payload.delete('extra.server')
# The raven extra context is populated by Raven::SidekiqCleanupMiddleware.
#
# It contains the full sidekiq job which consists of mixed types and nested
# objects. That causes a bunch of issues when trying to ingest logs into
# Elasticsearch.
#
# We apply a stricter schema here that forces the args to be an array of
# strings. This same logic exists in Gitlab::SidekiqLogging::JSONFormatter.
payload['extra.sidekiq'].tap do |value|
if value.is_a?(Hash) && value.key?('args')
value = value.dup
payload['extra.sidekiq']['args'] = Gitlab::Utils::LogLimitedArray.log_limited_array(value['args'].try(:map, &:to_s))
payload['extra.sidekiq']['args'] = Gitlab::ErrorTracking::Processor::SidekiqProcessor
.loggable_arguments(value['args'], value['class'])
end
end
......
......@@ -18,13 +18,15 @@ module Gitlab
when String
output[:message] = data
when Hash
output.merge!(data)
# jobstr is redundant and can include information we wanted to
# exclude (like arguments)
data.delete(:jobstr)
convert_to_iso8601!(data)
convert_retry_to_integer!(data)
stringify_args!(data)
output.merge!(data)
output.delete(:jobstr)
convert_to_iso8601!(output)
convert_retry_to_integer!(output)
process_args!(output)
end
output.to_json + "\n"
......@@ -59,8 +61,11 @@ module Gitlab
end
end
def stringify_args!(payload)
payload['args'] = Gitlab::Utils::LogLimitedArray.log_limited_array(payload['args'].map(&:to_s)) if payload['args']
def process_args!(payload)
return unless payload['args']
payload['args'] = Gitlab::ErrorTracking::Processor::SidekiqProcessor
.loggable_arguments(payload['args'], payload['class'])
end
end
end
......
......@@ -9,14 +9,14 @@ module Gitlab
# to around 10 kilobytes. Once we hit the limit, add the sentinel
# value as the last item in the returned array.
def self.log_limited_array(array, sentinel: '...')
return [] unless array.is_a?(Array)
return [] unless array.is_a?(Array) || array.is_a?(Enumerator::Lazy)
total_length = 0
limited_array = array.take_while do |arg|
total_length += JsonSizeEstimator.estimate(arg)
total_length <= MAXIMUM_ARRAY_LENGTH
end
end.to_a
limited_array.push(sentinel) if total_length > MAXIMUM_ARRAY_LENGTH
......
......@@ -43,6 +43,18 @@ RSpec.describe Gitlab::ErrorTracking::Processor::SidekiqProcessor do
end
end
describe '.loggable_arguments' do
it 'filters and limits the arguments, then converts to strings' do
half_limit = Gitlab::Utils::LogLimitedArray::MAXIMUM_ARRAY_LENGTH / 2
args = [[1, 2], 'a' * half_limit, 'b' * half_limit, 'c' * half_limit, 'd']
stub_const('LoggableArguments', double(loggable_arguments: [0, 1, 3, 4]))
expect(described_class.loggable_arguments(args, 'LoggableArguments'))
.to eq(['[1, 2]', 'a' * half_limit, '[FILTERED]', '...'])
end
end
describe '#process' do
context 'when there is Sidekiq data' do
shared_examples 'Sidekiq arguments' do |args_in_job_hash: true|
......
......@@ -186,10 +186,10 @@ describe Gitlab::ErrorTracking do
context 'with sidekiq args' do
it 'ensures extra.sidekiq.args is a string' do
extra = { sidekiq: { 'args' => [1, { 'id' => 2, 'name' => 'hello' }, 'some-value', 'another-value'] } }
extra = { sidekiq: { 'class' => 'PostReceive', 'args' => [1, { 'id' => 2, 'name' => 'hello' }, 'some-value', 'another-value'] } }
expect(Gitlab::ErrorTracking::Logger).to receive(:error).with(
hash_including({ 'extra.sidekiq' => { 'args' => ['1', '{"id"=>2, "name"=>"hello"}', 'some-value', 'another-value'] } }))
hash_including({ 'extra.sidekiq' => { 'class' => 'PostReceive', 'args' => ['1', '{"id"=>2, "name"=>"hello"}', 'some-value', 'another-value'] } }))
described_class.track_exception(exception, extra)
end
......@@ -197,6 +197,9 @@ describe Gitlab::ErrorTracking do
it 'filters sensitive arguments before sending' do
extra = { sidekiq: { 'class' => 'UnknownWorker', 'args' => ['sensitive string', 1, 2] } }
expect(Gitlab::ErrorTracking::Logger).to receive(:error).with(
hash_including('extra.sidekiq' => { 'class' => 'UnknownWorker', 'args' => ['[FILTERED]', '1', '2'] }))
described_class.track_exception(exception, extra)
sentry_event = Gitlab::Json.parse(Raven.client.transport.events.last[1])
......
......@@ -14,6 +14,7 @@ describe Gitlab::SidekiqLogging::JSONFormatter do
let(:hash_input) do
{
foo: 1,
'class' => 'PostReceive',
'bar' => 'test',
'created_at' => timestamp,
'enqueued_at' => timestamp,
......@@ -48,7 +49,17 @@ describe Gitlab::SidekiqLogging::JSONFormatter do
expect(subject).not_to include('jobstr')
end
context 'when the job args are bigger than the maximum allowed' do
it 'does not modify the input hash' do
input = { 'args' => [1, 'string'] }
output = Gitlab::Json.parse(described_class.new.call('INFO', now, 'my program', input))
expect(input['args']).to eq([1, 'string'])
expect(output['args']).to eq(['1', '[FILTERED]'])
end
context 'job arguments' do
context 'when the arguments are bigger than the maximum allowed' do
it 'keeps args from the front until they exceed the limit' do
half_limit = Gitlab::Utils::LogLimitedArray::MAXIMUM_ARRAY_LENGTH / 2
hash_input['args'] = [1, 2, 'a' * half_limit, 'b' * half_limit, 3]
......@@ -59,11 +70,21 @@ describe Gitlab::SidekiqLogging::JSONFormatter do
end
end
context 'when the job has non-integer arguments' do
it 'only allows permitted non-integer arguments through' do
hash_input['args'] = [1, 'foo', 'bar']
hash_input['class'] = 'SystemHookPushWorker'
expect(subject['args']).to eq(['1', 'foo', '[FILTERED]'])
end
end
it 'properly flattens arguments to a String' do
hash_input['args'] = [1, "test", 2, { 'test' => 1 }]
expect(subject['args']).to eq(["1", "test", "2", %({"test"=>1})])
end
end
context 'when the job has a non-integer value for retry' do
using RSpec::Parameterized::TableSyntax
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment