Commit 87fa73c5 authored by Andrew Newdigate's avatar Andrew Newdigate Committed by Kamil Trzciński

Attribute Sidekiq workers according to their workloads

These workload attributes include:

1. Are jobs for this worker latency sensitve?
1. Does the worker rely on external dependencies?
1. What resource boundary is the worker limited by? This can be either
   cpu or memory.

These attributes are non-mandatory, but in future, SLOS for workers will
be determined by these attributes, and therefore, if a job has specific
latency requirements, they should be configured through these
attributes.
parent 629dda38
......@@ -3,6 +3,10 @@
module WorkerAttributes
extend ActiveSupport::Concern
# Resource boundaries that workers can declare through the
# `worker_resource_boundary` attribute
VALID_RESOURCE_BOUNDARIES = [:memory, :cpu, :unknown].freeze
class_methods do
def feature_category(value)
raise "Invalid category. Use `feature_category_not_owned!` to mark a worker as not owned" if value == :not_owned
......@@ -24,6 +28,48 @@ module WorkerAttributes
get_worker_attribute(:feature_category) == :not_owned
end
# This should be set for jobs that need to be run immediately, or, if
# they are delayed, risk creating inconsistencies in the application
# that could being perceived by the user as incorrect behavior
# (ie, a bug)
# See doc/development/sidekiq_style_guide.md#Latency-Sensitive-Jobs
# for details
def latency_sensitive_worker!
worker_attributes[:latency_sensitive] = true
end
# Returns a truthy value if the worker is latency sensitive.
# See doc/development/sidekiq_style_guide.md#Latency-Sensitive-Jobs
# for details
def latency_sensitive_worker?
worker_attributes[:latency_sensitive]
end
# Set this attribute on a job when it will call to services outside of the
# application, such as 3rd party applications, other k8s clusters etc See
# doc/development/sidekiq_style_guide.md#Jobs-with-External-Dependencies for
# details
def worker_has_external_dependencies!
worker_attributes[:external_dependencies] = true
end
# Returns a truthy value if the worker has external dependencies.
# See doc/development/sidekiq_style_guide.md#Jobs-with-External-Dependencies
# for details
def worker_has_external_dependencies?
worker_attributes[:external_dependencies]
end
def worker_resource_boundary(boundary)
raise "Invalid boundary" unless VALID_RESOURCE_BOUNDARIES.include? boundary
worker_attributes[:resource_boundary] = boundary
end
def get_worker_resource_boundary
worker_attributes[:resource_boundary] || :unknown
end
protected
# Returns a worker attribute declared on this class or its parent class.
......
......@@ -5,6 +5,7 @@ class AuthorizedProjectsWorker
prepend WaitableWorker
feature_category :authentication_and_authorization
latency_sensitive_worker!
# This is a workaround for a Ruby 2.3.7 bug. rspec-mocks cannot restore the
# visibility of prepended modules. See https://github.com/rspec/rspec-mocks/issues/1231
......
......@@ -5,6 +5,8 @@ class BuildFinishedWorker
include PipelineQueue
queue_namespace :pipeline_processing
latency_sensitive_worker!
worker_resource_boundary :cpu
# rubocop: disable CodeReuse/ActiveRecord
def perform(build_id)
......
......@@ -6,6 +6,7 @@ class BuildHooksWorker
queue_namespace :pipeline_hooks
feature_category :continuous_integration
latency_sensitive_worker!
# rubocop: disable CodeReuse/ActiveRecord
def perform(build_id)
......
......@@ -6,6 +6,8 @@ class BuildQueueWorker
queue_namespace :pipeline_processing
feature_category :continuous_integration
latency_sensitive_worker!
worker_resource_boundary :cpu
# rubocop: disable CodeReuse/ActiveRecord
def perform(build_id)
......
......@@ -5,6 +5,7 @@ class BuildSuccessWorker
include PipelineQueue
queue_namespace :pipeline_processing
latency_sensitive_worker!
# rubocop: disable CodeReuse/ActiveRecord
def perform(build_id)
......
......@@ -4,6 +4,11 @@ class ChatNotificationWorker
include ApplicationWorker
feature_category :chatops
latency_sensitive_worker!
# TODO: break this into multiple jobs
# as the `responder` uses external dependencies
# See https://gitlab.com/gitlab-com/gl-infra/scalability/issues/34
# worker_has_external_dependencies!
RESCHEDULE_INTERVAL = 2.seconds
......
......@@ -7,6 +7,7 @@ module Ci
queue_namespace :pipeline_processing
feature_category :continuous_integration
worker_resource_boundary :cpu
def perform(build_id)
::Ci::Build.find_by_id(build_id).try do |build|
......
......@@ -5,6 +5,8 @@ class ClusterInstallAppWorker
include ClusterQueue
include ClusterApplications
worker_has_external_dependencies!
def perform(app_name, app_id)
find_application(app_name, app_id) do |app|
Clusters::Applications::InstallService.new(app).execute
......
......@@ -5,6 +5,8 @@ class ClusterPatchAppWorker
include ClusterQueue
include ClusterApplications
worker_has_external_dependencies!
def perform(app_name, app_id)
find_application(app_name, app_id) do |app|
Clusters::Applications::PatchService.new(app).execute
......
......@@ -4,6 +4,8 @@ class ClusterProjectConfigureWorker
include ApplicationWorker
include ClusterQueue
worker_has_external_dependencies!
def perform(project_id)
# Scheduled for removal in https://gitlab.com/gitlab-org/gitlab-foss/issues/59319
end
......
......@@ -4,6 +4,8 @@ class ClusterProvisionWorker
include ApplicationWorker
include ClusterQueue
worker_has_external_dependencies!
def perform(cluster_id)
Clusters::Cluster.find_by_id(cluster_id).try do |cluster|
cluster.provider.try do |provider|
......
......@@ -5,6 +5,8 @@ class ClusterUpgradeAppWorker
include ClusterQueue
include ClusterApplications
worker_has_external_dependencies!
def perform(app_name, app_id)
find_application(app_name, app_id) do |app|
Clusters::Applications::UpgradeService.new(app).execute
......
......@@ -8,6 +8,9 @@ class ClusterWaitForAppInstallationWorker
INTERVAL = 10.seconds
TIMEOUT = 20.minutes
worker_has_external_dependencies!
worker_resource_boundary :cpu
def perform(app_name, app_id)
find_application(app_name, app_id) do |app|
Clusters::Applications::CheckInstallationProgressService.new(app).execute
......
......@@ -5,6 +5,8 @@ class ClusterWaitForIngressIpAddressWorker
include ClusterQueue
include ClusterApplications
worker_has_external_dependencies!
def perform(app_name, app_id)
find_application(app_name, app_id) do |app|
Clusters::Applications::CheckIngressIpAddressService.new(app).execute
......
......@@ -7,6 +7,8 @@ module Clusters
include ClusterQueue
include ClusterApplications
worker_has_external_dependencies!
def perform(app_name, app_id)
find_application(app_name, app_id) do |app|
Clusters::Applications::UninstallService.new(app).execute
......
......@@ -10,6 +10,9 @@ module Clusters
INTERVAL = 10.seconds
TIMEOUT = 20.minutes
worker_has_external_dependencies!
worker_resource_boundary :cpu
def perform(app_name, app_id)
find_application(app_name, app_id) do |app|
Clusters::Applications::CheckUninstallProgressService.new(app).execute
......
......@@ -14,6 +14,7 @@ module Gitlab
include NotifyUponDeath
feature_category :importers
worker_has_external_dependencies!
end
# project - An instance of `Project` to import the data into.
......
......@@ -6,6 +6,8 @@ class CreatePipelineWorker
queue_namespace :pipeline_creation
feature_category :continuous_integration
latency_sensitive_worker!
worker_resource_boundary :cpu
def perform(project_id, user_id, ref, source, params = {})
project = Project.find(project_id)
......
......@@ -6,6 +6,7 @@ module Deployments
queue_namespace :deployment
feature_category :continuous_delivery
worker_resource_boundary :cpu
def perform(deployment_id)
Deployment.find_by_id(deployment_id).try(:execute_hooks)
......
......@@ -6,6 +6,7 @@ module Deployments
queue_namespace :deployment
feature_category :continuous_delivery
worker_resource_boundary :cpu
def perform(deployment_id)
Deployment.find_by_id(deployment_id).try do |deployment|
......
......@@ -4,6 +4,7 @@ class EmailReceiverWorker
include ApplicationWorker
feature_category :issue_tracking
latency_sensitive_worker!
def perform(raw)
return unless Gitlab::IncomingEmail.enabled?
......
......@@ -6,6 +6,8 @@ class EmailsOnPushWorker
attr_reader :email, :skip_premailer
feature_category :source_code_management
latency_sensitive_worker!
worker_resource_boundary :cpu
def perform(project_id, recipients, push_data, options = {})
options.symbolize_keys!
......
......@@ -5,6 +5,7 @@ class ExpireJobCacheWorker
include PipelineQueue
queue_namespace :pipeline_cache
latency_sensitive_worker!
# rubocop: disable CodeReuse/ActiveRecord
def perform(job_id)
......
......@@ -5,6 +5,8 @@ class ExpirePipelineCacheWorker
include PipelineQueue
queue_namespace :pipeline_cache
latency_sensitive_worker!
worker_resource_boundary :cpu
# rubocop: disable CodeReuse/ActiveRecord
def perform(pipeline_id)
......
......@@ -5,6 +5,7 @@ class GitlabShellWorker
include Gitlab::ShellAdapter
feature_category :source_code_management
latency_sensitive_worker!
def perform(action, *arg)
gitlab_shell.__send__(action, *arg) # rubocop:disable GitlabSecurity/PublicSend
......
......@@ -4,6 +4,7 @@ class ImportIssuesCsvWorker
include ApplicationWorker
feature_category :issue_tracking
worker_resource_boundary :cpu
sidekiq_retries_exhausted do |job|
Upload.find(job['args'][2]).destroy
......
......@@ -8,6 +8,7 @@ module MailScheduler
include MailSchedulerQueue
feature_category :issue_tracking
worker_resource_boundary :cpu
def perform(meth, *args)
check_arguments!(args)
......
......@@ -4,6 +4,7 @@ class MergeWorker
include ApplicationWorker
feature_category :source_code_management
latency_sensitive_worker!
def perform(merge_request_id, current_user_id, params)
params = params.with_indifferent_access
......
......@@ -6,6 +6,7 @@ module Namespaces
include CronjobQueue
feature_category :source_code_management
worker_resource_boundary :cpu
# Worker to prune pending rows on Namespace::AggregationSchedule
# It's scheduled to run once a day at 1:05am.
......
......@@ -5,6 +5,8 @@ class NewIssueWorker
include NewIssuable
feature_category :issue_tracking
latency_sensitive_worker!
worker_resource_boundary :cpu
def perform(issue_id, user_id)
return unless objects_found?(issue_id, user_id)
......
......@@ -5,6 +5,8 @@ class NewMergeRequestWorker
include NewIssuable
feature_category :source_code_management
latency_sensitive_worker!
worker_resource_boundary :cpu
def perform(merge_request_id, user_id)
return unless objects_found?(merge_request_id, user_id)
......
......@@ -4,6 +4,8 @@ class NewNoteWorker
include ApplicationWorker
feature_category :issue_tracking
latency_sensitive_worker!
worker_resource_boundary :cpu
# Keep extra parameter to preserve backwards compatibility with
# old `NewNoteWorker` jobs (can remove later)
......
......@@ -5,6 +5,8 @@ module ObjectPool
include ApplicationWorker
include ObjectPoolQueue
worker_resource_boundary :cpu
# The use of pool id is deprecated. Keeping the argument allows old jobs to
# still be performed.
def perform(_pool_id, project_id)
......
......@@ -5,6 +5,7 @@ class PagesDomainRemovalCronWorker
include CronjobQueue
feature_category :pages
worker_resource_boundary :cpu
def perform
PagesDomain.for_removal.find_each do |domain|
......
......@@ -5,6 +5,8 @@ class PipelineHooksWorker
include PipelineQueue
queue_namespace :pipeline_hooks
latency_sensitive_worker!
worker_resource_boundary :cpu
# rubocop: disable CodeReuse/ActiveRecord
def perform(pipeline_id)
......
......@@ -4,6 +4,8 @@ class PipelineMetricsWorker
include ApplicationWorker
include PipelineQueue
latency_sensitive_worker!
# rubocop: disable CodeReuse/ActiveRecord
def perform(pipeline_id)
Ci::Pipeline.find_by(id: pipeline_id).try do |pipeline|
......
......@@ -4,6 +4,9 @@ class PipelineNotificationWorker
include ApplicationWorker
include PipelineQueue
latency_sensitive_worker!
worker_resource_boundary :cpu
# rubocop: disable CodeReuse/ActiveRecord
def perform(pipeline_id, recipients = nil)
pipeline = Ci::Pipeline.find_by(id: pipeline_id)
......
......@@ -6,6 +6,7 @@ class PipelineProcessWorker
queue_namespace :pipeline_processing
feature_category :continuous_integration
latency_sensitive_worker!
# rubocop: disable CodeReuse/ActiveRecord
def perform(pipeline_id, build_ids = nil)
......
......@@ -5,6 +5,7 @@ class PipelineScheduleWorker
include CronjobQueue
feature_category :continuous_integration
worker_resource_boundary :cpu
def perform
Ci::PipelineSchedule.runnable_schedules.preloaded.find_in_batches do |schedules|
......
......@@ -5,6 +5,7 @@ class PipelineSuccessWorker
include PipelineQueue
queue_namespace :pipeline_processing
latency_sensitive_worker!
def perform(pipeline_id)
# no-op
......
......@@ -5,6 +5,7 @@ class PipelineUpdateWorker
include PipelineQueue
queue_namespace :pipeline_processing
latency_sensitive_worker!
# rubocop: disable CodeReuse/ActiveRecord
def perform(pipeline_id)
......
......@@ -4,6 +4,8 @@ class PostReceive
include ApplicationWorker
feature_category :source_code_management
latency_sensitive_worker!
worker_resource_boundary :cpu
def perform(gl_repository, identifier, changes, push_options = {})
project, repo_type = Gitlab::GlRepository.parse(gl_repository)
......
......@@ -11,6 +11,7 @@ class ProcessCommitWorker
include ApplicationWorker
feature_category :source_code_management
latency_sensitive_worker!
# project_id - The ID of the project this commit belongs to.
# user_id - The ID of the user that pushed the commit.
......
......@@ -3,6 +3,9 @@
# Worker for updating any project specific caches.
class ProjectCacheWorker
include ApplicationWorker
latency_sensitive_worker!
LEASE_TIMEOUT = 15.minutes.to_i
feature_category :source_code_management
......
......@@ -6,6 +6,7 @@ class ProjectExportWorker
sidekiq_options retry: 3
feature_category :source_code_management
worker_resource_boundary :memory
def perform(current_user_id, project_id, after_export_strategy = {}, params = {})
current_user = User.find(current_user_id)
......
......@@ -5,6 +5,7 @@ class ProjectServiceWorker
sidekiq_options dead: false
feature_category :integrations
worker_has_external_dependencies!
def perform(hook_id, data)
data = data.with_indifferent_access
......
......@@ -5,6 +5,14 @@ class ReactiveCachingWorker
feature_category_not_owned!
# TODO: The reactive caching worker should be split into
# two different workers, one for latency_sensitive jobs without external dependencies
# and another worker without latency_sensitivity, but with external dependencies
# https://gitlab.com/gitlab-com/gl-infra/scalability/issues/34
# This worker should also have `worker_has_external_dependencies!` enabled
latency_sensitive_worker!
worker_resource_boundary :cpu
def perform(class_name, id, *args)
klass = begin
class_name.constantize
......
......@@ -5,6 +5,7 @@ class RemoveExpiredMembersWorker
include CronjobQueue
feature_category :authentication_and_authorization
worker_resource_boundary :cpu
def perform
Member.expired.find_each do |member|
......
......@@ -7,6 +7,7 @@ class RepositoryImportWorker
include ProjectImportOptions
feature_category :importers
worker_has_external_dependencies!
# technical debt: https://gitlab.com/gitlab-org/gitlab/issues/33991
sidekiq_options memory_killer_memory_growth_kb: ENV.fetch('MEMORY_KILLER_REPOSITORY_IMPORT_WORKER_MEMORY_GROWTH_KB', 50).to_i
......
......@@ -6,6 +6,8 @@ class RepositoryUpdateRemoteMirrorWorker
include ApplicationWorker
include Gitlab::ExclusiveLeaseHelpers
worker_has_external_dependencies!
sidekiq_options retry: 3, dead: false
feature_category :source_code_management
......
......@@ -5,6 +5,7 @@ class StageUpdateWorker
include PipelineQueue
queue_namespace :pipeline_processing
latency_sensitive_worker!
# rubocop: disable CodeReuse/ActiveRecord
def perform(stage_id)
......
......@@ -5,6 +5,7 @@ class StuckCiJobsWorker
include CronjobQueue
feature_category :continuous_integration
worker_resource_boundary :cpu
EXCLUSIVE_LEASE_KEY = 'stuck_ci_builds_worker_lease'
......
......@@ -5,6 +5,7 @@ class StuckImportJobsWorker
include CronjobQueue
feature_category :importers
worker_resource_boundary :cpu
IMPORT_JOBS_EXPIRATION = 15.hours.to_i
......
......@@ -6,6 +6,8 @@ class UpdateHeadPipelineForMergeRequestWorker
queue_namespace :pipeline_processing
feature_category :continuous_integration
latency_sensitive_worker!
worker_resource_boundary :cpu
def perform(merge_request_id)
MergeRequest.find_by_id(merge_request_id).try do |merge_request|
......
......@@ -4,6 +4,8 @@ class UpdateMergeRequestsWorker
include ApplicationWorker
feature_category :source_code_management
latency_sensitive_worker!
worker_resource_boundary :cpu
LOG_TIME_THRESHOLD = 90 # seconds
......
......@@ -4,6 +4,8 @@ class WaitForClusterCreationWorker
include ApplicationWorker
include ClusterQueue
worker_has_external_dependencies!
def perform(cluster_id)
Clusters::Cluster.find_by_id(cluster_id).try do |cluster|
cluster.provider.try do |provider|
......
......@@ -4,6 +4,8 @@ class WebHookWorker
include ApplicationWorker
feature_category :integrations
worker_has_external_dependencies!
sidekiq_options retry: 4, dead: false
def perform(hook_id, data, hook_name)
......
---
title: Attribute Sidekiq workers according to their workloads
merge_request: 18066
author:
type: other
......@@ -368,7 +368,7 @@ Enterprise Edition instance. This has some implications:
- [Background migrations](background_migrations.md) run in Sidekiq, and
should only be done for migrations that would take an extreme amount of
time at GitLab.com scale.
1. **Sidekiq workers** [cannot change in a backwards-incompatible way](sidekiq_style_guide.md#removing-or-renaming-queues):
1. **Sidekiq workers** [cannot change in a backwards-incompatible way](sidekiq_style_guide.md#sidekiq-compatibility-across-updates):
1. Sidekiq queues are not drained before a deploy happens, so there will be
workers in the queue from the previous version of GitLab.
1. If you need to change a method signature, try to do so across two releases,
......
......@@ -61,6 +61,168 @@ the extra jobs will take resources away from jobs from workers that were already
there, if the resources available to the Sidekiq process handling the namespace
are not adjusted appropriately.
## Latency Sensitive Jobs
If a large number of background jobs get scheduled at once, queueing of jobs may
occur while jobs wait for a worker node to be become available. This is normal
and gives the system resilience by allowing it to gracefully handle spikes in
traffic. Some jobs, however, are more sensitive to latency than others. Examples
of these jobs include:
1. A job which updates a merge request following a push to a branch.
1. A job which invalidates a cache of known branches for a project after a push
to the branch.
1. A job which recalculates the groups and projects a user can see after a
change in permissions.
1. A job which updates the status of a CI pipeline after a state change to a job
in the pipeline.
When these jobs are delayed, the user may perceive the delay as a bug: for
example, they may push a branch and then attempt to create a merge request for
that branch, but be told in the UI that the branch does not exist. We deem these
jobs to be `latency_sensitive`.
Extra effort is made to ensure that these jobs are started within a very short
period of time after being scheduled. However, in order to ensure throughput,
these jobs also have very strict execution duration requirements:
1. The median job execution time should be less than 1 second.
1. 99% of jobs should complete within 10 seconds.
If a worker cannot meet these expectations, then it cannot be treated as a
`latency_sensitive` worker: consider redesigning the worker, or splitting the
work between two different workers, one with `latency_sensitive` code that
executes quickly, and the other with non-`latency_sensitive`, which has no
execution latency requirements (but also has lower scheduling targets).
This can be summed up in the following table:
| **Latency Sensitivity** | **Queue Scheduling Target** | **Execution Latency Requirement** |
|-------------------------|-----------------------------|-------------------------------------|
| Not `latency_sensitive` | 1 minute | Maximum run time of 1 hour |
| `latency_sensitive` | 100 milliseconds | p50 of 1 second, p99 of 10 seconds |
To mark a worker as being `latency_sensitive`, use the
`latency_sensitive_worker!` attribute, as shown in this example:
```ruby
class LatencySensitiveWorker
include ApplicationWorker
latency_sensitive_worker!
# ...
end
```
## Jobs with External Dependencies
Most background jobs in the GitLab application communicate with other GitLab
services, eg Postgres, Redis, Gitaly and Object Storage. These are considered
to be "internal" dependencies for a job.
However, some jobs will be dependent on external services in order to complete
successfully. Some examples include:
1. Jobs which call web-hooks configured by a user.
1. Jobs which deploy an application to a k8s cluster configured by a user.
These jobs have "external dependencies". This is important for the operation of
the background processing cluster in several ways:
1. Most external dependencies (such as web-hooks) do not provide SLOs, and
therefore we cannot guarantee the execution latencies on these jobs. Since we
cannot guarantee execution latency, we cannot ensure throughput and
therefore, in high-traffic environments, we need to ensure that jobs with
external dependencies are separated from `latency_sensitive` jobs, to ensure
throughput on those queues.
1. Errors in jobs with external dependencies have higher alerting thresholds as
there is a likelihood that the cause of the error is external.
```ruby
class ExternalDependencyWorker
include ApplicationWorker
# Declares that this worker depends on
# third-party, external services in order
# to complete successfully
worker_has_external_dependencies!
# ...
end
```
NOTE: **Note:** Note that a job cannot be both latency sensitive and have
external dependencies.
## CPU-bound and Memory-bound Workers
Workers that are constrained by CPU or memory resource limitations should be
annotated with the `worker_resource_boundary` method.
Most workers tend to spend most of their time blocked, wait on network responses
from other services such as Redis, Postgres and Gitaly. Since Sidekiq is a
multithreaded environment, these jobs can be scheduled with high concurrency.
Some workers, however, spend large amounts of time _on-cpu_ running logic in
Ruby. Ruby MRI does not support true multithreading - it relies on the
[GIL](https://thoughtbot.com/blog/untangling-ruby-threads#the-global-interpreter-lock)
to greatly simplify application development by only allowing one section of Ruby
code in a process to run at a time, no matter how many cores the machine
hosting the process has. For IO bound workers, this is not a problem, since most
of the threads are blocked in underlying libraries (which are outside of the
GIL).
If many threads are attempting to run Ruby code simultaneously, this will lead
to contention on the GIL which will have the affect of slowing down all
processes.
In high-traffic environments, knowing that a worker is CPU-bound allows us to
run it on a different fleet with lower concurrency. This ensures optimal
performance.
Likewise, if a worker uses large amounts of memory, we can run these on a
bespoke low concurrency, high memory fleet.
Note that Memory-bound workers create heavy GC workloads, with pauses of
10-50ms. This will have an impact on the latency requirements for the
worker. For this reason, `memory` bound, `latency_sensitive` jobs are not
permitted and will fail CI. In general, `memory` bound workers are
discouraged, and alternative approaches to processing the work should be
considered.
## Declaring a Job as CPU-bound
This example shows how to declare a job as being CPU-bound.
```ruby
class CPUIntensiveWorker
include ApplicationWorker
# Declares that this worker will perform a lot of
# calculations on-CPU.
worker_resource_boundary :cpu
# ...
end
```
## Determining whether a worker is CPU-bound
We use the following approach to determine whether a worker is CPU-bound:
- In the sidekiq structured JSON logs, aggregate the worker `duration` and
`cpu_s` fields.
- `duration` refers to the total job execution duration, in seconds
- `cpu_s` is derived from the
[`Process::CLOCK_THREAD_CPUTIME_ID`](https://www.rubydoc.info/stdlib/core/Process:clock_gettime)
counter, and is a measure of time spent by the job on-CPU.
- Divide `cpu_s` by `duration` to get the percentage time spend on-CPU.
- If this ratio exceeds 33%, the worker is considered CPU-bound and should be
annotated as such.
- Note that these values should not be used over small sample sizes, but
rather over fairly large aggregates.
## Feature Categorization
Each Sidekiq worker, or one of its ancestor classes, must declare a
......@@ -74,7 +236,7 @@ The declaration uses the `feature_category` class method, as shown below.
class SomeScheduledTaskWorker
include ApplicationWorker
# Declares that this feature is part of the
# Declares that this worker is part of the
# `continuous_integration` feature category
feature_category :continuous_integration
......@@ -88,11 +250,11 @@ source](https://gitlab.com/gitlab-com/www-gitlab-com/blob/master/data/stages.yml
### Updating `config/feature_categories.yml`
Occassionally new features will be added to GitLab stages. When this occurs, you
Occasionally new features will be added to GitLab stages. When this occurs, you
can automatically update `config/feature_categories.yml` by running
`scripts/update-feature-categories`. This script will fetch and parse
[`stages.yml`](https://gitlab.com/gitlab-com/www-gitlab-com/blob/master/data/stages.yml)
and generare a new version of the file, which needs to be checked into source control.
and generate a new version of the file, which needs to be checked into source control.
### Excluding Sidekiq workers from feature categorization
......@@ -116,9 +278,63 @@ end
Each Sidekiq worker must be tested using RSpec, just like any other class. These
tests should be placed in `spec/workers`.
## Removing or renaming queues
## Sidekiq Compatibility across Updates
Keep in mind that the arguments for a Sidekiq job are stored in a queue while it
is scheduled for execution. During a online update, this could lead to several
possible situations:
1. An older version of the application publishes a job, which is executed by an
upgraded Sidekiq node.
1. A job is queued before an upgrade, but executed after an upgrade.
1. A job is queued by a node running the newer version of the application, but
executed on a node running an older version of the application.
### Changing the arguments for a worker
Jobs need to be backwards- and forwards-compatible between consecutive versions
of the application.
This can be done by following this process:
1. **Do not remove arguments from the `perform` function.**. Instead, use the
following approach
1. Provide a default value (usually `nil`) and use a comment to mark the
argument as deprecated
1. Stop using the argument in `perform_async`.
1. Ignore the value in the worker class, but do not remove it until the next
major release.
### Removing workers
Try to avoid removing workers and their queues in minor and patch
releases.
Try to avoid renaming or removing workers and their queues in minor and patch releases.
During online update instance can have pending jobs and removing the queue can
lead to those jobs being stuck forever. If you can't write migration for those
Sidekiq jobs, please consider doing rename or remove queue in major release only.
Sidekiq jobs, please consider removing the worker in a major release only.
### Renaming queues
For the same reasons that removing workers is dangerous, care should be taken
when renaming queues.
When renaming queues, use the `sidekiq_queue_migrate` helper migration method,
as show in this example:
```ruby
class MigrateTheRenamedSidekiqQueue < ActiveRecord::Migration[5.0]
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
def up
sidekiq_queue_migrate 'old_queue_name', to: 'new_queue_name'
end
def down
sidekiq_queue_migrate 'new_queue_name', to: 'old_queue_name'
end
end
```
......@@ -5,6 +5,8 @@ module Ci
include ::ApplicationWorker
include ::PipelineQueue
worker_resource_boundary :cpu
def perform(bridge_id)
::Ci::Bridge.find_by_id(bridge_id).try do |bridge|
::Ci::CreateCrossProjectPipelineService
......
......@@ -5,6 +5,9 @@ module Ci
include ::ApplicationWorker
include ::PipelineQueue
latency_sensitive_worker!
worker_resource_boundary :cpu
def perform(pipeline_id)
::Ci::Pipeline.find_by_id(pipeline_id).try do |pipeline|
::Ci::PipelineBridgeStatusService
......
......@@ -5,6 +5,8 @@ class CreateGithubWebhookWorker
include GrapePathHelpers::NamedRouteMatcher
feature_category :integrations
worker_resource_boundary :cpu
worker_has_external_dependencies!
attr_reader :project
......
......@@ -4,6 +4,7 @@ class ExportCsvWorker
include ApplicationWorker
feature_category :issue_tracking
worker_resource_boundary :memory
def perform(current_user_id, project_id, params)
@current_user = User.find(current_user_id)
......
......@@ -6,6 +6,7 @@ module IncidentManagement
queue_namespace :incident_management
feature_category :incident_management
worker_resource_boundary :cpu
def perform(project_id, alert_hash)
project = find_project(project_id)
......
......@@ -5,6 +5,7 @@ class LdapAllGroupsSyncWorker
include CronjobQueue
feature_category :authentication_and_authorization
worker_has_external_dependencies!
def perform
return unless Gitlab::Auth::LDAP::Config.group_sync_enabled?
......
......@@ -4,6 +4,7 @@ class LdapGroupSyncWorker
include ApplicationWorker
feature_category :authentication_and_authorization
worker_has_external_dependencies!
# rubocop: disable CodeReuse/ActiveRecord
def perform(group_ids, provider = nil)
......
......@@ -5,6 +5,7 @@ class LdapSyncWorker
include CronjobQueue
feature_category :authentication_and_authorization
worker_has_external_dependencies!
# rubocop: disable CodeReuse/ActiveRecord
# rubocop: disable Gitlab/RailsLogger
......
......@@ -5,6 +5,7 @@ class NewEpicWorker
include NewIssuable
feature_category :agile_portfolio_management
worker_resource_boundary :cpu
def perform(epic_id, user_id)
return unless objects_found?(epic_id, user_id)
......
......@@ -7,6 +7,8 @@ class SyncSecurityReportsToReportApprovalRulesWorker
include PipelineQueue
feature_category :static_application_security_testing
latency_sensitive_worker!
worker_resource_boundary :cpu
def perform(pipeline_id)
pipeline = Ci::Pipeline.find_by_id(pipeline_id)
......
......@@ -5,6 +5,7 @@ class UpdateMaxSeatsUsedForGitlabComSubscriptionsWorker
include CronjobQueue
feature_category :license_compliance
worker_resource_boundary :cpu
# rubocop: disable CodeReuse/ActiveRecord
def perform
......
......@@ -21,8 +21,8 @@ describe 'Every Sidekiq worker' do
missing_from_file = worker_queues - file_worker_queues
expect(missing_from_file).to be_empty, "expected #{missing_from_file.to_a.inspect} to be in Gitlab::SidekiqConfig::QUEUE_CONFIG_PATHS"
unncessarily_in_file = file_worker_queues - worker_queues
expect(unncessarily_in_file).to be_empty, "expected #{unncessarily_in_file.to_a.inspect} not to be in Gitlab::SidekiqConfig::QUEUE_CONFIG_PATHS"
unnecessarily_in_file = file_worker_queues - worker_queues
expect(unnecessarily_in_file).to be_empty, "expected #{unnecessarily_in_file.to_a.inspect} not to be in Gitlab::SidekiqConfig::QUEUE_CONFIG_PATHS"
end
it 'has its queue or namespace in config/sidekiq_queues.yml', :aggregate_failures do
......@@ -42,7 +42,7 @@ describe 'Every Sidekiq worker' do
end
# All Sidekiq worker classes should declare a valid `feature_category`
# or explicitely be excluded with the `feature_category_not_owned!` annotation.
# or explicitly be excluded with the `feature_category_not_owned!` annotation.
# Please see doc/development/sidekiq_style_guide.md#Feature-Categorization for more details.
it 'has a feature_category or feature_category_not_owned! attribute', :aggregate_failures do
Gitlab::SidekiqConfig.workers.each do |worker|
......@@ -62,5 +62,36 @@ describe 'Every Sidekiq worker' do
expect(feature_categories).to include(worker.get_feature_category), "expected #{worker.inspect} to declare a valid feature_category, but got #{worker.get_feature_category}"
end
end
# Memory-bound workers are very expensive to run, since they need to run on nodes with very low
# concurrency, so that each job can consume a large amounts of memory. For this reason, on
# GitLab.com, when a large number of memory-bound jobs arrive at once, we let them queue up
# rather than scaling the hardware to meet the SLO. For this reason, memory-bound,
# latency-sensitive jobs are explicitly discouraged and disabled.
it 'is (exclusively) memory-bound or latency-sentitive, not both', :aggregate_failures do
latency_sensitive_workers = Gitlab::SidekiqConfig.workers
.select(&:latency_sensitive_worker?)
latency_sensitive_workers.each do |worker|
expect(worker.get_worker_resource_boundary).not_to eq(:memory), "#{worker.inspect} cannot be both memory-bound and latency sensitive"
end
end
# In high traffic installations, such as GitLab.com, `latency_sensitive` workers run in a
# dedicated fleet. In order to ensure short queue times, `latency_sensitive` jobs have strict
# SLOs in order to ensure throughput. However, when a worker depends on an external service,
# such as a user's k8s cluster or a third-party internet service, we cannot guarantee latency,
# and therefore throughput. An outage to an 3rd party service could therefore impact throughput
# on other latency_sensitive jobs, leading to degradation through the GitLab application.
# Please see doc/development/sidekiq_style_guide.md#Jobs-with-External-Dependencies for more
# details.
it 'has (exclusively) external dependencies or is latency-sentitive, not both', :aggregate_failures do
latency_sensitive_workers = Gitlab::SidekiqConfig.workers
.select(&:latency_sensitive_worker?)
latency_sensitive_workers.each do |worker|
expect(worker.worker_has_external_dependencies?).to be_falsey, "#{worker.inspect} cannot have both external dependencies and be latency sensitive"
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment