# frozen_string_literal: true # Create a separate process, which does not load the Rails environment, to index # each repository. This prevents memory leaks in the indexer from affecting the # rest of the application. module Gitlab module Elastic class Indexer include Gitlab::Utils::StrongMemoize Error = Class.new(StandardError) class << self def indexer_version Rails.root.join('GITLAB_ELASTICSEARCH_INDEXER_VERSION').read.chomp end end attr_reader :project, :index_status, :wiki alias_method :index_wiki?, :wiki def initialize(project, wiki: false) @project = project @wiki = wiki # Use the eager-loaded association if available. @index_status = project.index_status end # Runs the indexation process, which is the following: # - Purge the index for any unreachable commits; # - Run the `gitlab-elasticsearch-indexer`; # - Update the `index_status` for the associated project; # # ref - Git ref up to which the indexation will run (default: HEAD) def run(ref = 'HEAD') commit = find_indexable_commit(ref) return update_index_status(Gitlab::Git::BLANK_SHA) unless commit repository.__elasticsearch__.elastic_writing_targets.each do |target| Sidekiq.logger.debug(message: "Indexation running for #{project.id} #{from_sha}..#{commit.sha}", project_id: project.id, wiki: index_wiki?) run_indexer!(commit.sha, target) end # update the index status only if all writes were successful update_index_status(commit.sha) true end def find_indexable_commit(ref) !repository.empty? && repository.commit(ref) end private def repository index_wiki? ? project.wiki.repository : project.repository end def run_indexer!(to_sha, target) # This might happen when default branch has been reset or rebased. base_sha = if purge_unreachable_commits_from_index!(to_sha, target) Gitlab::Git::EMPTY_TREE_ID else from_sha end vars = build_envvars(base_sha, to_sha, target) path_to_indexer = Gitlab.config.elasticsearch.indexer_path command = if index_wiki? [path_to_indexer, "--blob-type=wiki_blob", "--skip-commits", project.id.to_s, repository_path] else [path_to_indexer, project.id.to_s, repository_path] end output, status = Gitlab::Popen.popen(command, nil, vars) raise Error, output unless status&.zero? end # Remove all indexed data for commits and blobs for a project. # # @return: whether the index has been purged def purge_unreachable_commits_from_index!(to_sha, target) return false if last_commit_ancestor_of?(to_sha) target.delete_index_for_commits_and_blobs(wiki: index_wiki?) true rescue ::Elasticsearch::Transport::Transport::Errors::BadRequest => e Gitlab::ErrorTracking.track_exception(e, project_id: project.id) end def build_envvars(from_sha, to_sha, target) # We accept any form of settings, including string and array # This is why JSON is needed vars = { 'RAILS_ENV' => Rails.env, 'ELASTIC_CONNECTION_INFO' => elasticsearch_config(target), 'GITALY_CONNECTION_INFO' => gitaly_config, 'FROM_SHA' => from_sha, 'TO_SHA' => to_sha, 'CORRELATION_ID' => Labkit::Correlation::CorrelationId.current_id, 'SSL_CERT_FILE' => OpenSSL::X509::DEFAULT_CERT_FILE, 'SSL_CERT_DIR' => OpenSSL::X509::DEFAULT_CERT_DIR } # Set AWS environment variables for IAM role authentication if present vars = build_aws_credentials_env(vars) # Users can override default SSL certificate path via SSL_CERT_FILE SSL_CERT_DIR vars.merge(ENV.slice('SSL_CERT_FILE', 'SSL_CERT_DIR')) end def build_aws_credentials_env(vars) # AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN need to be set as # environment variable in case of using IAM role based authentication in AWS # The credentials are buffered to prevent from hitting rate limit. They will be # refreshed when expired credentials = Gitlab::Elastic::Client.aws_credential_provider&.credentials return vars unless credentials&.set? vars.merge( 'AWS_ACCESS_KEY_ID' => credentials.access_key_id, 'AWS_SECRET_ACCESS_KEY' => credentials.secret_access_key, 'AWS_SESSION_TOKEN' => credentials.session_token ) end def last_commit index_wiki? ? index_status&.last_wiki_commit : index_status&.last_commit end def from_sha strong_memoize(:from_sha) do repository_contains_last_indexed_commit? ? last_commit : Gitlab::Git::EMPTY_TREE_ID end end def repository_contains_last_indexed_commit? strong_memoize(:repository_contains_last_indexed_commit) do last_commit.present? && repository.commit(last_commit).present? end end def last_commit_ancestor_of?(to_sha) return true if from_sha == Gitlab::Git::BLANK_SHA return false unless repository_contains_last_indexed_commit? # we always treat the `EMPTY_TREE_ID` as an ancestor to make sure # we don't try to purge an empty index from_sha == Gitlab::Git::EMPTY_TREE_ID || repository.ancestor?(from_sha, to_sha) end def repository_path "#{repository.disk_path}.git" end def elasticsearch_config(target) Gitlab::CurrentSettings.elasticsearch_config.merge( index_name: target.index_name ).to_json end def gitaly_config { storage: project.repository_storage, limit_file_size: Gitlab::CurrentSettings.elasticsearch_indexed_file_size_limit_kb.kilobytes }.merge(Gitlab::GitalyClient.connection_data(project.repository_storage)).to_json end # rubocop: disable CodeReuse/ActiveRecord def update_index_status(to_sha) raise "Invalid sha #{to_sha}" unless to_sha.present? # An index_status should always be created, # even if the repository is empty, so we know it's been looked at. @index_status ||= IndexStatus.safe_find_or_create_by!(project_id: project.id) attributes = if index_wiki? { last_wiki_commit: to_sha, wiki_indexed_at: Time.now } else { last_commit: to_sha, indexed_at: Time.now } end @index_status.update!(attributes) project.reload_index_status end # rubocop: enable CodeReuse/ActiveRecord end end end