Merge branch '1618-async-elasticsearch-indexing' into 'master'

Elasticsearch: allow initial indexing to proceed within Sidekiq Closes #1618 See merge request !1144

Merge branch '1618-async-elasticsearch-indexing' into 'master'
Elasticsearch: allow initial indexing to proceed within Sidekiq Closes #1618 See merge request !1144
cd3ae49b · Valery Sizov · 49f7a449 · 0955e2b4 · cd3ae49b · cd3ae49b
Commit cd3ae49b authored Feb 06, 2017 by Valery Sizov
9 changed files
--- a/app/workers/elastic_batch_project_indexer_worker.rb
+++ b/app/workers/elastic_batch_project_indexer_worker.rb
+class ElasticBatchProjectIndexerWorker
+  include Sidekiq::Worker
+  include Gitlab::CurrentSettings
+
+  # Batch indexing is a generally a onetime option, so give finer control over
+  # queuing and concurrency
+  include DedicatedSidekiqQueue
+
+  # This worker is long-running, but idempotent, so retry many times if
+  # necessary
+  sidekiq_options retry: 10
+
+  def perform(start, finish, update_index = false)
+    projects = build_relation(start, finish, update_index)
+
+    projects.find_each { |project| run_indexer(project) }
+  end
+
+  private
+
+  def run_indexer(project)
+    logger.info "Indexing #{project.name_with_namespace} (ID=#{project.id})..."
+
+    last_commit = project.index_status.try(:last_commit)
+    Gitlab::Elastic::Indexer.new(project).run(last_commit)
+
+    logger.info "Indexing #{project.name_with_namespace} (ID=#{project.id}) is done!"
+  rescue => err
+    logger.warn("#{err.message} indexing #{project.name_with_namespace} (ID=#{project.id}), trace - #{err.backtrace}")
+  end
+
+  def build_relation(start, finish, update_index)
+    relation = Project.includes(:index_status)
+
+    unless update_index
+      relation = relation.where('index_statuses.id IS NULL').references(:index_statuses)
+    end
+
+    table = Project.arel_table
+    relation = relation.where(table[:id].gteq(start)) if start
+    relation = relation.where(table[:id].lteq(finish)) if finish
+
+    relation
+  end
+end
--- a/app/workers/elastic_commit_indexer_worker.rb
+++ b/app/workers/elastic_commit_indexer_worker.rb
@@ -8,16 +8,7 @@ class ElasticCommitIndexerWorker
    return true unless current_application_settings.elasticsearch_indexing?

    project = Project.find(project_id)
-    repository = project.repository

-    return true unless repository.exists?
-
-    indexer = Gitlab::Elastic::Indexer.new
-    indexer.run(
-      project_id,
-      repository.path_to_repo,
-      oldrev,
-      newrev
-    )
+    Gitlab::Elastic::Indexer.new(project).run(oldrev, newrev)
  end
 end
--- a/config/sidekiq_queues.yml
+++ b/config/sidekiq_queues.yml
@@ -58,3 +58,4 @@
  - [project_update_repository_storage, 1]
  - [admin_emails, 1]
  - [geo_repository_update, 1]
+  - [elastic_batch_project_indexer, 1]
--- a/doc/integration/elasticsearch.md
+++ b/doc/integration/elasticsearch.md
@@ -22,11 +22,10 @@ searching in:
 - Snippets
 - Wiki

-Once the data is added to the database or repository and [Elasticsearch is enabled in the admin area](#enable-elasticsearch) the search index will be updated
-automatically.
-Elasticsearch can be installed on the same machine that GitLab
-is installed or on a separate server.
-
+Once the data is added to the database or repository and [Elasticsearch is
+enabled in the admin area](#enable-elasticsearch) the search index will be
+updated automatically. Elasticsearch can be installed on the same machine as
+GitLab, or on a separate server.

 ## Requirements

@@ -77,6 +76,37 @@ bundle exec rake gitlab:elastic:create_empty_index RAILS_ENV=production

 Then enable Elasticsearch indexing and run repository indexing tasks:

+```
+# Omnibus installations
+sudo gitlab-rake gitlab:elastic:index_repositories_async
+
+# Installations from source
+bundle exec rake gitlab:elastic:index_repositories_async RAILS_ENV=production
+```
+
+This enqueues a number of Sidekiq jobs to index your existing repositories.
+You can view the jobs in the admin panel (they are placed in the `elastic_batch_project_indexer`)
+queue), or you can query indexing status using a rake task:
+
+```
+# Omnibus installations
+sudo gitlab-rake gitlab:elastic:index_repositories_status
+
+# Installations from source
+bundle exec rake gitlab:elastic:index_repositories_status RAILS_ENV=production
+
+Indexing is 65.55% complete (6555/10000 projects)
+```
+
+By default, one job is created for every 300 projects. For large numbers of
+projects, you may wish to increase the batch size, by setting the `BATCH`
+environment variable. You may also wish to consider [throttling](../administration/operations/sidekiq_job_throttling.md)
+the `elastic_batch_project_indexer` queue , as this step can be I/O-intensive.
+
+You can also run the initial indexing synchronously - this is most useful if
+you have a small number of projects, or need finer-grained control over indexing
+than Sidekiq permits:
+
 ```
 # Omnibus installations
 sudo gitlab-rake gitlab:elastic:index_repositories
@@ -103,12 +133,24 @@ ID_FROM=1001 ID_TO=2000 sudo gitlab-rake gitlab:elastic:index_repositories
 ID_FROM=2001 sudo gitlab-rake gitlab:elastic:index_repositories
 ```

-Sometimes your repository index process `gitlab:elastic:index_repositories` get interupted due to various reasons, in this case you can safely run it again and it will skip those repositories that already have been indexed. As the indexer stores the last commit SHA of every indexed repository in the database you can run the indexer with the special parameter `UPDATE_INDEX` and it will check every project repository again to make sure that every commit in that repository is indexed, it can be useful in case if your index is outdated:
+Sometimes your repository index process `gitlab:elastic:index_repositories` or
+`gitlab:elastic:index_repositories_async` can get interrupted. This may happen
+for many reasons, but it's always safe to run the indexing job again - it will
+skip those repositories that have already been indexed.
+
+As the indexer stores the last commit SHA of every indexed repository in the
+database, you can run the indexer with the special parameter `UPDATE_INDEX` and
+it will check every project repository again to make sure that every commit in
+that repository is indexed, it can be useful in case if your index is outdated:

 ```
 UPDATE_INDEX=true ID_TO=1000 sudo gitlab-rake gitlab:elastic:index_repositories
 ```

+You can also use the `gitlab:elastic:clear_index_status` Rake task to force the
+indexer to "forget" all progresss, so retrying the indexing process from the
+start.
+
 To index all wikis:

 ```

--- a/lib/gitlab/elastic/indexer.rb
+++ b/lib/gitlab/elastic/indexer.rb
@@ -8,7 +8,11 @@ module Gitlab

      Error = Class.new(StandardError)

-      def initialize
+      attr_reader :project
+
+      def initialize(project)
+        @project = project
+
        connection_info = {
          host: current_application_settings.elasticsearch_host,
          port: current_application_settings.elasticsearch_port
@@ -22,20 +26,62 @@ module Gitlab
        }
      end

-      def run(project_id, repo_path, from_sha = nil, to_sha = nil)
+      def run(from_sha = nil, to_sha = nil)
        to_sha = nil if to_sha == Gitlab::Git::BLANK_SHA

-        vars = @vars.merge({ 'FROM_SHA' => from_sha, 'TO_SHA' => to_sha })
+        head_commit = repository.try(:commit)
+
+        if repository.nil? || !repository.exists? || repository.empty? || head_commit.nil?
+          update_index_status(Gitlab::Git::BLANK_SHA)
+          return
+        end
+
+        run_indexer!(from_sha, to_sha)
+        update_index_status(to_sha)
+
+        true
+      end
+
+      private

-        path_to_indexer = File.join(Rails.root, 'bin/elastic_repo_indexer')
+      def repository
+        project.repository
+      end
+
+      def path_to_indexer
+        File.join(Rails.root, 'bin/elastic_repo_indexer')
+      end

-        command = [path_to_indexer, project_id.to_s, repo_path]
+      def run_indexer!(from_sha, to_sha)
+        command = [path_to_indexer, project.id.to_s, repository.path_to_repo]
+        vars = @vars.merge('FROM_SHA' => from_sha, 'TO_SHA' => to_sha)

        output, status = Gitlab::Popen.popen(command, nil, vars)

        raise Error, output unless status.zero?
+      end

-        true
+      def update_index_status(to_sha)
+        head_commit = repository.try(:commit)
+
+        # Use the eager-loaded association if available. An index_status should
+        # always be created, even if the repository is empty, so we know it's
+        # been looked at.
+        index_status = project.index_status
+        index_status ||=
+          begin
+            IndexStatus.find_or_create_by(project_id: project.id)
+          rescue ActiveRecord::RecordNotUnique
+            retry
+          end
+
+        # Don't update the index status if we never reached HEAD
+        return if head_commit && to_sha && head_commit.sha != to_sha
+
+        sha = head_commit.try(:sha)
+        sha ||= Gitlab::Git::BLANK_SHA
+        index_status.update_attributes(last_commit: sha, indexed_at: Time.now)
+        project.index_status(true)
      end
    end
  end

--- a/lib/tasks/gitlab/elastic.rake
+++ b/lib/tasks/gitlab/elastic.rake
@@ -9,50 +9,34 @@ namespace :gitlab do
      Rake::Task["gitlab:elastic:index_database"].invoke
    end

-    desc "GitLab | Elasticsearch | Index project repositories"
-    task index_repositories: :environment  do
-      projects = if ENV['UPDATE_INDEX']
-                   Project
-                 else
-                   Project.includes(:index_status).
-                           where("index_statuses.id IS NULL").
-                           references(:index_statuses)
-                 end
+    desc "GitLab | Elasticsearch | Index project repositories in the background"
+    task index_repositories_async: :environment do
+      print "Enqueuing project repositories in batches of #{batch_size}"

-      projects = apply_project_filters(projects)
+      project_id_batches do |start, finish|
+        ElasticBatchProjectIndexerWorker.perform_async(start, finish, ENV['UPDATE_INDEX'])
+        print "."
+      end

-      indexer = Gitlab::Elastic::Indexer.new
+      puts "OK"
+    end

-      projects.find_each(batch_size: 300) do |project|
-        repository = project.repository
+    desc "GitLab | ElasticSearch | Check project repository indexing status"
+    task index_repositories_status: :environment do
+      indexed = IndexStatus.count
+      projects = Project.count
+      percent = (indexed / projects.to_f) * 100.0

-        if repository.exists? && !repository.empty?
-          puts "Indexing #{project.name_with_namespace} (ID=#{project.id})..."
+      puts "Indexing is %.2f%% complete (%d/%d projects)" % [percent, indexed, projects]
+    end

-          index_status = IndexStatus.find_or_create_by(project: project)
+    desc "GitLab | Elasticsearch | Index project repositories"
+    task index_repositories: :environment  do
+      print "Indexing project repositories..."

-          begin
-            head_commit = repository.commit
-
-            if !head_commit || index_status.last_commit == head_commit.sha
-              puts "Skipped".color(:yellow)
-              next
-            end
-
-            indexer.run(
-              project.id,
-              repository.path_to_repo,
-              index_status.last_commit
-            )
-
-            # During indexing the new commits can be pushed,
-            # the last_commit parameter only indicates that at least this commit is in index
-            index_status.update(last_commit: head_commit.sha, indexed_at: DateTime.now)
-            puts "Done!".color(:green)
-          rescue StandardError => e
-            puts "#{e.message}, trace - #{e.backtrace}"
-          end
-        end
+      Sidekiq::Logging.logger = Logger.new(STDOUT)
+      project_id_batches do |start, finish|
+        ElasticBatchProjectIndexerWorker.new.perform(start, finish, ENV['UPDATE_INDEX'])
      end
    end

@@ -116,6 +100,23 @@ namespace :gitlab do
      puts "Index recreated".color(:green)
    end

+    def batch_size
+      ENV.fetch('BATCH', 300).to_i
+    end
+
+    def project_id_batches(&blk)
+      relation = Project
+
+      unless ENV['UPDATE_INDEX']
+        relation = relation.includes(:index_status).where('index_statuses.id IS NULL').references(:index_statuses)
+      end
+
+      relation.all.in_batches(of: batch_size, start: ENV['ID_FROM'], finish: ENV['ID_TO']) do |relation|
+        ids = relation.reorder(:id).pluck(:id)
+        yield ids[0], ids[-1]
+      end
+    end
+
    def apply_project_filters(projects)
      if ENV['ID_FROM']
        projects = projects.where("projects.id >= ?", ENV['ID_FROM'])

--- a/spec/lib/gitlab/elastic/indexer_spec.rb
+++ b/spec/lib/gitlab/elastic/indexer_spec.rb
 require 'spec_helper'

-describe "Indexer" do
+describe Gitlab::Elastic::Indexer do
  include StubENV

  before do
@@ -8,21 +8,100 @@ describe "Indexer" do
    stub_application_setting(es_host: ['elastic-host1', 'elastic-host2'])
  end

-  it "runs commands" do
-    expect(Gitlab::Popen).to receive(:popen).with(
-      [File.join(Rails.root, 'bin/elastic_repo_indexer'), '1', 'full_repo_path'],
-      nil,
-      hash_including(
-        'ELASTIC_CONNECTION_INFO' => {
-                                       host: current_application_settings.elasticsearch_host,
-                                       port: current_application_settings.elasticsearch_port
-                                     }.to_json,
-        'RAILS_ENV'               => Rails.env,
-        'FROM_SHA' => '000000',
-        'TO_SHA' => '1d1f2d'
-      )
-    ).and_return([[''], 0])
-
-    Gitlab::Elastic::Indexer.new.run(1, 'full_repo_path', '000000', '1d1f2d')
+  let(:project)  { create(:project) }
+  let(:from_sha) { Gitlab::Git::BLANK_SHA }
+  let(:to_sha)   { project.commit.try(:sha) }
+  let(:indexer)  { described_class.new(project)  }
+
+  let(:popen_success) { [[''], 0] }
+  let(:popen_failure) { [['error'], 1] }
+
+  let(:elastic_connection_info) do
+    {
+      host: current_application_settings.elasticsearch_host,
+      port: current_application_settings.elasticsearch_port,
+    }
+  end
+
+  context 'empty project' do
+    let(:project) { create(:empty_project) }
+
+    it 'updates the index status without running the indexing command' do
+      expect_popen.never
+
+      indexer.run
+
+      expect_index_status(Gitlab::Git::BLANK_SHA)
+    end
+  end
+
+  context 'repository has unborn head' do
+    it 'updates the index status without running the indexing command' do
+      allow(project.repository).to receive(:exists?).and_return(false)
+      expect_popen.never
+
+      indexer.run
+
+      expect_index_status(Gitlab::Git::BLANK_SHA)
+    end
+  end
+
+  context 'test project' do
+    let(:project) { create(:project) }
+
+    it 'runs the indexing command' do
+      expect_popen.with(
+        [
+          File.join(Rails.root, 'bin/elastic_repo_indexer'),
+          project.id.to_s,
+          project.repository.path_to_repo
+        ],
+        nil,
+        hash_including(
+          'ELASTIC_CONNECTION_INFO' => elastic_connection_info.to_json,
+          'RAILS_ENV'               => Rails.env,
+          'FROM_SHA'                => from_sha,
+          'TO_SHA'                  => to_sha
+        )
+      ).and_return(popen_success)
+
+      indexer.run(from_sha, to_sha)
+    end
+
+    it 'updates the index status when the indexing is a success' do
+      expect_popen.and_return(popen_success)
+
+      indexer.run(from_sha, to_sha)
+
+      expect_index_status(to_sha)
+    end
+
+    it 'leaves the index status untouched when indexing a non-HEAD commit' do
+      expect_popen.and_return(popen_success)
+
+      indexer.run(from_sha, project.repository.commit('HEAD~1'))
+
+      expect(project.index_status).to be_nil
+    end
+
+    it 'leaves the index status untouched when the indexing fails' do
+      expect_popen.and_return(popen_failure)
+
+      expect { indexer.run }.to raise_error(Gitlab::Elastic::Indexer::Error)
+
+      expect(project.index_status).to be_nil
+    end
+  end
+
+  def expect_popen(*with)
+    expect(Gitlab::Popen).to receive(:popen)
+  end
+
+  def expect_index_status(sha)
+    status = project.index_status
+
+    expect(status).not_to be_nil
+    expect(status.indexed_at).not_to be_nil
+    expect(status.last_commit).to eq(sha)
  end
 end
--- a/spec/workers/elastic_batch_project_indexer_worker_spec.rb
+++ b/spec/workers/elastic_batch_project_indexer_worker_spec.rb
+require 'spec_helper'
+
+describe ElasticBatchProjectIndexerWorker do
+  subject(:worker) { described_class.new }
+  let(:projects) { create_list(:empty_project, 2) }
+
+  describe '#perform' do
+    it 'runs the indexer for projects in the batch range' do
+      projects.each {|project| expect_index(project) }
+
+      worker.perform(projects.first.id, projects.last.id)
+    end
+
+    it 'skips projects not in the batch range' do
+      expect_index(projects.first).never
+      expect_index(projects.last)
+
+      worker.perform(projects.last.id, projects.last.id)
+    end
+
+    context 'update_index = false' do
+      it 'skips projects that were already indexed' do
+        projects.first.create_index_status!
+
+        expect_index(projects.first).never
+
+        worker.perform(projects.first.id, projects.first.id)
+      end
+    end
+
+    context 'with update_index' do
+      it 'reindexes projects that were already indexed' do
+        projects.first.create_index_status!
+
+        expect_index(projects.first)
+        expect_index(projects.last)
+
+        worker.perform(projects.first.id, projects.last.id, true)
+      end
+
+      it 'starts indexing at the last indexed commit' do
+        projects.first.create_index_status!(last_commit: 'foo')
+
+        expect_index(projects.first).and_call_original
+        expect_any_instance_of(Gitlab::Elastic::Indexer).to receive(:run).with('foo')
+
+        worker.perform(projects.first.id, projects.first.id, true)
+      end
+    end
+  end
+
+  def expect_index(project)
+    expect(worker).to receive(:run_indexer).with(project)
+  end
+end
--- a/spec/workers/elastic_commit_indexer_worker_spec.rb
+++ b/spec/workers/elastic_commit_indexer_worker_spec.rb
@@ -15,23 +15,6 @@ describe ElasticCommitIndexerWorker do
      subject.perform(project.id, '0000', '0000')
    end

-    it 'does not run indexer when project is empty' do
-      empty_project = create :empty_project
-
-      expect_any_instance_of(Gitlab::Elastic::Indexer).not_to receive(:run)
-
-      subject.perform(empty_project.id, '0000', '0000')
-    end
-
-    it 'returns true if repository has unborn head' do
-      project = create :project
-      repository = double('repository')
-      expect(repository).to receive(:exists?).and_return(false)
-      expect_any_instance_of(Project).to receive(:repository).and_return(repository)
-
-      expect(subject.perform(project.id)).to be_truthy
-    end
-
    it 'returns true if ES disabled' do
      stub_application_setting(elasticsearch_indexing: false)