Commit 0955e2b4 authored by Nick Thomas's avatar Nick Thomas

Keep the project (repository) index status up to date

Prior to this commit, the `index_statuses` table was allowed to go stale after
initial indexing. This made it impossible to provide status details, and also
made treating the initial index as a 'backfill' job harder.

Now, if a repository is indexed via `git push`, the initial indexing will
recognise that it has been indexed, and not re-index it.

We introduce an asynchronous initial indexer and an index status rake task at
the same time.
parent b485cd2b
...@@ -2,50 +2,37 @@ class ElasticBatchProjectIndexerWorker ...@@ -2,50 +2,37 @@ class ElasticBatchProjectIndexerWorker
include Sidekiq::Worker include Sidekiq::Worker
include Gitlab::CurrentSettings include Gitlab::CurrentSettings
sidekiq_options queue: :elasticsearch, retry: 2 # Batch indexing is a generally a onetime option, so give finer control over
# queuing and concurrency
include DedicatedSidekiqQueue
# This worker is long-running, but idempotent, so retry many times if
# necessary
sidekiq_options retry: 10
def perform(start, finish, update_index = false) def perform(start, finish, update_index = false)
projects = build_relation(start, finish, update_index) projects = build_relation(start, finish, update_index)
indexer = Gitlab::Elastic::Indexer.new
projects.find_each do |project|
repository = project.repository
next unless repository.exists? && !repository.empty?
begin
logger.info "Indexing #{project.name_with_namespace} (ID=#{project.id})..."
index_status = project.index_status || project.build_index_status
head_commit = repository.commit
if !head_commit || index_status.last_commit == head_commit.sha projects.find_each { |project| run_indexer(project) }
logger.info("Skipped".color(:yellow))
next
end end
indexer.run( private
project.id,
repository.path_to_repo,
index_status.last_commit
)
# During indexing the new commits can be pushed, def run_indexer(project)
# the last_commit parameter only indicates that at least this commit is in index logger.info "Indexing #{project.name_with_namespace} (ID=#{project.id})..."
index_status.last_commit = head_commit.sha
index_status.indexed_at = DateTime.now last_commit = project.index_status.try(:last_commit)
index_status.save Gitlab::Elastic::Indexer.new(project).run(last_commit)
logger.info("Done!".color(:green)) logger.info "Indexing #{project.name_with_namespace} (ID=#{project.id}) is done!"
rescue => err rescue => err
logger.warn("#{err.message}, trace - #{err.backtrace}") logger.warn("#{err.message} indexing #{project.name_with_namespace} (ID=#{project.id}), trace - #{err.backtrace}")
end
end
end end
def build_relation(start, finish, update_index) def build_relation(start, finish, update_index)
relation = Project.includes(:index_status) relation = Project.includes(:index_status)
if update_index unless update_index
relation = relation.where('index_statuses.id IS NULL').references(:index_statuses) relation = relation.where('index_statuses.id IS NULL').references(:index_statuses)
end end
......
...@@ -8,16 +8,7 @@ class ElasticCommitIndexerWorker ...@@ -8,16 +8,7 @@ class ElasticCommitIndexerWorker
return true unless current_application_settings.elasticsearch_indexing? return true unless current_application_settings.elasticsearch_indexing?
project = Project.find(project_id) project = Project.find(project_id)
repository = project.repository
return true unless repository.exists? Gitlab::Elastic::Indexer.new(project).run(oldrev, newrev)
indexer = Gitlab::Elastic::Indexer.new
indexer.run(
project_id,
repository.path_to_repo,
oldrev,
newrev
)
end end
end end
...@@ -22,11 +22,10 @@ searching in: ...@@ -22,11 +22,10 @@ searching in:
- Snippets - Snippets
- Wiki - Wiki
Once the data is added to the database or repository and [Elasticsearch is enabled in the admin area](#enable-elasticsearch) the search index will be updated Once the data is added to the database or repository and [Elasticsearch is
automatically. enabled in the admin area](#enable-elasticsearch) the search index will be
Elasticsearch can be installed on the same machine that GitLab updated automatically. Elasticsearch can be installed on the same machine as
is installed or on a separate server. GitLab, or on a separate server.
## Requirements ## Requirements
...@@ -77,6 +76,37 @@ bundle exec rake gitlab:elastic:create_empty_index RAILS_ENV=production ...@@ -77,6 +76,37 @@ bundle exec rake gitlab:elastic:create_empty_index RAILS_ENV=production
Then enable Elasticsearch indexing and run repository indexing tasks: Then enable Elasticsearch indexing and run repository indexing tasks:
```
# Omnibus installations
sudo gitlab-rake gitlab:elastic:index_repositories_async
# Installations from source
bundle exec rake gitlab:elastic:index_repositories_async RAILS_ENV=production
```
This enqueues a number of Sidekiq jobs to index your existing repositories.
You can view the jobs in the admin panel (they are placed in the `elastic_batch_project_indexer`)
queue), or you can query indexing status using a rake task:
```
# Omnibus installations
sudo gitlab-rake gitlab:elastic:index_repositories_status
# Installations from source
bundle exec rake gitlab:elastic:index_repositories_status RAILS_ENV=production
Indexing is 65.55% complete (6555/10000 projects)
```
By default, one job is created for every 300 projects. For large numbers of
projects, you may wish to increase the batch size, by setting the `BATCH`
environment variable. You may also wish to consider [throttling](../administration/operations/sidekiq_job_throttling.md)
the `elastic_batch_project_indexer` queue , as this step can be I/O-intensive.
You can also run the initial indexing synchronously - this is most useful if
you have a small number of projects, or need finer-grained control over indexing
than Sidekiq permits:
``` ```
# Omnibus installations # Omnibus installations
sudo gitlab-rake gitlab:elastic:index_repositories sudo gitlab-rake gitlab:elastic:index_repositories
...@@ -103,12 +133,24 @@ ID_FROM=1001 ID_TO=2000 sudo gitlab-rake gitlab:elastic:index_repositories ...@@ -103,12 +133,24 @@ ID_FROM=1001 ID_TO=2000 sudo gitlab-rake gitlab:elastic:index_repositories
ID_FROM=2001 sudo gitlab-rake gitlab:elastic:index_repositories ID_FROM=2001 sudo gitlab-rake gitlab:elastic:index_repositories
``` ```
Sometimes your repository index process `gitlab:elastic:index_repositories` get interupted due to various reasons, in this case you can safely run it again and it will skip those repositories that already have been indexed. As the indexer stores the last commit SHA of every indexed repository in the database you can run the indexer with the special parameter `UPDATE_INDEX` and it will check every project repository again to make sure that every commit in that repository is indexed, it can be useful in case if your index is outdated: Sometimes your repository index process `gitlab:elastic:index_repositories` or
`gitlab:elastic:index_repositories_async` can get interrupted. This may happen
for many reasons, but it's always safe to run the indexing job again - it will
skip those repositories that have already been indexed.
As the indexer stores the last commit SHA of every indexed repository in the
database, you can run the indexer with the special parameter `UPDATE_INDEX` and
it will check every project repository again to make sure that every commit in
that repository is indexed, it can be useful in case if your index is outdated:
``` ```
UPDATE_INDEX=true ID_TO=1000 sudo gitlab-rake gitlab:elastic:index_repositories UPDATE_INDEX=true ID_TO=1000 sudo gitlab-rake gitlab:elastic:index_repositories
``` ```
You can also use the `gitlab:elastic:clear_index_status` Rake task to force the
indexer to "forget" all progresss, so retrying the indexing process from the
start.
To index all wikis: To index all wikis:
``` ```
......
...@@ -8,7 +8,11 @@ module Gitlab ...@@ -8,7 +8,11 @@ module Gitlab
Error = Class.new(StandardError) Error = Class.new(StandardError)
def initialize attr_reader :project
def initialize(project)
@project = project
connection_info = { connection_info = {
host: current_application_settings.elasticsearch_host, host: current_application_settings.elasticsearch_host,
port: current_application_settings.elasticsearch_port port: current_application_settings.elasticsearch_port
...@@ -22,20 +26,62 @@ module Gitlab ...@@ -22,20 +26,62 @@ module Gitlab
} }
end end
def run(project_id, repo_path, from_sha = nil, to_sha = nil) def run(from_sha = nil, to_sha = nil)
to_sha = nil if to_sha == Gitlab::Git::BLANK_SHA to_sha = nil if to_sha == Gitlab::Git::BLANK_SHA
vars = @vars.merge({ 'FROM_SHA' => from_sha, 'TO_SHA' => to_sha }) head_commit = repository.try(:commit)
if repository.nil? || !repository.exists? || repository.empty? || head_commit.nil?
update_index_status(Gitlab::Git::BLANK_SHA)
return
end
run_indexer!(from_sha, to_sha)
update_index_status(to_sha)
true
end
private
path_to_indexer = File.join(Rails.root, 'bin/elastic_repo_indexer') def repository
project.repository
end
def path_to_indexer
File.join(Rails.root, 'bin/elastic_repo_indexer')
end
command = [path_to_indexer, project_id.to_s, repo_path] def run_indexer!(from_sha, to_sha)
command = [path_to_indexer, project.id.to_s, repository.path_to_repo]
vars = @vars.merge('FROM_SHA' => from_sha, 'TO_SHA' => to_sha)
output, status = Gitlab::Popen.popen(command, nil, vars) output, status = Gitlab::Popen.popen(command, nil, vars)
raise Error, output unless status.zero? raise Error, output unless status.zero?
end
true def update_index_status(to_sha)
head_commit = repository.try(:commit)
# Use the eager-loaded association if available. An index_status should
# always be created, even if the repository is empty, so we know it's
# been looked at.
index_status = project.index_status
index_status ||=
begin
IndexStatus.find_or_create_by(project_id: project.id)
rescue ActiveRecord::RecordNotUnique
retry
end
# Don't update the index status if we never reached HEAD
return if head_commit && to_sha && head_commit.sha != to_sha
sha = head_commit.try(:sha)
sha ||= Gitlab::Git::BLANK_SHA
index_status.update_attributes(last_commit: sha, indexed_at: Time.now)
project.index_status(true)
end end
end end
end end
......
...@@ -21,13 +21,21 @@ namespace :gitlab do ...@@ -21,13 +21,21 @@ namespace :gitlab do
puts "OK" puts "OK"
end end
desc "GitLab | ElasticSearch | Check project repository indexing status"
task index_repositories_status: :environment do
indexed = IndexStatus.count
projects = Project.count
percent = (indexed / projects.to_f) * 100.0
puts "Indexing is %.2f%% complete (%d/%d projects)" % [percent, indexed, projects]
end
desc "GitLab | Elasticsearch | Index project repositories" desc "GitLab | Elasticsearch | Index project repositories"
task index_repositories: :environment do task index_repositories: :environment do
print "Indexing project repositories..." print "Indexing project repositories..."
Sidekiq::Logging.logger = Logger.new(STDOUT) Sidekiq::Logging.logger = Logger.new(STDOUT)
project_id_batches do |start, finish| project_id_batches do |start, finish|
puts [start, finish].inspect
ElasticBatchProjectIndexerWorker.new.perform(start, finish, ENV['UPDATE_INDEX']) ElasticBatchProjectIndexerWorker.new.perform(start, finish, ENV['UPDATE_INDEX'])
end end
end end
...@@ -99,7 +107,7 @@ namespace :gitlab do ...@@ -99,7 +107,7 @@ namespace :gitlab do
def project_id_batches(&blk) def project_id_batches(&blk)
relation = Project relation = Project
if ENV['UPDATE_INDEX'] unless ENV['UPDATE_INDEX']
relation = relation.includes(:index_status).where('index_statuses.id IS NULL').references(:index_statuses) relation = relation.includes(:index_status).where('index_statuses.id IS NULL').references(:index_statuses)
end end
......
require 'spec_helper' require 'spec_helper'
describe "Indexer" do describe Gitlab::Elastic::Indexer do
include StubENV include StubENV
before do before do
...@@ -8,21 +8,100 @@ describe "Indexer" do ...@@ -8,21 +8,100 @@ describe "Indexer" do
stub_application_setting(es_host: ['elastic-host1', 'elastic-host2']) stub_application_setting(es_host: ['elastic-host1', 'elastic-host2'])
end end
it "runs commands" do let(:project) { create(:project) }
expect(Gitlab::Popen).to receive(:popen).with( let(:from_sha) { Gitlab::Git::BLANK_SHA }
[File.join(Rails.root, 'bin/elastic_repo_indexer'), '1', 'full_repo_path'], let(:to_sha) { project.commit.try(:sha) }
let(:indexer) { described_class.new(project) }
let(:popen_success) { [[''], 0] }
let(:popen_failure) { [['error'], 1] }
let(:elastic_connection_info) do
{
host: current_application_settings.elasticsearch_host,
port: current_application_settings.elasticsearch_port,
}
end
context 'empty project' do
let(:project) { create(:empty_project) }
it 'updates the index status without running the indexing command' do
expect_popen.never
indexer.run
expect_index_status(Gitlab::Git::BLANK_SHA)
end
end
context 'repository has unborn head' do
it 'updates the index status without running the indexing command' do
allow(project.repository).to receive(:exists?).and_return(false)
expect_popen.never
indexer.run
expect_index_status(Gitlab::Git::BLANK_SHA)
end
end
context 'test project' do
let(:project) { create(:project) }
it 'runs the indexing command' do
expect_popen.with(
[
File.join(Rails.root, 'bin/elastic_repo_indexer'),
project.id.to_s,
project.repository.path_to_repo
],
nil, nil,
hash_including( hash_including(
'ELASTIC_CONNECTION_INFO' => { 'ELASTIC_CONNECTION_INFO' => elastic_connection_info.to_json,
host: current_application_settings.elasticsearch_host,
port: current_application_settings.elasticsearch_port
}.to_json,
'RAILS_ENV' => Rails.env, 'RAILS_ENV' => Rails.env,
'FROM_SHA' => '000000', 'FROM_SHA' => from_sha,
'TO_SHA' => '1d1f2d' 'TO_SHA' => to_sha
) )
).and_return([[''], 0]) ).and_return(popen_success)
indexer.run(from_sha, to_sha)
end
it 'updates the index status when the indexing is a success' do
expect_popen.and_return(popen_success)
indexer.run(from_sha, to_sha)
expect_index_status(to_sha)
end
it 'leaves the index status untouched when indexing a non-HEAD commit' do
expect_popen.and_return(popen_success)
indexer.run(from_sha, project.repository.commit('HEAD~1'))
expect(project.index_status).to be_nil
end
it 'leaves the index status untouched when the indexing fails' do
expect_popen.and_return(popen_failure)
expect { indexer.run }.to raise_error(Gitlab::Elastic::Indexer::Error)
expect(project.index_status).to be_nil
end
end
def expect_popen(*with)
expect(Gitlab::Popen).to receive(:popen)
end
def expect_index_status(sha)
status = project.index_status
Gitlab::Elastic::Indexer.new.run(1, 'full_repo_path', '000000', '1d1f2d') expect(status).not_to be_nil
expect(status.indexed_at).not_to be_nil
expect(status.last_commit).to eq(sha)
end end
end end
require 'spec_helper'
describe ElasticBatchProjectIndexerWorker do
subject(:worker) { described_class.new }
let(:projects) { create_list(:empty_project, 2) }
describe '#perform' do
it 'runs the indexer for projects in the batch range' do
projects.each {|project| expect_index(project) }
worker.perform(projects.first.id, projects.last.id)
end
it 'skips projects not in the batch range' do
expect_index(projects.first).never
expect_index(projects.last)
worker.perform(projects.last.id, projects.last.id)
end
context 'update_index = false' do
it 'skips projects that were already indexed' do
projects.first.create_index_status!
expect_index(projects.first).never
worker.perform(projects.first.id, projects.first.id)
end
end
context 'with update_index' do
it 'reindexes projects that were already indexed' do
projects.first.create_index_status!
expect_index(projects.first)
expect_index(projects.last)
worker.perform(projects.first.id, projects.last.id, true)
end
it 'starts indexing at the last indexed commit' do
projects.first.create_index_status!(last_commit: 'foo')
expect_index(projects.first).and_call_original
expect_any_instance_of(Gitlab::Elastic::Indexer).to receive(:run).with('foo')
worker.perform(projects.first.id, projects.first.id, true)
end
end
end
def expect_index(project)
expect(worker).to receive(:run_indexer).with(project)
end
end
...@@ -15,23 +15,6 @@ describe ElasticCommitIndexerWorker do ...@@ -15,23 +15,6 @@ describe ElasticCommitIndexerWorker do
subject.perform(project.id, '0000', '0000') subject.perform(project.id, '0000', '0000')
end end
it 'does not run indexer when project is empty' do
empty_project = create :empty_project
expect_any_instance_of(Gitlab::Elastic::Indexer).not_to receive(:run)
subject.perform(empty_project.id, '0000', '0000')
end
it 'returns true if repository has unborn head' do
project = create :project
repository = double('repository')
expect(repository).to receive(:exists?).and_return(false)
expect_any_instance_of(Project).to receive(:repository).and_return(repository)
expect(subject.perform(project.id)).to be_truthy
end
it 'returns true if ES disabled' do it 'returns true if ES disabled' do
stub_application_setting(elasticsearch_indexing: false) stub_application_setting(elasticsearch_indexing: false)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment