Commit cd3ae49b authored by Valery Sizov's avatar Valery Sizov

Merge branch '1618-async-elasticsearch-indexing' into 'master'

Elasticsearch: allow initial indexing to proceed within Sidekiq

Closes #1618

See merge request !1144
parents 49f7a449 0955e2b4
class ElasticBatchProjectIndexerWorker
include Sidekiq::Worker
include Gitlab::CurrentSettings
# Batch indexing is a generally a onetime option, so give finer control over
# queuing and concurrency
include DedicatedSidekiqQueue
# This worker is long-running, but idempotent, so retry many times if
# necessary
sidekiq_options retry: 10
def perform(start, finish, update_index = false)
projects = build_relation(start, finish, update_index)
projects.find_each { |project| run_indexer(project) }
end
private
def run_indexer(project)
logger.info "Indexing #{project.name_with_namespace} (ID=#{project.id})..."
last_commit = project.index_status.try(:last_commit)
Gitlab::Elastic::Indexer.new(project).run(last_commit)
logger.info "Indexing #{project.name_with_namespace} (ID=#{project.id}) is done!"
rescue => err
logger.warn("#{err.message} indexing #{project.name_with_namespace} (ID=#{project.id}), trace - #{err.backtrace}")
end
def build_relation(start, finish, update_index)
relation = Project.includes(:index_status)
unless update_index
relation = relation.where('index_statuses.id IS NULL').references(:index_statuses)
end
table = Project.arel_table
relation = relation.where(table[:id].gteq(start)) if start
relation = relation.where(table[:id].lteq(finish)) if finish
relation
end
end
......@@ -8,16 +8,7 @@ class ElasticCommitIndexerWorker
return true unless current_application_settings.elasticsearch_indexing?
project = Project.find(project_id)
repository = project.repository
return true unless repository.exists?
indexer = Gitlab::Elastic::Indexer.new
indexer.run(
project_id,
repository.path_to_repo,
oldrev,
newrev
)
Gitlab::Elastic::Indexer.new(project).run(oldrev, newrev)
end
end
......@@ -58,3 +58,4 @@
- [project_update_repository_storage, 1]
- [admin_emails, 1]
- [geo_repository_update, 1]
- [elastic_batch_project_indexer, 1]
......@@ -22,11 +22,10 @@ searching in:
- Snippets
- Wiki
Once the data is added to the database or repository and [Elasticsearch is enabled in the admin area](#enable-elasticsearch) the search index will be updated
automatically.
Elasticsearch can be installed on the same machine that GitLab
is installed or on a separate server.
Once the data is added to the database or repository and [Elasticsearch is
enabled in the admin area](#enable-elasticsearch) the search index will be
updated automatically. Elasticsearch can be installed on the same machine as
GitLab, or on a separate server.
## Requirements
......@@ -77,6 +76,37 @@ bundle exec rake gitlab:elastic:create_empty_index RAILS_ENV=production
Then enable Elasticsearch indexing and run repository indexing tasks:
```
# Omnibus installations
sudo gitlab-rake gitlab:elastic:index_repositories_async
# Installations from source
bundle exec rake gitlab:elastic:index_repositories_async RAILS_ENV=production
```
This enqueues a number of Sidekiq jobs to index your existing repositories.
You can view the jobs in the admin panel (they are placed in the `elastic_batch_project_indexer`)
queue), or you can query indexing status using a rake task:
```
# Omnibus installations
sudo gitlab-rake gitlab:elastic:index_repositories_status
# Installations from source
bundle exec rake gitlab:elastic:index_repositories_status RAILS_ENV=production
Indexing is 65.55% complete (6555/10000 projects)
```
By default, one job is created for every 300 projects. For large numbers of
projects, you may wish to increase the batch size, by setting the `BATCH`
environment variable. You may also wish to consider [throttling](../administration/operations/sidekiq_job_throttling.md)
the `elastic_batch_project_indexer` queue , as this step can be I/O-intensive.
You can also run the initial indexing synchronously - this is most useful if
you have a small number of projects, or need finer-grained control over indexing
than Sidekiq permits:
```
# Omnibus installations
sudo gitlab-rake gitlab:elastic:index_repositories
......@@ -103,12 +133,24 @@ ID_FROM=1001 ID_TO=2000 sudo gitlab-rake gitlab:elastic:index_repositories
ID_FROM=2001 sudo gitlab-rake gitlab:elastic:index_repositories
```
Sometimes your repository index process `gitlab:elastic:index_repositories` get interupted due to various reasons, in this case you can safely run it again and it will skip those repositories that already have been indexed. As the indexer stores the last commit SHA of every indexed repository in the database you can run the indexer with the special parameter `UPDATE_INDEX` and it will check every project repository again to make sure that every commit in that repository is indexed, it can be useful in case if your index is outdated:
Sometimes your repository index process `gitlab:elastic:index_repositories` or
`gitlab:elastic:index_repositories_async` can get interrupted. This may happen
for many reasons, but it's always safe to run the indexing job again - it will
skip those repositories that have already been indexed.
As the indexer stores the last commit SHA of every indexed repository in the
database, you can run the indexer with the special parameter `UPDATE_INDEX` and
it will check every project repository again to make sure that every commit in
that repository is indexed, it can be useful in case if your index is outdated:
```
UPDATE_INDEX=true ID_TO=1000 sudo gitlab-rake gitlab:elastic:index_repositories
```
You can also use the `gitlab:elastic:clear_index_status` Rake task to force the
indexer to "forget" all progresss, so retrying the indexing process from the
start.
To index all wikis:
```
......
......@@ -8,7 +8,11 @@ module Gitlab
Error = Class.new(StandardError)
def initialize
attr_reader :project
def initialize(project)
@project = project
connection_info = {
host: current_application_settings.elasticsearch_host,
port: current_application_settings.elasticsearch_port
......@@ -22,20 +26,62 @@ module Gitlab
}
end
def run(project_id, repo_path, from_sha = nil, to_sha = nil)
def run(from_sha = nil, to_sha = nil)
to_sha = nil if to_sha == Gitlab::Git::BLANK_SHA
vars = @vars.merge({ 'FROM_SHA' => from_sha, 'TO_SHA' => to_sha })
head_commit = repository.try(:commit)
if repository.nil? || !repository.exists? || repository.empty? || head_commit.nil?
update_index_status(Gitlab::Git::BLANK_SHA)
return
end
run_indexer!(from_sha, to_sha)
update_index_status(to_sha)
true
end
private
path_to_indexer = File.join(Rails.root, 'bin/elastic_repo_indexer')
def repository
project.repository
end
def path_to_indexer
File.join(Rails.root, 'bin/elastic_repo_indexer')
end
command = [path_to_indexer, project_id.to_s, repo_path]
def run_indexer!(from_sha, to_sha)
command = [path_to_indexer, project.id.to_s, repository.path_to_repo]
vars = @vars.merge('FROM_SHA' => from_sha, 'TO_SHA' => to_sha)
output, status = Gitlab::Popen.popen(command, nil, vars)
raise Error, output unless status.zero?
end
true
def update_index_status(to_sha)
head_commit = repository.try(:commit)
# Use the eager-loaded association if available. An index_status should
# always be created, even if the repository is empty, so we know it's
# been looked at.
index_status = project.index_status
index_status ||=
begin
IndexStatus.find_or_create_by(project_id: project.id)
rescue ActiveRecord::RecordNotUnique
retry
end
# Don't update the index status if we never reached HEAD
return if head_commit && to_sha && head_commit.sha != to_sha
sha = head_commit.try(:sha)
sha ||= Gitlab::Git::BLANK_SHA
index_status.update_attributes(last_commit: sha, indexed_at: Time.now)
project.index_status(true)
end
end
end
......
......@@ -9,50 +9,34 @@ namespace :gitlab do
Rake::Task["gitlab:elastic:index_database"].invoke
end
desc "GitLab | Elasticsearch | Index project repositories"
task index_repositories: :environment do
projects = if ENV['UPDATE_INDEX']
Project
else
Project.includes(:index_status).
where("index_statuses.id IS NULL").
references(:index_statuses)
end
desc "GitLab | Elasticsearch | Index project repositories in the background"
task index_repositories_async: :environment do
print "Enqueuing project repositories in batches of #{batch_size}"
projects = apply_project_filters(projects)
project_id_batches do |start, finish|
ElasticBatchProjectIndexerWorker.perform_async(start, finish, ENV['UPDATE_INDEX'])
print "."
end
indexer = Gitlab::Elastic::Indexer.new
puts "OK"
end
projects.find_each(batch_size: 300) do |project|
repository = project.repository
desc "GitLab | ElasticSearch | Check project repository indexing status"
task index_repositories_status: :environment do
indexed = IndexStatus.count
projects = Project.count
percent = (indexed / projects.to_f) * 100.0
if repository.exists? && !repository.empty?
puts "Indexing #{project.name_with_namespace} (ID=#{project.id})..."
puts "Indexing is %.2f%% complete (%d/%d projects)" % [percent, indexed, projects]
end
index_status = IndexStatus.find_or_create_by(project: project)
desc "GitLab | Elasticsearch | Index project repositories"
task index_repositories: :environment do
print "Indexing project repositories..."
begin
head_commit = repository.commit
if !head_commit || index_status.last_commit == head_commit.sha
puts "Skipped".color(:yellow)
next
end
indexer.run(
project.id,
repository.path_to_repo,
index_status.last_commit
)
# During indexing the new commits can be pushed,
# the last_commit parameter only indicates that at least this commit is in index
index_status.update(last_commit: head_commit.sha, indexed_at: DateTime.now)
puts "Done!".color(:green)
rescue StandardError => e
puts "#{e.message}, trace - #{e.backtrace}"
end
end
Sidekiq::Logging.logger = Logger.new(STDOUT)
project_id_batches do |start, finish|
ElasticBatchProjectIndexerWorker.new.perform(start, finish, ENV['UPDATE_INDEX'])
end
end
......@@ -116,6 +100,23 @@ namespace :gitlab do
puts "Index recreated".color(:green)
end
def batch_size
ENV.fetch('BATCH', 300).to_i
end
def project_id_batches(&blk)
relation = Project
unless ENV['UPDATE_INDEX']
relation = relation.includes(:index_status).where('index_statuses.id IS NULL').references(:index_statuses)
end
relation.all.in_batches(of: batch_size, start: ENV['ID_FROM'], finish: ENV['ID_TO']) do |relation|
ids = relation.reorder(:id).pluck(:id)
yield ids[0], ids[-1]
end
end
def apply_project_filters(projects)
if ENV['ID_FROM']
projects = projects.where("projects.id >= ?", ENV['ID_FROM'])
......
require 'spec_helper'
describe "Indexer" do
describe Gitlab::Elastic::Indexer do
include StubENV
before do
......@@ -8,21 +8,100 @@ describe "Indexer" do
stub_application_setting(es_host: ['elastic-host1', 'elastic-host2'])
end
it "runs commands" do
expect(Gitlab::Popen).to receive(:popen).with(
[File.join(Rails.root, 'bin/elastic_repo_indexer'), '1', 'full_repo_path'],
nil,
hash_including(
'ELASTIC_CONNECTION_INFO' => {
host: current_application_settings.elasticsearch_host,
port: current_application_settings.elasticsearch_port
}.to_json,
'RAILS_ENV' => Rails.env,
'FROM_SHA' => '000000',
'TO_SHA' => '1d1f2d'
)
).and_return([[''], 0])
Gitlab::Elastic::Indexer.new.run(1, 'full_repo_path', '000000', '1d1f2d')
let(:project) { create(:project) }
let(:from_sha) { Gitlab::Git::BLANK_SHA }
let(:to_sha) { project.commit.try(:sha) }
let(:indexer) { described_class.new(project) }
let(:popen_success) { [[''], 0] }
let(:popen_failure) { [['error'], 1] }
let(:elastic_connection_info) do
{
host: current_application_settings.elasticsearch_host,
port: current_application_settings.elasticsearch_port,
}
end
context 'empty project' do
let(:project) { create(:empty_project) }
it 'updates the index status without running the indexing command' do
expect_popen.never
indexer.run
expect_index_status(Gitlab::Git::BLANK_SHA)
end
end
context 'repository has unborn head' do
it 'updates the index status without running the indexing command' do
allow(project.repository).to receive(:exists?).and_return(false)
expect_popen.never
indexer.run
expect_index_status(Gitlab::Git::BLANK_SHA)
end
end
context 'test project' do
let(:project) { create(:project) }
it 'runs the indexing command' do
expect_popen.with(
[
File.join(Rails.root, 'bin/elastic_repo_indexer'),
project.id.to_s,
project.repository.path_to_repo
],
nil,
hash_including(
'ELASTIC_CONNECTION_INFO' => elastic_connection_info.to_json,
'RAILS_ENV' => Rails.env,
'FROM_SHA' => from_sha,
'TO_SHA' => to_sha
)
).and_return(popen_success)
indexer.run(from_sha, to_sha)
end
it 'updates the index status when the indexing is a success' do
expect_popen.and_return(popen_success)
indexer.run(from_sha, to_sha)
expect_index_status(to_sha)
end
it 'leaves the index status untouched when indexing a non-HEAD commit' do
expect_popen.and_return(popen_success)
indexer.run(from_sha, project.repository.commit('HEAD~1'))
expect(project.index_status).to be_nil
end
it 'leaves the index status untouched when the indexing fails' do
expect_popen.and_return(popen_failure)
expect { indexer.run }.to raise_error(Gitlab::Elastic::Indexer::Error)
expect(project.index_status).to be_nil
end
end
def expect_popen(*with)
expect(Gitlab::Popen).to receive(:popen)
end
def expect_index_status(sha)
status = project.index_status
expect(status).not_to be_nil
expect(status.indexed_at).not_to be_nil
expect(status.last_commit).to eq(sha)
end
end
require 'spec_helper'
describe ElasticBatchProjectIndexerWorker do
subject(:worker) { described_class.new }
let(:projects) { create_list(:empty_project, 2) }
describe '#perform' do
it 'runs the indexer for projects in the batch range' do
projects.each {|project| expect_index(project) }
worker.perform(projects.first.id, projects.last.id)
end
it 'skips projects not in the batch range' do
expect_index(projects.first).never
expect_index(projects.last)
worker.perform(projects.last.id, projects.last.id)
end
context 'update_index = false' do
it 'skips projects that were already indexed' do
projects.first.create_index_status!
expect_index(projects.first).never
worker.perform(projects.first.id, projects.first.id)
end
end
context 'with update_index' do
it 'reindexes projects that were already indexed' do
projects.first.create_index_status!
expect_index(projects.first)
expect_index(projects.last)
worker.perform(projects.first.id, projects.last.id, true)
end
it 'starts indexing at the last indexed commit' do
projects.first.create_index_status!(last_commit: 'foo')
expect_index(projects.first).and_call_original
expect_any_instance_of(Gitlab::Elastic::Indexer).to receive(:run).with('foo')
worker.perform(projects.first.id, projects.first.id, true)
end
end
end
def expect_index(project)
expect(worker).to receive(:run_indexer).with(project)
end
end
......@@ -15,23 +15,6 @@ describe ElasticCommitIndexerWorker do
subject.perform(project.id, '0000', '0000')
end
it 'does not run indexer when project is empty' do
empty_project = create :empty_project
expect_any_instance_of(Gitlab::Elastic::Indexer).not_to receive(:run)
subject.perform(empty_project.id, '0000', '0000')
end
it 'returns true if repository has unborn head' do
project = create :project
repository = double('repository')
expect(repository).to receive(:exists?).and_return(false)
expect_any_instance_of(Project).to receive(:repository).and_return(repository)
expect(subject.perform(project.id)).to be_truthy
end
it 'returns true if ES disabled' do
stub_application_setting(elasticsearch_indexing: false)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment