Commit b18b4bc1 authored by Valery Sizov's avatar Valery Sizov

Merge branch 'elastic_tools' into 'master'

More advanced elastic indexer

https://gitlab.com/gitlab-com/operations/issues/56

See merge request !152
parents 798e6ccf a23edfed
class IndexStatus < ActiveRecord::Base class IndexStatus < ActiveRecord::Base
belongs_to :project belongs_to :project
validates :project_id, uniqueness: true validates :project_id, uniqueness: true, presence: true
end end
class CreateIndexStatuses < ActiveRecord::Migration class CreateIndexStatuses < ActiveRecord::Migration
def change def change
create_table :index_statuses do |t| create_table :index_statuses do |t|
t.integer :project_id t.integer :project_id, null: false
t.datetime :indexed_at t.datetime :indexed_at
t.text :note t.text :note
t.string :last_commit t.string :last_commit
...@@ -9,6 +9,6 @@ class CreateIndexStatuses < ActiveRecord::Migration ...@@ -9,6 +9,6 @@ class CreateIndexStatuses < ActiveRecord::Migration
t.timestamps null: false t.timestamps null: false
end end
add_index :index_statuses, :project_id add_index :index_statuses, :project_id, unique: true
end end
end end
...@@ -447,7 +447,7 @@ ActiveRecord::Schema.define(version: 20160129075828) do ...@@ -447,7 +447,7 @@ ActiveRecord::Schema.define(version: 20160129075828) do
add_index "identities", ["user_id"], name: "index_identities_on_user_id", using: :btree add_index "identities", ["user_id"], name: "index_identities_on_user_id", using: :btree
create_table "index_statuses", force: :cascade do |t| create_table "index_statuses", force: :cascade do |t|
t.integer "project_id" t.integer "project_id", null: false
t.datetime "indexed_at" t.datetime "indexed_at"
t.text "note" t.text "note"
t.string "last_commit" t.string "last_commit"
...@@ -455,7 +455,7 @@ ActiveRecord::Schema.define(version: 20160129075828) do ...@@ -455,7 +455,7 @@ ActiveRecord::Schema.define(version: 20160129075828) do
t.datetime "updated_at", null: false t.datetime "updated_at", null: false
end end
add_index "index_statuses", ["project_id"], name: "index_index_statuses_on_project_id", using: :btree add_index "index_statuses", ["project_id"], name: "index_index_statuses_on_project_id", unique: true, using: :btree
create_table "issues", force: :cascade do |t| create_table "issues", force: :cascade do |t|
t.string "title" t.string "title"
......
...@@ -114,15 +114,19 @@ sudo gitlab-rake gitlab:elastic:index_repositories ...@@ -114,15 +114,19 @@ sudo gitlab-rake gitlab:elastic:index_repositories
bundle exec rake gitlab:elastic:index_repositories RAILS_ENV=production bundle exec rake gitlab:elastic:index_repositories RAILS_ENV=production
``` ```
If you want to run several tasks in parallel (probably in separate terminal windows) you can pass parameters `ID_FROM` and `ID_TO` like this: If you want to run several tasks in parallel (probably in separate terminal
windows) you can provide the `ID_FROM` and `ID_TO` parameters:
``` ```
ID_FROM=1001 ID_TO=2000 sudo gitlab-rake gitlab:elastic:index_repositories ID_FROM=1001 ID_TO=2000 sudo gitlab-rake gitlab:elastic:index_repositories
``` ```
Both parameters are optional. Keep in mind also that this task will skip repositories (and certain commits) you have already indexed. It stores the last commit SHA of every indexed repository in the database. Both parameters are optional. Keep in mind that this task will skip repositories
As an example, if you have 3000 of repositories and you want to run tree separate indexer your commands would be like: (and certain commits) that have already been indexed. It stores the last commit
SHA of every indexed repository in the database. As an example, if you have
3,000 repositories and you want to run three separate indexing tasks, you might
run:
``` ```
ID_TO=1000 sudo gitlab-rake gitlab:elastic:index_repositories ID_TO=1000 sudo gitlab-rake gitlab:elastic:index_repositories
...@@ -139,7 +143,9 @@ sudo gitlab-rake gitlab:elastic:index_wikis ...@@ -139,7 +143,9 @@ sudo gitlab-rake gitlab:elastic:index_wikis
# Installations from source # Installations from source
bundle exec rake gitlab:elastic:index_wikis RAILS_ENV=production bundle exec rake gitlab:elastic:index_wikis RAILS_ENV=production
``` ```
Wiki indexer also supports `ID_FROM` and `ID_TO` parameters if you want to limit a project set.
The wiki indexer also supports the `ID_FROM` and `ID_TO` parameters if you want
to limit a project set.
To index all database entities: To index all database entities:
...@@ -213,27 +219,31 @@ time drop. ...@@ -213,27 +219,31 @@ time drop.
curl -XPOST 'http://localhost:9200/_forcemerge?max_num_segments=5' curl -XPOST 'http://localhost:9200/_forcemerge?max_num_segments=5'
``` ```
To minimize a downtime of search feature we recommend next sequence of actions: To minimize downtime of the search feature we recommend the following:
1. Configure elastic search in gitlab.yml or gitlab.rb for omnibus installations but
do not enable it, just set a host and port.
1. Create empty indexes. Run 1. Configure Elasticsearch in `gitlab.yml`, or `gitlab.rb` for Omnibus
``` installations, but do not enable it, just set a host and port.
# Omnibus installations
sudo gitlab-rake gitlab:elastic:create_empty_indexes
# Installations from source 1. Create empty indexes:
bundle exec rake gitlab:elastic:create_empty_indexes
```
1. Index all repositories by `gitlab:elastic:index_repositories` rake task(see above). Probably you will do it in parallel ```
# Omnibus installations
sudo gitlab-rake gitlab:elastic:create_empty_indexes
1. Enable elasticsearch and restart GitLab application. # Installations from source
bundle exec rake gitlab:elastic:create_empty_indexes
```
1. Run indexers for database, wikis and repositories. By running reposotory indexer twise you will be sure that eveything is indexed because some commits could be pushed while you performed initial indexing. And don't worry, repository indexer will skip repositories and commits that are already indexed, in other words, it will be much faster this time. 1. Index all repositories using the `gitlab:elastic:index_repositories` Rake
task (see above). You'll probably want to do this in parallel.
1. Enable Elasticsearch and restart GitLab.
1. Run indexers for database, wikis, and repositories. By running the repository
indexer twice you will be sure that everything is indexed because some
commits could be pushed while you performed initial indexing. The repository
indexer will skip repositories and commits that are already indexed, so it
will be much shorter than the first run.
[ee-109]: https://gitlab.com/gitlab-org/gitlab-ee/merge_requests/109 "Elasticsearch Merge Request" [ee-109]: https://gitlab.com/gitlab-org/gitlab-ee/merge_requests/109 "Elasticsearch Merge Request"
[elasticsearch]: https://www.elastic.co/products/elasticsearch "Elasticsearch website" [elasticsearch]: https://www.elastic.co/products/elasticsearch "Elasticsearch website"
......
...@@ -11,9 +11,9 @@ namespace :gitlab do ...@@ -11,9 +11,9 @@ namespace :gitlab do
puts "Indexing #{project.name_with_namespace} (ID=#{project.id})..." puts "Indexing #{project.name_with_namespace} (ID=#{project.id})..."
index_status = IndexStatus.find_or_create_by(project: project) index_status = IndexStatus.find_or_create_by(project: project)
heads_sha = project.repository.commit.sha head_sha = project.repository.commit.sha
if index_status.last_commit == heads_sha if index_status.last_commit == head_sha
puts "Skipped".yellow puts "Skipped".yellow
next next
end end
...@@ -24,7 +24,7 @@ namespace :gitlab do ...@@ -24,7 +24,7 @@ namespace :gitlab do
# During indexing the new commits can be pushed, # During indexing the new commits can be pushed,
# the last_commit parameter only indicates that at least this commit is in index # the last_commit parameter only indicates that at least this commit is in index
index_status.update(last_commit: heads_sha, indexed_at: DateTime.now) index_status.update(last_commit: head_sha, indexed_at: DateTime.now)
puts "Done!".green puts "Done!".green
rescue StandardError => e rescue StandardError => e
puts "#{e.message}, trace - #{e.backtrace}" puts "#{e.message}, trace - #{e.backtrace}"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment