Commit dfe2c6df authored by Dmitry Gruzd's avatar Dmitry Gruzd Committed by Nick Thomas

Use prefix search instead of ngrams for sha fields

Currently, SHAs are indexed using ngrams from 5 to 40 characters. This
means that each SHA is split into 35 separate terms taking up a lot of
storage. SHAs are quite unique from 4-5 characters on, so a simple
prefix search will be sufficiently fast and as effective as ngrams with
term matching. This change replaces current sha fields analyzers with
prefix search.
parent e84ca201
---
title: Use prefix search instead of ngrams for sha fields
merge_request: 27597
author:
type: other
......@@ -32,11 +32,6 @@ module Elastic
tokenizer: 'path_tokenizer',
filter: %w(lowercase asciifolding)
},
sha_analyzer: {
type: 'custom',
tokenizer: 'sha_tokenizer',
filter: %w(lowercase asciifolding)
},
code_analyzer: {
type: 'custom',
tokenizer: 'whitespace',
......@@ -79,16 +74,16 @@ module Elastic
max_gram: 3,
token_chars: %w(letter digit)
},
sha_tokenizer: {
type: "edgeNGram",
min_gram: 5,
max_gram: 40,
token_chars: %w(letter digit)
},
path_tokenizer: {
type: 'path_hierarchy',
reverse: true
}
},
normalizer: {
sha_normalizer: {
type: "custom",
filter: ["lowercase"]
}
}
}
}
......@@ -198,16 +193,16 @@ module Elastic
indexes :blob do
indexes :type, type: :keyword
indexes :id, type: :text,
indexes :id, type: :keyword,
index_options: 'docs',
analyzer: :sha_analyzer
normalizer: :sha_normalizer
indexes :rid, type: :keyword
indexes :oid, type: :text,
indexes :oid, type: :keyword,
index_options: 'docs',
analyzer: :sha_analyzer
indexes :commit_sha, type: :text,
normalizer: :sha_normalizer
indexes :commit_sha, type: :keyword,
index_options: 'docs',
analyzer: :sha_analyzer
normalizer: :sha_normalizer
indexes :path, type: :text,
analyzer: :path_analyzer
indexes :file_name, type: :text,
......@@ -223,13 +218,13 @@ module Elastic
indexes :commit do
indexes :type, type: :keyword
indexes :id, type: :text,
indexes :id, type: :keyword,
index_options: 'docs',
analyzer: :sha_analyzer
normalizer: :sha_normalizer
indexes :rid, type: :keyword
indexes :sha, type: :text,
indexes :sha, type: :keyword,
index_options: 'docs',
analyzer: :sha_analyzer
normalizer: :sha_normalizer
indexes :author do
indexes :name, type: :text, index_options: 'docs'
......
......@@ -3,6 +3,8 @@
module Elastic
module Latest
module GitClassProxy
SHA_REGEX = /\A[0-9a-f]{5,40}\z/i.freeze
def elastic_search(query, type: :all, page: 1, per: 20, options: {})
results = { blobs: [], commits: [] }
......@@ -41,13 +43,15 @@ module Elastic
fields = %w(message^10 sha^5 author.name^2 author.email^2 committer.name committer.email).map {|i| "commit.#{i}"}
query_with_prefix = query.split(/\s+/).map { |s| s.gsub(SHA_REGEX) { |sha| "#{sha}*" } }.join(' ')
query_hash = {
query: {
bool: {
must: {
simple_query_string: {
fields: fields,
query: query,
query: query_with_prefix,
default_operator: :and
}
},
......
......@@ -22,6 +22,13 @@ describe Repository, :elastic do
expect(project.repository.elastic_search('def popen')[:blobs][:total_count]).to eq(1)
expect(project.repository.elastic_search('def | popen')[:blobs][:total_count] > 1).to be_truthy
expect(project.repository.elastic_search('initial')[:commits][:total_count]).to eq(1)
root_ref = project.repository.root_ref_sha.upcase
expect(project.repository.elastic_search(root_ref)[:commits][:total_count]).to eq(1)
partial_ref = root_ref[0...5]
expect(project.repository.elastic_search(partial_ref)[:commits][:total_count]).to eq(1)
expect(project.repository.elastic_search(partial_ref + '*')[:commits][:total_count]).to eq(1)
end
it 'can filter blobs' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment