Use prefix search instead of ngrams for sha fields

Currently, SHAs are indexed using ngrams from 5 to 40 characters. This means that each SHA is split into 35 separate terms taking up a lot of storage. SHAs are quite unique from 4-5 characters on, so a simple prefix search will be sufficiently fast and as effective as ngrams with term matching. This change replaces current sha fields analyzers with prefix search.

Use prefix search instead of ngrams for sha fields
Currently, SHAs are indexed using ngrams from 5 to 40 characters. This means that each SHA is split into 35 separate terms taking up a lot of storage. SHAs are quite unique from 4-5 characters on, so a simple prefix search will be sufficiently fast and as effective as ngrams with term matching. This change replaces current sha fields analyzers with prefix search.
dfe2c6df · Dmitry Gruzd · Nick Thomas · e84ca201 · dfe2c6df · dfe2c6df
Commit dfe2c6df authored Mar 26, 2020 by Dmitry Gruzd Committed by Nick Thomas Mar 26, 2020
4 changed files
--- a/ee/changelogs/unreleased/27789-sha-prefix-search.yml
+++ b/ee/changelogs/unreleased/27789-sha-prefix-search.yml
+---
+title: Use prefix search instead of ngrams for sha fields
+merge_request: 27597
+author:
+type: other
--- a/ee/lib/elastic/latest/config.rb
+++ b/ee/lib/elastic/latest/config.rb
@@ -32,11 +32,6 @@ module Elastic
                tokenizer: 'path_tokenizer',
                filter: %w(lowercase asciifolding)
              },
-              sha_analyzer: {
-                type: 'custom',
-                tokenizer: 'sha_tokenizer',
-                filter: %w(lowercase asciifolding)
-              },
              code_analyzer: {
                type: 'custom',
                tokenizer: 'whitespace',
@@ -79,16 +74,16 @@ module Elastic
                max_gram: 3,
                token_chars: %w(letter digit)
              },
-              sha_tokenizer: {
-                type: "edgeNGram",
-                min_gram: 5,
-                max_gram: 40,
-                token_chars: %w(letter digit)
-              },
              path_tokenizer: {
                type: 'path_hierarchy',
                reverse: true
              }
+            },
+            normalizer: {
+              sha_normalizer: {
+                type: "custom",
+                filter: ["lowercase"]
+              }
            }
          }
        }
@@ -198,16 +193,16 @@ module Elastic
        indexes :blob do
          indexes :type, type: :keyword

-          indexes :id, type: :text,
+          indexes :id, type: :keyword,
            index_options: 'docs',
-            analyzer: :sha_analyzer
+            normalizer: :sha_normalizer
          indexes :rid, type: :keyword
-          indexes :oid, type: :text,
+          indexes :oid, type: :keyword,
            index_options: 'docs',
-            analyzer: :sha_analyzer
-          indexes :commit_sha, type: :text,
+            normalizer: :sha_normalizer
+          indexes :commit_sha, type: :keyword,
            index_options: 'docs',
-            analyzer: :sha_analyzer
+            normalizer: :sha_normalizer
          indexes :path, type: :text,
            analyzer: :path_analyzer
          indexes :file_name, type: :text,
@@ -223,13 +218,13 @@ module Elastic
        indexes :commit do
          indexes :type, type: :keyword

-          indexes :id, type: :text,
+          indexes :id, type: :keyword,
            index_options: 'docs',
-            analyzer: :sha_analyzer
+            normalizer: :sha_normalizer
          indexes :rid, type: :keyword
-          indexes :sha, type: :text,
+          indexes :sha, type: :keyword,
            index_options: 'docs',
-            analyzer: :sha_analyzer
+            normalizer: :sha_normalizer

          indexes :author do
            indexes :name, type: :text, index_options: 'docs'

--- a/ee/lib/elastic/latest/git_class_proxy.rb
+++ b/ee/lib/elastic/latest/git_class_proxy.rb
@@ -3,6 +3,8 @@
 module Elastic
  module Latest
    module GitClassProxy
+      SHA_REGEX = /\A[0-9a-f]{5,40}\z/i.freeze
+
      def elastic_search(query, type: :all, page: 1, per: 20, options: {})
        results = { blobs: [], commits: [] }

@@ -41,13 +43,15 @@ module Elastic

        fields = %w(message^10 sha^5 author.name^2 author.email^2 committer.name committer.email).map {|i| "commit.#{i}"}

+        query_with_prefix = query.split(/\s+/).map { |s| s.gsub(SHA_REGEX) { |sha| "#{sha}*" } }.join(' ')
+
        query_hash = {
          query: {
            bool: {
              must: {
                simple_query_string: {
                  fields: fields,
-                  query: query,
+                  query: query_with_prefix,
                  default_operator: :and
                }
              },

--- a/ee/spec/models/concerns/elastic/repository_spec.rb
+++ b/ee/spec/models/concerns/elastic/repository_spec.rb
@@ -22,6 +22,13 @@ describe Repository, :elastic do
    expect(project.repository.elastic_search('def popen')[:blobs][:total_count]).to eq(1)
    expect(project.repository.elastic_search('def | popen')[:blobs][:total_count] > 1).to be_truthy
    expect(project.repository.elastic_search('initial')[:commits][:total_count]).to eq(1)
+
+    root_ref = project.repository.root_ref_sha.upcase
+    expect(project.repository.elastic_search(root_ref)[:commits][:total_count]).to eq(1)
+
+    partial_ref = root_ref[0...5]
+    expect(project.repository.elastic_search(partial_ref)[:commits][:total_count]).to eq(1)
+    expect(project.repository.elastic_search(partial_ref + '*')[:commits][:total_count]).to eq(1)
  end

  it 'can filter blobs' do