Commit 9c124899 authored by Dmitry Gruzd's avatar Dmitry Gruzd Committed by Dylan Griffith

Switch to prefix code search

Currently the code search uses ngrams to allow searching for prefixes as
well as full matches. This takes up a lot of storage and can be replaced
with a prefix search.
This change removes the usage of edgeNGram_filter from our index
mappings.
parent 0f896bd4
...@@ -54,7 +54,7 @@ Please see the `sha_tokenizer` explanation later below for an example. ...@@ -54,7 +54,7 @@ Please see the `sha_tokenizer` explanation later below for an example.
#### `code_analyzer` #### `code_analyzer`
Used when indexing a blob's filename and content. Uses the `whitespace` tokenizer and the filters: [`code`](#code), [`edgeNGram_filter`](#edgengram_filter), `lowercase`, and `asciifolding` Used when indexing a blob's filename and content. Uses the `whitespace` tokenizer and the filters: [`code`](#code), `lowercase`, and `asciifolding`
The `whitespace` tokenizer was selected in order to have more control over how tokens are split. For example the string `Foo::bar(4)` needs to generate tokens like `Foo` and `bar(4)` in order to be properly searched. The `whitespace` tokenizer was selected in order to have more control over how tokens are split. For example the string `Foo::bar(4)` needs to generate tokens like `Foo` and `bar(4)` in order to be properly searched.
......
---
title: Remove partial word matching from code search
merge_request: 32771
author:
type: changed
...@@ -35,7 +35,7 @@ module Elastic ...@@ -35,7 +35,7 @@ module Elastic
code_analyzer: { code_analyzer: {
type: 'custom', type: 'custom',
tokenizer: 'whitespace', tokenizer: 'whitespace',
filter: %w(code edgeNGram_filter lowercase asciifolding) filter: %w(code lowercase asciifolding)
}, },
code_search_analyzer: { code_search_analyzer: {
type: 'custom', type: 'custom',
...@@ -60,11 +60,6 @@ module Elastic ...@@ -60,11 +60,6 @@ module Elastic
'\.([^.]+)(?=\.|\s|\Z)', # separate terms on periods '\.([^.]+)(?=\.|\s|\Z)', # separate terms on periods
'\/?([^\/]+)(?=\/|\b)' # separate path terms (like/this/one) '\/?([^\/]+)(?=\/|\b)' # separate path terms (like/this/one)
] ]
},
edgeNGram_filter: {
type: 'edgeNGram',
min_gram: 2,
max_gram: 40
} }
}, },
tokenizer: { tokenizer: {
......
...@@ -537,7 +537,15 @@ describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need_inlin ...@@ -537,7 +537,15 @@ describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need_inlin
blobs = results.objects('blobs') blobs = results.objects('blobs')
expect(blobs.first.data).to include('def') expect(blobs.first.data).to include('def')
expect(results.blobs_count).to eq 7 expect(results.blobs_count).to eq 5
end
it 'finds blobs by prefix search' do
results = described_class.new(user, 'defau*', limit_project_ids)
blobs = results.objects('blobs')
expect(blobs.first.data).to include('default')
expect(results.blobs_count).to eq 3
end end
it 'finds blobs from public projects only' do it 'finds blobs from public projects only' do
...@@ -547,13 +555,13 @@ describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need_inlin ...@@ -547,13 +555,13 @@ describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need_inlin
ensure_elasticsearch_index! ensure_elasticsearch_index!
results = described_class.new(user, 'def', [project_1.id]) results = described_class.new(user, 'def', [project_1.id])
expect(results.blobs_count).to eq 7 expect(results.blobs_count).to eq 5
result_project_ids = results.objects('blobs').map(&:project_id) result_project_ids = results.objects('blobs').map(&:project_id)
expect(result_project_ids.uniq).to eq([project_1.id]) expect(result_project_ids.uniq).to eq([project_1.id])
results = described_class.new(user, 'def', [project_1.id, project_2.id]) results = described_class.new(user, 'def', [project_1.id, project_2.id])
expect(results.blobs_count).to eq 14 expect(results.blobs_count).to eq 10
end end
it 'returns zero when blobs are not found' do it 'returns zero when blobs are not found' do
...@@ -580,7 +588,8 @@ describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need_inlin ...@@ -580,7 +588,8 @@ describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need_inlin
expect(search_for('write')).to include('test.txt') expect(search_for('write')).to include('test.txt')
end end
it 'find by first two words' do # Re-enable after fixing https://gitlab.com/gitlab-org/gitlab/-/issues/10693#note_349683299
xit 'find by first two words' do
expect(search_for('writeString')).to include('test.txt') expect(search_for('writeString')).to include('test.txt')
end end
...@@ -591,6 +600,10 @@ describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need_inlin ...@@ -591,6 +600,10 @@ describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need_inlin
it 'find by exact match' do it 'find by exact match' do
expect(search_for('writeStringToFile')).to include('test.txt') expect(search_for('writeStringToFile')).to include('test.txt')
end end
it 'find by prefix search' do
expect(search_for('writeStr*')).to include('test.txt')
end
end end
context 'Searches special characters' do context 'Searches special characters' do
......
...@@ -129,7 +129,7 @@ describe API::Search do ...@@ -129,7 +129,7 @@ describe API::Search do
context 'filters' do context 'filters' do
it 'by filename' do it 'by filename' do
get api("/projects/#{project.id}/search", user), params: { scope: 'blobs', search: 'mon filename:PROCESS.md' } get api("/projects/#{project.id}/search", user), params: { scope: 'blobs', search: 'mon* filename:PROCESS.md' }
expect(response).to have_gitlab_http_status(:ok) expect(response).to have_gitlab_http_status(:ok)
expect(json_response.size).to eq(1) expect(json_response.size).to eq(1)
...@@ -137,7 +137,7 @@ describe API::Search do ...@@ -137,7 +137,7 @@ describe API::Search do
end end
it 'by path' do it 'by path' do
get api("/projects/#{project.id}/search", user), params: { scope: 'blobs', search: 'mon path:markdown' } get api("/projects/#{project.id}/search", user), params: { scope: 'blobs', search: 'mon* path:markdown' }
expect(response).to have_gitlab_http_status(:ok) expect(response).to have_gitlab_http_status(:ok)
expect(json_response.size).to eq(1) expect(json_response.size).to eq(1)
...@@ -147,7 +147,7 @@ describe API::Search do ...@@ -147,7 +147,7 @@ describe API::Search do
end end
it 'by extension' do it 'by extension' do
get api("/projects/#{project.id}/search", user), params: { scope: 'blobs', search: 'mon extension:md' } get api("/projects/#{project.id}/search", user), params: { scope: 'blobs', search: 'mon* extension:md' }
expect(response).to have_gitlab_http_status(:ok) expect(response).to have_gitlab_http_status(:ok)
expect(json_response.size).to eq(3) expect(json_response.size).to eq(3)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment