Commit befe8808 authored by Nick Thomas's avatar Nick Thomas

Merge branch '224459-more-performant-elasticsearch-file-path-regex' into 'master'

Speed up Advanced global search regex for file path segments

See merge request gitlab-org/gitlab!35292
parents ca1a8e59 ecfee576
...@@ -111,11 +111,7 @@ Patterns: ...@@ -111,11 +111,7 @@ Patterns:
- `'"((?:\\"|[^"]|\\")*)"'`: captures terms inside quotes, removing the quotes - `'"((?:\\"|[^"]|\\")*)"'`: captures terms inside quotes, removing the quotes
- `"'((?:\\'|[^']|\\')*)'"`: same as above, for single-quotes - `"'((?:\\'|[^']|\\')*)'"`: same as above, for single-quotes
- `'\.([^.]+)(?=\.|\s|\Z)'`: separate terms with periods in-between - `'\.([^.]+)(?=\.|\s|\Z)'`: separate terms with periods in-between
- `'\/?([^\/]+)(?=\/|\b)'`: separate path terms `like/this/one` - `'([\p{L}_.-]+)'` : some common chars in file names to keep the whole filename intact (eg. `my_file-ñame.txt`)
#### `edgeNGram_filter`
Uses an [Edge NGram token filter](https://www.elastic.co/guide/en/elasticsearch/reference/5.5/analysis-edgengram-tokenfilter.html) to allow inputs with only parts of a token to find the token. For example it would turn `glasses` into permutations starting with `gl` and ending with `glasses`, which would allow a search for "`glass`" to find the original token `glasses`
## Gotchas ## Gotchas
......
---
title: Speed up Advanced global search regex for file path segments
merge_request: 35292
author:
type: performance
...@@ -38,7 +38,7 @@ module Elastic ...@@ -38,7 +38,7 @@ module Elastic
code_analyzer: { code_analyzer: {
type: 'custom', type: 'custom',
tokenizer: 'whitespace', tokenizer: 'whitespace',
filter: %w(code lowercase asciifolding) filter: %w(code lowercase asciifolding remove_duplicates)
}, },
code_search_analyzer: { code_search_analyzer: {
type: 'custom', type: 'custom',
...@@ -61,7 +61,7 @@ module Elastic ...@@ -61,7 +61,7 @@ module Elastic
'"((?:\\"|[^"]|\\")*)"', # capture terms inside quotes, removing the quotes '"((?:\\"|[^"]|\\")*)"', # capture terms inside quotes, removing the quotes
"'((?:\\'|[^']|\\')*)'", # same as above, for single quotes "'((?:\\'|[^']|\\')*)'", # same as above, for single quotes
'\.([^.]+)(?=\.|\s|\Z)', # separate terms on periods '\.([^.]+)(?=\.|\s|\Z)', # separate terms on periods
'\/?([^\/]+)(?=\/|\b)' # separate path terms (like/this/one) '([\p{L}_.-]+)' # some common chars in file names to keep the whole filename intact (eg. my_file-name.txt)
] ]
} }
}, },
......
...@@ -622,9 +622,21 @@ RSpec.describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need ...@@ -622,9 +622,21 @@ RSpec.describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need
Foo.bar(x) Foo.bar(x)
include "bikes-3.4" include "bikes-3.4"
/a/longer/file-path/absolute_with_specials.txt
another/file-path/relative-with-specials.txt
/file-path/components-within-slashes/
another/file-path/differeñt-lønguage.txt
us-east-2 us-east-2
bye bye
MyJavaClass::javaLangStaticMethodCall
$my_perl_object->perlMethodCall
LanguageWithSingleColon:someSingleColonMethodCall
WouldHappenInManyLanguages,tokenAfterCommaWithNoSpace
ParenthesesBetweenTokens)tokenAfterParentheses
a.b.c=missing_token_around_equals
FILE FILE
end end
let(:file_name) { 'elastic_specialchars_test.md' } let(:file_name) { 'elastic_specialchars_test.md' }
...@@ -651,6 +663,46 @@ RSpec.describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need ...@@ -651,6 +663,46 @@ RSpec.describe Gitlab::Elastic::SearchResults, :elastic, :sidekiq_might_not_need
expect(search_for('"and;colons:too$"')).to include(file_name) expect(search_for('"and;colons:too$"')).to include(file_name)
expect(search_for('bar\(x\)')).to include(file_name) expect(search_for('bar\(x\)')).to include(file_name)
end end
it 'finds absolute file paths with slashes and other special chars' do
expect(search_for('"absolute_with_specials.txt"')).to include(file_name)
end
it 'finds relative file paths with slashes and other special chars' do
expect(search_for('"relative-with-specials.txt"')).to include(file_name)
end
it 'finds file path components within slashes for directories' do
expect(search_for('"components-within-slashes"')).to include(file_name)
end
it 'finds file paths for various languages' do
expect(search_for('"differeñt-lønguage.txt"')).to include(file_name)
end
it 'finds java style static method call after ::' do
expect(search_for('javaLangStaticMethodCall')).to include(file_name)
end
it 'finds perl object method call' do
expect(search_for('perlMethodCall')).to include(file_name)
end
it 'finds tokens after a colon' do
expect(search_for('someSingleColonMethodCall')).to include(file_name)
end
it 'finds tokens after a comma with no space' do
expect(search_for('tokenAfterCommaWithNoSpace')).to include(file_name)
end
it 'finds a token directly after parentheses' do
expect(search_for('tokenAfterParentheses')).to include(file_name)
end
it 'finds a token after = without a space' do
expect(search_for('missing_token_around_equals')).to include(file_name)
end
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment