Commit 3c419ef1 authored by Alex Ives's avatar Alex Ives

Merge branch '273784-issues-full-text-search' into 'master'

Use PG full-text search for searching issues

See merge request gitlab-org/gitlab!71913
parents fa583db1 33253988
......@@ -117,6 +117,10 @@ module IssuableCollections
options[:attempt_group_search_optimizations] = true
end
if collection_type == 'Issue' && Feature.enabled?(:issues_full_text_search, @project || @group, default_enabled: :yaml)
options[:attempt_full_text_search] = true
end
params.permit(finder_type.valid_params).merge(options)
end
end
......
......@@ -37,6 +37,7 @@
# attempt_project_search_optimizations: boolean
# crm_contact_id: integer
# crm_organization_id: integer
# attempt_full_text_search: boolean
#
class IssuableFinder
prepend FinderWithCrossProjectAccess
......@@ -46,6 +47,7 @@ class IssuableFinder
requires_cross_project_access unless: -> { params.project? }
FULL_TEXT_SEARCH_TERM_REGEX = /\A[\p{ASCII}|\p{Latin}]+\z/.freeze
NEGATABLE_PARAMS_HELPER_KEYS = %i[project_id scope status include_subgroups].freeze
attr_accessor :current_user, :params
......@@ -331,6 +333,8 @@ class IssuableFinder
return items if items.is_a?(ActiveRecord::NullRelation)
return items if Feature.enabled?(:disable_anonymous_search, type: :ops) && current_user.nil?
return items.pg_full_text_search(search) if use_full_text_search?
if use_cte_for_search?
cte = Gitlab::SQL::CTE.new(klass.table_name, items)
......@@ -341,6 +345,10 @@ class IssuableFinder
end
# rubocop: enable CodeReuse/ActiveRecord
def use_full_text_search?
params[:attempt_full_text_search] && params[:search] =~ FULL_TEXT_SEARCH_TERM_REGEX
end
# rubocop: disable CodeReuse/ActiveRecord
def by_iids(items)
params[:iids].present? ? items.where(iid: params[:iids]) : items
......
# frozen_string_literal: true
# This module adds PG full-text search capabilities to a model.
# A `search_data` association with a `search_vector` column is required.
#
# Declare the fields that will be part of the search vector with their
# corresponding weights. Possible values for weight are A, B, C, or D.
# For example:
#
# include PgFullTextSearchable
# pg_full_text_searchable columns: [{ name: 'title', weight: 'A' }, { name: 'description', weight: 'B' }]
#
# This module sets up an after_commit hook that updates the search data
# when the searchable columns are changed. You will need to implement the
# `#persist_pg_full_text_search_vector` method that does the actual insert or update.
#
# This also adds a `pg_full_text_search` scope so you can do:
#
# Model.pg_full_text_search("some search term")
module PgFullTextSearchable
extend ActiveSupport::Concern
LONG_WORDS_REGEX = %r([A-Za-z0-9+/]{50,}).freeze
TSVECTOR_MAX_LENGTH = 1.megabyte.freeze
TEXT_SEARCH_DICTIONARY = 'english'
def update_search_data!
tsvector_sql_nodes = self.class.pg_full_text_searchable_columns.map do |column, weight|
tsvector_arel_node(column, weight)&.to_sql
end
persist_pg_full_text_search_vector(Arel.sql(tsvector_sql_nodes.compact.join(' || ')))
rescue ActiveRecord::StatementInvalid => e
raise unless e.cause.is_a?(PG::ProgramLimitExceeded) && e.message.include?('string is too long for tsvector')
Gitlab::AppJsonLogger.error(
message: 'Error updating search data: string is too long for tsvector',
class: self.class.name,
model_id: self.id
)
end
private
def persist_pg_full_text_search_vector(search_vector)
raise NotImplementedError
end
def tsvector_arel_node(column, weight)
return if self[column].blank?
column_text = self[column].gsub(LONG_WORDS_REGEX, ' ')
column_text = column_text[0..(TSVECTOR_MAX_LENGTH - 1)]
column_text = ActiveSupport::Inflector.transliterate(column_text)
Arel::Nodes::NamedFunction.new(
'setweight',
[
Arel::Nodes::NamedFunction.new(
'to_tsvector',
[Arel::Nodes.build_quoted(TEXT_SEARCH_DICTIONARY), Arel::Nodes.build_quoted(column_text)]
),
Arel::Nodes.build_quoted(weight)
]
)
end
included do
cattr_reader :pg_full_text_searchable_columns do
{}
end
end
class_methods do
def pg_full_text_searchable(columns:)
raise 'Full text search columns already defined!' if pg_full_text_searchable_columns.present?
columns.each do |column|
pg_full_text_searchable_columns[column[:name]] = column[:weight]
end
# We update this outside the transaction because this could raise an error if the resulting tsvector
# is too long. When that happens, we still persist the create / update but the model will not have a
# search data record. This is fine in most cases because this is a very rare occurrence and only happens
# with strings that are most likely unsearchable anyway.
#
# We also do not want to use a subtransaction here due to: https://gitlab.com/groups/gitlab-org/-/epics/6540
after_save_commit do
next unless pg_full_text_searchable_columns.keys.any? { |f| saved_changes.has_key?(f) }
update_search_data!
end
end
def pg_full_text_search(search_term)
search_data_table = reflect_on_association(:search_data).klass.arel_table
joins(:search_data).where(
Arel::Nodes::InfixOperation.new(
'@@',
search_data_table[:search_vector],
Arel::Nodes::NamedFunction.new(
'websearch_to_tsquery',
[Arel::Nodes.build_quoted(TEXT_SEARCH_DICTIONARY), Arel::Nodes.build_quoted(search_term)]
)
)
)
end
end
end
......@@ -24,6 +24,7 @@ class Issue < ApplicationRecord
include Todoable
include FromUnion
include EachBatch
include PgFullTextSearchable
extend ::Gitlab::Utils::Override
......@@ -77,6 +78,7 @@ class Issue < ApplicationRecord
end
end
has_one :search_data, class_name: 'Issues::SearchData'
has_one :issuable_severity
has_one :sentry_issue
has_one :alert_management_alert, class_name: 'AlertManagement::Alert'
......@@ -102,6 +104,8 @@ class Issue < ApplicationRecord
alias_attribute :external_author, :service_desk_reply_to
pg_full_text_searchable columns: [{ name: 'title', weight: 'A' }, { name: 'description', weight: 'B' }]
scope :in_projects, ->(project_ids) { where(project_id: project_ids) }
scope :not_in_projects, ->(project_ids) { where.not(project_id: project_ids) }
......@@ -233,6 +237,11 @@ class Issue < ApplicationRecord
def order_upvotes_asc
reorder(upvotes_count: :asc)
end
override :pg_full_text_search
def pg_full_text_search(search_term)
super.where('issue_search_data.project_id = issues.project_id')
end
end
def next_object_by_relative_position(ignoring: nil, order: :asc)
......@@ -611,6 +620,11 @@ class Issue < ApplicationRecord
private
override :persist_pg_full_text_search_vector
def persist_pg_full_text_search_vector(search_vector)
Issues::SearchData.upsert({ project_id: project_id, issue_id: id, search_vector: search_vector }, unique_by: %i(project_id issue_id))
end
def spammable_attribute_changed?
title_changed? ||
description_changed? ||
......
# frozen_string_literal: true
module Issues
class SearchData < ApplicationRecord
extend SuppressCompositePrimaryKeyWarning
self.table_name = 'issue_search_data'
belongs_to :issue
end
end
---
name: issues_full_text_search
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/71913
rollout_issue_url:
milestone: '14.5'
type: development
group: group::project management
default_enabled: false
# frozen_string_literal: true
class CreateIssueSearchTable < Gitlab::Database::Migration[1.0]
include Gitlab::Database::PartitioningMigrationHelpers::TableManagementHelpers
def up
execute <<~SQL
CREATE TABLE issue_search_data (
project_id bigint NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
issue_id bigint NOT NULL REFERENCES issues(id) ON DELETE CASCADE,
created_at timestamp with time zone DEFAULT NOW() NOT NULL,
updated_at timestamp with time zone DEFAULT NOW() NOT NULL,
search_vector tsvector,
PRIMARY KEY (project_id, issue_id)
) PARTITION BY HASH (project_id)
SQL
# rubocop: disable Migration/AddIndex
add_index :issue_search_data, :issue_id
add_index :issue_search_data, :search_vector, using: :gin, name: 'index_issue_search_data_on_search_vector'
# rubocop: enable Migration/AddIndex
create_hash_partitions :issue_search_data, 64
end
def down
drop_table :issue_search_data
end
end
# frozen_string_literal: true
class BackfillIssueSearchData < Gitlab::Database::Migration[1.0]
MIGRATION = 'BackfillIssueSearchData'
def up
queue_batched_background_migration(
MIGRATION,
:issues,
:id,
batch_size: 100_000,
sub_batch_size: 1_000,
job_interval: 5.minutes
)
end
def down
Gitlab::Database::BackgroundMigration::BatchedMigration
.for_configuration(MIGRATION, :issues, :id, [])
.delete_all
end
end
9d87052305a552ce380e81a33c690496c44e332eb86869ea6882f5cd4856ab93
\ No newline at end of file
630899d5a7f833ce0533ae553de89e70bd03fad9b438fd367e3a568261b08b00
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
# frozen_string_literal: true
# rubocop:disable Style/Documentation
module Gitlab
module BackgroundMigration
# Backfills the new `issue_search_data` table, which contains
# the tsvector from the issue title and description.
class BackfillIssueSearchData
include Gitlab::Database::DynamicModelHelpers
def perform(start_id, stop_id, batch_table, batch_column, sub_batch_size, pause_ms)
define_batchable_model(batch_table, connection: ActiveRecord::Base.connection).where(batch_column => start_id..stop_id).each_batch(of: sub_batch_size) do |sub_batch|
update_search_data(sub_batch)
sleep(pause_ms * 0.001)
rescue ActiveRecord::StatementInvalid => e
raise unless e.cause.is_a?(PG::ProgramLimitExceeded) && e.message.include?('string is too long for tsvector')
update_search_data_individually(sub_batch, pause_ms)
end
end
private
def update_search_data(relation)
relation.klass.connection.execute(
<<~SQL
INSERT INTO issue_search_data (project_id, issue_id, search_vector, created_at, updated_at)
SELECT
project_id,
id,
setweight(to_tsvector('english', LEFT(title, 255)), 'A') || setweight(to_tsvector('english', LEFT(REGEXP_REPLACE(description, '[A-Za-z0-9+/]{50,}', ' ', 'g'), 1048576)), 'B'),
NOW(),
NOW()
FROM issues
WHERE issues.id IN (#{relation.select(:id).to_sql})
ON CONFLICT DO NOTHING
SQL
)
end
def update_search_data_individually(relation, pause_ms)
relation.pluck(:id).each do |issue_id|
update_search_data(relation.klass.where(id: issue_id))
sleep(pause_ms * 0.001)
rescue ActiveRecord::StatementInvalid => e
raise unless e.cause.is_a?(PG::ProgramLimitExceeded) && e.message.include?('string is too long for tsvector')
logger.error(
message: 'Error updating search data: string is too long for tsvector',
class: relation.klass.name,
model_id: issue_id
)
end
end
def logger
@logger ||= Gitlab::BackgroundMigration::Logger.build
end
end
end
end
......@@ -274,6 +274,7 @@ issue_emails: :gitlab_main
issue_email_participants: :gitlab_main
issue_links: :gitlab_main
issue_metrics: :gitlab_main
issue_search_data: :gitlab_main
issues: :gitlab_main
issues_prometheus_alert_events: :gitlab_main
issues_self_managed_prometheus_alert_events: :gitlab_main
......
......@@ -144,7 +144,7 @@ module Gitlab
def params_include_filters?
non_filtering_params = %i[
scope state sort group_id include_subgroups
attempt_group_search_optimizations non_archived issue_types
attempt_group_search_optimizations attempt_full_text_search non_archived issue_types
]
finder.params.except(*non_filtering_params).values.any?
......
......@@ -13,7 +13,22 @@ RSpec.describe DashboardController do
end
describe 'GET issues' do
it_behaves_like 'issuables list meta-data', :issue, :issues
context 'when issues_full_text_search is disabled' do
before do
stub_feature_flags(issues_full_text_search: false)
end
it_behaves_like 'issuables list meta-data', :issue, :issues
end
context 'when issues_full_text_search is enabled' do
before do
stub_feature_flags(issues_full_text_search: true)
end
it_behaves_like 'issuables list meta-data', :issue, :issues
end
it_behaves_like 'issuables requiring filter', :issues
end
......
......@@ -72,7 +72,21 @@ RSpec.describe Projects::IssuesController do
project.add_developer(user)
end
it_behaves_like "issuables list meta-data", :issue
context 'when issues_full_text_search is disabled' do
before do
stub_feature_flags(issues_full_text_search: false)
end
it_behaves_like 'issuables list meta-data', :issue
end
context 'when issues_full_text_search is enabled' do
before do
stub_feature_flags(issues_full_text_search: true)
end
it_behaves_like 'issuables list meta-data', :issue
end
it_behaves_like 'set sort order from user preference' do
let(:sorting_param) { 'updated_asc' }
......
......@@ -497,6 +497,8 @@ RSpec.describe 'Filter issues', :js do
end
it 'filters issues by searched text containing special characters' do
stub_feature_flags(issues_full_text_search: false)
issue = create(:issue, project: project, author: user, title: "issue with !@\#{$%^&*()-+")
search = '!@#{$%^&*()-+'
......
......@@ -632,6 +632,29 @@ RSpec.describe IssuesFinder do
end
end
context 'filtering by issue term using full-text search' do
let(:params) { { search: search_term, attempt_full_text_search: true } }
let_it_be(:english) { create(:issue, project: project1, title: 'title', description: 'something english') }
let_it_be(:japanese) { create(:issue, project: project1, title: '日本語 title', description: 'another english description') }
context 'with latin search term' do
let(:search_term) { 'title english' }
it 'returns matching issues' do
expect(issues).to contain_exactly(english, japanese)
end
end
context 'with non-latin search term' do
let(:search_term) { '日本語' }
it 'returns matching issues' do
expect(issues).to contain_exactly(japanese)
end
end
end
context 'filtering by issues iids' do
let(:params) { { iids: [issue3.iid] } }
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::BackgroundMigration::BackfillIssueSearchData do
let(:namespaces_table) { table(:namespaces) }
let(:projects_table) { table(:projects) }
let(:issue_search_data_table) { table(:issue_search_data) }
let!(:namespace) { namespaces_table.create!(name: 'gitlab-org', path: 'gitlab-org') }
let!(:project) { projects_table.create!(name: 'gitlab', path: 'gitlab-org/gitlab-ce', namespace_id: namespace.id) }
let!(:issues) { Array.new(10) { table(:issues).create!(project_id: project.id, title: 'test title', description: 'test description') } }
let(:migration) { described_class.new }
before do
allow(migration).to receive(:sleep)
end
it 'backfills search data for the specified records' do
# sleeps for every sub-batch
expect(migration).to receive(:sleep).with(0.05).exactly(3).times
migration.perform(issues[0].id, issues[5].id, :issues, :id, 2, 50)
expect(issue_search_data_table.count).to eq(6)
end
it 'skips issues that already have search data' do
old_time = Time.new(2019, 1, 1).in_time_zone
issue_search_data_table.create!(project_id: project.id, issue_id: issues[0].id, updated_at: old_time)
migration.perform(issues[0].id, issues[5].id, :issues, :id, 2, 50)
expect(issue_search_data_table.count).to eq(6)
expect(issue_search_data_table.find_by_issue_id(issues[0].id).updated_at).to be_like_time(old_time)
end
it 'rescues batch with bad data and inserts other rows' do
issues[1].update!(description: Array.new(30_000) { SecureRandom.hex }.join(' '))
expect_next_instance_of(Gitlab::BackgroundMigration::Logger) do |logger|
expect(logger).to receive(:error).with(a_hash_including(message: /string is too long for tsvector/, model_id: issues[1].id))
end
expect { migration.perform(issues[0].id, issues[5].id, :issues, :id, 2, 50) }.not_to raise_error
expect(issue_search_data_table.count).to eq(5)
expect(issue_search_data_table.find_by_issue_id(issues[1].id)).to eq(nil)
end
it 're-raises other errors' do
allow(migration).to receive(:update_search_data).and_raise(ActiveRecord::StatementTimeout)
expect { migration.perform(issues[0].id, issues[5].id, :issues, :id, 2, 50) }.to raise_error(ActiveRecord::StatementTimeout)
end
end
......@@ -34,6 +34,7 @@ issues:
- issuable_severity
- issuable_sla
- issue_assignees
- search_data
- closed_by
- epic_issue
- epic
......@@ -627,6 +628,8 @@ issuable_severity:
issue_assignees:
- issue
- assignee
search_data:
- issue
merge_request_assignees:
- merge_request
- assignee
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe PgFullTextSearchable do
let(:project) { create(:project) }
let(:model_class) do
Class.new(ActiveRecord::Base) do
include PgFullTextSearchable
self.table_name = 'issues'
belongs_to :project
has_one :search_data, class_name: 'Issues::SearchData'
def persist_pg_full_text_search_vector(search_vector)
Issues::SearchData.upsert({ project_id: project_id, issue_id: id, search_vector: search_vector }, unique_by: %i(project_id issue_id))
end
def self.name
'Issue'
end
end
end
describe '.pg_full_text_searchable' do
it 'sets pg_full_text_searchable_columns' do
model_class.pg_full_text_searchable columns: [{ name: 'title', weight: 'A' }]
expect(model_class.pg_full_text_searchable_columns).to eq({ 'title' => 'A' })
end
it 'raises an error when called twice' do
model_class.pg_full_text_searchable columns: [{ name: 'title', weight: 'A' }]
expect { model_class.pg_full_text_searchable columns: [{ name: 'title', weight: 'A' }] }.to raise_error('Full text search columns already defined!')
end
end
describe 'after commit hook' do
let(:model) { model_class.create!(project: project) }
before do
model_class.pg_full_text_searchable columns: [{ name: 'title', weight: 'A' }]
end
context 'when specified columns are changed' do
it 'calls update_search_data!' do
expect(model).to receive(:update_search_data!)
model.update!(title: 'A new title')
end
end
context 'when specified columns are not changed' do
it 'does not enqueue worker' do
expect(model).not_to receive(:update_search_data!)
model.update!(description: 'A new description')
end
end
end
describe '.pg_full_text_search' do
let(:english) { model_class.create!(project: project, title: 'title', description: 'something english') }
let(:with_accent) { model_class.create!(project: project, title: 'Jürgen', description: 'Ærøskøbing') }
let(:japanese) { model_class.create!(project: project, title: '日本語 title', description: 'another english description') }
before do
model_class.pg_full_text_searchable columns: [{ name: 'title', weight: 'A' }, { name: 'description', weight: 'B' }]
[english, with_accent, japanese].each(&:update_search_data!)
end
it 'searches across all fields' do
expect(model_class.pg_full_text_search('title english')).to contain_exactly(english, japanese)
end
it 'searches for exact term with quotes' do
expect(model_class.pg_full_text_search('"something english"')).to contain_exactly(english)
end
it 'ignores accents' do
expect(model_class.pg_full_text_search('jurgen')).to contain_exactly(with_accent)
end
it 'does not support searching by non-Latin characters' do
expect(model_class.pg_full_text_search('日本')).to be_empty
end
end
describe '#update_search_data!' do
let(:model) { model_class.create!(project: project, title: 'title', description: 'description') }
before do
model_class.pg_full_text_searchable columns: [{ name: 'title', weight: 'A' }, { name: 'description', weight: 'B' }]
end
it 'sets the correct weights' do
model.update_search_data!
expect(model.search_data.search_vector).to match(/'titl':1A/)
expect(model.search_data.search_vector).to match(/'descript':2B/)
end
context 'with accented and non-Latin characters' do
let(:model) { model_class.create!(project: project, title: '日本語', description: 'Jürgen') }
it 'transliterates accented characters and removes non-Latin ones' do
model.update_search_data!
expect(model.search_data.search_vector).not_to match(/日本語/)
expect(model.search_data.search_vector).to match(/jurgen/)
end
end
context 'when upsert times out' do
it 're-raises the exception' do
expect(Issues::SearchData).to receive(:upsert).once.and_raise(ActiveRecord::StatementTimeout)
expect { model.update_search_data! }.to raise_error(ActiveRecord::StatementTimeout)
end
end
context 'with strings that go over tsvector limit', :delete do
let(:long_string) { Array.new(30_000) { SecureRandom.hex }.join(' ') }
let(:model) { model_class.create!(project: project, title: 'title', description: long_string) }
it 'does not raise an exception' do
expect(Gitlab::AppJsonLogger).to receive(:error).with(
a_hash_including(class: model_class.name, model_id: model.id)
)
expect { model.update_search_data! }.not_to raise_error
expect(model.search_data).to eq(nil)
end
end
context 'when model class does not implement persist_pg_full_text_search_vector' do
let(:model_class) do
Class.new(ActiveRecord::Base) do
include PgFullTextSearchable
self.table_name = 'issues'
belongs_to :project
has_one :search_data, class_name: 'Issues::SearchData'
def self.name
'Issue'
end
end
end
it 'raises an error' do
expect { model.update_search_data! }.to raise_error(NotImplementedError)
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment