Partition issue search data table

33253988 · Heinrich Lee Yu · 5dc6368c · 33253988 · 33253988 · 33253988
Commit 33253988 authored Nov 12, 2021 by Heinrich Lee Yu
8 changed files
--- a/app/models/concerns/pg_full_text_searchable.rb
+++ b/app/models/concerns/pg_full_text_searchable.rb
@@ -11,7 +11,8 @@
 # pg_full_text_searchable columns: [{ name: 'title', weight: 'A' }, { name: 'description', weight: 'B' }]
 #
 # This module sets up an after_commit hook that updates the search data
-# when the searchable columns are changed.
+# when the searchable columns are changed. You will need to implement the
+# `#persist_pg_full_text_search_vector` method that does the actual insert or update.
 #
 # This also adds a `pg_full_text_search` scope so you can do:
 #
@@ -29,8 +30,7 @@ module PgFullTextSearchable
      tsvector_arel_node(column, weight)&.to_sql
    end

-    association = self.class.reflect_on_association(:search_data)
-    association.klass.upsert({ association.foreign_key => id, search_vector: Arel.sql(tsvector_sql_nodes.compact.join(' || ')) })
+    persist_pg_full_text_search_vector(Arel.sql(tsvector_sql_nodes.compact.join(' || ')))
  rescue ActiveRecord::StatementInvalid => e
    raise unless e.cause.is_a?(PG::ProgramLimitExceeded) && e.message.include?('string is too long for tsvector')

@@ -43,6 +43,10 @@ module PgFullTextSearchable

  private

+  def persist_pg_full_text_search_vector(search_vector)
+    raise NotImplementedError
+  end
+
  def tsvector_arel_node(column, weight)
    return if self[column].blank?

@@ -76,6 +80,12 @@ module PgFullTextSearchable
        pg_full_text_searchable_columns[column[:name]] = column[:weight]
      end

+      # We update this outside the transaction because this could raise an error if the resulting tsvector
+      # is too long. When that happens, we still persist the create / update but the model will not have a
+      # search data record. This is fine in most cases because this is a very rare occurrence and only happens
+      # with strings that are most likely unsearchable anyway.
+      #
+      # We also do not want to use a subtransaction here due to: https://gitlab.com/groups/gitlab-org/-/epics/6540
      after_save_commit do
        next unless pg_full_text_searchable_columns.keys.any? { |f| saved_changes.has_key?(f) }


--- a/app/models/issue.rb
+++ b/app/models/issue.rb
@@ -237,6 +237,11 @@ class Issue < ApplicationRecord
    def order_upvotes_asc
      reorder(upvotes_count: :asc)
    end
+
+    override :pg_full_text_search
+    def pg_full_text_search(search_term)
+      super.where('issue_search_data.project_id = issues.project_id')
+    end
  end

  def next_object_by_relative_position(ignoring: nil, order: :asc)
@@ -615,6 +620,11 @@ class Issue < ApplicationRecord

  private

+  override :persist_pg_full_text_search_vector
+  def persist_pg_full_text_search_vector(search_vector)
+    Issues::SearchData.upsert({ project_id: project_id, issue_id: id, search_vector: search_vector }, unique_by: %i(project_id issue_id))
+  end
+
  def spammable_attribute_changed?
    title_changed? ||
      description_changed? ||

--- a/app/models/issues/search_data.rb
+++ b/app/models/issues/search_data.rb
@@ -2,6 +2,8 @@

 module Issues
  class SearchData < ApplicationRecord
+    extend SuppressCompositePrimaryKeyWarning
+
    self.table_name = 'issue_search_data'

    belongs_to :issue

--- a/db/migrate/20211007090229_create_issue_search_table.rb
+++ b/db/migrate/20211007090229_create_issue_search_table.rb
 # frozen_string_literal: true

 class CreateIssueSearchTable < Gitlab::Database::Migration[1.0]
-  def change
-    create_table :issue_search_data, id: false do |t|
-      t.references :issue, index: false, default: nil, primary_key: true, foreign_key: { on_delete: :cascade }, type: :bigint
-      t.timestamps_with_timezone default: -> { 'CURRENT_TIMESTAMP' }
-      t.tsvector :search_vector
-
-      t.index :search_vector, using: :gin, name: 'index_issue_search_data_on_search_vector'
-    end
+  include Gitlab::Database::PartitioningMigrationHelpers::TableManagementHelpers
+
+  def up
+    execute <<~SQL
+    CREATE TABLE issue_search_data (
+      project_id bigint NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
+      issue_id bigint NOT NULL REFERENCES issues(id) ON DELETE CASCADE,
+      created_at timestamp with time zone DEFAULT NOW() NOT NULL,
+      updated_at timestamp with time zone DEFAULT NOW() NOT NULL,
+      search_vector tsvector,
+      PRIMARY KEY (project_id, issue_id)
+    ) PARTITION BY HASH (project_id)
+    SQL
+
+    # rubocop: disable Migration/AddIndex
+    add_index :issue_search_data, :issue_id
+    add_index :issue_search_data, :search_vector, using: :gin, name: 'index_issue_search_data_on_search_vector'
+    # rubocop: enable Migration/AddIndex
+
+    create_hash_partitions :issue_search_data, 64
+  end
+
+  def down
+    drop_table :issue_search_data
  end
 end
--- a/db/structure.sql
+++ b/db/structure.sql
--- a/lib/gitlab/background_migration/backfill_issue_search_data.rb
+++ b/lib/gitlab/background_migration/backfill_issue_search_data.rb
@@ -25,8 +25,9 @@ module Gitlab
      def update_search_data(relation)
        relation.klass.connection.execute(
          <<~SQL
-          INSERT INTO issue_search_data (issue_id, search_vector, created_at, updated_at)
+          INSERT INTO issue_search_data (project_id, issue_id, search_vector, created_at, updated_at)
          SELECT
+            project_id,
            id,
            setweight(to_tsvector('english', LEFT(title, 255)), 'A') || setweight(to_tsvector('english', LEFT(REGEXP_REPLACE(description, '[A-Za-z0-9+/]{50,}', ' ', 'g'), 1048576)), 'B'),
            NOW(),

--- a/spec/lib/gitlab/background_migration/backfill_issue_search_data_spec.rb
+++ b/spec/lib/gitlab/background_migration/backfill_issue_search_data_spec.rb
@@ -3,12 +3,20 @@
 require 'spec_helper'

 RSpec.describe Gitlab::BackgroundMigration::BackfillIssueSearchData do
-  let(:issues_table) { table(:issues) }
+  let(:namespaces_table) { table(:namespaces) }
+  let(:projects_table) { table(:projects) }
  let(:issue_search_data_table) { table(:issue_search_data) }
-  let!(:issues) { Array.new(10) { issues_table.create!(title: 'test title', description: 'test description') } }
+
+  let!(:namespace) { namespaces_table.create!(name: 'gitlab-org', path: 'gitlab-org') }
+  let!(:project) { projects_table.create!(name: 'gitlab', path: 'gitlab-org/gitlab-ce', namespace_id: namespace.id) }
+  let!(:issues) { Array.new(10) { table(:issues).create!(project_id: project.id, title: 'test title', description: 'test description') } }

  let(:migration) { described_class.new }

+  before do
+    allow(migration).to receive(:sleep)
+  end
+
  it 'backfills search data for the specified records' do
    # sleeps for every sub-batch
    expect(migration).to receive(:sleep).with(0.05).exactly(3).times
@@ -20,12 +28,12 @@ RSpec.describe Gitlab::BackgroundMigration::BackfillIssueSearchData do

  it 'skips issues that already have search data' do
    old_time = Time.new(2019, 1, 1).in_time_zone
-    issue_search_data_table.create!(issue_id: issues[0].id, updated_at: old_time)
+    issue_search_data_table.create!(project_id: project.id, issue_id: issues[0].id, updated_at: old_time)

    migration.perform(issues[0].id, issues[5].id, :issues, :id, 2, 50)

    expect(issue_search_data_table.count).to eq(6)
-    expect(issue_search_data_table.find(issues[0].id).updated_at).to be_like_time(old_time)
+    expect(issue_search_data_table.find_by_issue_id(issues[0].id).updated_at).to be_like_time(old_time)
  end

  it 'rescues batch with bad data and inserts other rows' do

--- a/spec/models/concerns/pg_full_text_searchable_spec.rb
+++ b/spec/models/concerns/pg_full_text_searchable_spec.rb
@@ -3,14 +3,21 @@
 require 'spec_helper'

 RSpec.describe PgFullTextSearchable do
+  let(:project) { create(:project) }
+
  let(:model_class) do
    Class.new(ActiveRecord::Base) do
      include PgFullTextSearchable

      self.table_name = 'issues'

+      belongs_to :project
      has_one :search_data, class_name: 'Issues::SearchData'

+      def persist_pg_full_text_search_vector(search_vector)
+        Issues::SearchData.upsert({ project_id: project_id, issue_id: id, search_vector: search_vector }, unique_by: %i(project_id issue_id))
+      end
+
      def self.name
        'Issue'
      end
@@ -32,7 +39,7 @@ RSpec.describe PgFullTextSearchable do
  end

  describe 'after commit hook' do
-    let(:model) { model_class.create! }
+    let(:model) { model_class.create!(project: project) }

    before do
      model_class.pg_full_text_searchable columns: [{ name: 'title', weight: 'A' }]
@@ -56,9 +63,9 @@ RSpec.describe PgFullTextSearchable do
  end

  describe '.pg_full_text_search' do
-    let(:english) { model_class.create!(title: 'title', description: 'something english') }
-    let(:with_accent) { model_class.create!(title: 'Jürgen', description: 'Ærøskøbing') }
-    let(:japanese) { model_class.create!(title: '日本語 title', description: 'another english description') }
+    let(:english) { model_class.create!(project: project, title: 'title', description: 'something english') }
+    let(:with_accent) { model_class.create!(project: project, title: 'Jürgen', description: 'Ærøskøbing') }
+    let(:japanese) { model_class.create!(project: project, title: '日本語 title', description: 'another english description') }

    before do
      model_class.pg_full_text_searchable columns: [{ name: 'title', weight: 'A' }, { name: 'description', weight: 'B' }]
@@ -84,7 +91,7 @@ RSpec.describe PgFullTextSearchable do
  end

  describe '#update_search_data!' do
-    let(:model) { model_class.create!(title: 'title', description: 'description') }
+    let(:model) { model_class.create!(project: project, title: 'title', description: 'description') }

    before do
      model_class.pg_full_text_searchable columns: [{ name: 'title', weight: 'A' }, { name: 'description', weight: 'B' }]
@@ -98,7 +105,7 @@ RSpec.describe PgFullTextSearchable do
    end

    context 'with accented and non-Latin characters' do
-      let(:model) { model_class.create!(title: '日本語', description: 'Jürgen') }
+      let(:model) { model_class.create!(project: project, title: '日本語', description: 'Jürgen') }

      it 'transliterates accented characters and removes non-Latin ones' do
        model.update_search_data!
@@ -118,7 +125,7 @@ RSpec.describe PgFullTextSearchable do

    context 'with strings that go over tsvector limit', :delete do
      let(:long_string) { Array.new(30_000) { SecureRandom.hex }.join(' ') }
-      let(:model) { model_class.create!(title: 'title', description: long_string) }
+      let(:model) { model_class.create!(project: project, title: 'title', description: long_string) }

      it 'does not raise an exception' do
        expect(Gitlab::AppJsonLogger).to receive(:error).with(
@@ -130,5 +137,26 @@ RSpec.describe PgFullTextSearchable do
        expect(model.search_data).to eq(nil)
      end
    end
+
+    context 'when model class does not implement persist_pg_full_text_search_vector' do
+      let(:model_class) do
+        Class.new(ActiveRecord::Base) do
+          include PgFullTextSearchable
+
+          self.table_name = 'issues'
+
+          belongs_to :project
+          has_one :search_data, class_name: 'Issues::SearchData'
+
+          def self.name
+            'Issue'
+          end
+        end
+      end
+
+      it 'raises an error' do
+        expect { model.update_search_data! }.to raise_error(NotImplementedError)
+      end
+    end
  end
 end