Commit 8873b61f authored by Adam Hegyi's avatar Adam Hegyi

Merge branch 'rudimentary-similarity-sorting' into 'master'

Similarity sorting in group projects API

See merge request gitlab-org/gitlab!37300
parents 9054609e a609fdea
......@@ -451,6 +451,16 @@ class Project < ApplicationRecord
# Sometimes queries (e.g. using CTEs) require explicit disambiguation with table name
scope :projects_order_id_desc, -> { reorder(self.arel_table['id'].desc) }
scope :sorted_by_similarity_desc, -> (search) do
order_expression = Gitlab::Database::SimilarityScore.build_expression(search: search, rules: [
{ column: arel_table["path"], multiplier: 1 },
{ column: arel_table["name"], multiplier: 0.7 },
{ column: arel_table["description"], multiplier: 0.2 }
])
reorder(order_expression.desc, arel_table['id'].desc)
end
scope :with_packages, -> { joins(:packages) }
scope :in_namespace, ->(namespace_ids) { where(namespace_id: namespace_ids) }
scope :personal, ->(user) { where(namespace_id: user.namespace_id) }
......
......@@ -184,7 +184,7 @@ Parameters:
| `id` | integer/string | yes | The ID or [URL-encoded path of the group](README.md#namespaced-path-encoding) owned by the authenticated user |
| `archived` | boolean | no | Limit by archived status |
| `visibility` | string | no | Limit by visibility `public`, `internal`, or `private` |
| `order_by` | string | no | Return projects ordered by `id`, `name`, `path`, `created_at`, `updated_at`, or `last_activity_at` fields. Default is `created_at` |
| `order_by` | string | no | Return projects ordered by `id`, `name`, `path`, `created_at`, `updated_at`, `similarity` (1), or `last_activity_at` fields. Default is `created_at` |
| `sort` | string | no | Return projects sorted in `asc` or `desc` order. Default is `desc` |
| `search` | string | no | Return list of authorized projects matching the search criteria |
| `simple` | boolean | no | Return only the ID, URL, name, and path of each project |
......@@ -198,6 +198,13 @@ Parameters:
| `with_custom_attributes` | boolean | no | Include [custom attributes](custom_attributes.md) in response (admins only) |
| `with_security_reports` | boolean | no | **(ULTIMATE)** Return only projects that have security reports artifacts present in any of their builds. This means "projects with security reports enabled". Default is `false` |
1. Order by similarity: Orders the results by a similarity score calculated from the provided `search`
URL parameter. This is an [alpha](https://about.gitlab.com/handbook/product/gitlab-the-product/#alpha) feature [introduced](https://gitlab.com/gitlab-org/gitlab/-/issues/221043) in GitLab 13.3.
The feature is behind a feature flag, you can [enable it](../administration/feature_flags.md#enable-or-disable-the-feature)
with the `similarity_search` flag. When using `order_by=similarity` the `sort` parameter is
ignored. When the `search` parameter is not provided, the API returns the projects ordered by `name`.
Example response:
```json
......
......@@ -76,7 +76,7 @@ module API
params: project_finder_params,
options: finder_options
).execute
projects = reorder_projects(projects)
projects = reorder_projects_with_similarity_order_support(group, projects)
paginate(projects)
end
......@@ -112,6 +112,24 @@ module API
accepted!
end
def reorder_projects_with_similarity_order_support(group, projects)
return handle_similarity_order(group, projects) if params[:order_by] == 'similarity'
reorder_projects(projects)
end
# rubocop: disable CodeReuse/ActiveRecord
def handle_similarity_order(group, projects)
if params[:search].present? && Feature.enabled?(:similarity_search, group)
projects.sorted_by_similarity_desc(params[:search])
else
order_options = { name: :asc }
order_options['id'] ||= params[:sort] || 'asc'
projects.reorder(order_options)
end
end
# rubocop: enable CodeReuse/ActiveRecord
end
resource :groups do
......@@ -222,7 +240,7 @@ module API
optional :visibility, type: String, values: Gitlab::VisibilityLevel.string_values,
desc: 'Limit by visibility'
optional :search, type: String, desc: 'Return list of authorized projects matching the search criteria'
optional :order_by, type: String, values: %w[id name path created_at updated_at last_activity_at],
optional :order_by, type: String, values: %w[id name path created_at updated_at last_activity_at similarity],
default: 'created_at', desc: 'Return projects ordered by field'
optional :sort, type: String, values: %w[asc desc], default: 'desc',
desc: 'Return projects sorted in ascending and descending order'
......
# frozen_string_literal: true
module Gitlab
module Database
class SimilarityScore
EMPTY_STRING = Arel.sql("''").freeze
EXPRESSION_ON_INVALID_INPUT = Arel::Nodes::NamedFunction.new('CAST', [Arel.sql('0').as('integer')]).freeze
DEFAULT_MULTIPLIER = 1
# This method returns an Arel expression that can be used in an ActiveRecord query to order the resultset by similarity.
#
# Note: Calculating similarity score for large volume of records is inefficient. use SimilarityScore only for smaller
# resultset which is already filtered by other conditions (< 10_000 records).
#
# ==== Parameters
# * +search+ - [String] the user provided search string
# * +rules+ - [{ column: COLUMN, multiplier: 1 }, { column: COLUMN_2, multiplier: 0.5 }] rules for the scoring.
# * +column+ - Arel column expression, example: Project.arel_table["name"]
# * +multiplier+ - Integer or Float to increase or decrease the score (optional, defaults to 1)
#
# ==== Use case
#
# We'd like to search for projects by path, name and description. We want to rank higher the path and name matches, since
# it's more likely that the user was remembering the path or the name of the project.
#
# Rules:
# [
# { column: Project.arel_table['path'], multiplier: 1 },
# { column: Project.arel_table['name'], multiplier: 1 },
# { column: Project.arel_table['description'], multiplier: 0.5 }
# ]
#
# ==== Examples
#
# Similarity calculation based on one column:
#
# Gitlab::Database::SimilarityScore.build_expession(search: 'my input', rules: [{ column: Project.arel_table['name'] }])
#
# Similarity calculation based on two column, where the second column has lower priority:
#
# Gitlab::Database::SimilarityScore.build_expession(search: 'my input', rules: [
# { column: Project.arel_table['name'], multiplier: 1 },
# { column: Project.arel_table['description'], multiplier: 0.5 }
# ])
#
# Integration with an ActiveRecord query:
#
# table = Project.arel_table
#
# order_expression = Gitlab::Database::SimilarityScore.build_expession(search: 'input', rules: [
# { column: table['name'], multiplier: 1 },
# { column: table['description'], multiplier: 0.5 }
# ])
#
# Project.where("name LIKE ?", '%' + 'input' + '%').order(order_expression.desc)
#
# The expression can be also used in SELECT:
#
# results = Project.select(order_expression.as('similarity')).where("name LIKE ?", '%' + 'input' + '%').order(similarity: :desc)
# puts results.map(&:similarity)
#
def self.build_expression(search:, rules:)
return EXPRESSION_ON_INVALID_INPUT if search.blank? || rules.empty?
quoted_search = ActiveRecord::Base.connection.quote(search.to_s)
first_expression, *expressions = rules.map do |rule|
rule_to_arel(quoted_search, rule)
end
# (SIMILARITY ...) + (SIMILARITY ...)
expressions.inject(first_expression) do |expression1, expression2|
Arel::Nodes::Addition.new(expression1, expression2)
end
end
# (SIMILARITY(COALESCE(column, ''), 'search_string') * CAST(multiplier AS numeric))
def self.rule_to_arel(search, rule)
Arel::Nodes::Grouping.new(
Arel::Nodes::Multiplication.new(
similarity_function_call(search, column_expression(rule)),
multiplier_expression(rule)
)
)
end
# COALESCE(column, '')
def self.column_expression(rule)
Arel::Nodes::NamedFunction.new('COALESCE', [rule.fetch(:column), EMPTY_STRING])
end
# SIMILARITY(COALESCE(column, ''), 'search_string')
def self.similarity_function_call(search, column)
Arel::Nodes::NamedFunction.new('SIMILARITY', [column, Arel.sql(search)])
end
# CAST(multiplier AS numeric)
def self.multiplier_expression(rule)
quoted_multiplier = ActiveRecord::Base.connection.quote(rule.fetch(:multiplier, DEFAULT_MULTIPLIER).to_s)
Arel::Nodes::NamedFunction.new('CAST', [Arel.sql(quoted_multiplier).as('numeric')])
end
private_class_method :rule_to_arel
private_class_method :column_expression
private_class_method :similarity_function_call
private_class_method :multiplier_expression
end
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Database::SimilarityScore do
let(:search) { '' }
let(:query_result) { ActiveRecord::Base.connection.execute(query).to_a }
let(:query) do
# In memory query, with the id as the tie breaker.
<<-SQL
SELECT *, #{order_expression} AS similarity
FROM (
VALUES (1, 'Git', 'git', 'git source code mirror. this is a publish-only repository.'),
(2, 'GitLab Runner', 'gitlab-runner', 'official helm chart for the gitlab runner'),
(3, 'gitaly', 'gitaly', 'gitaly is a git rpc service for handling all the git calls made by gitlab'),
(4, 'GitLab', 'gitlab', 'gitlab is an open source end-to-end software development platform with built-in version control'),
(5, 'Gitlab Danger', 'gitlab-danger', 'this gem provides common dangerfile and plugins for gitlab projects'),
(6, 'different', 'same', 'same'),
(7, 'same', 'different', 'same'),
(8, 'gitlab-styles', 'gitlab-styles', 'gitlab style guides and shared style configs.'),
(9, '🔒 gitaly', 'gitaly-sec', 'security mirror for gitaly')
) tbl (id, name, path, descrption) ORDER BY #{order_expression} DESC, id DESC;
SQL
end
let(:order_expression) do
Gitlab::Database::SimilarityScore.build_expression(search: search, rules: [{ column: Arel.sql('path') }]).to_sql
end
subject { query_result.take(3).map { |row| row['path'] } }
context 'when passing empty values' do
context 'when search is nil' do
let(:search) { nil }
it 'orders by a constant 0 value' do
expect(query).to include('ORDER BY CAST(0 AS integer) DESC')
end
end
context 'when rules are empty' do
let(:search) { 'text' }
let(:order_expression) do
Gitlab::Database::SimilarityScore.build_expression(search: search, rules: []).to_sql
end
it 'orders by a constant 0 value' do
expect(query).to include('ORDER BY CAST(0 AS integer) DESC')
end
end
end
context 'when similarity scoring based on the path' do
let(:search) { 'git' }
context 'when searching for `git`' do
let(:search) { 'git' }
it { expect(subject).to eq(%w[git gitlab gitaly]) }
end
context 'when searching for `gitlab`' do
let(:search) { 'gitlab' }
it { expect(subject).to eq(%w[gitlab gitlab-styles gitlab-danger]) }
end
context 'when searching for something unrelated' do
let(:search) { 'xyz' }
it 'results have 0 similarity score' do
expect(query_result.map { |row| row['similarity'] }).to all(eq(0))
end
end
end
describe 'score multiplier' do
let(:order_expression) do
Gitlab::Database::SimilarityScore.build_expression(search: search, rules: [
{ column: Arel.sql('path'), multiplier: 1 },
{ column: Arel.sql('name'), multiplier: 0.8 }
]).to_sql
end
let(:search) { 'different' }
it 'ranks `path` matches higher' do
expect(subject).to eq(%w[different same gitlab-danger])
end
end
end
......@@ -860,6 +860,66 @@ RSpec.describe API::Groups do
end
end
context 'with similarity ordering' do
let_it_be(:group_with_projects) { create(:group) }
let_it_be(:project_1) { create(:project, name: 'Project', path: 'project', group: group_with_projects) }
let_it_be(:project_2) { create(:project, name: 'Test Project', path: 'test-project', group: group_with_projects) }
let_it_be(:project_3) { create(:project, name: 'Test', path: 'test', group: group_with_projects) }
let(:params) { { order_by: 'similarity', search: 'test' } }
subject { get api("/groups/#{group_with_projects.id}/projects", user1), params: params }
before do
group_with_projects.add_owner(user1)
end
it 'returns items based ordered by similarity' do
subject
expect(response).to have_gitlab_http_status(:ok)
expect(response).to include_pagination_headers
expect(json_response.length).to eq(2)
project_names = json_response.map { |proj| proj['name'] }
expect(project_names).to eq(['Test', 'Test Project'])
end
context 'when `search` parameter is not given' do
before do
params.delete(:search)
end
it 'returns items ordered by name' do
subject
expect(response).to have_gitlab_http_status(:ok)
expect(response).to include_pagination_headers
expect(json_response.length).to eq(3)
project_names = json_response.map { |proj| proj['name'] }
expect(project_names).to eq(['Project', 'Test', 'Test Project'])
end
end
context 'when `similarity_search` feature flag is off' do
before do
stub_feature_flags(similarity_search: false)
end
it 'returns items ordered by name' do
subject
expect(response).to have_gitlab_http_status(:ok)
expect(response).to include_pagination_headers
expect(json_response.length).to eq(2)
project_names = json_response.map { |proj| proj['name'] }
expect(project_names).to eq(['Test', 'Test Project'])
end
end
end
it "returns the group's projects with simple representation" do
get api("/groups/#{group1.id}/projects", user1), params: { simple: true }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment