Commit 237c8f66 authored by Yorick Peterse's avatar Yorick Peterse

Precalculate trending projects

This commit introduces a Sidekiq worker that precalculates the list of
trending projects on a daily basis. The resulting set is stored in a
database table that is then queried by Project.trending.

This setup means that Unicorn workers no longer _may_ have to calculate
the list of trending projects. Furthermore it supports filtering without
any complex caching mechanisms.

The data in the "trending_projects" table is inserted in the same order
as the project ranking. This means that getting the projects in the
correct order is simply a matter of:

    SELECT projects.*
    FROM projects
    INNER JOIN trending_projects ON trending_projects.project_id = projects.id
    ORDER BY trending_projects.id ASC;

Such a query will only take a few milliseconds at most (as measured on
GitLab.com), opposed to a few seconds for the query used for calculating
the project ranks.

The migration in this commit does not require downtime and takes care of
populating an initial list of trending projects.
parent 434d98b2
...@@ -21,8 +21,7 @@ class Explore::ProjectsController < Explore::ApplicationController ...@@ -21,8 +21,7 @@ class Explore::ProjectsController < Explore::ApplicationController
end end
def trending def trending
@projects = TrendingProjectsFinder.new.execute @projects = filter_projects(Project.trending)
@projects = filter_projects(@projects)
@projects = @projects.page(params[:page]) @projects = @projects.page(params[:page])
respond_to do |format| respond_to do |format|
......
# Finder for retrieving public trending projects in a given time range.
class TrendingProjectsFinder
# current_user - The currently logged in User, if any.
# last_months - The number of months to limit the trending data to.
def execute(months_limit = 1)
Rails.cache.fetch(cache_key_for(months_limit), expires_in: 1.day) do
Project.public_only.trending(months_limit.months.ago)
end
end
private
def cache_key_for(months)
"trending_projects/#{months}"
end
end
...@@ -375,19 +375,9 @@ class Project < ActiveRecord::Base ...@@ -375,19 +375,9 @@ class Project < ActiveRecord::Base
%r{(?<project>#{name_pattern}/#{name_pattern})} %r{(?<project>#{name_pattern}/#{name_pattern})}
end end
def trending(since = 1.month.ago) def trending
# By counting in the JOIN we don't expose the GROUP BY to the outer query. joins('INNER JOIN trending_projects ON projects.id = trending_projects.project_id').
# This means that calls such as "any?" and "count" just return a number of reorder('trending_projects.id ASC')
# the total count, instead of the counts grouped per project as a Hash.
join_body = "INNER JOIN (
SELECT project_id, COUNT(*) AS amount
FROM notes
WHERE created_at >= #{sanitize(since)}
AND system IS FALSE
GROUP BY project_id
) join_note_counts ON projects.id = join_note_counts.project_id"
joins(join_body).reorder('join_note_counts.amount DESC')
end end
def cached_count def cached_count
......
class TrendingProject < ActiveRecord::Base
belongs_to :project
# The number of months to include in the trending calculation.
MONTHS_TO_INCLUDE = 1
# The maximum number of projects to include in the trending set.
PROJECTS_LIMIT = 100
# Populates the trending projects table with the current list of trending
# projects.
def self.refresh!
# The calculation **must** run in a transaction. If the removal of data and
# insertion of new data were to run separately a user might end up with an
# empty list of trending projects for a short period of time.
transaction do
delete_all
timestamp = connection.quote(MONTHS_TO_INCLUDE.months.ago)
connection.execute <<-EOF.strip_heredoc
INSERT INTO #{table_name} (project_id)
SELECT project_id
FROM notes
INNER JOIN projects ON projects.id = notes.project_id
WHERE notes.created_at >= #{timestamp}
AND notes.system IS FALSE
AND projects.visibility_level = #{Gitlab::VisibilityLevel::PUBLIC}
GROUP BY project_id
ORDER BY count(*) DESC
LIMIT #{PROJECTS_LIMIT};
EOF
end
end
end
class TrendingProjectsWorker
include Sidekiq::Worker
sidekiq_options queue: :trending_projects
def perform
Rails.logger.info('Refreshing trending projects')
TrendingProject.refresh!
end
end
...@@ -304,6 +304,10 @@ Settings.cron_jobs['prune_old_events_worker'] ||= Settingslogic.new({}) ...@@ -304,6 +304,10 @@ Settings.cron_jobs['prune_old_events_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['prune_old_events_worker']['cron'] ||= '* */6 * * *' Settings.cron_jobs['prune_old_events_worker']['cron'] ||= '* */6 * * *'
Settings.cron_jobs['prune_old_events_worker']['job_class'] = 'PruneOldEventsWorker' Settings.cron_jobs['prune_old_events_worker']['job_class'] = 'PruneOldEventsWorker'
Settings.cron_jobs['trending_projects_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['trending_projects_worker']['cron'] = '0 1 * * *'
Settings.cron_jobs['trending_projects_worker']['job_class'] = 'TrendingProjectsWorker'
# #
# GitLab Shell # GitLab Shell
# #
......
# See http://doc.gitlab.com/ce/development/migration_style_guide.html
# for more information on how to write migrations for GitLab.
class PrecalculateTrendingProjects < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
def up
create_table :trending_projects do |t|
t.references :project, index: true, foreign_key: { on_delete: :cascade }, null: false
end
timestamp = connection.quote(1.month.ago)
# We're hardcoding the visibility level (public) here so that if it ever
# changes this query doesn't suddenly use the new value (which may break
# later migrations).
visibility = 20
execute <<-EOF.strip_heredoc
INSERT INTO trending_projects (project_id)
SELECT project_id
FROM notes
INNER JOIN projects ON projects.id = notes.project_id
WHERE notes.created_at >= #{timestamp}
AND notes.system IS FALSE
AND projects.visibility_level = #{visibility}
GROUP BY project_id
ORDER BY count(*) DESC
LIMIT 100;
EOF
end
def down
drop_table :trending_projects
end
end
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20160926145521) do ActiveRecord::Schema.define(version: 20161007133303) do
# These are extensions that must be enabled in order to support this database # These are extensions that must be enabled in order to support this database
enable_extension "plpgsql" enable_extension "plpgsql"
...@@ -1070,6 +1070,12 @@ ActiveRecord::Schema.define(version: 20160926145521) do ...@@ -1070,6 +1070,12 @@ ActiveRecord::Schema.define(version: 20160926145521) do
add_index "todos", ["target_type", "target_id"], name: "index_todos_on_target_type_and_target_id", using: :btree add_index "todos", ["target_type", "target_id"], name: "index_todos_on_target_type_and_target_id", using: :btree
add_index "todos", ["user_id"], name: "index_todos_on_user_id", using: :btree add_index "todos", ["user_id"], name: "index_todos_on_user_id", using: :btree
create_table "trending_projects", force: :cascade do |t|
t.integer "project_id", null: false
end
add_index "trending_projects", ["project_id"], name: "index_trending_projects_on_project_id", using: :btree
create_table "u2f_registrations", force: :cascade do |t| create_table "u2f_registrations", force: :cascade do |t|
t.text "certificate" t.text "certificate"
t.string "key_handle" t.string "key_handle"
...@@ -1212,5 +1218,6 @@ ActiveRecord::Schema.define(version: 20160926145521) do ...@@ -1212,5 +1218,6 @@ ActiveRecord::Schema.define(version: 20160926145521) do
add_foreign_key "personal_access_tokens", "users" add_foreign_key "personal_access_tokens", "users"
add_foreign_key "protected_branch_merge_access_levels", "protected_branches" add_foreign_key "protected_branch_merge_access_levels", "protected_branches"
add_foreign_key "protected_branch_push_access_levels", "protected_branches" add_foreign_key "protected_branch_push_access_levels", "protected_branches"
add_foreign_key "trending_projects", "projects", on_delete: :cascade
add_foreign_key "u2f_registrations", "users" add_foreign_key "u2f_registrations", "users"
end end
...@@ -800,32 +800,14 @@ describe Project, models: true do ...@@ -800,32 +800,14 @@ describe Project, models: true do
end end
create(:note_on_commit, project: project2) create(:note_on_commit, project: project2)
end
describe 'without an explicit start date' do
subject { described_class.trending.to_a }
it 'sorts Projects by the amount of notes in descending order' do TrendingProject.refresh!
expect(subject).to eq([project1, project2])
end
end end
describe 'with an explicit start date' do subject { described_class.trending.to_a }
let(:date) { 2.months.ago }
subject { described_class.trending(date).to_a } it 'sorts projects by the amount of notes in descending order' do
expect(subject).to eq([project1, project2])
before do
2.times do
# Little fix for special issue related to Fractional Seconds support for MySQL.
# See: https://github.com/rails/rails/pull/14359/files
create(:note_on_commit, project: project2, created_at: date + 1)
end
end
it 'sorts Projects by the amount of notes in descending order' do
expect(subject).to eq([project2, project1])
end
end end
it 'does not take system notes into account' do it 'does not take system notes into account' do
......
require 'spec_helper' require 'spec_helper'
describe TrendingProjectsFinder do describe TrendingProject do
let(:user) { create(:user) } let(:user) { create(:user) }
let(:public_project1) { create(:empty_project, :public) } let(:public_project1) { create(:empty_project, :public) }
let(:public_project2) { create(:empty_project, :public) } let(:public_project2) { create(:empty_project, :public) }
let(:public_project3) { create(:empty_project, :public) }
let(:private_project) { create(:empty_project, :private) } let(:private_project) { create(:empty_project, :private) }
let(:internal_project) { create(:empty_project, :internal) } let(:internal_project) { create(:empty_project, :internal) }
...@@ -13,36 +14,43 @@ describe TrendingProjectsFinder do ...@@ -13,36 +14,43 @@ describe TrendingProjectsFinder do
end end
2.times do 2.times do
create(:note_on_commit, project: public_project2, created_at: 5.weeks.ago) create(:note_on_commit, project: public_project2)
end end
create(:note_on_commit, project: public_project3, created_at: 5.weeks.ago)
create(:note_on_commit, project: private_project) create(:note_on_commit, project: private_project)
create(:note_on_commit, project: internal_project) create(:note_on_commit, project: internal_project)
end end
describe '#execute', caching: true do describe '.refresh!' do
context 'without an explicit time range' do before do
it 'returns public trending projects' do described_class.refresh!
projects = described_class.new.execute end
expect(projects).to eq([public_project1]) it 'populates the trending projects table' do
end expect(described_class.count).to eq(2)
end end
context 'with an explicit time range' do it 'removes existing rows before populating the table' do
it 'returns public trending projects' do described_class.refresh!
projects = described_class.new.execute(2)
expect(projects).to eq([public_project1, public_project2]) expect(described_class.count).to eq(2)
end
end end
it 'caches the list of projects' do it 'stores the project IDs for every trending project' do
projects = described_class.new rows = described_class.order(id: :asc).all
expect(rows[0].project_id).to eq(public_project1.id)
expect(rows[1].project_id).to eq(public_project2.id)
end
expect(Project).to receive(:trending).once it 'does not store projects that fall out of the trending time range' do
expect(described_class.where(project_id: public_project3).any?).to eq(false)
end
2.times { projects.execute } it 'stores only public projects' do
expect(described_class.where(project_id: [public_project1.id, public_project2.id]).count).to eq(2)
expect(described_class.where(project_id: [private_project.id, internal_project.id]).count).to eq(0)
end end
end end
end end
require 'spec_helper'
describe TrendingProjectsWorker do
describe '#perform' do
it 'refreshes the trending projects' do
expect(TrendingProject).to receive(:refresh!)
described_class.new.perform
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment