Commit 2aadbc09 authored by Stan Hu's avatar Stan Hu

Geo: Limit the huge cross-database pluck for LFS objects and attachments

This repeats the approach used in #3629 to limit the huge `NOT IN`
fragment.

Closes #3645
parent 423f153d
......@@ -18,8 +18,10 @@ module Geo
def find_object_ids
downloaded_ids = find_downloaded_ids([:attachment, :avatar, :file])
current_node.uploads
.where.not(id: downloaded_ids)
unsynched_downloads = filter_downloaded_ids(
current_node.uploads, downloaded_ids, Upload.table_name)
unsynched_downloads
.order(created_at: :desc)
.limit(db_retrieve_batch_size)
.pluck(:id, :uploader)
......@@ -29,14 +31,33 @@ module Geo
def find_lfs_object_ids
downloaded_ids = find_downloaded_ids([:lfs])
current_node.lfs_objects
.where.not(id: downloaded_ids)
unsynched_downloads = filter_downloaded_ids(
current_node.lfs_objects, downloaded_ids, LfsObject.table_name)
unsynched_downloads
.order(created_at: :desc)
.limit(db_retrieve_batch_size)
.pluck(:id)
.map { |id| [id, :lfs] }
end
# This query requires data from two different databases, and unavoidably
# plucks a list of file IDs from one into the other. This will not scale
# well with the number of synchronized files--the query will increase
# linearly in size--so this should be replaced with postgres_fdw ASAP.
def filter_downloaded_ids(objects, downloaded_ids, table_name)
return objects if downloaded_ids.empty?
joined_relation = objects.joins(<<~SQL)
LEFT OUTER JOIN
(VALUES #{downloaded_ids.map { |id| "(#{id}, 't')" }.join(',')})
file_registry(file_id, registry_present)
ON #{table_name}.id = file_registry.file_id
SQL
joined_relation.where(file_registry: { registry_present: [nil, false] })
end
def find_downloaded_ids(file_types)
downloaded_ids = Geo::FileRegistry.where(file_type: file_types).pluck(:file_id)
(downloaded_ids + scheduled_file_ids(file_types)).uniq
......
---
title: 'Geo: Limit the huge cross-database pluck for LFS objects and attachments'
merge_request:
author:
type: fixed
require 'spec_helper'
describe Geo::FileDownloadDispatchWorker do
describe Geo::FileDownloadDispatchWorker, :postgresql do
include ::EE::GeoHelpers
set(:primary) { create(:geo_node, :primary, host: 'primary-geo-node') }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment