Add support on GitLab Geo download scheduler for recorded files

7a0da750 · Douglas Barbosa Alexandre · f6249945 · 7a0da750 · 7a0da750
Commit 7a0da750 authored Apr 05, 2017 by Douglas Barbosa Alexandre
2 changed files
--- a/app/workers/geo_file_download_dispatch_worker.rb
+++ b/app/workers/geo_file_download_dispatch_worker.rb
@@ -9,8 +9,8 @@ class GeoFileDownloadDispatchWorker
  MAX_CONCURRENT_DOWNLOADS = 10.freeze

  def initialize
-    @pending_lfs_downloads = []
-    @scheduled_lfs_jobs = []
+    @pending_downloads = []
+    @scheduled_jobs = []
  end

  # The scheduling works as the following:
@@ -34,6 +34,7 @@ class GeoFileDownloadDispatchWorker

        update_jobs_in_progress
        load_pending_downloads if reload_queue?
+
        # If we are still under the limit after refreshing our DB, we can end
        # after scheduling the remaining transfers.
        last_batch = reload_queue?
@@ -41,7 +42,7 @@ class GeoFileDownloadDispatchWorker
        break if over_time?
        break unless downloads_remain?

-        schedule_lfs_downloads
+        schedule_downloads

        break if last_batch

@@ -53,7 +54,7 @@ class GeoFileDownloadDispatchWorker
  private

  def reload_queue?
-    @pending_lfs_downloads.size < MAX_CONCURRENT_DOWNLOADS
+    @pending_downloads.size < MAX_CONCURRENT_DOWNLOADS
  end

  def over_time?
@@ -61,32 +62,54 @@ class GeoFileDownloadDispatchWorker
  end

  def load_pending_downloads
-    @pending_lfs_downloads = find_lfs_object_ids(DB_RETRIEVE_BATCH)
+    lfs_object_ids = find_lfs_object_ids(DB_RETRIEVE_BATCH)
+    objects_ids    = find_object_ids(DB_RETRIEVE_BATCH - lfs_object_ids.size)
+
+    @pending_downloads = lfs_object_ids + objects_ids
  end

  def downloads_remain?
-    @pending_lfs_downloads.size
+    @pending_downloads.size
  end

-  def schedule_lfs_downloads
-    num_to_schedule = [MAX_CONCURRENT_DOWNLOADS - job_ids.size, @pending_lfs_downloads.size].min
+  def schedule_downloads
+    num_to_schedule = [MAX_CONCURRENT_DOWNLOADS - job_ids.size, @pending_downloads.size].min

    return unless downloads_remain?

    num_to_schedule.times do
-      lfs_id = @pending_lfs_downloads.shift
-      job_id = GeoFileDownloadWorker.perform_async(:lfs, lfs_id)
+      object_id, object_type = @pending_downloads.shift
+      job_id = GeoFileDownloadWorker.perform_async(object_type, object_id)

      if job_id
-        @scheduled_lfs_jobs << { job_id: job_id, id: lfs_id }
+        @scheduled_jobs << { id: object_id, type: object_type, job_id: job_id }
      end
    end
  end

+  def find_object_ids(limit)
+    downloaded_ids = find_downloaded_ids([:attachment, :avatar, :file])
+
+    Upload.where.not(id: downloaded_ids)
+          .order(created_at: :desc)
+          .limit(limit)
+          .pluck(:id, :uploader)
+          .map { |id, uploader| [id, uploader.gsub('Uploader', '').downcase] }
+  end
+
  def find_lfs_object_ids(limit)
-    downloaded_ids = Geo::FileRegistry.where(file_type: 'lfs').pluck(:file_id)
-    downloaded_ids = (downloaded_ids + scheduled_lfs_ids).uniq
-    LfsObject.where.not(id: downloaded_ids).order(created_at: :desc).limit(limit).pluck(:id)
+    downloaded_ids = find_downloaded_ids([:lfs])
+
+    LfsObject.where.not(id: downloaded_ids)
+             .order(created_at: :desc)
+             .limit(limit)
+             .pluck(:id)
+             .map { |id| [id, :lfs] }
+  end
+
+  def find_downloaded_ids(file_types)
+    downloaded_lfs_ids = Geo::FileRegistry.where(file_type: file_types).pluck(:file_id)
+    downloaded_lfs_ids = (downloaded_lfs_ids + scheduled_ids(file_types)).uniq
  end

  def update_jobs_in_progress
@@ -95,15 +118,15 @@ class GeoFileDownloadDispatchWorker
    # SidekiqStatus returns an array of booleans: true if the job has completed, false otherwise.
    # For each entry, first use `zip` to make { job_id: 123, id: 10 } -> [ { job_id: 123, id: 10 }, bool ]
    # Next, filter out the jobs that have completed.
-    @scheduled_lfs_jobs = @scheduled_lfs_jobs.zip(status).map { |(job, completed)| job if completed }.compact
+    @scheduled_jobs = @scheduled_jobs.zip(status).map { |(job, completed)| job if completed }.compact
  end

  def job_ids
-    @scheduled_lfs_jobs.map { |data| data[:job_id] }
+    @scheduled_jobs.map { |data| data[:job_id] }
  end

-  def scheduled_lfs_ids
-    @scheduled_lfs_jobs.map { |data| data[:id] }
+  def scheduled_ids(types)
+    @scheduled_jobs.select { |data| types.include?(data[:type]) }.map { |data| data[:id] }
  end

  def try_obtain_lease

--- a/spec/workers/geo_file_download_dispatch_worker_spec.rb
+++ b/spec/workers/geo_file_download_dispatch_worker_spec.rb
@@ -45,21 +45,25 @@ describe GeoFileDownloadDispatchWorker do

    # Test the case where we have:
    #
-    # 1. A total of 6 files in the queue, and we can load a maxmimum of 5 and send 2 at a time.
+    # 1. A total of 8 files in the queue, and we can load a maximimum of 5 and send 2 at a time.
    # 2. We send 2, wait for 1 to finish, and then send again.
    it 'attempts to load a new batch without pending downloads' do
      stub_const('GeoFileDownloadDispatchWorker::DB_RETRIEVE_BATCH', 5)
      stub_const('GeoFileDownloadDispatchWorker::MAX_CONCURRENT_DOWNLOADS', 2)

-      create_list(:lfs_object, 6, :with_file)
+      avatar = fixture_file_upload(Rails.root.join('spec/fixtures/dk.png'))
+      create_list(:lfs_object, 2, :with_file)
+      create_list(:user, 2, avatar: avatar)
+      create_list(:note, 2, :with_attachment)
+      create(:appearance, logo: avatar, header_logo: avatar)

      allow_any_instance_of(described_class).to receive(:over_time?).and_return(false)

-      expect(GeoFileDownloadWorker).to receive(:perform_async).exactly(6).times.and_call_original
-      # For 6 downloads, we expect three database reloads:
+      expect(GeoFileDownloadWorker).to receive(:perform_async).exactly(8).times.and_call_original
+      # For 8 downloads, we expect three database reloads:
      # 1. Load the first batch of 5.
-      # 2. 4 get sent out, 1 remains. This triggers another reload, which loads in the remaining 2.
-      # 3. Since the second reload filled the pipe with 2, we need to do a final reload to ensure
+      # 2. 4 get sent out, 1 remains. This triggers another reload, which loads in the remaining 4.
+      # 3. Since the second reload filled the pipe with 4, we need to do a final reload to ensure
      #    zero are left.
      expect(subject).to receive(:load_pending_downloads).exactly(3).times.and_call_original