ebulk fix resume initial ingestion

a5366088 · roqueporchetto@gmail.com · c6e035d3 · c6e035d3 · c6e035d3 · c6e035d3
Commit a5366088 authored Sep 21, 2018 by roqueporchetto@gmail.com
17 changed files
--- a/.gitignore
+++ b/.gitignore
-*~
-ebulk-data/config/*config.yml
-
--- a/ebulk-data/config/csv-parser-wendelin.yml~
+++ b/ebulk-data/config/csv-parser-wendelin.yml~
-exec: 
-  max_threads: 1
-  min_output_tasks: 1
-in:
-  type: file
-  path_prefix: ./csv/
-  parser:
-    charset: UTF-8
-    type: csv
-    delimiter: ';'
-    columns:
-    - {name: id, type: string}
-    - {name: id2, type: string}
-    - {name: id3, type: string}
-    - {name: id4, type: string}
-out: 
-  type: wendelin
-  erp5_url: "https://softinst102878.host.vifib.net/erp5/portal_ingestion_policies/wendelin_embulk"
-  user: "zope"
-  password: "asd"
--- a/ebulk-data/config/csv-wendelin.yml~
+++ b/ebulk-data/config/csv-wendelin.yml~
-exec: 
-  max_threads: 1
-  min_output_tasks: 1
-in:
-  type: file
-  path_prefix: ./csv/
-  parser:
-    charset: UTF-8
-#    newline: CRLF
-    type: csv
-    delimiter: ';'
-#    quote: '"'
-#    escape: ''
-#    null_string: 'NULL'
-    columns:
-    - {name: id, type: string}
-    - {name: id2, type: string}
-    - {name: id3, type: string}
-    - {name: id4, type: string}
-out: 
-  type: wendelin
-  erp5_url: "https://softinst102878.host.vifib.net/erp5/portal_ingestion_policies/wendelin_embulk"
-  user: "zope"
-  password: "asd"
--- a/ebulk-data/config/download-config.yml~
+++ b/ebulk-data/config/download-config.yml~
-exec: 
-  max_threads: 1
-  min_output_tasks: 1
-in:
-  type: wendelin
-  erp5_url: "https://softinst102878.host.vifib.net/erp5/"
-  user: "asd"
-  password: "asd"
-  data_set: "sample"
-  chunk_size: "50"
-  output_path: "sample"
-  tool_dir: "."
-
-out:
-  type: fif
-  output_path: "sample"
-  tool_dir: "."
--- a/ebulk-data/config/download-config_template.yml~
+++ b/ebulk-data/config/download-config_template.yml~
-exec: 
-  max_threads: 1
-  min_output_tasks: 1
-in:
-  type: wendelin
-  erp5_url: $DOWN_URL
-  user: $USER
-  password: $pwd
-  data_set: $DATA_SET
-  chunk_size: $CHUNK
-  output_path: $DATASET_DIR
-  tool_dir: $TOOL_DIR
-
-out:
-  type: fif
-  output_path: $DATASET_DIR
-  tool_dir: $TOOL_DIR
--- a/ebulk-data/config/ingestion-config.yml~
+++ b/ebulk-data/config/ingestion-config.yml~
-exec: 
-  max_threads: 1
-  min_output_tasks: 1
-in:
-  type: fif
-  path_prefix: ["input/"]
-  supplier: [SUPPLIER]
-  data_set: [DATA_SET]
-  chunk_size: 0
-
-out: 
-  type: wendelin
-  erp5_url: 'https://softinst79462.host.vifib.net/erp5/portal_ingestion_policies/wendelin_embulk'
-  user: [USER]
-  password: [PASSWORD]
-  tag: supplier.dataset.filename.extension.end
-
--- a/ebulk-data/config/ingestion-config_template.yml~
+++ b/ebulk-data/config/ingestion-config_template.yml~
-exec: 
-  max_threads: 1
-  min_output_tasks: 1
-in:
-  type: fif
-  path_prefix: [$DATASET_DIR]
-  supplier: $USER
-  data_set: $DATA_SET
-  chunk_size: $CHUNK
-  erp5_url: $DOWN_URL
-  user: $USER
-  password: $pwd
-  tool_dir: $TOOL_DIR
-
-out: 
-  type: wendelin
-  erp5_url: $ING_URL
-  user: $USER
-  password: $pwd
-  tool_dir: $TOOL_DIR
--- a/ebulk-data/config/ingestion-custom-config_template.yml~
+++ b/ebulk-data/config/ingestion-custom-config_template.yml~
-# CUSTOM CONFIGURATION FILE
-# PLEASE FILL THE FILE WITH THE CONFIGURATION OF YOUR CUSTOM EMBULK PLUGIN
-# ONLY THE 'IN' SECTION, OTHERS MUST REMAIN AS THEY ARE
-
-# PLEASE FILL THE 'IN' SECTION ACCORDING TO YOUR PLUGIN
-in:
-
-# FOR EXAMPLE CSV FILES
-#  type: file
-#  path_prefix: MY_CSV_DIRECTORY
-
-# FOR EXAMPLE AWS-S3 storage:
-#  type: s3
-#  bucket: MY_BUCKET
-#  path_prefix: ""
-#  access_key_id: MY_KEY_ID
-#  secret_access_key: MY_SECRET_KEY
-
-# PLEASE LEAVE THE SECTIONS BELOW AS THEY ARE (unless you know what you are doing)
-  parser:
-    type: binary
-    supplier: $USER
-    data_set: $DATA_SET
-    tool_dir: $TOOL_DIR
-    chunk_size: $CHUNK
-    input_plugin: $STORAGE
-
-out: 
-  type: wendelin
-  erp5_url: $ING_URL
-  user: $USER
-  password: $pwd
-
-exec: 
-  max_threads: 1
-  min_output_tasks: 1
--- a/ebulk-data/config/ingestion-ftp-config_template.yml~
+++ b/ebulk-data/config/ingestion-ftp-config_template.yml~
-# FTP CONFIGURATION FILE
-# PLEASE FILL THE FILE WITH THE CONFIGURATION OF YOUR FTP STORAGE
-# ONLY THE 'IN' SECTION, OTHERS MUST REMAIN AS THEY ARE
-
-in:
-  type: ftp
-  host: $FTP_HOST
-  user: $FTP_USER
-  password: $FTP_PASSWORD
-  path_prefix: $FTP_PATH
-  #ssl_verify: false
-  #port: 21
-
-# PLEASE LEAVE THE SECTIONS BELOW AS THEY ARE (unless you know what you are doing)
-  parser:
-    type: binary
-    supplier: $USER
-    data_set: $DATA_SET
-    tool_dir: $TOOL_DIR
-    chunk_size: $CHUNK
-    storage: $STORAGE
-
-out: 
-  type: wendelin
-  erp5_url: $ING_URL
-  user: $USER
-  password: $pwd
-
-exec: 
-  max_threads: 1
-  min_output_tasks: 1
-
--- a/ebulk-data/config/ingestion-http-config.yml~
+++ b/ebulk-data/config/ingestion-http-config.yml~
-# HTTP CONFIGURATION FILE
-# PLEASE FILL THE FILE WITH THE CONFIGURATION OF YOUR HTTP URL
-# ONLY THE 'IN' SECTION, OTHERS MUST REMAIN AS THEY ARE
-
-in:
-  type: http
-  url: "http://archive.ics.uci.edu/ml/machine-learning-databases/00000/Donnees%20conso%20autos.txt"
-  method: "get"
-#  basic_auth:
-#    user: MyUser
-#    password: MyPassword
-#  params:
-#    - {name: paramA, value: valueA}
-#    - {name: paramB, value: valueB}
-
-# PLEASE LEAVE THE SECTIONS BELOW AS THEY ARE (unless you know what you are doing)
-  parser:
-    type: binary
-    supplier: "zope"
-    data_set: "http"
-    tool_dir: "."
-    chunk_size: "50"
-    storage: "http"
-    path_prefix: 
-
-out: 
-  type: wendelin
-  erp5_url: "https://softinst102878.host.vifib.net/erp5/portal_ingestion_policies/wendelin_embulk"
-  user: "zope"
-  password: "telecom"
-
-exec: 
-  max_threads: 1
-  min_output_tasks: 1
--- a/ebulk-data/config/ingestion-http-config_template.yml~
+++ b/ebulk-data/config/ingestion-http-config_template.yml~
-# HTTP CONFIGURATION FILE
-# PLEASE FILL THE FILE WITH THE CONFIGURATION OF YOUR HTTP URL
-# ONLY THE 'IN' SECTION, OTHERS MUST REMAIN AS THEY ARE
-
-in:
-  type: http
-  url: $HTTP_URL
-  method: $HTTP_METHOD
-#  basic_auth:
-#    user: MyUser
-#    password: MyPassword
-#  params:
-#    - {name: paramA, value: valueA}
-#    - {name: paramB, value: valueB}
-
-# PLEASE LEAVE THE SECTIONS BELOW AS THEY ARE (unless you know what you are doing)
-  parser:
-    type: binary
-    supplier: $USER
-    data_set: $DATA_SET
-    tool_dir: $TOOL_DIR
-    chunk_size: $CHUNK
-    storage: $STORAGE
-    path_prefix: $HTTP_PREFIX
-
-out: 
-  type: wendelin
-  erp5_url: $ING_URL
-  user: $USER
-  password: $pwd
-
-exec: 
-  max_threads: 1
-  min_output_tasks: 1
-
--- a/ebulk-data/config/ingestion-s3-config.yml~
+++ b/ebulk-data/config/ingestion-s3-config.yml~
-exec: 
-  max_threads: 1
-  min_output_tasks: 1
-in:
-  type: s3
-  bucket: "roque5"
-  path_prefix: ""
-  access_key_id: "AKIAJLY3N4YBNAJMBLGQ"
-  secret_access_key: "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"
-  parser:
-    type: binary
-    supplier: "zope"
-    data_set: "encoding"
-    tool_dir: "."
-    chunk_size: "5"
-    input_plugin "s3"
-
-out: 
-  type: wendelin
-  erp5_url: "https://softinst102878.host.vifib.net/erp5/portal_ingestion_policies/wendelin_embulk"
-  user: "zope"
-  password: "telecom"
--- a/ebulk-data/config/ingestion-s3-config_template.yml~
+++ b/ebulk-data/config/ingestion-s3-config_template.yml~
-# S3 CONFIGURATION FILE
-# PLEASE FILL THE FILE WITH THE CONFIGURATION OF YOUR S3 BUCKET
-# ONLY THE 'IN' SECTION, OTHERS MUST REMAIN AS THEY ARE
-
-in:
-  type: s3
-  bucket: $S3_BUCKET
-  path_prefix: $S3_PREFIX
-  access_key_id: $S3_ACCESS_KEY
-  secret_access_key: $S3_SECRET_KEY
-  auth_method: $S3_AUTH_METHOD
-#  endpoint: 
-#  region: 
-#  path_match_pattern: 
-#  http_proxy:
-#    host: 
-#    port: 
-
-
-# PLEASE LEAVE THE SECTIONS BELOW AS THEY ARE (unless you know what you are doing)
-  parser:
-    type: binary
-    supplier: $USER
-    data_set: $DATA_SET
-    tool_dir: $TOOL_DIR
-    chunk_size: $CHUNK
-    storage: $STORAGE
-    path_prefix: $S3_PREFIX
-
-out: 
-  type: wendelin
-  erp5_url: $ING_URL
-  user: $USER
-  password: $pwd
-
-exec: 
-  max_threads: 1
-  min_output_tasks: 1
-
--- a/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/dataset_utils.rb
+++ b/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/dataset_utils.rb
@@ -7,6 +7,7 @@ class DatasetUtils
  DATASET_REPORT_FILE = ".dataset-task-report"
  DATASET_COMPLETED_FILE = ".dataset-completed"
  RESUME_OPERATION_FILE = ".resume-operation"
+  INITIAL_INGESTION_FILE = ".initial-ingestion"

  RUN_DONE = "done"
  RUN_ERROR = "error"
@@ -22,6 +23,7 @@ class DatasetUtils
    @task_report_file = @data_set_directory + DATASET_REPORT_FILE
    @completed_file = @data_set_directory + DATASET_COMPLETED_FILE
    @resume_operation_file = @data_set_directory + RESUME_OPERATION_FILE
+    @initial_ingestion_file = @data_set_directory + INITIAL_INGESTION_FILE
  end

  def getLocalFiles(remove=nil)
@@ -130,6 +132,18 @@ class DatasetUtils
    return File.exist?(@task_report_file)
  end

+  def deleteInitialIngestionFile()
+    File.delete(@initial_ingestion_file) if File.exist?(@initial_ingestion_file)
+  end
+
+  def createInitialIngestionFile()
+    File.open(@initial_ingestion_file, 'w') {}
+  end
+
+  def initialIngestionFileExist()
+    return File.exist?(@initial_ingestion_file)
+  end
+
  def addToReport(reference, status, size, hash, data_set)
    local_files = {}
    begin
@@ -183,7 +197,7 @@ class DatasetUtils
  end

  def getLocalChanges(files, data_set)
-    new_files = []
+    all_files, new_files, modified_files, deleted_files = [], [], [], []
    begin
      if reportFileExist()
        File.readlines(@task_report_file).each do |line|
@@ -199,27 +213,31 @@ class DatasetUtils
                hash = getHash(file_path).to_s
 	        if size == record[2].to_s
 	          if hash != record[3].chomp
-	            new_files.push({"path" => file_path, "size" => size, "hash" => hash })
+	            all_files.push({"path" => file_path, "size" => size, "hash" => hash })
+		    modified_files.push(file_path)
 	          end
 	        else
-	          new_files.push({"path" => file_path, "size" => size, "hash" => hash })
+	          all_files.push({"path" => file_path, "size" => size, "hash" => hash })
+		  modified_files.push(file_path)
 	        end
 	      end
 	      files.delete(file_path)
            else
-	      new_files.push({"path" => file_path, "size" => "", "hash" => DELETE })
+	      all_files.push({"path" => file_path, "size" => "", "hash" => DELETE })
+	      deleted_files.push(file_path)
            end
 	  end
        end
      end
      files.each do |path|
-        new_files.push({"path" => path, "size" => "", "hash" => "" })
+        all_files.push({"path" => path, "size" => "", "hash" => "" })
+	new_files.push(path)
      end
    rescue Exception => e
      @logger.error("An error occurred in DatasetUtils method 'getLocalChanges':" + e.to_s)
      @logger.error(e.backtrace)
    end
-    return new_files
+    return all_files, new_files, modified_files, deleted_files
  end

  def getRemoteChangedDataStreams(data_streams)

--- a/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/input/fif.rb
+++ b/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/input/fif.rb
@@ -10,6 +10,10 @@ module Embulk

      Plugin.register_input("fif", self)

+      NEW = "New"
+      MODIFIED = "Modified"
+      DELETED = "Deleted"
+
      EOF = "EOF"
      CHUNK_SIZE = 50000000 #50mb
      MEGA = 1000000
@@ -24,6 +28,21 @@ module Embulk
 		 {"name"=>"hash", "type"=>"string"}
               ]

+      def self.showChangesList(changes, type, print_short)
+	if not changes.empty?
+	  puts
+	  @logger.info("#{type} file(s):", print=TRUE)
+	  if print_short and changes.length > 50
+	    limit = changes.length > 130 ? 130/3 : changes.length/3
+	    @logger.info(changes[0, limit], print=TRUE)
+	    @logger.info("....", print=TRUE)
+	    @logger.info(changes[changes.length-limit, changes.length-1], print=TRUE)
+	  else
+	    @logger.info(changes, print=TRUE)
+	  end
+	end
+      end
+
      def self.transaction(config, &control)
 	begin
 	  tool_dir = config.param('tool_dir', :string)
@@ -36,6 +55,7 @@ module Embulk
 	  if task['chunk_size'] == 0
 	    task['chunk_size'] = CHUNK_SIZE
 	  end
+	  @data_set = task['data_set']
 	  paths = config.param('path_prefix', :array)
 	  paths[0] = paths[0].end_with?("/") ? paths[0] : paths[0] + "/"
 	  @data_set_directory = paths[0]
@@ -50,13 +70,17 @@ module Embulk
 	  @logger.info("Checking remote dataset...", print=TRUE)
 	  data_stream_dict = @wendelin.getDataStreams(task['data_set'])
 	  @dataset_utils = DatasetUtils.new(@data_set_directory)
-	  if @dataset_utils.reportFileExist()
-	    @logger.info("Checking local dataset...", print=TRUE)
-	    if not @dataset_utils.reportUpToDate(data_stream_dict)
-	      puts
-	      @logger.error("Your current dataset is outdated. Please, run a download to update it before ingest your changes.", print=TRUE)
-	      puts
-	      @logger.abortExecution(error=FALSE)
+	  if not @dataset_utils.reportFileExist()
+	    @dataset_utils.createInitialIngestionFile()
+	  else
+	    if not @dataset_utils.initialIngestionFileExist()
+	      @logger.info("Checking local dataset...", print=TRUE)
+	      if not @dataset_utils.reportUpToDate(data_stream_dict)
+	        puts
+	        @logger.error("Your current dataset is outdated. Please, run a download to update it before ingest your changes.", print=TRUE)
+	        puts
+	        @logger.abortExecution(error=FALSE)
+	      end
 	    end
 	  end
 	  if data_stream_dict["status_code"] != 0
@@ -79,20 +103,18 @@ module Embulk
 	    @logger.abortExecution()
          end

-	  task['paths'] = @dataset_utils.getLocalChanges(task['paths'], task['data_set'])
+	  task['paths'], new_files, modified_files, deleted_files = @dataset_utils.getLocalChanges(task['paths'], task['data_set'])
 	  if task['paths'].empty?
 	    puts
 	    @logger.info("No changes in '#{@data_set_directory}'. Everything up-to-date.", print=TRUE)
 	    @logger.abortExecution(error=FALSE)
 	  end
-	  @logger.info("#{task['paths'].length} change(s) detected for ingestion: ", print=TRUE)
-	  if task['paths'].length > 15
-	    @logger.info(task['paths'][0, 5], print=TRUE)
-	    @logger.info(".....", print=TRUE)
-	    @logger.info(task['paths'][task['paths'].length-5, task['paths'].length-1], print=TRUE)
-	  else
-	    @logger.info(task['paths'], print=TRUE)
-	  end
+	  changes = @dataset_utils.reportFileExist() ? "change" : "new file"
+	  @logger.info("#{task['paths'].length} #{changes}(s) detected for ingestion: ", print=TRUE)
+	  print_short = task['paths'].length > 500
+	  self.showChangesList(new_files, NEW, print_short)
+	  self.showChangesList(modified_files, MODIFIED, print_short)
+	  self.showChangesList(deleted_files, DELETED, print_short)
 	  puts
 	  @logger.info("Continue with ingestion? (y/n)", print=TRUE)
          option = gets
@@ -101,6 +123,9 @@ module Embulk
 	    @logger.info("Ingestion cancelled by user.", print=TRUE)
 	    @logger.abortExecution()
 	  end
+	  if not @dataset_utils.reportFileExist()
+	    @dataset_utils.createReportFile()
+	  end

 	  columns = [
            Column.new(0, "supplier", :string),
@@ -139,9 +164,11 @@ module Embulk
 	  @logger.info(task_reports, print=TRUE)
 	end
        next_config_diff = task_reports.map{|hash| hash["done"]}.flatten.compact
-	@logger.info("#{next_config_diff.length} file(s) ingested.", print=TRUE)
+	changes = @dataset_utils.initialIngestionFileExist() ? "new file" : "change"
+	@logger.info("#{next_config_diff.length} #{changes}(s) ingested.", print=TRUE)
 	if(next_config_diff.length == count)
 	  @logger.info("Dataset successfully ingested.", print=TRUE)
+	  @wendelin.increaseDatasetVersion(@data_set)
 	else
 	  next_config_diff = task_reports.map{|hash| hash["error"]}.flatten.compact
 	  puts

--- a/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/input/wendelin.rb
+++ b/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/input/wendelin.rb
@@ -24,7 +24,7 @@ module Embulk
 				      next [] unless Dir.exist?(path)
 				      Dir[(path + '/**/*').gsub! '//', '/']
 				    }.flatten.select{ |file| File.file?(file) }
-	  local_changes = @dataset_utils.getLocalChanges(local_files, data_set)
+	  local_changes, a, b, c = @dataset_utils.getLocalChanges(local_files, data_set)
 	  data_set = @data_set.end_with?("/") ? @data_set : @data_set + "/"
 	  remote_changes = remote_streams.map { |remote|
 		remote = @data_set_directory + remote["reference"].reverse.sub("/".reverse, ".".reverse).reverse.sub(data_set, "")
@@ -178,12 +178,15 @@ module Embulk
 	      @logger.info("This dataset was already downloaded. What do you want to do?", print=TRUE)
 	      puts
 	      self.askUserForAction(task, action=UPDATE)
-	    else
+	    elsif not @dataset_utils.initialIngestionFileExist()
 	      puts
 	      @logger.info("There was a previous attempt to download this dataset but it did not finish successfully.", print=TRUE)
 	      @logger.info("What do you want to do?", print=TRUE)
 	      puts
 	      self.askUserForAction(task, action=RESUME)
+	    else
+	      puts
+	      self.askUserForAction(task, action=UPDATE)
 	    end
 	  else
 	    dir_entries = Dir.entries(@data_set_directory).length
@@ -203,6 +206,7 @@ module Embulk
 	    end
 	    @dataset_utils.createReportFile()
 	  end
+	  @dataset_utils.deleteInitialIngestionFile()
 	  columns = [
 	    Column.new(0, "reference", :string),
 	    Column.new(1, "data_chunk", :string),

--- a/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/wendelin_client.rb
+++ b/ebulk-data/embulk-wendelin-dataset-tool/lib/embulk/wendelin_client.rb
@@ -38,6 +38,21 @@ class WendelinClient
    end
  end

+  def increaseDatasetVersion(reference)
+    if reference == ""
+      @logger.warn("Could not increase data set version because dataset reference is empty.")
+    else
+      @logger.info("Increasing dataset version")
+      uri = URI("#{@erp5_url}/ERP5Site_increaseDatasetVersion?reference=#{reference}")
+      begin
+        res = open(uri, http_basic_authentication: [@user, @password]).read
+      rescue Exception => e
+        @logger.error("An error occurred while increasing dataset version: " + e.to_s)
+        @logger.error(e.backtrace)
+      end
+    end
+  end
+
  def ingest(reference, data_chunk)
      @logger.info("Ingestion reference: #{reference}", print=TRUE)
      if Time.new - @last_ingestion < 2