new comands for staging and reset

parent 963e3e0d
...@@ -22,6 +22,9 @@ GREEN='\033[0;32m' ...@@ -22,6 +22,9 @@ GREEN='\033[0;32m'
ORANGE='\033[0;33m' ORANGE='\033[0;33m'
NC='\033[0m' NC='\033[0m'
DEFAULT_CHUNK_SIZE="50" DEFAULT_CHUNK_SIZE="50"
STAGE_ADD="add"
STAGE_REMOVE="remove"
STAGE_RESET="reset"
function helpReadme { function helpReadme {
echo -e "[INFO] For help, please run '${GREEN}ebulk --help${NC}'" echo -e "[INFO] For help, please run '${GREEN}ebulk --help${NC}'"
...@@ -42,9 +45,9 @@ function checkParameters { ...@@ -42,9 +45,9 @@ function checkParameters {
helpReadme >&2; return 1 helpReadme >&2; return 1
fi fi
if [ "$STORAGE" = "" ] ; then if [ "$STORAGE" = "" ] ; then
if [ ! -d $DATASET_DIR ]; then if [ ! -d "$DATASET_DIR" ]; then
echo echo
mkdir $DATASET_DIR 2>/dev/null mkdir "$DATASET_DIR" 2>/dev/null
if [ ! $? -eq 0 ]; then if [ ! $? -eq 0 ]; then
echo echo
echo -e "${ORANGE}[ERROR] Dataset path not found." echo -e "${ORANGE}[ERROR] Dataset path not found."
...@@ -55,14 +58,19 @@ function checkParameters { ...@@ -55,14 +58,19 @@ function checkParameters {
fi fi
EBULK_DATASET_FILE="$DATASET_DIR/.ebulk_dataset" EBULK_DATASET_FILE="$DATASET_DIR/.ebulk_dataset"
if [[ $DATASET_DIR != $REFERENCE ]]; then if [[ $DATASET_DIR != $REFERENCE ]]; then
if [ "$REFERENCE" = "." ] ; then
REFERENCE=$(basename "$DATASET_DIR")
fi
DATA_SET=$REFERENCE DATA_SET=$REFERENCE
echo $REFERENCE > $EBULK_DATASET_FILE 2>/dev/null echo $REFERENCE > "$EBULK_DATASET_FILE" 2>/dev/null
else else
if [ -f $EBULK_DATASET_FILE ]; then if [ -f "$EBULK_DATASET_FILE" ]; then
DATA_SET=$(cat "$DATASET_DIR/.ebulk_dataset") DATA_SET=$(cat "$DATASET_DIR/.ebulk_dataset" 2>/dev/null)
else else
DATA_SET=$(basename "$DATASET_DIR") DATA_SET=$(basename "$DATASET_DIR")
echo $DATA_SET > $EBULK_DATASET_FILE 2>/dev/null if [ "$DATA_SET" != "." ] ; then
SAVE_DATASET_NAME="TRUE"
fi
fi fi
fi fi
else else
...@@ -70,14 +78,19 @@ function checkParameters { ...@@ -70,14 +78,19 @@ function checkParameters {
fi fi
re='^[A-Za-z][_A-Za-z.0-9-]*$' re='^[A-Za-z][_A-Za-z.0-9-]*$'
if ! [[ $DATA_SET =~ $re ]] ; then if ! [[ $DATA_SET =~ $re ]] ; then
if [ "$DATA_SET" = "." ] && [[ -z "$STORAGE" ]] ; then
echo
echo -e "${ORANGE}[ERROR] You are not in a dataset directory ${GREEN}'$DATA_SET'${ORANGE}.${NC}"
echo
else
echo echo
echo -e "${ORANGE}[ERROR] Error in argument: invalid dataset name ${GREEN}'$DATA_SET'${ORANGE}.${NC}" echo -e "${ORANGE}[ERROR] Error in argument: invalid dataset name ${GREEN}'$DATA_SET'${ORANGE}.${NC}"
echo -e "${ORANGE}[ERROR] Dataset name must start with a letter, and only alphanumerics, dots ( . ), underscores ( _ ) and hyphens ( - ) are allowed.${NC}" echo -e "${ORANGE}[ERROR] Dataset name must start with a letter, and only alphanumerics, dots ( . ), underscores ( _ ) and hyphens ( - ) are allowed.${NC}"
echo echo
if [ -f $EBULK_DATASET_FILE ]; then
rm -f ${EBULK_DATASET_FILE}
fi fi
helpReadme >&2; return 1 helpReadme >&2; return 1
elif [ ! -z "$SAVE_DATASET_NAME" ]; then
echo $DATA_SET > "$EBULK_DATASET_FILE" 2>/dev/null
fi fi
if [ ! -z "$CHUNK" ]; then if [ ! -z "$CHUNK" ]; then
re='^[0-9]+$' re='^[0-9]+$'
...@@ -143,6 +156,7 @@ function updateConfigFile { ...@@ -143,6 +156,7 @@ function updateConfigFile {
DOWN_URL=\"$DOWN_URL\" DOWN_URL=\"$DOWN_URL\"
ING_URL=\"$ING_URL\" ING_URL=\"$ING_URL\"
STORAGE=\"$STORAGE\" STORAGE=\"$STORAGE\"
STATUS=\"$STATUS\"
S3_BUCKET=\"$S3_BUCKET\" S3_BUCKET=\"$S3_BUCKET\"
S3_PREFIX=\"$S3_PREFIX\" S3_PREFIX=\"$S3_PREFIX\"
...@@ -174,6 +188,7 @@ function runProcess { ...@@ -174,6 +188,7 @@ function runProcess {
return 1 return 1
fi fi
echo -e "[INFO] Dataset: ${GREEN}$DATA_SET${NC}" echo -e "[INFO] Dataset: ${GREEN}$DATA_SET${NC}"
if [ -z "$STATUS" ]; then
if [ ! -z "$CHUNK" ]; then if [ ! -z "$CHUNK" ]; then
if [ "$CHUNK" -eq "0" ]; then if [ "$CHUNK" -eq "0" ]; then
echo "[INFO] Default chunk size: $DEFAULT_CHUNK_SIZE Mb." echo "[INFO] Default chunk size: $DEFAULT_CHUNK_SIZE Mb."
...@@ -181,17 +196,20 @@ function runProcess { ...@@ -181,17 +196,20 @@ function runProcess {
echo "[INFO] Chunk size set in $CHUNK Mb." echo "[INFO] Chunk size set in $CHUNK Mb."
fi fi
fi fi
fi
if [ -z "$STATUS" ]; then
if ! askCredentials; then if ! askCredentials; then
return 1 return 1
fi fi
fi
echo echo
echo "[INFO] Supplier: $USER"
updateConfigFile updateConfigFile
echo "[INFO] Starting operation..." echo "[INFO] Starting operation..."
if [ ! -d $LOG_DIR ]; then if [ ! -d $LOG_DIR ]; then
mkdir $LOG_DIR 2>/dev/null mkdir $LOG_DIR 2>/dev/null
fi fi
$embulk run -L $TOOL_PATH/embulk-wendelin-dataset-tool $FILE $DIFF 2> "$LOG_DIR/error.log" || { $embulk run -L $TOOL_PATH/embulk-wendelin-dataset-tool $FILE $DIFF 2> "$LOG_DIR/error.log" || {
if [ -z "$STATUS" ]; then
echo echo
echo -e "${ORANGE}[ERROR] Embulk tool stopped its execution.${NC}" echo -e "${ORANGE}[ERROR] Embulk tool stopped its execution.${NC}"
if [ "$STORAGE" != \"\" ] ; then if [ "$STORAGE" != \"\" ] ; then
...@@ -200,6 +218,7 @@ function runProcess { ...@@ -200,6 +218,7 @@ function runProcess {
fi fi
echo "[INFO] Please check the logs in '$LOG_DIR' directory for more details." echo "[INFO] Please check the logs in '$LOG_DIR' directory for more details."
echo echo
fi
} }
} }
...@@ -383,17 +402,37 @@ function askS3parameters { ...@@ -383,17 +402,37 @@ function askS3parameters {
fi fi
} }
# WELCOME function stage {
echo EBULK_DATASET_FILE="./.ebulk_dataset"
echo " #########################################################################" if [ ! -f "$EBULK_DATASET_FILE" ]; then
echo " ############## WELCOME TO EBULK INGESTION-DOWNLOAD TOOL #################" echo
echo " ########### This tool relies on Embulk software and Java 8 ##############" echo -e "${ORANGE}[ERROR] You are not in a dataset directory."
echo " ######## Do not forget to check the README before use this tool #########" echo -e "[INFO] $OP operation can only be run within a root dataset directory.${NC}"
echo " ############## In case of any problem, please contact us ###############" echo
echo " ####################### roqueporchetto@gmail.com ########################" helpReadme >&2; exit
echo " ###################### Happy ingestion-download ! #######################" fi
echo " #########################################################################" if [[ $PATH_TO_ELEMENT = "" ]]; then
echo echo
echo -e "${ORANGE}[ERROR] Nothing specified, nothing to $OP."
echo -e "[INFO] Please specify a valid path.${NC}"
echo
helpReadme >&2; exit
fi
STAGE_FILE="./.staged"
}
function welcome {
echo
echo " #########################################################################"
echo " ############## WELCOME TO EBULK INGESTION-DOWNLOAD TOOL #################"
echo " ########### This tool relies on Embulk software and Java 8 ##############"
echo " ######## Do not forget to check the README before use this tool #########"
echo " ############## In case of any problem, please contact us ###############"
echo " ####################### roqueporchetto@gmail.com ########################"
echo " ###################### Happy ingestion-download ! #######################"
echo " #########################################################################"
echo
}
if [ ! -d $EBULK_DATA_PATH ]; then if [ ! -d $EBULK_DATA_PATH ]; then
mkdir $EBULK_DATA_PATH 2>/dev/null mkdir $EBULK_DATA_PATH 2>/dev/null
...@@ -428,14 +467,21 @@ while [ "$1" != "" ]; do ...@@ -428,14 +467,21 @@ while [ "$1" != "" ]; do
-h | --help ) cat $TOOL_PATH/help.md -h | --help ) cat $TOOL_PATH/help.md
exit exit
;; ;;
-e | --examples ) cat $TOOL_PATH/example.md
exit
;;
-r | --readme ) less $TOOL_PATH/README.md -r | --readme ) less $TOOL_PATH/README.md
exit exit
;; ;;
pull ) OPERATION=$1 status | push | pull ) OPERATION=$1
;; ;;
push ) OPERATION=$1 add | remove | reset ) OPERATION=$1
shift
PATH_TO_ELEMENT=$1
REFERENCE="."
;; ;;
*) if [[ $REFERENCE != $1 ]]; then *) if [[ $REFERENCE != $1 ]]; then
echo
echo -e "${ORANGE}[ERROR] Invalid parameter '$1'.${NC}" echo -e "${ORANGE}[ERROR] Invalid parameter '$1'.${NC}"
echo echo
helpReadme >&2; exit helpReadme >&2; exit
...@@ -444,14 +490,15 @@ while [ "$1" != "" ]; do ...@@ -444,14 +490,15 @@ while [ "$1" != "" ]; do
shift shift
done done
for ELEMENT in '' '-d' '--directory' '-s' '--storage' '-cs' '--custom-storage' '-a' '--advanced' '-c' '--chunk'; do
if [ "$ELEMENT" = "$REFERENCE" ]; then
REFERENCE="."
fi
done
if [[ $OPERATION = "" ]]; then if [[ $OPERATION = "" ]]; then
echo -e "${ORANGE}[ERROR] Please specify a valid operation.${NC}"
echo echo
helpReadme >&2; exit echo -e "${ORANGE}[ERROR] Please specify a valid operation.${NC}"
fi
if [[ $REFERENCE = "" ]]; then
echo -e "${ORANGE}[ERROR] Dataset not specified."
echo -e "[INFO] Please specify a valid dataset.${NC}"
echo echo
helpReadme >&2; exit helpReadme >&2; exit
fi fi
...@@ -463,7 +510,41 @@ if [[ $CHUNK = "" ]]; then ...@@ -463,7 +510,41 @@ if [[ $CHUNK = "" ]]; then
fi fi
case $OPERATION in case $OPERATION in
add)
OP=$STAGE_ADD
stage
ELEMENT="./$PATH_TO_ELEMENT"
if [ -d "$ELEMENT" ] || [ -f "$ELEMENT" ]; then
echo "$OP;$PATH_TO_ELEMENT" >> $STAGE_FILE
else
echo
echo -e "${ORANGE}[ERROR] '$PATH_TO_ELEMENT' did not match any files or directories."
echo -e "[INFO] Please specify a valid path.${NC}"
echo
helpReadme >&2; exit
fi
;;
remove)
OP=$STAGE_REMOVE
stage
echo "$OP;$PATH_TO_ELEMENT" >> $STAGE_FILE
;;
reset)
OP=$STAGE_RESET
stage
echo "$OP;$PATH_TO_ELEMENT" >> $STAGE_FILE
;;
status)
welcome
STATUS=$OPERATION
FILE=$ING_FILE
TEMPLATE_FILE=$ING_TEMPLATE_FILE
echo "### DATASET STATUS ###"
echo
runProcess
;;
pull) pull)
welcome
FILE=$DOWN_FILE FILE=$DOWN_FILE
TEMPLATE_FILE=$DOWN_TEMPLATE_FILE TEMPLATE_FILE=$DOWN_TEMPLATE_FILE
if [ "$STORAGE" != "" ] ; then if [ "$STORAGE" != "" ] ; then
...@@ -480,6 +561,7 @@ case $OPERATION in ...@@ -480,6 +561,7 @@ case $OPERATION in
runProcess runProcess
;; ;;
push) push)
welcome
MESSAGE="storage: $STORAGE" MESSAGE="storage: $STORAGE"
if [ "$CUSTOM" = true ] ; then if [ "$CUSTOM" = true ] ; then
FILE=$CUSTOM_ING_FILE FILE=$CUSTOM_ING_FILE
......
...@@ -7,9 +7,6 @@ module Embulk ...@@ -7,9 +7,6 @@ module Embulk
class Wendelininput < InputPlugin class Wendelininput < InputPlugin
CHUNK_SIZE = 50000000 #50mb
MEGA = 1000000
UPDATE = "U" UPDATE = "U"
RESUME = "R" RESUME = "R"
DOWNLOAD = "D" DOWNLOAD = "D"
...@@ -17,34 +14,17 @@ module Embulk ...@@ -17,34 +14,17 @@ module Embulk
Plugin.register_input("wendelin", self) Plugin.register_input("wendelin", self)
def self.warnConflicts(remote_streams, data_set, action) def self.warnConflicts(remote_streams, data_set)
if not remote_streams.empty? if not remote_streams.empty?
paths = [@data_set_directory.end_with?("/") ? @data_set_directory : @data_set_directory + "/"] conflicts = @dataset_utils.getLocalConflicts(remote_streams, data_set)
local_files = paths.map {|path|
next [] unless Dir.exist?(path)
Dir[(path + '/**/*').gsub! '//', '/']
}.flatten.select{ |file| File.file?(file) }
local_changes, a, b, c = @dataset_utils.getLocalChanges(local_files, data_set)
data_set = @data_set.end_with?("/") ? @data_set : @data_set + "/"
remote_changes = remote_streams.map { |remote|
remote = @data_set_directory + remote["reference"].reverse.sub("/".reverse, ".".reverse).reverse.sub(data_set, "")
remote.end_with?(".none") ? remote[0...-5] : remote
}
conflicts = local_changes.select{ |conflict| remote_changes.include? conflict["path"] }.map{ |conflict| conflict["path"] }
# check scenario where the last version file exists but not in report
# (due download interrumped right after save the file but before add it to report)
if action == RESUME and conflicts.length == 1 and File.exist?(conflicts[0])
@logger.warn("The file #{conflicts[0]} was detected as false positive conflict and it was not informed to user.")
conflicts = []
end
if not conflicts.empty? if not conflicts.empty?
@logger.warn("CONFLICT: there are conflicts with some of your local changes.", print=TRUE) @logger.warn("CONFLICT: there are conflicts with some of your local changes.", print=TRUE)
puts "** press key **" puts "** press key **"
option = gets option = gets
@logger.warn("Conflicted files:", print=TRUE) @logger.warn("Conflicted changes:", print=TRUE)
@logger.warn(conflicts, print=TRUE) @logger.warn(conflicts, print=TRUE)
puts puts
@logger.warn("Your local conflicted files will be overwritten by download.", print=TRUE) @logger.warn("Your local conflicted changes will be overwritten by current download.", print=TRUE)
@logger.warn("Do you want to continue? (y/n)", print=TRUE) @logger.warn("Do you want to continue? (y/n)", print=TRUE)
option = gets option = gets
option = option.chomp option = option.chomp
...@@ -52,6 +32,7 @@ module Embulk ...@@ -52,6 +32,7 @@ module Embulk
@logger.info("Download cancelled by user.", print=TRUE) @logger.info("Download cancelled by user.", print=TRUE)
@logger.abortExecution(error=FALSE) @logger.abortExecution(error=FALSE)
end end
@dataset_utils.deleteStagedFile()
end end
end end
end end
...@@ -79,26 +60,16 @@ module Embulk ...@@ -79,26 +60,16 @@ module Embulk
end end
case option case option
when action when action
@logger.info("Checking remote changes and posible local conflicts...", print=TRUE) if action != RESUME
task['data_streams'] = @dataset_utils.getRemoteChangedDataStreams(task['data_streams']) task['data_streams'] = @dataset_utils.getRemoteChangedDataStreams(task['data_streams'])
self.warnConflicts(task['data_streams'], task['data_set'], action) self.warnConflicts(task['data_streams'], task['data_set']) if action != RESUME
@dataset_utils.deleteCompletedFile() @dataset_utils.deleteCompletedFile()
if task['data_streams'].empty? if task['data_streams'].empty?
@logger.info("No new files in dataset.", print=TRUE)
@logger.info("Your downloaded dataset is already up to date.", print=TRUE) @logger.info("Your downloaded dataset is already up to date.", print=TRUE)
end end
when DOWNLOAD when DOWNLOAD
ebulk_file = @data_set_directory + "/.ebulk_dataset" @logger.info("Checking remote files and posible local conflicts...", print=TRUE)
ebulk_file_content = "" self.warnConflicts(task['data_streams'], task['data_set'])
if File.file?(ebulk_file)
ebulk_file_content = File.read(ebulk_file)
end
FileUtils.rm_rf(@data_set_directory)
unless File.directory?(@data_set_directory)
FileUtils.mkdir_p(@data_set_directory)
end
if ebulk_file_content != ""
File.open(ebulk_file, 'w') { |file| file.write(ebulk_file_content) }
end
@dataset_utils.deleteCompletedFile() @dataset_utils.deleteCompletedFile()
@dataset_utils.createReportFile() @dataset_utils.createReportFile()
when ABORT when ABORT
...@@ -114,18 +85,12 @@ module Embulk ...@@ -114,18 +85,12 @@ module Embulk
@erp5_url = config.param('erp5_url', :string) @erp5_url = config.param('erp5_url', :string)
@data_set = config.param('data_set', :string) @data_set = config.param('data_set', :string)
@logger.info("Dataset name: #{@data_set}") @logger.info("Dataset name: #{@data_set}")
if @data_set == '$DATA_SET'
@logger.error("There was an error setting the configuration file", print=TRUE)
@logger.info("Please try manual download or update manually the download configuration file.", print=TRUE)
@logger.abortExecution()
end
@user = config.param("user", :string, defualt: nil) @user = config.param("user", :string, defualt: nil)
@logger.info("User: #{@user}") @logger.info("User: #{@user}")
@password = config.param("password", :string, default: nil) @password = config.param("password", :string, default: nil)
@chunk_size = config.param('chunk_size', :float, default: 0) * MEGA @chunk_size = config.param('chunk_size', :float, default: 0) * DatasetUtils::MEGA
@output_path = config.param("output_path", :string, :default => nil) @output_path = config.param("output_path", :string, :default => nil)
if File.directory?(@output_path) if not File.directory?(@output_path)
else
@logger.error("Output directory not found.", print=TRUE) @logger.error("Output directory not found.", print=TRUE)
@logger.abortExecution() @logger.abortExecution()
end end
...@@ -141,31 +106,21 @@ module Embulk ...@@ -141,31 +106,21 @@ module Embulk
'tool_dir' => @tool_dir 'tool_dir' => @tool_dir
} }
if task['chunk_size'] == 0 if task['chunk_size'] == 0
task['chunk_size'] = CHUNK_SIZE task['chunk_size'] = DatasetUtils::CHUNK_SIZE
end end
@logger.info("Chunk size set in #{task['chunk_size']/MEGA}MB") @logger.info("Chunk size set in #{task['chunk_size']/DatasetUtils::MEGA}MB")
@data_set_directory = @output_path.end_with?("/") ? @output_path : @output_path + "/" @dataset_utils = DatasetUtils.new("")
task['data_set_directory'] = @data_set_directory task['data_set_directory'] = @dataset_utils.appendSlashTo(@output_path)
@data_set_directory = task['data_set_directory']
@dataset_utils = DatasetUtils.new(@data_set_directory) @dataset_utils = DatasetUtils.new(@data_set_directory)
@logger.info("Getting remote file list from dataset '#{@data_set}'...", print=TRUE) @logger.info("Getting remote file list from dataset '#{@data_set}'...", print=TRUE)
data_stream_list = @wendelin.getDataStreams(@data_set) data_stream_list = @wendelin.getDataStreams(@data_set)
n_retry = 0
while data_stream_list["status_code"] == 2 and n_retry < 6
sleep 10
data_stream_list = @wendelin.getDataStreams(@data_set)
n_retry += 1
end
if data_stream_list["status_code"] == 0 if data_stream_list["status_code"] == 0
if data_stream_list["result"].empty? if data_stream_list["result"].empty?
@logger.error("No valid data found for data set " + @data_set, print=TRUE) @logger.error("No valid data found for data set " + @data_set, print=TRUE)
@logger.abortExecution(error=FALSE) @logger.abortExecution(error=FALSE)
end end
task['data_streams'] = data_stream_list["result"] task['data_streams'] = data_stream_list["result"]
elsif data_stream_list["status_code"] == 2
@logger.error("Dataset '#{@data_set}' has files recently ingested waiting for processing.", print=TRUE)
@logger.error("Please retry in some minutes.", print=TRUE)
@logger.abortExecution(error=FALSE)
else else
@logger.error(data_stream_list["error_message"], print=TRUE) @logger.error(data_stream_list["error_message"], print=TRUE)
@logger.abortExecution() @logger.abortExecution()
...@@ -189,13 +144,9 @@ module Embulk ...@@ -189,13 +144,9 @@ module Embulk
self.askUserForAction(task, action=UPDATE) self.askUserForAction(task, action=UPDATE)
end end
else else
dir_entries = Dir.entries(@data_set_directory).length if not @dataset_utils.dirEmpty(@data_set_directory)
if File.file?(@data_set_directory+"/.ebulk_dataset")
dir_entries -= 1
end
if dir_entries > 2
puts puts
@logger.info("Dataset download directory is not empty! It will be overwritten: " + @data_set_directory, print=TRUE) @logger.info("Dataset download directory is not empty! Its files could be overwritten: " + @data_set_directory, print=TRUE)
@logger.info("Continue with download? (y/n)", print=TRUE) @logger.info("Continue with download? (y/n)", print=TRUE)
option = gets option = gets
option = option.chomp option = option.chomp
...@@ -203,6 +154,8 @@ module Embulk ...@@ -203,6 +154,8 @@ module Embulk
@logger.info("Download cancelled by user.", print=TRUE) @logger.info("Download cancelled by user.", print=TRUE)
@logger.abortExecution(error=FALSE) @logger.abortExecution(error=FALSE)
end end
@logger.info("Checking remote files and posible local conflicts...", print=TRUE)
self.warnConflicts(task['data_streams'], task['data_set'])
end end
@dataset_utils.createReportFile() @dataset_utils.createReportFile()
end end
...@@ -225,18 +178,7 @@ module Embulk ...@@ -225,18 +178,7 @@ module Embulk
def self.resume(task, columns, count, &control) def self.resume(task, columns, count, &control)
@logger = LogManager.instance() @logger = LogManager.instance()
task_reports = yield(task, columns, count) task_reports = yield(task, columns, count)
if task_reports.any? @dataset_utils.showTaskReport(task_reports)
@logger.info("Reports:", print=TRUE)
if task_reports.length > 15
@logger.info(task_reports[0, 5], print=TRUE)
@logger.info(".....", print=TRUE)
@logger.info(task_reports[task_reports.length-5, task_reports.length-1], print=TRUE)
else
@logger.info(task_reports, print=TRUE)
end
@logger.info("Full task report:")
@logger.info(task_reports)
end
next_config_diff = task_reports.map{|hash| hash[DatasetUtils::RUN_DONE]}.flatten.compact next_config_diff = task_reports.map{|hash| hash[DatasetUtils::RUN_DONE]}.flatten.compact
if(next_config_diff.length == count) if(next_config_diff.length == count)
if(count > 0) if(count > 0)
...@@ -245,10 +187,15 @@ module Embulk ...@@ -245,10 +187,15 @@ module Embulk
@logger.info("Dataset files are in dataset directory: " + @data_set_directory, print=TRUE) @logger.info("Dataset files are in dataset directory: " + @data_set_directory, print=TRUE)
end end
@dataset_utils.createCompletedFile() @dataset_utils.createCompletedFile()
if count > 10 else
next_config_diff = {} if(count > 0)
puts
@logger.error("Some remote files could not be downloaded. Please check the details in the log file: " + @logger.getLogPath(), print=TRUE)
@logger.info("Please retry the operation for download those files.", print=TRUE)
puts
end end
end end
next_config_diff = {}
return {DatasetUtils::RUN_DONE => next_config_diff} return {DatasetUtils::RUN_DONE => next_config_diff}
end end
...@@ -296,7 +243,6 @@ module Embulk ...@@ -296,7 +243,6 @@ module Embulk
else else
return_value = DatasetUtils::RUN_DONE return_value = DatasetUtils::RUN_DONE
end end
# update reports if operation successfully ended
if return_value == DatasetUtils::RUN_DONE if return_value == DatasetUtils::RUN_DONE
if hash.to_s == DatasetUtils::DELETE if hash.to_s == DatasetUtils::DELETE
@dataset_utils.deleteFromReport(ref, return_value) @dataset_utils.deleteFromReport(ref, return_value)
......
...@@ -35,12 +35,9 @@ module Embulk ...@@ -35,12 +35,9 @@ module Embulk
page.each do |record| page.each do |record|
reference = record[0] reference = record[0]
data_chunk = Base64.decode64(record[1]) data_chunk = Base64.decode64(record[1])
data_set_directory = @output_path.end_with?("/") ? @output_path : @output_path + "/" @dataset_utils = DatasetUtils.new("")
ref = reference.reverse.sub("/".reverse, ".".reverse).reverse.sub(record[2]+"/", "") data_set_directory = @dataset_utils.appendSlashTo(@output_path)
if ref.end_with?(".none") file_path = @dataset_utils.referenceToPath(reference, data_set_directory, record[2])
ref = ref[0...-5]
end
file_path = data_set_directory + ref
write_mode = 'ab' write_mode = 'ab'
if record[3] == DatasetUtils::DELETE if record[3] == DatasetUtils::DELETE
File.delete(file_path) if File.exist?(file_path) File.delete(file_path) if File.exist?(file_path)
...@@ -48,7 +45,7 @@ module Embulk ...@@ -48,7 +45,7 @@ module Embulk
if record[3] == TRUE.to_s if record[3] == TRUE.to_s
write_mode = 'w' write_mode = 'w'
end end
dirname = File.dirname(data_set_directory + ref) dirname = File.dirname(file_path)
unless File.directory?(dirname) unless File.directory?(dirname)
FileUtils.mkdir_p(dirname) FileUtils.mkdir_p(dirname)
end end
......
...@@ -46,11 +46,12 @@ module Embulk ...@@ -46,11 +46,12 @@ module Embulk
hash = record[7] hash = record[7]
begin begin
if eof == DatasetUtils::DELETE if eof == DatasetUtils::DELETE
reference = [dataset, filename, extension].join("/") reference = [dataset, filename, extension].join(DatasetUtils::REFERENCE_SEPARATOR)
@wendelin.delete(reference) @wendelin.delete(reference)
else else
reference = [supplier, dataset, filename, extension, eof, size, hash].join("/") reference = [supplier, dataset, filename, extension, eof, size, hash].join(DatasetUtils::REFERENCE_SEPARATOR)
if not @wendelin.ingest(reference, data_chunk) split = eof != ""
if not @wendelin.ingest(reference, data_chunk, split)
raise "could not ingest" raise "could not ingest"
end end
end end
......
require_relative '../filelogger' require_relative '../filelogger'
require_relative '../dataset_utils'
class Index class Index
include Singleton include Singleton
...@@ -19,21 +20,20 @@ module Embulk ...@@ -19,21 +20,20 @@ module Embulk
class BinaryParserPlugin < ParserPlugin class BinaryParserPlugin < ParserPlugin
Plugin.register_parser("binary", self) Plugin.register_parser("binary", self)
CHUNK_SIZE = 50
MEGA = 1000000
EOF = "EOF"
def self.transaction(config, &control) def self.transaction(config, &control)
tool_dir = config.param('tool_dir', :string, default: ".") tool_dir = config.param('tool_dir', :string, default: ".")
@logger = LogManager.instance() @logger = LogManager.instance()
@logger.setFilename(tool_dir, "parser") @logger.setFilename(tool_dir, "parser")
task = { task = {
chunk_size: config.param('chunk_size', :float, default: CHUNK_SIZE) * MEGA, chunk_size: config.param('chunk_size', :float, default: 0) * DatasetUtils::MEGA,
supplier: config.param("supplier", :string, default: "parser"), supplier: config.param("supplier", :string, default: "parser"),
data_set: config.param("data_set", :string), data_set: config.param("data_set", :string),
input_plugin: config.param("storage", :string, default: "parser"), input_plugin: config.param("storage", :string, default: "parser"),
date: Time.now.strftime("%Y-%m-%d_%H-%M-%S") date: Time.now.strftime("%Y-%m-%d_%H-%M-%S")
} }
if task['chunk_size'] == 0
task['chunk_size'] = DatasetUtils::CHUNK_SIZE
end
columns = [ columns = [
Column.new(0, "supplier", :string), Column.new(0, "supplier", :string),
Column.new(1, "data_set", :string), Column.new(1, "data_set", :string),
...@@ -71,7 +71,7 @@ module Embulk ...@@ -71,7 +71,7 @@ module Embulk
end end
private private
def each_chunk(file, filename, chunk_size=CHUNK_SIZE) def each_chunk(file, filename, chunk_size=DatasetUtils::CHUNK_SIZE)
extension = @index.to_s.rjust(3, "0") extension = @index.to_s.rjust(3, "0")
npart = 0 npart = 0
next_byte = file.read(1) next_byte = file.read(1)
...@@ -89,7 +89,7 @@ module Embulk ...@@ -89,7 +89,7 @@ module Embulk
data += file.read(chunk_size) data += file.read(chunk_size)
next_byte = file.read(1) next_byte = file.read(1)
if not next_byte if not next_byte
eof = EOF eof = DatasetUtils::EOF
if first if first
# this means that the whole file will be ingested at once (not split) # this means that the whole file will be ingested at once (not split)
eof = "" eof = ""
......
...@@ -23,6 +23,9 @@ class WendelinClient ...@@ -23,6 +23,9 @@ class WendelinClient
rescue Exception => e rescue Exception => e
@logger.error("An error occurred while checking if reference exists: " + e.to_s) @logger.error("An error occurred while checking if reference exists: " + e.to_s)
@logger.error(e.backtrace) @logger.error(e.backtrace)
if e.to_s.include? "Unauthorized" or e.to_s.include? "401"
raise e
end
return FALSE return FALSE
else else
return res.to_s == 'TRUE' return res.to_s == 'TRUE'
...@@ -53,27 +56,27 @@ class WendelinClient ...@@ -53,27 +56,27 @@ class WendelinClient
end end
end end
def ingest(reference, data_chunk) def ingest(reference, data_chunk, split)
@logger.info("Ingestion reference: #{reference}", print=TRUE) @logger.info("Ingestion reference: #{reference}", print=TRUE)
if Time.new - @last_ingestion < 2 if split and Time.new - @last_ingestion < 3
# avoid send ingestions to close (specially for split ones) # avoid to send split ingestions to close
sleep 2 sleep 3
end end
if exists(reference) if exists(reference)
@logger.info("There is another ingestion already done for the pair data_set-filename. Reference "\ @logger.info("There is another ingestion already done for the pair dataset-filename. Reference "\
+ reference, print=TRUE) + reference, print=TRUE)
@logger.info("Rename your reference or delete the older ingestion.", print=TRUE) @logger.info("Rename your file or download the full dataset to make local changes.", print=TRUE)
return FALSE return FALSE
end end
if reference.include? "#" or reference.include? "+" if reference.include? "#" or reference.include? "+"
raise "Invalid chars in file name. Please rename it." raise "invalid chars in file name. Please rename it."
end end
begin begin
uri = URI("#{@erp5_url}/ingest?reference=#{reference}") uri = URI("#{@erp5_url}/ingest?reference=#{reference}")
rescue Exception => e rescue Exception => e
@logger.error("An error occurred while generating url: " + e.to_s) @logger.error("An error occurred while generating url: " + e.to_s)
@logger.error(e.backtrace) @logger.error(e.backtrace)
raise "Invalid chars in file name. Please rename it." raise "invalid chars in file name. Please rename it."
end end
response = handleRequest(uri, reference, data_chunk) response = handleRequest(uri, reference, data_chunk)
if response == FALSE if response == FALSE
...@@ -138,7 +141,7 @@ class WendelinClient ...@@ -138,7 +141,7 @@ class WendelinClient
res = Net::HTTP.start(uri.hostname, uri.port, res = Net::HTTP.start(uri.hostname, uri.port,
:use_ssl => (uri.scheme == 'https'), :use_ssl => (uri.scheme == 'https'),
:verify_mode => OpenSSL::SSL::VERIFY_NONE, :verify_mode => OpenSSL::SSL::VERIFY_NONE,
:ssl_timeout => 20, :open_timeout => 20, :read_timeout => 20, :ssl_timeout => 300, :open_timeout => 300, :read_timeout => 300,
) do |http| ) do |http|
http.request(req) http.request(req)
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment