new comands for staging and reset

parent 963e3e0d
......@@ -22,6 +22,9 @@ GREEN='\033[0;32m'
ORANGE='\033[0;33m'
NC='\033[0m'
DEFAULT_CHUNK_SIZE="50"
STAGE_ADD="add"
STAGE_REMOVE="remove"
STAGE_RESET="reset"
function helpReadme {
echo -e "[INFO] For help, please run '${GREEN}ebulk --help${NC}'"
......@@ -42,9 +45,9 @@ function checkParameters {
helpReadme >&2; return 1
fi
if [ "$STORAGE" = "" ] ; then
if [ ! -d $DATASET_DIR ]; then
if [ ! -d "$DATASET_DIR" ]; then
echo
mkdir $DATASET_DIR 2>/dev/null
mkdir "$DATASET_DIR" 2>/dev/null
if [ ! $? -eq 0 ]; then
echo
echo -e "${ORANGE}[ERROR] Dataset path not found."
......@@ -55,14 +58,19 @@ function checkParameters {
fi
EBULK_DATASET_FILE="$DATASET_DIR/.ebulk_dataset"
if [[ $DATASET_DIR != $REFERENCE ]]; then
if [ "$REFERENCE" = "." ] ; then
REFERENCE=$(basename "$DATASET_DIR")
fi
DATA_SET=$REFERENCE
echo $REFERENCE > $EBULK_DATASET_FILE 2>/dev/null
echo $REFERENCE > "$EBULK_DATASET_FILE" 2>/dev/null
else
if [ -f $EBULK_DATASET_FILE ]; then
DATA_SET=$(cat "$DATASET_DIR/.ebulk_dataset")
if [ -f "$EBULK_DATASET_FILE" ]; then
DATA_SET=$(cat "$DATASET_DIR/.ebulk_dataset" 2>/dev/null)
else
DATA_SET=$(basename "$DATASET_DIR")
echo $DATA_SET > $EBULK_DATASET_FILE 2>/dev/null
if [ "$DATA_SET" != "." ] ; then
SAVE_DATASET_NAME="TRUE"
fi
fi
fi
else
......@@ -70,14 +78,19 @@ function checkParameters {
fi
re='^[A-Za-z][_A-Za-z.0-9-]*$'
if ! [[ $DATA_SET =~ $re ]] ; then
if [ "$DATA_SET" = "." ] && [[ -z "$STORAGE" ]] ; then
echo
echo -e "${ORANGE}[ERROR] You are not in a dataset directory ${GREEN}'$DATA_SET'${ORANGE}.${NC}"
echo
else
echo
echo -e "${ORANGE}[ERROR] Error in argument: invalid dataset name ${GREEN}'$DATA_SET'${ORANGE}.${NC}"
echo -e "${ORANGE}[ERROR] Dataset name must start with a letter, and only alphanumerics, dots ( . ), underscores ( _ ) and hyphens ( - ) are allowed.${NC}"
echo
if [ -f $EBULK_DATASET_FILE ]; then
rm -f ${EBULK_DATASET_FILE}
fi
helpReadme >&2; return 1
elif [ ! -z "$SAVE_DATASET_NAME" ]; then
echo $DATA_SET > "$EBULK_DATASET_FILE" 2>/dev/null
fi
if [ ! -z "$CHUNK" ]; then
re='^[0-9]+$'
......@@ -143,6 +156,7 @@ function updateConfigFile {
DOWN_URL=\"$DOWN_URL\"
ING_URL=\"$ING_URL\"
STORAGE=\"$STORAGE\"
STATUS=\"$STATUS\"
S3_BUCKET=\"$S3_BUCKET\"
S3_PREFIX=\"$S3_PREFIX\"
......@@ -174,6 +188,7 @@ function runProcess {
return 1
fi
echo -e "[INFO] Dataset: ${GREEN}$DATA_SET${NC}"
if [ -z "$STATUS" ]; then
if [ ! -z "$CHUNK" ]; then
if [ "$CHUNK" -eq "0" ]; then
echo "[INFO] Default chunk size: $DEFAULT_CHUNK_SIZE Mb."
......@@ -181,17 +196,20 @@ function runProcess {
echo "[INFO] Chunk size set in $CHUNK Mb."
fi
fi
fi
if [ -z "$STATUS" ]; then
if ! askCredentials; then
return 1
fi
fi
echo
echo "[INFO] Supplier: $USER"
updateConfigFile
echo "[INFO] Starting operation..."
if [ ! -d $LOG_DIR ]; then
mkdir $LOG_DIR 2>/dev/null
fi
$embulk run -L $TOOL_PATH/embulk-wendelin-dataset-tool $FILE $DIFF 2> "$LOG_DIR/error.log" || {
if [ -z "$STATUS" ]; then
echo
echo -e "${ORANGE}[ERROR] Embulk tool stopped its execution.${NC}"
if [ "$STORAGE" != \"\" ] ; then
......@@ -200,6 +218,7 @@ function runProcess {
fi
echo "[INFO] Please check the logs in '$LOG_DIR' directory for more details."
echo
fi
}
}
......@@ -383,17 +402,37 @@ function askS3parameters {
fi
}
# WELCOME
echo
echo " #########################################################################"
echo " ############## WELCOME TO EBULK INGESTION-DOWNLOAD TOOL #################"
echo " ########### This tool relies on Embulk software and Java 8 ##############"
echo " ######## Do not forget to check the README before use this tool #########"
echo " ############## In case of any problem, please contact us ###############"
echo " ####################### roqueporchetto@gmail.com ########################"
echo " ###################### Happy ingestion-download ! #######################"
echo " #########################################################################"
echo
function stage {
EBULK_DATASET_FILE="./.ebulk_dataset"
if [ ! -f "$EBULK_DATASET_FILE" ]; then
echo
echo -e "${ORANGE}[ERROR] You are not in a dataset directory."
echo -e "[INFO] $OP operation can only be run within a root dataset directory.${NC}"
echo
helpReadme >&2; exit
fi
if [[ $PATH_TO_ELEMENT = "" ]]; then
echo
echo -e "${ORANGE}[ERROR] Nothing specified, nothing to $OP."
echo -e "[INFO] Please specify a valid path.${NC}"
echo
helpReadme >&2; exit
fi
STAGE_FILE="./.staged"
}
function welcome {
echo
echo " #########################################################################"
echo " ############## WELCOME TO EBULK INGESTION-DOWNLOAD TOOL #################"
echo " ########### This tool relies on Embulk software and Java 8 ##############"
echo " ######## Do not forget to check the README before use this tool #########"
echo " ############## In case of any problem, please contact us ###############"
echo " ####################### roqueporchetto@gmail.com ########################"
echo " ###################### Happy ingestion-download ! #######################"
echo " #########################################################################"
echo
}
if [ ! -d $EBULK_DATA_PATH ]; then
mkdir $EBULK_DATA_PATH 2>/dev/null
......@@ -428,14 +467,21 @@ while [ "$1" != "" ]; do
-h | --help ) cat $TOOL_PATH/help.md
exit
;;
-e | --examples ) cat $TOOL_PATH/example.md
exit
;;
-r | --readme ) less $TOOL_PATH/README.md
exit
;;
pull ) OPERATION=$1
status | push | pull ) OPERATION=$1
;;
push ) OPERATION=$1
add | remove | reset ) OPERATION=$1
shift
PATH_TO_ELEMENT=$1
REFERENCE="."
;;
*) if [[ $REFERENCE != $1 ]]; then
echo
echo -e "${ORANGE}[ERROR] Invalid parameter '$1'.${NC}"
echo
helpReadme >&2; exit
......@@ -444,14 +490,15 @@ while [ "$1" != "" ]; do
shift
done
for ELEMENT in '' '-d' '--directory' '-s' '--storage' '-cs' '--custom-storage' '-a' '--advanced' '-c' '--chunk'; do
if [ "$ELEMENT" = "$REFERENCE" ]; then
REFERENCE="."
fi
done
if [[ $OPERATION = "" ]]; then
echo -e "${ORANGE}[ERROR] Please specify a valid operation.${NC}"
echo
helpReadme >&2; exit
fi
if [[ $REFERENCE = "" ]]; then
echo -e "${ORANGE}[ERROR] Dataset not specified."
echo -e "[INFO] Please specify a valid dataset.${NC}"
echo -e "${ORANGE}[ERROR] Please specify a valid operation.${NC}"
echo
helpReadme >&2; exit
fi
......@@ -463,7 +510,41 @@ if [[ $CHUNK = "" ]]; then
fi
case $OPERATION in
add)
OP=$STAGE_ADD
stage
ELEMENT="./$PATH_TO_ELEMENT"
if [ -d "$ELEMENT" ] || [ -f "$ELEMENT" ]; then
echo "$OP;$PATH_TO_ELEMENT" >> $STAGE_FILE
else
echo
echo -e "${ORANGE}[ERROR] '$PATH_TO_ELEMENT' did not match any files or directories."
echo -e "[INFO] Please specify a valid path.${NC}"
echo
helpReadme >&2; exit
fi
;;
remove)
OP=$STAGE_REMOVE
stage
echo "$OP;$PATH_TO_ELEMENT" >> $STAGE_FILE
;;
reset)
OP=$STAGE_RESET
stage
echo "$OP;$PATH_TO_ELEMENT" >> $STAGE_FILE
;;
status)
welcome
STATUS=$OPERATION
FILE=$ING_FILE
TEMPLATE_FILE=$ING_TEMPLATE_FILE
echo "### DATASET STATUS ###"
echo
runProcess
;;
pull)
welcome
FILE=$DOWN_FILE
TEMPLATE_FILE=$DOWN_TEMPLATE_FILE
if [ "$STORAGE" != "" ] ; then
......@@ -480,6 +561,7 @@ case $OPERATION in
runProcess
;;
push)
welcome
MESSAGE="storage: $STORAGE"
if [ "$CUSTOM" = true ] ; then
FILE=$CUSTOM_ING_FILE
......
......@@ -7,9 +7,6 @@ module Embulk
class Wendelininput < InputPlugin
CHUNK_SIZE = 50000000 #50mb
MEGA = 1000000
UPDATE = "U"
RESUME = "R"
DOWNLOAD = "D"
......@@ -17,34 +14,17 @@ module Embulk
Plugin.register_input("wendelin", self)
def self.warnConflicts(remote_streams, data_set, action)
def self.warnConflicts(remote_streams, data_set)
if not remote_streams.empty?
paths = [@data_set_directory.end_with?("/") ? @data_set_directory : @data_set_directory + "/"]
local_files = paths.map {|path|
next [] unless Dir.exist?(path)
Dir[(path + '/**/*').gsub! '//', '/']
}.flatten.select{ |file| File.file?(file) }
local_changes, a, b, c = @dataset_utils.getLocalChanges(local_files, data_set)
data_set = @data_set.end_with?("/") ? @data_set : @data_set + "/"
remote_changes = remote_streams.map { |remote|
remote = @data_set_directory + remote["reference"].reverse.sub("/".reverse, ".".reverse).reverse.sub(data_set, "")
remote.end_with?(".none") ? remote[0...-5] : remote
}
conflicts = local_changes.select{ |conflict| remote_changes.include? conflict["path"] }.map{ |conflict| conflict["path"] }
# check scenario where the last version file exists but not in report
# (due download interrumped right after save the file but before add it to report)
if action == RESUME and conflicts.length == 1 and File.exist?(conflicts[0])
@logger.warn("The file #{conflicts[0]} was detected as false positive conflict and it was not informed to user.")
conflicts = []
end
conflicts = @dataset_utils.getLocalConflicts(remote_streams, data_set)
if not conflicts.empty?
@logger.warn("CONFLICT: there are conflicts with some of your local changes.", print=TRUE)
puts "** press key **"
option = gets
@logger.warn("Conflicted files:", print=TRUE)
@logger.warn("Conflicted changes:", print=TRUE)
@logger.warn(conflicts, print=TRUE)
puts
@logger.warn("Your local conflicted files will be overwritten by download.", print=TRUE)
@logger.warn("Your local conflicted changes will be overwritten by current download.", print=TRUE)
@logger.warn("Do you want to continue? (y/n)", print=TRUE)
option = gets
option = option.chomp
......@@ -52,6 +32,7 @@ module Embulk
@logger.info("Download cancelled by user.", print=TRUE)
@logger.abortExecution(error=FALSE)
end
@dataset_utils.deleteStagedFile()
end
end
end
......@@ -79,26 +60,16 @@ module Embulk
end
case option
when action
@logger.info("Checking remote changes and posible local conflicts...", print=TRUE) if action != RESUME
task['data_streams'] = @dataset_utils.getRemoteChangedDataStreams(task['data_streams'])
self.warnConflicts(task['data_streams'], task['data_set'], action)
self.warnConflicts(task['data_streams'], task['data_set']) if action != RESUME
@dataset_utils.deleteCompletedFile()
if task['data_streams'].empty?
@logger.info("No new files in dataset.", print=TRUE)
@logger.info("Your downloaded dataset is already up to date.", print=TRUE)
end
when DOWNLOAD
ebulk_file = @data_set_directory + "/.ebulk_dataset"
ebulk_file_content = ""
if File.file?(ebulk_file)
ebulk_file_content = File.read(ebulk_file)
end
FileUtils.rm_rf(@data_set_directory)
unless File.directory?(@data_set_directory)
FileUtils.mkdir_p(@data_set_directory)
end
if ebulk_file_content != ""
File.open(ebulk_file, 'w') { |file| file.write(ebulk_file_content) }
end
@logger.info("Checking remote files and posible local conflicts...", print=TRUE)
self.warnConflicts(task['data_streams'], task['data_set'])
@dataset_utils.deleteCompletedFile()
@dataset_utils.createReportFile()
when ABORT
......@@ -114,18 +85,12 @@ module Embulk
@erp5_url = config.param('erp5_url', :string)
@data_set = config.param('data_set', :string)
@logger.info("Dataset name: #{@data_set}")
if @data_set == '$DATA_SET'
@logger.error("There was an error setting the configuration file", print=TRUE)
@logger.info("Please try manual download or update manually the download configuration file.", print=TRUE)
@logger.abortExecution()
end
@user = config.param("user", :string, defualt: nil)
@logger.info("User: #{@user}")
@password = config.param("password", :string, default: nil)
@chunk_size = config.param('chunk_size', :float, default: 0) * MEGA
@chunk_size = config.param('chunk_size', :float, default: 0) * DatasetUtils::MEGA
@output_path = config.param("output_path", :string, :default => nil)
if File.directory?(@output_path)
else
if not File.directory?(@output_path)
@logger.error("Output directory not found.", print=TRUE)
@logger.abortExecution()
end
......@@ -141,31 +106,21 @@ module Embulk
'tool_dir' => @tool_dir
}
if task['chunk_size'] == 0
task['chunk_size'] = CHUNK_SIZE
task['chunk_size'] = DatasetUtils::CHUNK_SIZE
end
@logger.info("Chunk size set in #{task['chunk_size']/MEGA}MB")
@data_set_directory = @output_path.end_with?("/") ? @output_path : @output_path + "/"
task['data_set_directory'] = @data_set_directory
@logger.info("Chunk size set in #{task['chunk_size']/DatasetUtils::MEGA}MB")
@dataset_utils = DatasetUtils.new("")
task['data_set_directory'] = @dataset_utils.appendSlashTo(@output_path)
@data_set_directory = task['data_set_directory']
@dataset_utils = DatasetUtils.new(@data_set_directory)
@logger.info("Getting remote file list from dataset '#{@data_set}'...", print=TRUE)
data_stream_list = @wendelin.getDataStreams(@data_set)
n_retry = 0
while data_stream_list["status_code"] == 2 and n_retry < 6
sleep 10
data_stream_list = @wendelin.getDataStreams(@data_set)
n_retry += 1
end
if data_stream_list["status_code"] == 0
if data_stream_list["result"].empty?
@logger.error("No valid data found for data set " + @data_set, print=TRUE)
@logger.abortExecution(error=FALSE)
end
task['data_streams'] = data_stream_list["result"]
elsif data_stream_list["status_code"] == 2
@logger.error("Dataset '#{@data_set}' has files recently ingested waiting for processing.", print=TRUE)
@logger.error("Please retry in some minutes.", print=TRUE)
@logger.abortExecution(error=FALSE)
else
@logger.error(data_stream_list["error_message"], print=TRUE)
@logger.abortExecution()
......@@ -189,13 +144,9 @@ module Embulk
self.askUserForAction(task, action=UPDATE)
end
else
dir_entries = Dir.entries(@data_set_directory).length
if File.file?(@data_set_directory+"/.ebulk_dataset")
dir_entries -= 1
end
if dir_entries > 2
if not @dataset_utils.dirEmpty(@data_set_directory)
puts
@logger.info("Dataset download directory is not empty! It will be overwritten: " + @data_set_directory, print=TRUE)
@logger.info("Dataset download directory is not empty! Its files could be overwritten: " + @data_set_directory, print=TRUE)
@logger.info("Continue with download? (y/n)", print=TRUE)
option = gets
option = option.chomp
......@@ -203,6 +154,8 @@ module Embulk
@logger.info("Download cancelled by user.", print=TRUE)
@logger.abortExecution(error=FALSE)
end
@logger.info("Checking remote files and posible local conflicts...", print=TRUE)
self.warnConflicts(task['data_streams'], task['data_set'])
end
@dataset_utils.createReportFile()
end
......@@ -225,18 +178,7 @@ module Embulk
def self.resume(task, columns, count, &control)
@logger = LogManager.instance()
task_reports = yield(task, columns, count)
if task_reports.any?
@logger.info("Reports:", print=TRUE)
if task_reports.length > 15
@logger.info(task_reports[0, 5], print=TRUE)
@logger.info(".....", print=TRUE)
@logger.info(task_reports[task_reports.length-5, task_reports.length-1], print=TRUE)
else
@logger.info(task_reports, print=TRUE)
end
@logger.info("Full task report:")
@logger.info(task_reports)
end
@dataset_utils.showTaskReport(task_reports)
next_config_diff = task_reports.map{|hash| hash[DatasetUtils::RUN_DONE]}.flatten.compact
if(next_config_diff.length == count)
if(count > 0)
......@@ -245,10 +187,15 @@ module Embulk
@logger.info("Dataset files are in dataset directory: " + @data_set_directory, print=TRUE)
end
@dataset_utils.createCompletedFile()
if count > 10
next_config_diff = {}
else
if(count > 0)
puts
@logger.error("Some remote files could not be downloaded. Please check the details in the log file: " + @logger.getLogPath(), print=TRUE)
@logger.info("Please retry the operation for download those files.", print=TRUE)
puts
end
end
next_config_diff = {}
return {DatasetUtils::RUN_DONE => next_config_diff}
end
......@@ -296,7 +243,6 @@ module Embulk
else
return_value = DatasetUtils::RUN_DONE
end
# update reports if operation successfully ended
if return_value == DatasetUtils::RUN_DONE
if hash.to_s == DatasetUtils::DELETE
@dataset_utils.deleteFromReport(ref, return_value)
......
......@@ -35,12 +35,9 @@ module Embulk
page.each do |record|
reference = record[0]
data_chunk = Base64.decode64(record[1])
data_set_directory = @output_path.end_with?("/") ? @output_path : @output_path + "/"
ref = reference.reverse.sub("/".reverse, ".".reverse).reverse.sub(record[2]+"/", "")
if ref.end_with?(".none")
ref = ref[0...-5]
end
file_path = data_set_directory + ref
@dataset_utils = DatasetUtils.new("")
data_set_directory = @dataset_utils.appendSlashTo(@output_path)
file_path = @dataset_utils.referenceToPath(reference, data_set_directory, record[2])
write_mode = 'ab'
if record[3] == DatasetUtils::DELETE
File.delete(file_path) if File.exist?(file_path)
......@@ -48,7 +45,7 @@ module Embulk
if record[3] == TRUE.to_s
write_mode = 'w'
end
dirname = File.dirname(data_set_directory + ref)
dirname = File.dirname(file_path)
unless File.directory?(dirname)
FileUtils.mkdir_p(dirname)
end
......
......@@ -46,11 +46,12 @@ module Embulk
hash = record[7]
begin
if eof == DatasetUtils::DELETE
reference = [dataset, filename, extension].join("/")
reference = [dataset, filename, extension].join(DatasetUtils::REFERENCE_SEPARATOR)
@wendelin.delete(reference)
else
reference = [supplier, dataset, filename, extension, eof, size, hash].join("/")
if not @wendelin.ingest(reference, data_chunk)
reference = [supplier, dataset, filename, extension, eof, size, hash].join(DatasetUtils::REFERENCE_SEPARATOR)
split = eof != ""
if not @wendelin.ingest(reference, data_chunk, split)
raise "could not ingest"
end
end
......
require_relative '../filelogger'
require_relative '../dataset_utils'
class Index
include Singleton
......@@ -19,21 +20,20 @@ module Embulk
class BinaryParserPlugin < ParserPlugin
Plugin.register_parser("binary", self)
CHUNK_SIZE = 50
MEGA = 1000000
EOF = "EOF"
def self.transaction(config, &control)
tool_dir = config.param('tool_dir', :string, default: ".")
@logger = LogManager.instance()
@logger.setFilename(tool_dir, "parser")
task = {
chunk_size: config.param('chunk_size', :float, default: CHUNK_SIZE) * MEGA,
chunk_size: config.param('chunk_size', :float, default: 0) * DatasetUtils::MEGA,
supplier: config.param("supplier", :string, default: "parser"),
data_set: config.param("data_set", :string),
input_plugin: config.param("storage", :string, default: "parser"),
date: Time.now.strftime("%Y-%m-%d_%H-%M-%S")
}
if task['chunk_size'] == 0
task['chunk_size'] = DatasetUtils::CHUNK_SIZE
end
columns = [
Column.new(0, "supplier", :string),
Column.new(1, "data_set", :string),
......@@ -71,7 +71,7 @@ module Embulk
end
private
def each_chunk(file, filename, chunk_size=CHUNK_SIZE)
def each_chunk(file, filename, chunk_size=DatasetUtils::CHUNK_SIZE)
extension = @index.to_s.rjust(3, "0")
npart = 0
next_byte = file.read(1)
......@@ -89,7 +89,7 @@ module Embulk
data += file.read(chunk_size)
next_byte = file.read(1)
if not next_byte
eof = EOF
eof = DatasetUtils::EOF
if first
# this means that the whole file will be ingested at once (not split)
eof = ""
......
......@@ -23,6 +23,9 @@ class WendelinClient
rescue Exception => e
@logger.error("An error occurred while checking if reference exists: " + e.to_s)
@logger.error(e.backtrace)
if e.to_s.include? "Unauthorized" or e.to_s.include? "401"
raise e
end
return FALSE
else
return res.to_s == 'TRUE'
......@@ -53,27 +56,27 @@ class WendelinClient
end
end
def ingest(reference, data_chunk)
def ingest(reference, data_chunk, split)
@logger.info("Ingestion reference: #{reference}", print=TRUE)
if Time.new - @last_ingestion < 2
# avoid send ingestions to close (specially for split ones)
sleep 2
if split and Time.new - @last_ingestion < 3
# avoid to send split ingestions to close
sleep 3
end
if exists(reference)
@logger.info("There is another ingestion already done for the pair data_set-filename. Reference "\
@logger.info("There is another ingestion already done for the pair dataset-filename. Reference "\
+ reference, print=TRUE)
@logger.info("Rename your reference or delete the older ingestion.", print=TRUE)
@logger.info("Rename your file or download the full dataset to make local changes.", print=TRUE)
return FALSE
end
if reference.include? "#" or reference.include? "+"
raise "Invalid chars in file name. Please rename it."
raise "invalid chars in file name. Please rename it."
end
begin
uri = URI("#{@erp5_url}/ingest?reference=#{reference}")
rescue Exception => e
@logger.error("An error occurred while generating url: " + e.to_s)
@logger.error(e.backtrace)
raise "Invalid chars in file name. Please rename it."
raise "invalid chars in file name. Please rename it."
end
response = handleRequest(uri, reference, data_chunk)
if response == FALSE
......@@ -138,7 +141,7 @@ class WendelinClient
res = Net::HTTP.start(uri.hostname, uri.port,
:use_ssl => (uri.scheme == 'https'),
:verify_mode => OpenSSL::SSL::VERIFY_NONE,
:ssl_timeout => 20, :open_timeout => 20, :read_timeout => 20,
:ssl_timeout => 300, :open_timeout => 300, :read_timeout => 300,
) do |http|
http.request(req)
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment