Commit 405d1bca authored by Xiaowu Zhang's avatar Xiaowu Zhang

parameter

parent 9238195c
...@@ -19,12 +19,12 @@ md5sum = 5b5c740ed6f30e8a058b8e767840bf6f ...@@ -19,12 +19,12 @@ md5sum = 5b5c740ed6f30e8a058b8e767840bf6f
[template-jscrawler] [template-jscrawler]
filename = instance-jscrawler.cfg.jinja2.in filename = instance-jscrawler.cfg.jinja2.in
md5sum = c955e6be5d5902bbaa0daedf09920ce7 md5sum = 4c6f373b055bc64bc3256a1791b4b4bd
[template-jscrawler] [template-jscrawler]
filename = instance-jscrawler.cfg.jinja2.in filename = instance-jscrawler.cfg.jinja2.in
md5sum = c955e6be5d5902bbaa0daedf09920ce7 md5sum = 4c6f373b055bc64bc3256a1791b4b4bd
[template-jscrawler-builder] [template-jscrawler-builder]
filename = template-jscrawler.builder.sh.in filename = template-jscrawler.builder.sh.in
md5sum = f8100800cda3532ee4cf987b34bd2c13 md5sum = 6bc7ff2b35d94c093bcf8f66138d93b7
...@@ -82,6 +82,10 @@ extensions = jinja2.ext.do ...@@ -82,6 +82,10 @@ extensions = jinja2.ext.do
list = {{ slapparameter_dict.get('urls', "").split("\n") | join('\n ') }} list = {{ slapparameter_dict.get('urls', "").split("\n") | join('\n ') }}
period = {{ slapparameter_dict.get('crawl-periodicity', 0) }} period = {{ slapparameter_dict.get('crawl-periodicity', 0) }}
depth = {{ slapparameter_dict.get('crawl-depth', 3) }} depth = {{ slapparameter_dict.get('crawl-depth', 3) }}
include_html = {{ slapparameter_dict.get('include_html', True) }}
include_js = {{ slapparameter_dict.get('include_js', False) }}
include_css = {{ slapparameter_dict.get('include_css', False) }}
include_header = {{ slapparameter_dict.get('include_header', False) }}
context = context =
key public_folder directory:www key public_folder directory:www
...@@ -90,6 +94,10 @@ context = ...@@ -90,6 +94,10 @@ context =
key period :period key period :period
key url_list :list key url_list :list
key depth :depth key depth :depth
key include_html :include_html
key include_js :include_js
key include_css :include_css
key include_header :include_header
raw shell_binary {{ bash_executable_location }} raw shell_binary {{ bash_executable_location }}
raw pid_file ${directory:run}/jscrawler.pid raw pid_file ${directory:run}/jscrawler.pid
......
...@@ -6,6 +6,10 @@ OUTPUT_DIR="{{ public_folder }}" ...@@ -6,6 +6,10 @@ OUTPUT_DIR="{{ public_folder }}"
TMP_DIR="{{ tmp_folder }}" TMP_DIR="{{ tmp_folder }}"
PERIOD="{{ period }}" PERIOD="{{ period }}"
DEPTH="{{ depth }}" DEPTH="{{ depth }}"
INCLUDE_HTML={{ include_html }}
INCLUDE_JS={{ include_js }}
INCLUDE_CSS={{ include_css }}
INCLUDE_HEADER={{ include_header }}
if [ -s "{{ pid_file}}" ]; then if [ -s "{{ pid_file}}" ]; then
echo "Crawler process already running with pid `cat {{ pid_file}}`" echo "Crawler process already running with pid `cat {{ pid_file}}`"
...@@ -17,7 +21,7 @@ trap "rm -f -- '{{ pid_file}}'" EXIT ...@@ -17,7 +21,7 @@ trap "rm -f -- '{{ pid_file}}'" EXIT
echo $$ > "{{ pid_file}}" echo $$ > "{{ pid_file}}"
crawl() { crawl() {
{{ jscrawler_wrapper }} -f $TMP_OUTPUT --link $1 -d $4 {{ jscrawler_wrapper }} -f $TMP_OUTPUT --link $1 -d $4 --include_html $5 --include_js $6 --include_css $7 --include_header $8
if [ -s "$2" ]; then if [ -s "$2" ]; then
mv $2 $3 mv $2 $3
fi fi
...@@ -36,7 +40,7 @@ check_crawl() { ...@@ -36,7 +40,7 @@ check_crawl() {
I=$((T+86400*PERIOD)) I=$((T+86400*PERIOD))
diff=$((NOW-I)) diff=$((NOW-I))
if [ "$diff" -gt 0 ]; then if [ "$diff" -gt 0 ]; then
crawl $url $tmp $sitemap $4 crawl $url $tmp $sitemap $4 $5 $6 $7 $8
else else
echo "Already crawled $url... SKIPPED" echo "Already crawled $url... SKIPPED"
fi fi
...@@ -53,8 +57,8 @@ do ...@@ -53,8 +57,8 @@ do
TMP_OUTPUT="$TMP_DIR/$NAME.xml" TMP_OUTPUT="$TMP_DIR/$NAME.xml"
if [ -s "$OUTPUT" ]; then if [ -s "$OUTPUT" ]; then
check_crawl $url $TMP_OUTPUT $OUTPUT $DEPTH check_crawl $url $TMP_OUTPUT $OUTPUT $DEPTH $INCLUDE_HTML $INCLUDE_JS $INCLUDE_CSS $INCLUDE_HEADER
else else
crawl $url $TMP_OUTPUT $OUTPUT $DEPTH crawl $url $TMP_OUTPUT $OUTPUT $DEPTH $INCLUDE_HTML $INCLUDE_JS $INCLUDE_CSS $INCLUDE_HEADER
fi fi
done done
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment