Commit 405d1bca authored by Xiaowu Zhang's avatar Xiaowu Zhang

parameter

parent 9238195c
......@@ -19,12 +19,12 @@ md5sum = 5b5c740ed6f30e8a058b8e767840bf6f
[template-jscrawler]
filename = instance-jscrawler.cfg.jinja2.in
md5sum = c955e6be5d5902bbaa0daedf09920ce7
md5sum = 4c6f373b055bc64bc3256a1791b4b4bd
[template-jscrawler]
filename = instance-jscrawler.cfg.jinja2.in
md5sum = c955e6be5d5902bbaa0daedf09920ce7
md5sum = 4c6f373b055bc64bc3256a1791b4b4bd
[template-jscrawler-builder]
filename = template-jscrawler.builder.sh.in
md5sum = f8100800cda3532ee4cf987b34bd2c13
md5sum = 6bc7ff2b35d94c093bcf8f66138d93b7
......@@ -82,6 +82,10 @@ extensions = jinja2.ext.do
list = {{ slapparameter_dict.get('urls', "").split("\n") | join('\n ') }}
period = {{ slapparameter_dict.get('crawl-periodicity', 0) }}
depth = {{ slapparameter_dict.get('crawl-depth', 3) }}
include_html = {{ slapparameter_dict.get('include_html', True) }}
include_js = {{ slapparameter_dict.get('include_js', False) }}
include_css = {{ slapparameter_dict.get('include_css', False) }}
include_header = {{ slapparameter_dict.get('include_header', False) }}
context =
key public_folder directory:www
......@@ -90,6 +94,10 @@ context =
key period :period
key url_list :list
key depth :depth
key include_html :include_html
key include_js :include_js
key include_css :include_css
key include_header :include_header
raw shell_binary {{ bash_executable_location }}
raw pid_file ${directory:run}/jscrawler.pid
......
......@@ -6,6 +6,10 @@ OUTPUT_DIR="{{ public_folder }}"
TMP_DIR="{{ tmp_folder }}"
PERIOD="{{ period }}"
DEPTH="{{ depth }}"
INCLUDE_HTML={{ include_html }}
INCLUDE_JS={{ include_js }}
INCLUDE_CSS={{ include_css }}
INCLUDE_HEADER={{ include_header }}
if [ -s "{{ pid_file}}" ]; then
echo "Crawler process already running with pid `cat {{ pid_file}}`"
......@@ -17,7 +21,7 @@ trap "rm -f -- '{{ pid_file}}'" EXIT
echo $$ > "{{ pid_file}}"
crawl() {
{{ jscrawler_wrapper }} -f $TMP_OUTPUT --link $1 -d $4
{{ jscrawler_wrapper }} -f $TMP_OUTPUT --link $1 -d $4 --include_html $5 --include_js $6 --include_css $7 --include_header $8
if [ -s "$2" ]; then
mv $2 $3
fi
......@@ -36,7 +40,7 @@ check_crawl() {
I=$((T+86400*PERIOD))
diff=$((NOW-I))
if [ "$diff" -gt 0 ]; then
crawl $url $tmp $sitemap $4
crawl $url $tmp $sitemap $4 $5 $6 $7 $8
else
echo "Already crawled $url... SKIPPED"
fi
......@@ -53,8 +57,8 @@ do
TMP_OUTPUT="$TMP_DIR/$NAME.xml"
if [ -s "$OUTPUT" ]; then
check_crawl $url $TMP_OUTPUT $OUTPUT $DEPTH
check_crawl $url $TMP_OUTPUT $OUTPUT $DEPTH $INCLUDE_HTML $INCLUDE_JS $INCLUDE_CSS $INCLUDE_HEADER
else
crawl $url $TMP_OUTPUT $OUTPUT $DEPTH
crawl $url $TMP_OUTPUT $OUTPUT $DEPTH $INCLUDE_HTML $INCLUDE_JS $INCLUDE_CSS $INCLUDE_HEADER
fi
done
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment