Commit 9238195c authored by Xiaowu Zhang's avatar Xiaowu Zhang

jscrawler: extend crawler to check site

parent 70db124b
......@@ -15,12 +15,16 @@
[instance]
filename = instance.cfg.in
md5sum = 7333d1dfd4e8e4c375f7f1748292f554
md5sum = 5b5c740ed6f30e8a058b8e767840bf6f
[template-jscrawler]
filename = instance-jscrawler.cfg.jinja2.in
md5sum = ae9269ab5b1cce77016f822024d2d996
md5sum = c955e6be5d5902bbaa0daedf09920ce7
[template-jscrawler]
filename = instance-jscrawler.cfg.jinja2.in
md5sum = c955e6be5d5902bbaa0daedf09920ce7
[template-jscrawler-builder]
filename = template-jscrawler.builder.sh.in
md5sum = c5e8f8b3983d5f572a564b34fa0f7499
md5sum = f8100800cda3532ee4cf987b34bd2c13
{% set part_list = [] -%}
{% macro section(name) %}{% do part_list.append(name) %}{{ name }}{% endmacro -%}
{% set site_list = slapparameter_dict.get('urls', "").split("\n") -%}
[directory]
recipe = slapos.cookbook:mkdirectory
etc = ${buildout:directory}/etc
......@@ -77,12 +81,15 @@ output = ${directory:bin}/jscrawler-build
extensions = jinja2.ext.do
list = {{ slapparameter_dict.get('urls', "").split("\n") | join('\n ') }}
period = {{ slapparameter_dict.get('crawl-periodicity', 0) }}
depth = {{ slapparameter_dict.get('crawl-depth', 3) }}
context =
key public_folder directory:www
key tmp_folder directory:tmp
key jscrawler_wrapper jscrawler-wrapper:wrapper-path
key period :period
key url_list :list
key depth :depth
raw shell_binary {{ bash_executable_location }}
raw pid_file ${directory:run}/jscrawler.pid
......@@ -94,6 +101,16 @@ frequency = * * * * *
command = ${jscrawler-build-wrapper:output}
{% for site in site_list %}
{% set new_site = site.split('//')[1].strip() -%}
[{{ section('check-http-header-for-' + new_site) }}]
<= monitor-promise-base
promise = check_site_state
name = {{ 'check-site-state-for-' + new_site.replace('.', '-') + '.py'}}
config-site-state-file = ${directory:www}/{{ new_site + '.xml'}}
{% endfor %}
[publish-connection-information]
<= monitor-publish
recipe = slapos.cookbook:publish
......@@ -110,6 +127,7 @@ parts =
httpd-wrapper
httpd-listen-promise
jscrawler-frontend-promise
{{ part_list | join('\n ') }}
eggs-directory = {{ eggs_directory }}
develop-eggs-directory = {{ develop_eggs_directory }}
......
[buildout]
develop = /srv/slapgrid/slappart78/srv/project/slapos.toolbox
parts = switch-softwaretype
# std stuff for slapos instance
......
......@@ -19,12 +19,15 @@ parts =
[nodejs]
<= nodejs-12.18.3
[jscrawler]
[jscrawlerxxxx]
recipe = slapos.recipe.build:gitclone
repository = https://lab.nexedi.com/Mynij/mynij-crawler.git
revision = ccbdfdc4712c008034b891f081be92b9342c48ac
git-executable = ${git:location}/bin/git
[jscrawler]
location = /srv/slapgrid/slappart78/srv/project/mynij-crawler
[jscrawler-build]
recipe = plone.recipe.command
......
......@@ -5,6 +5,7 @@ URLS="{{ url_list }}"
OUTPUT_DIR="{{ public_folder }}"
TMP_DIR="{{ tmp_folder }}"
PERIOD="{{ period }}"
DEPTH="{{ depth }}"
if [ -s "{{ pid_file}}" ]; then
echo "Crawler process already running with pid `cat {{ pid_file}}`"
......@@ -16,7 +17,7 @@ trap "rm -f -- '{{ pid_file}}'" EXIT
echo $$ > "{{ pid_file}}"
crawl() {
{{ jscrawler_wrapper }} -f $TMP_OUTPUT --link $1
{{ jscrawler_wrapper }} -f $TMP_OUTPUT --link $1 -d $4
if [ -s "$2" ]; then
mv $2 $3
fi
......@@ -35,7 +36,7 @@ check_crawl() {
I=$((T+86400*PERIOD))
diff=$((NOW-I))
if [ "$diff" -gt 0 ]; then
crawl $url $tmp $sitemap
crawl $url $tmp $sitemap $4
else
echo "Already crawled $url... SKIPPED"
fi
......@@ -52,8 +53,8 @@ do
TMP_OUTPUT="$TMP_DIR/$NAME.xml"
if [ -s "$OUTPUT" ]; then
check_crawl $url $TMP_OUTPUT $OUTPUT
check_crawl $url $TMP_OUTPUT $OUTPUT $DEPTH
else
crawl $url $TMP_OUTPUT $OUTPUT
crawl $url $TMP_OUTPUT $OUTPUT $DEPTH
fi
done
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment