mirror of https://github.com/CIRCL/AIL-framework
fix: [Crawler] Restart Splash on failure, limit unbound in memory cache (maxrss)
parent
0358b0cf58
commit
b3b75ccbea
|
@ -10,10 +10,12 @@ import datetime
|
|||
import base64
|
||||
import redis
|
||||
import json
|
||||
import time
|
||||
|
||||
from scrapy.spidermiddlewares.httperror import HttpError
|
||||
from twisted.internet.error import DNSLookupError
|
||||
from twisted.internet.error import TimeoutError
|
||||
from twisted.web._newclient import ResponseNeverReceived
|
||||
|
||||
from scrapy import Spider
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
|
@ -39,6 +41,7 @@ class TorSplashCrawler():
|
|||
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
|
||||
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
||||
'HTTPERROR_ALLOW_ALL': True,
|
||||
'RETRY_TIMES': 2,
|
||||
'DEPTH_LIMIT': crawler_depth_limit
|
||||
})
|
||||
|
||||
|
@ -97,7 +100,7 @@ class TorSplashCrawler():
|
|||
yield SplashRequest(
|
||||
self.start_urls,
|
||||
self.parse,
|
||||
#errback=self.errback_catcher,
|
||||
errback=self.errback_catcher,
|
||||
endpoint='render.json',
|
||||
meta={'father': self.original_paste},
|
||||
args={ 'html': 1,
|
||||
|
@ -174,7 +177,7 @@ class TorSplashCrawler():
|
|||
yield SplashRequest(
|
||||
link.url,
|
||||
self.parse,
|
||||
#errback=self.errback_catcher,
|
||||
errback=self.errback_catcher,
|
||||
endpoint='render.json',
|
||||
meta={'father': relative_filename_paste},
|
||||
args={ 'html': 1,
|
||||
|
@ -184,17 +187,39 @@ class TorSplashCrawler():
|
|||
'wait': 10}
|
||||
)
|
||||
|
||||
'''
|
||||
def errback_catcher(self, failure):
|
||||
# catch all errback failures,
|
||||
self.logger.error(repr(failure))
|
||||
print('failure')
|
||||
#print(failure)
|
||||
print(failure.type)
|
||||
#print(failure.request.meta['item'])
|
||||
|
||||
if failure.check(ResponseNeverReceived):
|
||||
request = failure.request
|
||||
url = request.meta['splash']['args']['url']
|
||||
father = request.meta['father']
|
||||
|
||||
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
|
||||
time.sleep(10)
|
||||
yield SplashRequest(
|
||||
url,
|
||||
self.parse,
|
||||
errback=self.errback_catcher,
|
||||
endpoint='render.json',
|
||||
meta={'father': father},
|
||||
args={ 'html': 1,
|
||||
'png': 1,
|
||||
'render_all': 1,
|
||||
'har': 1,
|
||||
'wait': 10}
|
||||
)
|
||||
|
||||
else:
|
||||
print('failure')
|
||||
#print(failure)
|
||||
print(failure.type)
|
||||
#print(failure.request.meta['item'])
|
||||
|
||||
'''
|
||||
#if isinstance(failure.value, HttpError):
|
||||
if failure.check(HttpError):
|
||||
elif failure.check(HttpError):
|
||||
# you can get the response
|
||||
response = failure.value.response
|
||||
print('HttpError')
|
||||
|
@ -214,7 +239,7 @@ class TorSplashCrawler():
|
|||
print('TimeoutError')
|
||||
print(TimeoutError)
|
||||
self.logger.error('TimeoutError on %s', request.url)
|
||||
'''
|
||||
'''
|
||||
|
||||
def save_crawled_paste(self, filename, content):
|
||||
|
||||
|
|
|
@ -5,12 +5,15 @@ usage() { echo "Usage: sudo $0 [-f <config_absolute_path>] [-p <port_start>] [-n
|
|||
echo " -p: number of the first splash server port number. This number is incremented for the others splash server";
|
||||
echo " -n: number of splash servers to start";
|
||||
echo "";
|
||||
echo " -options:";
|
||||
echo " -u: max unbound in-memory cache (Mb, Restart Splash when full, default=3000 Mb)";
|
||||
echo "";
|
||||
echo "example:";
|
||||
echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3";
|
||||
exit 1;
|
||||
}
|
||||
|
||||
while getopts ":p:f:n:" o; do
|
||||
while getopts ":p:f:n:u:" o; do
|
||||
case "${o}" in
|
||||
p)
|
||||
p=${OPTARG}
|
||||
|
@ -21,6 +24,9 @@ while getopts ":p:f:n:" o; do
|
|||
n)
|
||||
n=${OPTARG}
|
||||
;;
|
||||
u)
|
||||
u=${OPTARG}
|
||||
;;
|
||||
*)
|
||||
usage
|
||||
;;
|
||||
|
@ -28,6 +34,10 @@ while getopts ":p:f:n:" o; do
|
|||
done
|
||||
shift $((OPTIND-1))
|
||||
|
||||
if [ -z "${u}" ]; then
|
||||
u=3000;
|
||||
fi
|
||||
|
||||
if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then
|
||||
usage;
|
||||
fi
|
||||
|
@ -52,7 +62,7 @@ sleep 0.1
|
|||
|
||||
for ((i=0;i<=$((${n} - 1));i++)); do
|
||||
port_number=$((${p} + $i))
|
||||
screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 --memory=4.5G -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x'
|
||||
screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -d -p '$port_number':8050 --restart=always --cpus=1 --memory=4.5G -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash --maxrss '$u'; read x'
|
||||
sleep 0.1
|
||||
printf "$GREEN Splash server launched on port $port_number$DEFAULT\n"
|
||||
done
|
||||
|
|
Loading…
Reference in New Issue