mirror of https://github.com/CIRCL/AIL-framework
chg: [crawler] add infos
parent
e4a5f66a10
commit
c49e871ba8
|
@ -36,7 +36,8 @@ def crawl_onion(url, domain, date, date_month, message):
|
||||||
r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message)
|
r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message)
|
||||||
|
|
||||||
print('--------------------------------------')
|
print('--------------------------------------')
|
||||||
print(' DOCKER SPLASH DOWN')
|
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
||||||
|
print(' {} DOWN'.format(splash_url))
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
if r.status_code == 200:
|
if r.status_code == 200:
|
||||||
|
@ -94,7 +95,6 @@ if __name__ == '__main__':
|
||||||
print('incorrect crawler type: {}'.format(type_hidden_service))
|
print('incorrect crawler type: {}'.format(type_hidden_service))
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|
||||||
print('crawler type: {}'.format(type_hidden_service))
|
|
||||||
print('splash url: {}'.format(splash_url))
|
print('splash url: {}'.format(splash_url))
|
||||||
|
|
||||||
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
|
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
|
||||||
|
@ -132,13 +132,11 @@ if __name__ == '__main__':
|
||||||
message = r_onion.spop('mess_onion')
|
message = r_onion.spop('mess_onion')
|
||||||
|
|
||||||
if message is not None:
|
if message is not None:
|
||||||
print(message)
|
|
||||||
|
|
||||||
splitted = message.split(';')
|
splitted = message.split(';')
|
||||||
if len(splitted) == 2:
|
if len(splitted) == 2:
|
||||||
url, paste = splitted
|
url, paste = splitted
|
||||||
paste = paste.replace(PASTES_FOLDER+'/', '')
|
paste = paste.replace(PASTES_FOLDER+'/', '')
|
||||||
print(paste)
|
|
||||||
'''
|
'''
|
||||||
if not '.onion' in url:
|
if not '.onion' in url:
|
||||||
print('not onion')
|
print('not onion')
|
||||||
|
@ -157,9 +155,9 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
domain_url = 'http://{}'.format(domain)
|
domain_url = 'http://{}'.format(domain)
|
||||||
|
|
||||||
print('------------------START CRAWLER------------------')
|
print('\033[92m------------------START CRAWLER------------------\033[0m')
|
||||||
print(type_hidden_service)
|
print('crawler type: {}'.format(type_hidden_service))
|
||||||
print('-------------------------------------------------')
|
print('\033[92m-------------------------------------------------\033[0m')
|
||||||
print('url: {}'.format(url))
|
print('url: {}'.format(url))
|
||||||
print('domain: {}'.format(domain))
|
print('domain: {}'.format(domain))
|
||||||
print('domain_url: {}'.format(domain_url))
|
print('domain_url: {}'.format(domain_url))
|
||||||
|
|
|
@ -1,6 +1,14 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
#usage() { echo "Usage: sudo $0 [-f <config absolute_path>] [-p <port_start>] [-n <number_of_splash>]" 1>&2; exit 1; }
|
usage() { echo "Usage: sudo $0 [-f <config_absolute_path>] [-p <port_start>] [-n <number_of_splash_servers>]" 1>&2;
|
||||||
|
echo " -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)"
|
||||||
|
echo " -p: number of the first splash server port number. This number is incremented for the others splash server"
|
||||||
|
echo " -n: number of splash servers to start"
|
||||||
|
echo ""
|
||||||
|
echo "example:"
|
||||||
|
echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3"
|
||||||
|
exit 1;
|
||||||
|
}
|
||||||
|
|
||||||
while getopts ":p:f:n:" o; do
|
while getopts ":p:f:n:" o; do
|
||||||
case "${o}" in
|
case "${o}" in
|
||||||
|
@ -25,14 +33,11 @@ if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then
|
||||||
echo "usage"
|
echo "usage"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
first_port=$p
|
|
||||||
echo "usage0"
|
|
||||||
screen -dmS "Docker_Splash"
|
screen -dmS "Docker_Splash"
|
||||||
echo "usage1"
|
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
|
|
||||||
for ((i=0;i<=$((${n} - 1));i++)); do
|
for ((i=0;i<=$((${n} - 1));i++)); do
|
||||||
port_number=$((${p} + $i))
|
port_number=$((${p} + $i))
|
||||||
screen -S "Docker_Splash" -X screen -t "docker_splash:$i" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x'
|
screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x'
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
done
|
done
|
||||||
|
|
|
@ -1,7 +1,12 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# install docker
|
read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r
|
||||||
sudo apt install docker.io
|
echo # (optional) move to a new line
|
||||||
|
if [[ $REPLY =~ ^[Yy]$ ]]
|
||||||
|
then
|
||||||
|
# install docker
|
||||||
|
sudo apt install docker.io
|
||||||
|
fi
|
||||||
|
|
||||||
# pull splah docker
|
# pull splah docker
|
||||||
sudo docker pull scrapinghub/splash
|
sudo docker pull scrapinghub/splash
|
||||||
|
|
Loading…
Reference in New Issue