chg: [crawler] add infos

pull/260/head
Terrtia 2018-09-26 16:34:27 +02:00
parent e4a5f66a10
commit c49e871ba8
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
3 changed files with 22 additions and 14 deletions

View File

@ -36,7 +36,8 @@ def crawl_onion(url, domain, date, date_month, message):
r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message)
print('--------------------------------------')
print(' DOCKER SPLASH DOWN')
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
print(' {} DOWN'.format(splash_url))
exit(0)
if r.status_code == 200:
@ -94,7 +95,6 @@ if __name__ == '__main__':
print('incorrect crawler type: {}'.format(type_hidden_service))
exit(0)
print('crawler type: {}'.format(type_hidden_service))
print('splash url: {}'.format(splash_url))
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
@ -132,13 +132,11 @@ if __name__ == '__main__':
message = r_onion.spop('mess_onion')
if message is not None:
print(message)
splitted = message.split(';')
if len(splitted) == 2:
url, paste = splitted
paste = paste.replace(PASTES_FOLDER+'/', '')
print(paste)
'''
if not '.onion' in url:
print('not onion')
@ -157,9 +155,9 @@ if __name__ == '__main__':
domain_url = 'http://{}'.format(domain)
print('------------------START CRAWLER------------------')
print(type_hidden_service)
print('-------------------------------------------------')
print('\033[92m------------------START CRAWLER------------------\033[0m')
print('crawler type: {}'.format(type_hidden_service))
print('\033[92m-------------------------------------------------\033[0m')
print('url: {}'.format(url))
print('domain: {}'.format(domain))
print('domain_url: {}'.format(domain_url))

View File

@ -1,6 +1,14 @@
#!/bin/bash
#usage() { echo "Usage: sudo $0 [-f <config absolute_path>] [-p <port_start>] [-n <number_of_splash>]" 1>&2; exit 1; }
usage() { echo "Usage: sudo $0 [-f <config_absolute_path>] [-p <port_start>] [-n <number_of_splash_servers>]" 1>&2;
echo " -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)"
echo " -p: number of the first splash server port number. This number is incremented for the others splash server"
echo " -n: number of splash servers to start"
echo ""
echo "example:"
echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3"
exit 1;
}
while getopts ":p:f:n:" o; do
case "${o}" in
@ -25,14 +33,11 @@ if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then
echo "usage"
fi
first_port=$p
echo "usage0"
screen -dmS "Docker_Splash"
echo "usage1"
sleep 0.1
for ((i=0;i<=$((${n} - 1));i++)); do
port_number=$((${p} + $i))
screen -S "Docker_Splash" -X screen -t "docker_splash:$i" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x'
screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x'
sleep 0.1
done

5
crawler_hidden_services_install.sh Normal file → Executable file
View File

@ -1,7 +1,12 @@
#!/bin/bash
read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r
echo # (optional) move to a new line
if [[ $REPLY =~ ^[Yy]$ ]]
then
# install docker
sudo apt install docker.io
fi
# pull splah docker
sudo docker pull scrapinghub/splash