diff --git a/HOWTO.md b/HOWTO.md index 1a66402b..50fad074 100644 --- a/HOWTO.md +++ b/HOWTO.md @@ -96,3 +96,31 @@ In AIL, you can track terms, set of terms and even regexes without creating a de - You can track a term by simply putting it in the box. - You can track a set of terms by simply putting terms in an array surrounded by the '\' character. You can also set a custom threshold regarding the number of terms that must match to trigger the detection. For example, if you want to track the terms _term1_ and _term2_ at the same time, you can use the following rule: `\[term1, term2, [100]]\` - You can track regexes as easily as tracking a term. You just have to put your regex in the box surrounded by the '/' character. For example, if you want to track the regex matching all email address having the domain _domain.net_, you can use the following aggressive rule: `/*.domain.net/`. + + +Crawler +--------------------- +In AIL, you can crawl hidden services. + +two types of configutation [explaination for what]: + 1) use local Splash dockers (use the same host for Splash servers and AIL) + 2) use remote Splash servers + +- (Splash host) Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used) +- (Splash host) Setup your tor proxy[is already installed]: + - Add the following line in ``/etc/tor/torrc: SOCKSPolicy accept 172.17.0.0/16`` + (for a linux docker, the localhost IP is 172.17.0.1; Should be adapted for other platform) + - Restart the tor proxy: ``sudo service tor restart`` + +- (Splash host) Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f ] [-p ] [-n ]`` + all the Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status. + +- (AIL host) Edit the ``/bin/packages/config.cfg`` file: + - In the crawler section, set ``activate_crawler`` to ``True`` + - Change the IP address of Splash servers if needed (remote only) + - Set ``splash_onion_port`` according to your Splash servers port numbers who are using the tor proxy. those ports numbers should be described as a single port (ex: 8050) or a port range (ex: 8050-8052 for 8050,8051,8052 ports). + +- (AIL host) launch all AIL crawler scripts using: ``./bin/LAUNCH.sh -c`` + + + diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index a7c0631d..3b17a4a6 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -201,23 +201,28 @@ function launching_scripts { } function launching_crawler { - CONFIG=$AIL_BIN/packages/config.cfg - lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}") - echo $lport + CONFIG=$AIL_BIN/packages/config.cfg + lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}") - IFS='-' read -ra PORTS <<< "$lport" - first_port=${PORTS[0]} - last_port=${PORTS[1]} + IFS='-' read -ra PORTS <<< "$lport" + if [ ${#PORTS[@]} -eq 1 ] + then + first_port=${PORTS[0]} + last_port=${PORTS[0]} + else + first_port=${PORTS[0]} + last_port=${PORTS[1]} + fi - screen -dmS "Crawler_AIL" - sleep 0.1 + screen -dmS "Crawler_AIL" + sleep 0.1 - for ((i=first_port;i<=last_port;i++)); do - screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x' - sleep 0.1 - done + for ((i=first_port;i<=last_port;i++)); do + screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x' + sleep 0.1 + done - echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT + echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT } function shutting_down_redis { @@ -465,7 +470,7 @@ function launch_all { while [ "$1" != "" ]; do case $1 in - -l | --launchAuto ) launch_all "automatic"; launching_crawler + -l | --launchAuto ) launch_all "automatic"; ;; -k | --killAll ) killall; ;; diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 64b1f7f6..c30fa071 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -240,4 +240,4 @@ db = 0 activate_crawler = True crawler_depth_limit = 1 splash_url_onion = http://127.0.0.1 -splash_onion_port = 8050-8050 +splash_onion_port = 8050-8052 diff --git a/bin/torcrawler/launch_splash_crawler.sh b/bin/torcrawler/launch_splash_crawler.sh index 37963e93..e78656ab 100755 --- a/bin/torcrawler/launch_splash_crawler.sh +++ b/bin/torcrawler/launch_splash_crawler.sh @@ -1,12 +1,12 @@ #!/bin/bash usage() { echo "Usage: sudo $0 [-f ] [-p ] [-n ]" 1>&2; - echo " -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)" - echo " -p: number of the first splash server port number. This number is incremented for the others splash server" - echo " -n: number of splash servers to start" - echo "" - echo "example:" - echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3" + echo " -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)"; + echo " -p: number of the first splash server port number. This number is incremented for the others splash server"; + echo " -n: number of splash servers to start"; + echo ""; + echo "example:"; + echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3"; exit 1; } @@ -29,8 +29,7 @@ done shift $((OPTIND-1)) if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then - #usage - echo "usage" + usage; fi screen -dmS "Docker_Splash" diff --git a/crawler_hidden_services_install.sh b/crawler_hidden_services_install.sh index 3cdb96bd..3fbccb74 100755 --- a/crawler_hidden_services_install.sh +++ b/crawler_hidden_services_install.sh @@ -1,15 +1,56 @@ #!/bin/bash -read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r -echo # (optional) move to a new line -if [[ $REPLY =~ ^[Yy]$ ]] -then +install_docker() { # install docker - sudo apt install docker.io + sudo apt install docker.io; + + # pull splah docker + sudo docker pull scrapinghub/splash; +} + +install_python_requirement() { + . ./AILENV/bin/activate; + pip3 install -U -r crawler_requirements.txt; +} + +install_all() { + read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r + echo # (optional) move to a new line + if [[ $REPLY =~ ^[Yy]$ ]] + then + install_docker; + fi + install_python_requirement; +} + +usage() { + echo "Usage: crawler_hidden_services_install.sh [-y | -n]" 1>&2; + echo " -y: install docker" + echo " -n: don't install docker" + echo "" + echo "example:" + echo "crawler_hidden_services_install.sh -y" + exit 1; +} + +if [[ $1 == "" ]]; then + install_all; + exit; +else + key="$1" + case $key in + "") + install_all; + ;; + -y|--yes) + install_docker; + install_python_requirement; + ;; + -n|--no) + install_python_requirement; + ;; + *) # unknown option + usage; + ;; + esac fi - -# pull splah docker -sudo docker pull scrapinghub/splash - -. ./AILENV/bin/activate -pip3 install -U -r pip3_packages_requirement.txt