mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			chg: [Crawler] add docs
							parent
							
								
									c49e871ba8
								
							
						
					
					
						commit
						04b9d9fc1d
					
				
							
								
								
									
										28
									
								
								HOWTO.md
								
								
								
								
							
							
						
						
									
										28
									
								
								HOWTO.md
								
								
								
								
							|  | @ -96,3 +96,31 @@ In AIL, you can track terms, set of terms and even regexes without creating a de | |||
| - You can track a term by simply putting it in the box. | ||||
| - You can track a set of terms by simply putting terms in an array surrounded by the '\' character. You can also set a custom threshold regarding the number of terms that must match to trigger the detection. For example, if you want to track the terms _term1_ and _term2_ at the same time, you can use the following rule: `\[term1, term2, [100]]\` | ||||
| - You can track regexes as easily as tracking a term. You just have to put your regex in the box surrounded by the '/' character. For example, if you want to track the regex matching all email address having the domain _domain.net_, you can use the following aggressive rule: `/*.domain.net/`. | ||||
| 
 | ||||
| 
 | ||||
| Crawler | ||||
| --------------------- | ||||
| In AIL, you can crawl hidden services. | ||||
| 
 | ||||
| two types of configutation [explaination for what]: | ||||
| 	1) use local Splash dockers (use the same host for Splash servers and AIL) | ||||
| 	2) use remote Splash servers | ||||
| 
 | ||||
| - (Splash host) Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used) | ||||
| - (Splash host) Setup your tor proxy[is already installed]:  | ||||
| 	- Add the following line in ``/etc/tor/torrc: SOCKSPolicy accept 172.17.0.0/16`` | ||||
|   	  (for a linux docker, the localhost IP is 172.17.0.1; Should be adapted for other platform) | ||||
| 	- Restart the tor proxy: ``sudo service tor restart`` | ||||
| 
 | ||||
| - (Splash host) Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f <config absolute_path>] [-p <port_start>] [-n <number_of_splash>]`` | ||||
|   all the Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status. | ||||
| 
 | ||||
| - (AIL host) Edit the ``/bin/packages/config.cfg`` file: | ||||
| 	- In the crawler section, set ``activate_crawler`` to ``True`` | ||||
| 	- Change the IP address of Splash servers if needed (remote only) | ||||
| 	- Set ``splash_onion_port`` according to your Splash servers port numbers who are using the tor proxy. those ports numbers should be described as a single port (ex: 8050) or a port range (ex: 8050-8052 for 8050,8051,8052 ports). | ||||
| 
 | ||||
| - (AIL host) launch all AIL crawler scripts using: ``./bin/LAUNCH.sh -c`` | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -201,23 +201,28 @@ function launching_scripts { | |||
| } | ||||
| 
 | ||||
| function launching_crawler { | ||||
|   CONFIG=$AIL_BIN/packages/config.cfg | ||||
|   lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}") | ||||
|   echo $lport | ||||
|     CONFIG=$AIL_BIN/packages/config.cfg | ||||
|     lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}") | ||||
| 
 | ||||
|   IFS='-' read -ra PORTS <<< "$lport" | ||||
|   first_port=${PORTS[0]} | ||||
|   last_port=${PORTS[1]} | ||||
|     IFS='-' read -ra PORTS <<< "$lport" | ||||
|     if [ ${#PORTS[@]} -eq 1 ] | ||||
|     then | ||||
|         first_port=${PORTS[0]} | ||||
|         last_port=${PORTS[0]} | ||||
|     else | ||||
|         first_port=${PORTS[0]} | ||||
|         last_port=${PORTS[1]} | ||||
|     fi | ||||
| 
 | ||||
|   screen -dmS "Crawler_AIL" | ||||
|   sleep 0.1 | ||||
|     screen -dmS "Crawler_AIL" | ||||
|     sleep 0.1 | ||||
| 
 | ||||
|   for ((i=first_port;i<=last_port;i++)); do | ||||
|       screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x' | ||||
|       sleep 0.1 | ||||
|   done | ||||
|     for ((i=first_port;i<=last_port;i++)); do | ||||
|         screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x' | ||||
|         sleep 0.1 | ||||
|     done | ||||
| 
 | ||||
|   echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT | ||||
|     echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT | ||||
| } | ||||
| 
 | ||||
| function shutting_down_redis { | ||||
|  | @ -465,7 +470,7 @@ function launch_all { | |||
| 
 | ||||
| while [ "$1" != "" ]; do | ||||
|     case $1 in | ||||
|         -l | --launchAuto )         launch_all "automatic"; launching_crawler | ||||
|         -l | --launchAuto )         launch_all "automatic"; | ||||
|                                     ;; | ||||
|         -k | --killAll )            killall; | ||||
|                                     ;; | ||||
|  |  | |||
|  | @ -240,4 +240,4 @@ db = 0 | |||
| activate_crawler = True | ||||
| crawler_depth_limit = 1 | ||||
| splash_url_onion = http://127.0.0.1 | ||||
| splash_onion_port = 8050-8050 | ||||
| splash_onion_port = 8050-8052 | ||||
|  |  | |||
|  | @ -1,12 +1,12 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| usage() { echo "Usage: sudo $0 [-f <config_absolute_path>] [-p <port_start>] [-n <number_of_splash_servers>]" 1>&2; | ||||
|           echo "          -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)" | ||||
|           echo "          -p: number of the first splash server port number. This number is incremented for the others splash server" | ||||
|           echo "          -n: number of splash servers to start" | ||||
|           echo "" | ||||
|           echo "example:" | ||||
|           echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3" | ||||
|           echo "          -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)"; | ||||
|           echo "          -p: number of the first splash server port number. This number is incremented for the others splash server"; | ||||
|           echo "          -n: number of splash servers to start"; | ||||
|           echo ""; | ||||
|           echo "example:"; | ||||
|           echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3"; | ||||
|           exit 1; | ||||
|         } | ||||
| 
 | ||||
|  | @ -29,8 +29,7 @@ done | |||
| shift $((OPTIND-1)) | ||||
| 
 | ||||
| if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then | ||||
|     #usage | ||||
|     echo "usage" | ||||
|     usage; | ||||
| fi | ||||
| 
 | ||||
| screen -dmS "Docker_Splash" | ||||
|  |  | |||
|  | @ -1,15 +1,56 @@ | |||
| #!/bin/bash | ||||
| 
 | ||||
| read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r | ||||
| echo    # (optional) move to a new line | ||||
| if [[ $REPLY =~ ^[Yy]$ ]] | ||||
| then | ||||
| install_docker() { | ||||
|     # install docker | ||||
|     sudo apt install docker.io | ||||
|     sudo apt install docker.io; | ||||
| 
 | ||||
|     # pull splah docker | ||||
|     sudo docker pull scrapinghub/splash; | ||||
| } | ||||
| 
 | ||||
| install_python_requirement() { | ||||
|     . ./AILENV/bin/activate; | ||||
|     pip3 install -U -r crawler_requirements.txt; | ||||
| } | ||||
| 
 | ||||
| install_all() { | ||||
|     read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r | ||||
|     echo    # (optional) move to a new line | ||||
|     if [[ $REPLY =~ ^[Yy]$ ]] | ||||
|     then | ||||
|         install_docker; | ||||
|     fi | ||||
|     install_python_requirement; | ||||
| } | ||||
| 
 | ||||
| usage() { | ||||
|   echo "Usage: crawler_hidden_services_install.sh [-y | -n]" 1>&2; | ||||
|             echo "          -y: install docker" | ||||
|             echo "          -n: don't install docker" | ||||
|             echo "" | ||||
|             echo "example:" | ||||
|             echo "crawler_hidden_services_install.sh -y" | ||||
|             exit 1; | ||||
| } | ||||
| 
 | ||||
| if [[ $1 == "" ]]; then | ||||
|     install_all; | ||||
|     exit; | ||||
| else | ||||
|   key="$1" | ||||
|   case $key in | ||||
|       "") | ||||
|       install_all; | ||||
|       ;; | ||||
|       -y|--yes) | ||||
|       install_docker; | ||||
|       install_python_requirement; | ||||
|       ;; | ||||
|       -n|--no) | ||||
|       install_python_requirement; | ||||
|       ;; | ||||
|       *)    # unknown option | ||||
|       usage; | ||||
|       ;; | ||||
|   esac | ||||
| fi | ||||
| 
 | ||||
| # pull splah docker | ||||
| sudo docker pull scrapinghub/splash | ||||
| 
 | ||||
| . ./AILENV/bin/activate | ||||
| pip3 install -U -r pip3_packages_requirement.txt | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Terrtia
						Terrtia