chg: [Onion] change onion regex, fix crawler

pull/260/head
Terrtia 2018-08-13 09:23:14 +02:00
parent 8b1c10b38c
commit 7652089433
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
4 changed files with 82 additions and 34 deletions

View File

@ -3,6 +3,7 @@
import os
import sys
import re
import redis
import datetime
import time
@ -16,6 +17,33 @@ from pubsublogger import publisher
def signal_handler(sig, frame):
sys.exit(0)
def crawl_onion(url, domain):
date = datetime.datetime.now().strftime("%Y%m%d")
if not r_onion.sismember('onion_up:'+date , domain):
super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
if super_father is None:
super_father=paste
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father],
stdout=subprocess.PIPE)
while process.poll() is None:
time.sleep(1)
if process.returncode == 0:
if r_serv_metadata.exists('paste_children:'+paste):
msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
p.populate_set_out(msg, 'Tags')
print(process.stdout.read())
r_onion.sadd('onion_up:'+date , domain)
r_onion.sadd('onion_up_link:'+date , url)
else:
r_onion.sadd('onion_down:'+date , domain)
r_onion.sadd('onion_down_link:'+date , url)
print(process.stdout.read())
if __name__ == '__main__':
publisher.port = 6380
@ -52,6 +80,9 @@ if __name__ == '__main__':
db=p.config.getint("ARDB_Onion", "db"),
decode_responses=True)
url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
re.compile(url_regex)
while True:
message = p.get_from_set()
@ -61,30 +92,24 @@ if __name__ == '__main__':
if len(splitted) == 2:
url, paste = splitted
print(url)
url_list = re.findall(url_regex, url)[0]
if url_list[1] == '':
url= 'http://{}'.format(url)
if not r_cache.exists(url):
super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
if super_father is None:
super_father=paste
link, s, credential, subdomain, domain, host, port, \
resource_path, query_string, f1, f2, f3, f4 = url_list
domain = url_list[4]
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, paste, super_father],
stdout=subprocess.PIPE)
while process.poll() is None:
time.sleep(1)
domain_url = 'http://{}'.format(domain)
date = datetime.datetime.now().strftime("%Y%m%d")
print(date)
url_domain = url.replace('http://', '')
if process.returncode == 0:
if r_serv_metadata.exists('paste_children:'+paste):
msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
p.populate_set_out(msg, 'Tags')
print('------------------START ONIOM CRAWLER------------------')
print('url: {}'.format(url))
print('domain: {}'.format(domain))
print('domain_url: {}'.format(domain_url))
r_onion.sadd('onion_up:'+date , url_domain)
else:
r_onion.sadd('onion_down:'+date , url_domain)
print(process.stdout.read())
crawl_onion(url, domain)
if url != domain_url:
crawl_onion(domain_url, domain)
else:
continue

View File

@ -108,7 +108,7 @@ if __name__ == "__main__":
# Thanks to Faup project for this regex
# https://github.com/stricaud/faup
url_regex = "((http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
while True:
if message is not None:

View File

@ -9,14 +9,11 @@ import uuid
import datetime
import base64
import redis
from urllib.parse import urlparse
from scrapy import Spider
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess, Crawler
from twisted.internet import reactor
from scrapy_splash import SplashRequest
sys.path.append(os.environ['AIL_BIN'])
@ -40,19 +37,20 @@ class TorSplashCrawler():
'DEPTH_LIMIT': crawler_depth_limit
})
def crawl(self, url, original_paste, super_father):
self.process.crawl(self.crawler, url=url, original_paste=original_paste, super_father=super_father)
def crawl(self, url, domain, original_paste, super_father):
self.process.crawl(self.crawler, url=url, domain=domain,original_paste=original_paste, super_father=super_father)
self.process.start()
class TorSplashSpider(Spider):
name = 'TorSplashSpider'
def __init__(self, url, original_paste, super_father, *args, **kwargs):
def __init__(self, url, domain,original_paste, super_father, *args, **kwargs):
self.original_paste = original_paste
self.super_father = super_father
self.start_urls = url
self.domains = [urlparse(url).netloc]
self.domains = [domain]
date = datetime.datetime.now().strftime("%Y/%m/%d")
self.full_date = datetime.datetime.now().strftime("%Y%m%d")
config_section = 'Crawler'
self.p = Process(config_section)
@ -75,6 +73,12 @@ class TorSplashCrawler():
db=self.p.config.getint("ARDB_Metadata", "db"),
decode_responses=True)
self.r_serv_onion = redis.StrictRedis(
host=self.p.config.get("ARDB_Onion", "host"),
port=self.p.config.getint("ARDB_Onion", "port"),
db=self.p.config.getint("ARDB_Onion", "db"),
decode_responses=True)
self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
self.p.config.get("Directories", "crawled"), date )
@ -96,6 +100,7 @@ class TorSplashCrawler():
print(response.headers)
print(response.status)
# # TODO: # FIXME:
self.r_cache.setbit(response.url, 0, 1)
self.r_cache.expire(response.url, 360000)
@ -105,8 +110,19 @@ class TorSplashCrawler():
# save new paste on disk
if self.save_crawled_paste(filename_paste, response.data['html']):
# create onion metadata
if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domain[0])):
self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'first_seen', self.full_date)
self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'last_seen', self.full_date)
# add onion screenshot history
self.r_serv_onion.sadd('onion_history:{}'.format(self.domain[0]), self.full_date)
#create paste metadata
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent'])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste)
@ -114,6 +130,13 @@ class TorSplashCrawler():
dirname = os.path.dirname(filename_screenshot)
if not os.path.exists(dirname):
os.makedirs(dirname)
print(sys.getsizeof(response.data['png']))
print(sys.getsizeof(response.data['html']))
print(self.domains[0])
with open(filename_screenshot, 'wb') as f:
f.write(base64.standard_b64decode(response.data['png'].encode()))
@ -140,7 +163,6 @@ class TorSplashCrawler():
def save_crawled_paste(self, filename, content):
print(filename)
if os.path.isfile(filename):
print('File: {} already exist in submitted pastes'.format(filename))
return False

View File

@ -8,8 +8,8 @@ from TorSplashCrawler import TorSplashCrawler
if __name__ == '__main__':
if len(sys.argv) != 4:
print('usage:', 'tor_crawler.py', 'url', 'paste', 'super_father')
if len(sys.argv) != 5:
print('usage:', 'tor_crawler.py', 'url', 'domain', 'paste', 'super_father')
exit(1)
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
@ -26,8 +26,9 @@ if __name__ == '__main__':
crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
url = sys.argv[1]
paste = sys.argv[2]
super_father = sys.argv[3]
domain = sys.argv[2]
paste = sys.argv[3]
super_father = sys.argv[4]
crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit)
crawler.crawl(url, paste, super_father)
crawler.crawl(url, domain, paste, super_father)