The onion module now fetches the URLs it finds.

pull/38/head
Raphaël Vinot 2014-08-31 22:42:12 +02:00
parent abfe13436b
commit f4b89669fc
4 changed files with 115 additions and 6 deletions

View File

@ -25,14 +25,49 @@ import pprint
import time import time
from packages import Paste from packages import Paste
from pubsublogger import publisher from pubsublogger import publisher
import datetime
import os
import base64
import subprocess
from Helper import Process from Helper import Process
def fetch(p, urls, domains, path):
for url, domain in zip(urls, domains):
to_fetch = base64.standard_b64encode(url)
process = subprocess.Popen(["python", './tor_fetcher.py', to_fetch],
stdout=subprocess.PIPE)
while process.poll() is None:
time.sleep(1)
if process.returncode == 0:
tempfile = process.stdout.read().strip()
with open(tempfile, 'r') as f:
filename = path + domain
content = base64.standard_b64decode(f.read())
save_path = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "pastes"),
filename)
dirname = os.path.dirname(save_path)
if not os.path.exists(dirname):
os.makedirs(dirname)
with open(save_path, 'w') as ff:
ff.write(content)
p.populate_set_out(save_path)
os.unlink(tempfile)
else:
print 'Failed at downloading', url
print process.stdout.read()
if __name__ == "__main__": if __name__ == "__main__":
publisher.port = 6380 publisher.port = 6380
publisher.channel = "Script" publisher.channel = "Script"
torclient_host = '127.0.0.1'
torclient_port = 9050
config_section = 'Onion' config_section = 'Onion'
p = Process(config_section) p = Process(config_section)
@ -49,7 +84,7 @@ if __name__ == "__main__":
# Thanks to Faup project for this regex # Thanks to Faup project for this regex
# https://github.com/stricaud/faup # https://github.com/stricaud/faup
url_regex = "([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*" url_regex = "((http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
while True: while True:
if message is not None: if message is not None:
@ -59,14 +94,16 @@ if __name__ == "__main__":
# "For each new paste" # "For each new paste"
if prec_filename is None or filename != prec_filename: if prec_filename is None or filename != prec_filename:
domains_list = [] domains_list = []
urls = []
PST = Paste.Paste(filename) PST = Paste.Paste(filename)
for x in PST.get_regex(url_regex): for x in PST.get_regex(url_regex):
# Extracting url with regex # Extracting url with regex
credential, subdomain, domain, host, tld, port, \ url, s, credential, subdomain, domain, host, port, \
resource_path, query_string, f1, f2, f3, f4 = x resource_path, query_string, f1, f2, f3, f4 = x
domains_list.append(domain) domains_list.append(domain)
urls.append(url)
# Saving the list of extracted onion domains. # Saving the list of extracted onion domains.
PST.__setattr__(channel, domains_list) PST.__setattr__(channel, domains_list)
@ -76,16 +113,21 @@ if __name__ == "__main__":
to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date, to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date,
PST.p_name) PST.p_name)
if len(domains_list) > 0: if len(domains_list) > 0:
publisher.warning('{}Detected {} .onion(s)'.format( publisher.warning('{}Detected {} .onion(s)'.format(
to_print, len(domains_list))) to_print, len(domains_list)))
now = datetime.datetime.now()
path = os.path.join('onions', str(now.year).zfill(4),
str(now.month).zfill(2),
str(now.day).zfill(2),
str(int(time.mktime(now.utctimetuple()))))
fetch(p, urls, domains_list, path)
else: else:
publisher.info('{}Onion related'.format(to_print)) publisher.info('{}Onion related'.format(to_print))
prec_filename = filename prec_filename = filename
else: else:
publisher.debug("Script url is Idling 10s") publisher.debug("Script url is Idling 10s")
print 'Sleeping' print 'Sleeping'
time.sleep(10) time.sleep(10)
message = p.get_from_set() message = p.get_from_set()

View File

@ -34,7 +34,7 @@ subscribe = Redis_Mail
[Onion] [Onion]
subscribe = Redis_Onion subscribe = Redis_Onion
#publish = Redis_Global publish = Redis_Global
[Web] [Web]
subscribe = Redis_Web subscribe = Redis_Web

64
bin/tor_fetcher.py Normal file
View File

@ -0,0 +1,64 @@
#!/usr/bin/env python2
# -*-coding:UTF-8 -*
import socks
import socket
import urllib2
import StringIO
import gzip
import base64
import sys
import tempfile
def create_connection(address, timeout=None, source_address=None):
sock = socks.socksocket()
sock.connect(address)
return sock
def get_page(url, torclient_host='127.0.0.1', torclient_port=9050):
request = urllib2.Request(url)
# UA of the Tor browser bundle
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0')
return urllib2.urlopen(request).read()
def makegzip64(s):
out = StringIO.StringIO()
with gzip.GzipFile(fileobj=out, mode="w") as f:
f.write(s)
return base64.standard_b64encode(out.getvalue())
if __name__ == "__main__":
if len(sys.argv) != 2:
print('usage:', 'tor_fetcher.py', 'URL (base64 encoded)')
exit(1)
try:
url = base64.standard_b64decode(sys.argv[1])
except:
print('unable to decode')
exit(1)
torclient_host = '127.0.0.1'
torclient_port = 9050
# Setup Proxy
socks.set_default_proxy(socks.SOCKS5, torclient_host, torclient_port, True)
socket.socket = socks.socksocket
socket.create_connection = create_connection
try:
page = get_page(url)
except:
print('unable to fetch')
exit(1)
to_write = makegzip64(page)
t, path = tempfile.mkstemp()
with open(path, 'w') as f:
f.write(to_write)
print path
exit(0)

View File

@ -32,6 +32,9 @@ whoosh
ipaddress ipaddress
pycountry pycountry
# To fetch Onion urls
PySocks
#ASN lookup requirements #ASN lookup requirements
http://adns-python.googlecode.com/files/adns-python-1.2.1.tar.gz http://adns-python.googlecode.com/files/adns-python-1.2.1.tar.gz
https://github.com/trolldbois/python-cymru-services/archive/master.zip https://github.com/trolldbois/python-cymru-services/archive/master.zip