AIL-framework/bin/Web.py

153 lines
5.2 KiB
Python
Executable File

#!/usr/bin/env python
# -*-coding:UTF-8 -*
"""
The Web Module
============================
This module tries to parse URLs and warns if some defined contry code are present.
"""
import redis
import pprint
import time
import os
import dns.exception
from packages import Paste
from packages import lib_refine
from pubsublogger import publisher
from pyfaup.faup import Faup
import re
# Country and ASN lookup
from cymru.ip2asn.dns import DNSClient as ip2asn
import socket
import pycountry
import ipaddress
from Helper import Process
# Used to prevent concat with empty fields due to url parsing
def avoidNone(a_string):
if a_string is None:
return ""
else:
return a_string
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'Web'
p = Process(config_section)
# REDIS #
r_serv2 = redis.StrictRedis(
host=p.config.get("Redis_Cache", "host"),
port=p.config.getint("Redis_Cache", "port"),
db=p.config.getint("Redis_Cache", "db"))
# Protocol file path
protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "protocolsfile"))
# Country to log as critical
cc_critical = p.config.get("Url", "cc_critical")
# FUNCTIONS #
publisher.info("Script URL subscribed to channel web_categ")
# FIXME For retro compatibility
channel = 'web_categ'
message = p.get_from_set()
prec_filename = None
faup = Faup()
# Get all uri from protocolsfile (Used for Curve)
uri_scheme = ""
with open(protocolsfile_path, 'r') as scheme_file:
for scheme in scheme_file:
uri_scheme += scheme[:-1]+"|"
uri_scheme = uri_scheme[:-1]
url_regex = "("+uri_scheme+")\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*"
while True:
if message is not None:
filename, score = message.split()
if prec_filename is None or filename != prec_filename:
domains_list = []
PST = Paste.Paste(filename)
client = ip2asn()
for x in PST.get_regex(url_regex):
matching_url = re.search(url_regex, PST.get_p_content())
url = matching_url.group(0)
to_send = "{} {} {}".format(url, PST._get_p_date(), filename)
p.populate_set_out(to_send, 'Url')
faup.decode(url)
domain = faup.get_domain()
subdomain = faup.get_subdomain()
f1 = None
domains_list.append(domain)
publisher.debug('{} Published'.format(url))
if f1 == "onion":
print domain
hostl = unicode(avoidNone(subdomain)+avoidNone(domain))
try:
socket.setdefaulttimeout(1)
ip = socket.gethostbyname(unicode(hostl))
except:
# If the resolver is not giving any IPv4 address,
# ASN/CC lookup is skip.
continue
try:
l = client.lookup(ip, qType='IP')
except ipaddress.AddressValueError:
continue
cc = getattr(l, 'cc')
asn = getattr(l, 'asn')
# EU is not an official ISO 3166 code (but used by RIPE
# IP allocation)
if cc is not None and cc != "EU":
print hostl, asn, cc, \
pycountry.countries.get(alpha_2=cc).name
if cc == cc_critical:
to_print = 'Url;{};{};{};Detected {} {}'.format(
PST.p_source, PST.p_date, PST.p_name,
hostl, cc)
#publisher.warning(to_print)
print to_print
else:
print hostl, asn, cc
A_values = lib_refine.checking_A_record(r_serv2,
domains_list)
if A_values[0] >= 1:
PST.__setattr__(channel, A_values)
PST.save_attribute_redis(channel, (A_values[0],
list(A_values[1])))
pprint.pprint(A_values)
publisher.info('Url;{};{};{};Checked {} URL;{}'.format(
PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_path))
prec_filename = filename
else:
publisher.debug("Script url is Idling 10s")
print 'Sleeping'
time.sleep(10)
message = p.get_from_set()