No known key found for this signature in database
GPG 키 ID: 1E1B1F50D84613D0
23개의 변경된 파일과 176개의 추가작업 그리고 465개의 파일을 삭제
-
6bin/LAUNCH.sh
-
86bin/Urls.py
-
184bin/Web.py
-
207bin/WebStats.py
-
6bin/modules/ApiKey.py
-
6bin/modules/Categ.py
-
81bin/modules/Credential.py
-
4bin/modules/CreditCards.py
-
23bin/modules/Decoder.py
-
3bin/modules/DomClassifier.py
-
3bin/modules/Global.py
-
6bin/modules/Indexer.py
-
5bin/modules/Keys.py
-
7bin/modules/Onion.py
-
0bin/modules/Phone.py
-
0bin/modules/SentimentAnalysis.py
-
0bin/modules/Tags.py
-
0bin/modules/__init__.py
-
0bin/modules/abstract_module.py
-
0bin/modules/submit_paste.py
-
0bin/trackers/Tracker_Term.py
-
14tests/test_modules.py
-
0tools/import_dir.py
@ -0,0 +1,86 @@ |
|||
#!/usr/bin/env python3 |
|||
# -*-coding:UTF-8 -* |
|||
|
|||
""" |
|||
The Urls Module |
|||
============================ |
|||
|
|||
This module extract URLs from an item and send them to others modules. |
|||
|
|||
""" |
|||
|
|||
################################## |
|||
# Import External packages |
|||
################################## |
|||
import redis |
|||
import pprint |
|||
import time |
|||
import os |
|||
from pyfaup.faup import Faup |
|||
import re |
|||
|
|||
################################## |
|||
# Import Project packages |
|||
################################## |
|||
from module.abstract_module import AbstractModule |
|||
from packages.Item import Item |
|||
from packages import lib_refine |
|||
from Helper import Process |
|||
|
|||
|
|||
class Urls(AbstractModule): |
|||
""" |
|||
Urls module for AIL framework |
|||
""" |
|||
|
|||
def __init__(self): |
|||
""" |
|||
Init Urls |
|||
""" |
|||
super(Urls, self).__init__() |
|||
|
|||
# FUNCTIONS # |
|||
self.faup = Faup() |
|||
|
|||
# Protocol file path |
|||
protocolsfile_path = os.path.join(os.environ['AIL_HOME'], |
|||
self.process.config.get("Directories", "protocolsfile")) |
|||
# Get all uri from protocolsfile (Used for Curve) |
|||
uri_scheme = "" |
|||
with open(protocolsfile_path, 'r') as scheme_file: |
|||
for scheme in scheme_file: |
|||
uri_scheme += scheme[:-1]+"|" |
|||
uri_scheme = uri_scheme[:-1] |
|||
|
|||
self.url_regex = "((?i:"+uri_scheme + \ |
|||
")\://(?:[a-zA-Z0-9\.\-]+(?:\:[a-zA-Z0-9\.&%\$\-]+)*@)*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:[a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(?:com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" |
|||
|
|||
# Send module state to logs |
|||
self.redis_logger.info(f"Module {self.module_name} initialized") |
|||
|
|||
|
|||
def compute(self, message): |
|||
""" |
|||
Search for Web links from given message |
|||
""" |
|||
# Extract item |
|||
id, score = message.split() |
|||
|
|||
item = Item(id) |
|||
|
|||
l_urls = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item.get_content()) |
|||
if len(urls) > 0: |
|||
to_print = f'Urls;{item.get_source()};{item.get_date()};{item.get_basename()};' |
|||
self.redis_logger.info(f'{to_print}Detected {len(urls)} URL;{item.get_id()}') |
|||
|
|||
for url in l_urls: |
|||
# # TODO: FIXME handle .foundation .dev onion? i2p? |
|||
|
|||
to_send = f"{url} {item.get_id()}" |
|||
self.send_message_to_queue(to_send, 'Url') |
|||
self.redis_logger.debug(f"url_parsed: {to_send}") |
|||
|
|||
if __name__ == '__main__': |
|||
|
|||
module = Urls() |
|||
module.run() |
|||
@ -1,184 +0,0 @@ |
|||
#!/usr/bin/env python3 |
|||
# -*-coding:UTF-8 -* |
|||
|
|||
""" |
|||
The Web Module |
|||
============================ |
|||
|
|||
This module tries to parse URLs and warns if some defined contry code are present. |
|||
|
|||
""" |
|||
|
|||
################################## |
|||
# Import External packages |
|||
################################## |
|||
import redis |
|||
import pprint |
|||
import time |
|||
import os |
|||
import dns.exception |
|||
from pyfaup.faup import Faup |
|||
import re |
|||
# Country and ASN lookup |
|||
from cymru.ip2asn.dns import DNSClient as ip2asn |
|||
import socket |
|||
import pycountry |
|||
import ipaddress |
|||
|
|||
################################## |
|||
# Import Project packages |
|||
################################## |
|||
from module.abstract_module import AbstractModule |
|||
from packages import Paste |
|||
from packages import lib_refine |
|||
from Helper import Process |
|||
|
|||
|
|||
class Web(AbstractModule): |
|||
""" |
|||
Web module for AIL framework |
|||
""" |
|||
|
|||
# Used to prevent concat with empty fields due to url parsing |
|||
def avoidNone(self, a_string): |
|||
if a_string is None: |
|||
return "" |
|||
else: |
|||
return a_string |
|||
|
|||
def __init__(self): |
|||
""" |
|||
Init Web |
|||
""" |
|||
super(Web, self).__init__(logger_channel='script:web') |
|||
|
|||
# REDIS Cache |
|||
self.r_serv2 = redis.StrictRedis( |
|||
host=self.process.config.get("Redis_Cache", "host"), |
|||
port=self.process.config.getint("Redis_Cache", "port"), |
|||
db=self.process.config.getint("Redis_Cache", "db"), |
|||
decode_responses=True) |
|||
|
|||
# Country to log as critical |
|||
self.cc_critical = self.process.config.get("Url", "cc_critical") |
|||
|
|||
# FUNCTIONS # |
|||
|
|||
self.faup = Faup() |
|||
|
|||
# Protocol file path |
|||
protocolsfile_path = os.path.join(os.environ['AIL_HOME'], |
|||
self.process.config.get("Directories", "protocolsfile")) |
|||
# Get all uri from protocolsfile (Used for Curve) |
|||
uri_scheme = "" |
|||
with open(protocolsfile_path, 'r') as scheme_file: |
|||
for scheme in scheme_file: |
|||
uri_scheme += scheme[:-1]+"|" |
|||
uri_scheme = uri_scheme[:-1] |
|||
|
|||
self.url_regex = "((?i:"+uri_scheme + \ |
|||
")\://(?:[a-zA-Z0-9\.\-]+(?:\:[a-zA-Z0-9\.&%\$\-]+)*@)*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:[a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(?:com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" |
|||
|
|||
self.prec_filename = None |
|||
|
|||
# Send module state to logs |
|||
self.redis_logger.info(f"Module {self.module_name} initialized") |
|||
|
|||
|
|||
def compute(self, message): |
|||
""" |
|||
Search for Web links from given message |
|||
""" |
|||
# Extract item |
|||
filename, score = message.split() |
|||
|
|||
domains_list = set() |
|||
hosts_list = set() |
|||
|
|||
if self.prec_filename is None or filename != self.prec_filename: |
|||
domains_list.clear() |
|||
hosts_list.clear() |
|||
|
|||
PST = Paste.Paste(filename) |
|||
client = ip2asn() |
|||
|
|||
detected_urls = PST.get_regex(self.url_regex) |
|||
if len(detected_urls) > 0: |
|||
to_print = f'Web;{PST.p_source};{PST.p_date};{PST.p_name};' |
|||
self.redis_logger.info(f'{to_print}Detected {len(detected_urls)} URL;{PST.p_rel_path}') |
|||
|
|||
for url in detected_urls: |
|||
|
|||
if url.endswith(".on"): |
|||
# URL is an onion link skip |
|||
# TODO send to TOR crawler ? |
|||
# self.redis_logger.debug("Skip onion link") |
|||
continue |
|||
|
|||
self.redis_logger.debug(f"match regex: {url}") |
|||
|
|||
to_send = f"{url} {PST._get_p_date()} {filename}" |
|||
self.process.populate_set_out(to_send, 'Url') |
|||
self.redis_logger.debug(f"url_parsed: {to_send}") |
|||
|
|||
self.faup.decode(url) |
|||
domain = self.faup.get_domain() |
|||
subdomain = self.faup.get_subdomain() |
|||
|
|||
self.redis_logger.debug(f'{url} Published') |
|||
|
|||
domains_list.add(domain) |
|||
|
|||
hostl = f'{subdomain}.{domain}' if subdomain else domain |
|||
|
|||
if hostl not in hosts_list: |
|||
# test host only once a host in a paste |
|||
hosts_list.add(hostl) |
|||
|
|||
try: |
|||
socket.setdefaulttimeout(1) |
|||
ip = socket.gethostbyname(hostl) |
|||
# If the resolver is not giving any IPv4 address, |
|||
# ASN/CC lookup is skip. |
|||
l = client.lookup(ip, qType='IP') |
|||
except ipaddress.AddressValueError: |
|||
self.redis_logger.debug( |
|||
f'ASN/CC lookup failed for IP {ip}') |
|||
continue |
|||
except: |
|||
self.redis_logger.debug( |
|||
f'Resolver IPv4 address failed for host {hostl}') |
|||
continue |
|||
|
|||
cc = getattr(l, 'cc') |
|||
asn = '' |
|||
if getattr(l, 'asn') is not None: |
|||
asn = getattr(l, 'asn')[2:] # remobe b' |
|||
|
|||
# EU is not an official ISO 3166 code (but used by RIPE |
|||
# IP allocation) |
|||
if cc is not None and cc != "EU": |
|||
countryname = pycountry.countries.get(alpha_2=cc).name |
|||
self.redis_logger.debug(f'{hostl};{asn};{cc};{countryname}') |
|||
if cc == self.cc_critical: |
|||
to_print = f'Url;{PST.p_source};{PST.p_date};{PST.p_name};Detected {hostl} {cc}' |
|||
self.redis_logger.info(to_print) |
|||
else: |
|||
self.redis_logger.debug(f'{hostl};{asn};{cc}') |
|||
|
|||
A_values = lib_refine.checking_A_record(self.r_serv2, |
|||
domains_list) |
|||
|
|||
if A_values[0] >= 1: |
|||
|
|||
pprint.pprint(A_values) |
|||
# self.redis_logger.info('Url;{};{};{};Checked {} URL;{}'.format( |
|||
# PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_rel_path)) |
|||
|
|||
self.prec_filename = filename |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
|
|||
module = Web() |
|||
module.run() |
|||
@ -1,207 +0,0 @@ |
|||
#!/usr/bin/env python3 |
|||
# -*-coding:UTF-8 -* |
|||
|
|||
""" |
|||
The WebStats Module |
|||
====================== |
|||
|
|||
This module makes stats on URL recolted from the web module. |
|||
It consider the TLD, Domain and protocol. |
|||
|
|||
""" |
|||
|
|||
################################## |
|||
# Import External packages |
|||
################################## |
|||
import time |
|||
import datetime |
|||
import redis |
|||
import os |
|||
from pubsublogger import publisher |
|||
from pyfaup.faup import Faup |
|||
|
|||
|
|||
################################## |
|||
# Import Project packages |
|||
################################## |
|||
from module.abstract_module import AbstractModule |
|||
from packages import lib_words |
|||
from packages.Date import Date |
|||
from Helper import Process |
|||
|
|||
|
|||
class WebStats(AbstractModule): |
|||
""" |
|||
WebStats module for AIL framework |
|||
""" |
|||
|
|||
# Config Var |
|||
THRESHOLD_TOTAL_SUM = 200 # Above this value, a keyword is eligible for a progression |
|||
THRESHOLD_INCREASE = 1.0 # The percentage representing the keyword occurence since num_day_to_look |
|||
MAX_SET_CARDINALITY = 10 # The cardinality of the progression set |
|||
NUM_DAY_TO_LOOK = 5 # the detection of the progression start num_day_to_look in the past |
|||
|
|||
|
|||
def __init__(self): |
|||
super(WebStats, self).__init__() |
|||
|
|||
# Send module state to logs |
|||
self.redis_logger.info("Module %s initialized"%(self.module_name)) |
|||
# Sent to the logging a description of the module |
|||
self.redis_logger.info("Makes statistics about valid URL") |
|||
|
|||
self.pending_seconds = 5*60 |
|||
|
|||
# REDIS # |
|||
self.r_serv_trend = redis.StrictRedis( |
|||
host=self.process.config.get("ARDB_Trending", "host"), |
|||
port=self.process.config.get("ARDB_Trending", "port"), |
|||
db=self.process.config.get("ARDB_Trending", "db"), |
|||
decode_responses=True) |
|||
|
|||
# FILE CURVE SECTION # |
|||
self.csv_path_proto = os.path.join(os.environ['AIL_HOME'], |
|||
self.process.config.get("Directories", "protocolstrending_csv")) |
|||
self.protocolsfile_path = os.path.join(os.environ['AIL_HOME'], |
|||
self.process.config.get("Directories", "protocolsfile")) |
|||
|
|||
self.csv_path_tld = os.path.join(os.environ['AIL_HOME'], |
|||
self.process.config.get("Directories", "tldstrending_csv")) |
|||
self.tldsfile_path = os.path.join(os.environ['AIL_HOME'], |
|||
self.process.config.get("Directories", "tldsfile")) |
|||
|
|||
self.csv_path_domain = os.path.join(os.environ['AIL_HOME'], |
|||
self.process.config.get("Directories", "domainstrending_csv")) |
|||
|
|||
self.faup = Faup() |
|||
self.generate_new_graph = False |
|||
|
|||
|
|||
def computeNone(self): |
|||
if self.generate_new_graph: |
|||
self.generate_new_graph = False |
|||
|
|||
today = datetime.date.today() |
|||
year = today.year |
|||
month = today.month |
|||
|
|||
self.redis_logger.debug('Building protocol graph') |
|||
lib_words.create_curve_with_word_file(self.r_serv_trend, self.csv_path_proto, |
|||
self.protocolsfile_path, year, |
|||
month) |
|||
|
|||
self.redis_logger.debug('Building tld graph') |
|||
lib_words.create_curve_with_word_file(self.r_serv_trend, self.csv_path_tld, |
|||
self.tldsfile_path, year, |
|||
month) |
|||
|
|||
self.redis_logger.debug('Building domain graph') |
|||
lib_words.create_curve_from_redis_set(self.r_serv_trend, self.csv_path_domain, |
|||
"domain", year, |
|||
month) |
|||
self.redis_logger.debug('end building') |
|||
|
|||
|
|||
def compute(self, message): |
|||
self.generate_new_graph = True |
|||
|
|||
# Do something with the message from the queue |
|||
url, date, path = message.split() |
|||
self.faup.decode(url) |
|||
url_parsed = self.faup.get() |
|||
|
|||
# Scheme analysis |
|||
self.analyse('scheme', date, url_parsed) |
|||
# Tld analysis |
|||
self.analyse('tld', date, url_parsed) |
|||
# Domain analysis |
|||
self.analyse('domain', date, url_parsed) |
|||
|
|||
self.compute_progression('scheme', self.NUM_DAY_TO_LOOK, url_parsed) |
|||
self.compute_progression('tld', self.NUM_DAY_TO_LOOK, url_parsed) |
|||
self.compute_progression('domain', self.NUM_DAY_TO_LOOK, url_parsed) |
|||
|
|||
|
|||
def analyse(self, field_name, date, url_parsed): |
|||
field = url_parsed[field_name] |
|||
|
|||
if field is not None: |
|||
try: # faup version |
|||
field = field.decode() |
|||
except: |
|||
pass |
|||
|
|||
self.r_serv_trend.hincrby(field, date, 1) |
|||
|
|||
if field_name == "domain": #save domain in a set for the monthly plot |
|||
domain_set_name = "domain_set_" + date[0:6] |
|||
self.r_serv_trend.sadd(domain_set_name, field) |
|||
self.redis_logger.debug("added in " + domain_set_name +": "+ field) |
|||
|
|||
|
|||
def get_date_range(self, num_day): |
|||
curr_date = datetime.date.today() |
|||
date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2)) |
|||
date_list = [] |
|||
|
|||
for i in range(0, num_day+1): |
|||
date_list.append(date.substract_day(i)) |
|||
return date_list |
|||
|
|||
|
|||
def compute_progression_word(self, num_day, keyword): |
|||
""" |
|||
Compute the progression for one keyword |
|||
""" |
|||
date_range = self.get_date_range(num_day) |
|||
# check if this keyword is eligible for progression |
|||
keyword_total_sum = 0 |
|||
value_list = [] |
|||
for date in date_range: # get value up to date_range |
|||
curr_value = self.r_serv_trend.hget(keyword, date) |
|||
value_list.append(int(curr_value if curr_value is not None else 0)) |
|||
keyword_total_sum += int(curr_value) if curr_value is not None else 0 |
|||
oldest_value = value_list[-1] if value_list[-1] != 0 else 1 #Avoid zero division |
|||
|
|||
# The progression is based on the ratio: value[i] / value[i-1] |
|||
keyword_increase = 0 |
|||
value_list_reversed = value_list[:] |
|||
value_list_reversed.reverse() |
|||
for i in range(1, len(value_list_reversed)): |
|||
divisor = value_list_reversed[i-1] if value_list_reversed[i-1] != 0 else 1 |
|||
keyword_increase += value_list_reversed[i] / divisor |
|||
|
|||
return (keyword_increase, keyword_total_sum) |
|||
|
|||
|
|||
def compute_progression(self, field_name, num_day, url_parsed): |
|||
""" |
|||
recompute the set top_progression zset |
|||
- Compute the current field progression |
|||
- re-compute the current progression for each first 2*self.MAX_SET_CARDINALITY fields in the top_progression_zset |
|||
""" |
|||
redis_progression_name_set = "z_top_progression_"+field_name |
|||
|
|||
keyword = url_parsed[field_name] |
|||
if keyword is not None: |
|||
|
|||
#compute the progression of the current word |
|||
keyword_increase, keyword_total_sum = self.compute_progression_word(num_day, keyword) |
|||
|
|||
#re-compute the progression of 2*self.MAX_SET_CARDINALITY |
|||
current_top = self.r_serv_trend.zrevrangebyscore(redis_progression_name_set, '+inf', '-inf', withscores=True, start=0, num=2*self.MAX_SET_CARDINALITY) |
|||
for word, value in current_top: |
|||
word_inc, word_tot_sum = self.compute_progression_word(num_day, word) |
|||
self.r_serv_trend.zrem(redis_progression_name_set, word) |
|||
if (word_tot_sum > self.THRESHOLD_TOTAL_SUM) and (word_inc > self.THRESHOLD_INCREASE): |
|||
self.r_serv_trend.zadd(redis_progression_name_set, float(word_inc), word) |
|||
|
|||
# filter before adding |
|||
if (keyword_total_sum > self.THRESHOLD_TOTAL_SUM) and (keyword_increase > self.THRESHOLD_INCREASE): |
|||
self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_increase), keyword) |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
|
|||
module = WebStats() |
|||
module.run() |
|||
쓰기
미리보기
불러오는 중...
취소
저장
Reference in new issue