mirror of https://github.com/CIRCL/AIL-framework
chg: [Urls (Web) module] fix regex + rename
parent
fc11c1769b
commit
73da7ae4c5
|
@ -12,21 +12,21 @@ This module extract URLs from an item and send them to others modules.
|
|||
##################################
|
||||
# Import External packages
|
||||
##################################
|
||||
import redis
|
||||
import pprint
|
||||
import time
|
||||
import os
|
||||
from pyfaup.faup import Faup
|
||||
import re
|
||||
import sys
|
||||
|
||||
from pyfaup.faup import Faup
|
||||
|
||||
sys.path.append(os.environ['AIL_BIN'])
|
||||
##################################
|
||||
# Import Project packages
|
||||
##################################
|
||||
from module.abstract_module import AbstractModule
|
||||
from modules.abstract_module import AbstractModule
|
||||
from packages.Item import Item
|
||||
from packages import lib_refine
|
||||
from Helper import Process
|
||||
from lib import regex_helper
|
||||
|
||||
# # TODO: Faup packages: Add new binding: Check TLD
|
||||
|
||||
class Urls(AbstractModule):
|
||||
"""
|
||||
|
@ -39,8 +39,8 @@ class Urls(AbstractModule):
|
|||
"""
|
||||
super(Urls, self).__init__()
|
||||
|
||||
# FUNCTIONS #
|
||||
self.faup = Faup()
|
||||
self.redis_cache_key = regex_helper.generate_redis_cache_key(self.module_name)
|
||||
|
||||
# Protocol file path
|
||||
protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
|
||||
|
@ -53,7 +53,7 @@ class Urls(AbstractModule):
|
|||
uri_scheme = uri_scheme[:-1]
|
||||
|
||||
self.url_regex = "((?i:"+uri_scheme + \
|
||||
")\://(?:[a-zA-Z0-9\.\-]+(?:\:[a-zA-Z0-9\.&%\$\-]+)*@)*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:[a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(?:com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
||||
")\://(?:[a-zA-Z0-9\.\-]+(?:\:[a-zA-Z0-9\.&%\$\-]+)*@)*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:[a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(?:[a-zA-Z]{2,15}))(?:\:[0-9]+)*(?:/?(?:[a-zA-Z0-9\.\,\?'\\+&%\$#\=~_\-]+))*)"
|
||||
|
||||
# Send module state to logs
|
||||
self.redis_logger.info(f"Module {self.module_name} initialized")
|
||||
|
@ -67,19 +67,22 @@ class Urls(AbstractModule):
|
|||
id, score = message.split()
|
||||
|
||||
item = Item(id)
|
||||
item_content = item.get_content()
|
||||
|
||||
l_urls = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item.get_content())
|
||||
if len(urls) > 0:
|
||||
to_print = f'Urls;{item.get_source()};{item.get_date()};{item.get_basename()};'
|
||||
self.redis_logger.info(f'{to_print}Detected {len(urls)} URL;{item.get_id()}')
|
||||
|
||||
l_urls = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item_content)
|
||||
for url in l_urls:
|
||||
# # TODO: FIXME handle .foundation .dev onion? i2p?
|
||||
self.faup.decode(url)
|
||||
unpack_url = self.faup.get()
|
||||
|
||||
to_send = f"{url} {item.get_id()}"
|
||||
print(to_send)
|
||||
self.send_message_to_queue(to_send, 'Url')
|
||||
self.redis_logger.debug(f"url_parsed: {to_send}")
|
||||
|
||||
if len(l_urls) > 0:
|
||||
to_print = f'Urls;{item.get_source()};{item.get_date()};{item.get_basename()};'
|
||||
self.redis_logger.info(f'{to_print}Detected {len(l_urls)} URL;{item.get_id()}')
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
module = Urls()
|
|
@ -51,7 +51,7 @@ subscribe = Redis_Global
|
|||
|
||||
[Categ]
|
||||
subscribe = Redis_Global
|
||||
publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Web,Redis_Credential,Redis_SourceCode,Redis_Cve,Redis_ApiKey
|
||||
publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Urls,Redis_Credential,Redis_SourceCode,Redis_Cve,Redis_ApiKey
|
||||
|
||||
[CreditCards]
|
||||
subscribe = Redis_CreditCards
|
||||
|
@ -74,14 +74,11 @@ publish = Redis_ValidOnion,Redis_Tags,Redis_Crawler
|
|||
[DumpValidOnion]
|
||||
subscribe = Redis_ValidOnion
|
||||
|
||||
[Web]
|
||||
subscribe = Redis_Web
|
||||
[Urls]
|
||||
subscribe = Redis_Urls
|
||||
publish = Redis_Url
|
||||
#publish = Redis_Url,ZMQ_Url
|
||||
|
||||
[WebStats]
|
||||
subscribe = Redis_Url
|
||||
|
||||
[LibInjection]
|
||||
subscribe = Redis_Url
|
||||
publish = Redis_Duplicate,Redis_Tags
|
||||
|
|
Loading…
Reference in New Issue