2021-06-02 14:42:23 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
|
|
|
|
"""
|
|
|
|
The Urls Module
|
|
|
|
============================
|
|
|
|
|
|
|
|
This module extract URLs from an item and send them to others modules.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
##################################
|
|
|
|
# Import External packages
|
|
|
|
##################################
|
|
|
|
import os
|
2021-06-07 15:58:21 +02:00
|
|
|
import sys
|
|
|
|
|
|
|
|
from pyfaup.faup import Faup
|
2021-06-02 14:42:23 +02:00
|
|
|
|
2021-06-07 15:58:21 +02:00
|
|
|
sys.path.append(os.environ['AIL_BIN'])
|
2021-06-02 14:42:23 +02:00
|
|
|
##################################
|
|
|
|
# Import Project packages
|
|
|
|
##################################
|
2021-06-07 15:58:21 +02:00
|
|
|
from modules.abstract_module import AbstractModule
|
2023-04-13 14:25:02 +02:00
|
|
|
from lib.ConfigLoader import ConfigLoader
|
2022-10-25 16:25:19 +02:00
|
|
|
from lib.objects.Items import Item
|
2021-06-02 14:42:23 +02:00
|
|
|
|
2021-06-07 15:58:21 +02:00
|
|
|
# # TODO: Faup packages: Add new binding: Check TLD
|
2021-06-02 14:42:23 +02:00
|
|
|
|
|
|
|
class Urls(AbstractModule):
|
|
|
|
"""
|
|
|
|
Urls module for AIL framework
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
"""
|
|
|
|
Init Urls
|
|
|
|
"""
|
|
|
|
super(Urls, self).__init__()
|
|
|
|
|
2023-04-13 14:25:02 +02:00
|
|
|
config_loader = ConfigLoader()
|
|
|
|
|
2021-06-02 14:42:23 +02:00
|
|
|
self.faup = Faup()
|
|
|
|
|
|
|
|
# Protocol file path
|
|
|
|
protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
|
2023-04-13 14:25:02 +02:00
|
|
|
config_loader.get_config_str("Directories", "protocolsfile"))
|
2021-06-02 14:42:23 +02:00
|
|
|
# Get all uri from protocolsfile (Used for Curve)
|
|
|
|
uri_scheme = ""
|
|
|
|
with open(protocolsfile_path, 'r') as scheme_file:
|
|
|
|
for scheme in scheme_file:
|
|
|
|
uri_scheme += scheme[:-1]+"|"
|
|
|
|
uri_scheme = uri_scheme[:-1]
|
|
|
|
|
|
|
|
self.url_regex = "((?i:"+uri_scheme + \
|
2021-06-07 15:58:21 +02:00
|
|
|
")\://(?:[a-zA-Z0-9\.\-]+(?:\:[a-zA-Z0-9\.&%\$\-]+)*@)*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:[a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(?:[a-zA-Z]{2,15}))(?:\:[0-9]+)*(?:/?(?:[a-zA-Z0-9\.\,\?'\\+&%\$#\=~_\-]+))*)"
|
2021-06-02 14:42:23 +02:00
|
|
|
|
|
|
|
# Send module state to logs
|
2023-05-12 15:29:53 +02:00
|
|
|
self.logger.info(f"Module {self.module_name} initialized")
|
2021-06-02 14:42:23 +02:00
|
|
|
|
|
|
|
def compute(self, message):
|
|
|
|
"""
|
|
|
|
Search for Web links from given message
|
|
|
|
"""
|
|
|
|
# Extract item
|
2022-10-25 16:25:19 +02:00
|
|
|
item_id, score = message.split()
|
2021-06-02 14:42:23 +02:00
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
item = Item(item_id)
|
2021-06-07 15:58:21 +02:00
|
|
|
item_content = item.get_content()
|
2021-06-02 14:42:23 +02:00
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
# TODO Handle invalid URL
|
|
|
|
l_urls = self.regex_findall(self.url_regex, item.get_id(), item_content)
|
2021-06-02 14:42:23 +02:00
|
|
|
for url in l_urls:
|
2021-06-07 15:58:21 +02:00
|
|
|
self.faup.decode(url)
|
2022-10-25 16:25:19 +02:00
|
|
|
url_decoded = self.faup.get()
|
|
|
|
# decode URL
|
|
|
|
try:
|
|
|
|
url = url_decoded['url'].decode()
|
|
|
|
except AttributeError:
|
|
|
|
url = url_decoded['url']
|
2021-06-02 14:42:23 +02:00
|
|
|
|
|
|
|
to_send = f"{url} {item.get_id()}"
|
2021-06-07 15:58:21 +02:00
|
|
|
print(to_send)
|
2023-04-13 14:25:02 +02:00
|
|
|
self.add_message_to_queue(to_send, 'Url')
|
2023-05-12 15:29:53 +02:00
|
|
|
self.logger.debug(f"url_parsed: {to_send}")
|
2021-06-02 14:42:23 +02:00
|
|
|
|
2021-06-07 15:58:21 +02:00
|
|
|
if len(l_urls) > 0:
|
|
|
|
to_print = f'Urls;{item.get_source()};{item.get_date()};{item.get_basename()};'
|
2023-04-21 14:53:33 +02:00
|
|
|
print(to_print)
|
|
|
|
# .debug ???
|
|
|
|
# self.redis_logger.info(f'{to_print}Detected {len(l_urls)} URL;{item.get_id()}')
|
2021-06-07 15:58:21 +02:00
|
|
|
|
2021-06-02 14:42:23 +02:00
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
if __name__ == '__main__':
|
2021-06-02 14:42:23 +02:00
|
|
|
module = Urls()
|
|
|
|
module.run()
|