2020-03-30 18:43:50 +02:00
API Helper
import base64
import gzip
2022-10-25 16:25:19 +02:00
import hashlib
2020-03-30 18:43:50 +02:00
import json
import os
2022-10-25 16:25:19 +02:00
import pickle
2020-03-30 18:43:50 +02:00
import re
import sys
2020-07-27 15:46:09 +02:00
import time
2020-03-30 18:43:50 +02:00
import uuid
2022-10-25 16:25:19 +02:00
from enum import IntEnum, unique
2020-03-30 18:43:50 +02:00
from datetime import datetime, timedelta
2023-03-14 17:36:42 +01:00
from dateutil.relativedelta import relativedelta
2021-05-14 14:42:16 +02:00
from urllib.parse import urlparse, urljoin
2023-03-14 17:36:42 +01:00
from bs4 import BeautifulSoup
2022-10-25 16:25:19 +02:00
from pylacus import PyLacus
2020-03-30 18:43:50 +02:00
from pyfaup.faup import Faup
2020-05-22 15:41:05 +02:00
# interact with splash_crawler API
import requests
2022-10-25 16:25:19 +02:00
# Import Project packages
from packages import git_status
from lib.ConfigLoader import ConfigLoader
from lib.objects.Domains import Domain
2022-11-30 15:50:10 +01:00
from lib.objects.Items import Item
2022-10-25 16:25:19 +02:00
config_loader = ConfigLoader()
r_db = config_loader.get_db_conn("Kvrocks_DB")
r_crawler = config_loader.get_db_conn("Kvrocks_Crawler")
r_cache = config_loader.get_redis_conn("Redis_Cache")
2020-03-30 18:43:50 +02:00
2022-10-25 16:25:19 +02:00
ITEMS_FOLDER = config_loader.get_config_str("Directories", "pastes")
HAR_DIR = config_loader.get_files_directory('har')
2021-05-14 14:42:16 +02:00
activate_crawler = config_loader.get_config_str("Crawler", "activate_crawler")
2020-03-30 18:43:50 +02:00
config_loader = None
faup = Faup()
2020-08-17 21:52:57 +02:00
# # # # # # # #
# #
# #
# # # # # # # #
2022-10-25 16:25:19 +02:00
def gen_uuid():
return str(uuid.uuid4())
2020-03-30 18:43:50 +02:00
def generate_uuid():
return str(uuid.uuid4()).replace('-', '')
2021-02-10 15:50:48 +01:00
# # TODO: remove me ?
2022-10-25 16:25:19 +02:00
def get_current_date(separator=False):
if separator:
return datetime.now().strftime("%Y/%m/%d")
return datetime.now().strftime("%Y%m%d")
def get_date_crawled_items_source(date):
return os.path.join('crawled', date)
def get_date_har_dir(date):
return os.path.join(HAR_DIR, date)
2020-08-17 21:52:57 +02:00
2021-02-05 17:42:33 +01:00
def is_valid_onion_domain(domain):
if not domain.endswith('.onion'):
return False
domain = domain.replace('.onion', '', 1)
2022-10-25 16:25:19 +02:00
if len(domain) == 16: # v2 address
2021-02-05 17:42:33 +01:00
r_onion = r'[a-z0-9]{16}'
if re.match(r_onion, domain):
return True
2022-10-25 16:25:19 +02:00
elif len(domain) == 56: # v3 address
2021-02-05 17:42:33 +01:00
r_onion = r'[a-z0-9]{56}'
if re.fullmatch(r_onion, domain):
return True
return False
2020-08-17 21:52:57 +02:00
2023-03-31 09:25:06 +02:00
def is_valid_domain(domain):
url_unpack = faup.get()
unpack_domain = url_unpack['domain'].lower()
return domain == unpack_domain
2021-03-05 18:47:38 +01:00
def get_faup():
return faup
2022-10-25 16:25:19 +02:00
def unpack_url(url):
f = get_faup()
url_decoded = f.get()
port = url_decoded['port']
if not port:
if url_decoded['scheme'] == 'http':
port = 80
elif url_decoded['scheme'] == 'https':
port = 443
port = 80
url_decoded['port'] = port
# decode URL
url = url_decoded['url'].decode()
except AttributeError:
url = url_decoded['url']
# if not url_decoded['scheme']:
# url = f'http://{url}'
# Fix case
url_decoded['domain'] = url_decoded['domain'].lower()
url_decoded['url'] = url.replace(url_decoded['host'], url_decoded['host'].lower(), 1)
return url_decoded
2021-05-14 14:42:16 +02:00
# # # # # # # #
# #
2022-10-25 16:25:19 +02:00
2021-05-14 14:42:16 +02:00
# #
2023-02-21 12:22:49 +01:00
# # # # # # # # TODO CREATE NEW OBJECT
2021-05-14 14:42:16 +02:00
def get_favicon_from_html(html, domain, url):
favicon_urls = extract_favicon_from_html(html, url)
2022-10-25 16:25:19 +02:00
# add root favicon
2021-05-14 14:42:16 +02:00
if not favicon_urls:
return favicon_urls
def extract_favicon_from_html(html, url):
favicon_urls = set()
soup = BeautifulSoup(html, 'html.parser')
set_icons = set()
# If there are multiple <link rel="icon">s, the browser uses their media,
# type, and sizes attributes to select the most appropriate icon.
# If several icons are equally appropriate, the last one is used.
# If the most appropriate icon is later found to be inappropriate,
# for example because it uses an unsupported format,
# the browser proceeds to the next-most appropriate, and so on.
# # DEBUG: /!\ firefox load all favicon ???
# iOS Safari 'apple-touch-icon'
# Safari pinned tabs 'mask-icon'
# Android Chrome 'manifest'
# Edge and IE 12:
# - <meta name="msapplication-TileColor" content="#aaaaaa"> <meta name="theme-color" content="#ffffff">
# - <meta name="msapplication-config" content="/icons/browserconfig.xml">
# desktop browser 'shortcut icon' (older browser), 'icon'
for favicon_tag in ['icon', 'shortcut icon']:
if soup.head:
for icon in soup.head.find_all('link', attrs={'rel': lambda x : x and x.lower() == favicon_tag, 'href': True}):
# # TODO: handle base64 favicon
for tag in set_icons:
icon_url = tag.get('href')
if icon_url:
if icon_url.startswith('//'):
icon_url = icon_url.replace('//', '/')
if icon_url.startswith('data:'):
# # TODO: handle base64 favicon
icon_url = urljoin(url, icon_url)
icon_url = urlparse(icon_url, scheme=urlparse(url).scheme).geturl()
return favicon_urls
# # # - - # # #
2020-03-30 18:43:50 +02:00
2023-03-14 17:36:42 +01:00
# # TODO:
2022-10-25 16:25:19 +02:00
def create_cookie_crawler(cookie_dict, domain, crawler_type='web'):
2020-04-01 09:58:47 +02:00
# check cookie domain filed
if not 'domain' in cookie_dict:
2022-10-25 16:25:19 +02:00
cookie_dict['domain'] = f'.{domain}'
2020-04-01 09:58:47 +02:00
2020-03-30 18:43:50 +02:00
# tor browser: disable secure cookie
2022-10-25 16:25:19 +02:00
if crawler_type == 'onion':
2020-03-30 18:43:50 +02:00
cookie_dict['secure'] = False
# force cookie domain
# url = urlparse(browser_cookie['Host raw'])
# domain = url.netloc.split(':', 1)[0]
# cookie_dict['domain'] = '.{}'.format(domain)
# change expire date
cookie_dict['expires'] = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z'
return cookie_dict
2023-02-17 14:50:20 +01:00
2020-03-30 18:43:50 +02:00
2023-02-17 14:50:20 +01:00
def get_cookiejars():
return r_crawler.smembers('cookiejars:all')
2020-03-30 18:43:50 +02:00
2023-02-17 14:50:20 +01:00
def get_cookiejars_global():
cookiejars = r_crawler.smembers('cookiejars:global')
2022-10-25 16:25:19 +02:00
if not cookiejars:
cookiejars = []
return cookiejars
2020-03-30 18:43:50 +02:00
2023-02-17 14:50:20 +01:00
def get_cookiejars_user(user_id):
cookiejars = r_crawler.smembers(f'cookiejars:user:{user_id}')
2022-10-25 16:25:19 +02:00
if not cookiejars:
cookiejars = []
return cookiejars
2020-03-30 18:43:50 +02:00
2023-02-17 14:50:20 +01:00
class Cookiejar:
2020-03-30 18:43:50 +02:00
2023-02-17 14:50:20 +01:00
def __init__(self, cookiejar_uuid):
self.uuid = cookiejar_uuid
2022-08-19 16:53:31 +02:00
2023-02-17 14:50:20 +01:00
def exists(self):
return r_crawler.exists(f'cookiejar:meta:{self.uuid}') # or cookiejar:uuid
2020-03-30 18:43:50 +02:00
2023-02-17 14:50:20 +01:00
def get_date(self):
return r_crawler.hget(f'cookiejar:meta:{self.uuid}', 'date')
def _set_date(self, date):
r_crawler.hset(f'cookiejar:meta:{self.uuid}', 'date', date)
def get_description(self):
return r_crawler.hget(f'cookiejar:meta:{self.uuid}', 'description')
def set_description(self, description):
r_crawler.hset(f'cookiejar:meta:{self.uuid}', 'description', description)
def get_user(self):
return r_crawler.hget(f'cookiejar:meta:{self.uuid}', 'user')
2020-03-30 18:43:50 +02:00
2023-02-17 14:50:20 +01:00
def _set_user(self, user_id):
return r_crawler.hset(f'cookiejar:meta:{self.uuid}', 'user', user_id)
def get_level(self):
level = r_crawler.hget(f'cookiejar:meta:{self.uuid}', 'level')
if level:
level = 1
level = 0
return level
def _set_level(self, level):
if level:
level = 1
level = 0
r_crawler.hset(f'cookiejar:meta:{self.uuid}', 'level', level)
def is_cookie_in_jar(self, cookie_uuid):
return r_crawler.sismember(f'cookiejar:cookies:{self.uuid}', cookie_uuid)
def get_cookies_uuid(self):
return r_crawler.smembers(f'cookiejar:cookies:{self.uuid}')
def get_cookies(self, r_json=False):
l_cookies = []
for cookie_uuid in self.get_cookies_uuid():
cookies = Cookie(cookie_uuid)
return l_cookies
def get_nb_cookies(self):
return r_crawler.scard(f'cookiejar:cookies:{self.uuid}')
def get_meta(self, level=False, nb_cookies=False, cookies=False, r_json=False):
meta = {'uuid': self.uuid,
'date': self.get_date(),
'description': self.get_description(),
'user': self.get_user()}
if level:
meta['level'] = self.get_level()
if nb_cookies:
meta['nb_cookies'] = self.get_nb_cookies()
if cookies:
meta['cookies'] = self.get_cookies(r_json=r_json)
return meta
def add_cookie(self, name, value, cookie_uuid=None, domain=None, httponly=None, path=None, secure=None, text=None):
if cookie_uuid:
cookie = Cookie(cookie_uuid)
if cookie.exists():
cookie_uuid = generate_uuid()
cookie_uuid = generate_uuid()
r_crawler.sadd(f'cookiejar:cookies:{self.uuid}', cookie_uuid)
cookie = Cookie(cookie_uuid)
cookie.set_field('name', name)
cookie.set_field('value', value)
if domain:
cookie.set_field('domain', domain)
if httponly:
cookie.set_field('httpOnly', str(httponly))
if path:
cookie.set_field('path', path)
if secure:
cookie.set_field('secure', str(secure))
if text:
cookie.set_field('path', text)
return cookie_uuid
def delete_cookie(self, cookie_uuid):
if self.is_cookie_in_jar(cookie_uuid):
cookie = Cookie(cookie_uuid)
def create(self, user_id, description=None, level=1):
if self.exists():
raise Exception('Cookiejar already exists')
r_crawler.sadd('cookiejars:all', self.uuid)
if level == 0:
r_crawler.sadd(f'cookiejars:user:{user_id}', self.uuid)
r_crawler.sadd('cookiejars:global', self.uuid)
if description:
def delete(self):
for cookie_uuid in self.get_cookies_uuid():
2023-02-21 12:22:49 +01:00
r_crawler.srem(f'cookiejars:user:{self.get_user()}', self.uuid)
r_crawler.srem('cookiejars:global', self.uuid)
r_crawler.srem('cookiejars:all', self.uuid)
2023-02-17 14:50:20 +01:00
def create_cookiejar(user_id, description=None, level=1, cookiejar_uuid=None):
if cookiejar_uuid:
cookiejar = Cookiejar(cookiejar_uuid)
if cookiejar.exists():
cookiejar_uuid = generate_uuid()
cookiejar_uuid = generate_uuid()
cookiejar = Cookiejar(cookiejar_uuid)
cookiejar.create(user_id, description=description, level=level)
return cookiejar_uuid
2020-03-30 18:43:50 +02:00
2023-02-17 14:50:20 +01:00
def get_cookiejars_meta_by_iterator(iter_cookiejar_uuid):
cookiejars_meta = []
for cookiejar_uuid in iter_cookiejar_uuid:
cookiejar = Cookiejar(cookiejar_uuid)
return cookiejars_meta
def get_cookiejars_by_user(user_id):
cookiejars_global = get_cookiejars_global()
cookiejars_user = get_cookiejars_user(user_id)
return [*cookiejars_user, *cookiejars_global]
## API ##
def api_get_cookiejars_selector(user_id):
cookiejars = []
for cookiejar_uuid in get_cookiejars_by_user(user_id):
cookiejar = Cookiejar(cookiejar_uuid)
2023-02-21 12:22:49 +01:00
description = cookiejar.get_description()
if not description:
description = ''
cookiejars.append(f'{description} : {cookiejar.uuid}')
2023-02-17 14:50:20 +01:00
return sorted(cookiejars)
def api_verify_cookiejar_acl(cookiejar_uuid, user_id):
cookiejar = Cookiejar(cookiejar_uuid)
if not cookiejar.exists():
return {'error': 'unknown cookiejar uuid', 'cookiejar_uuid': cookiejar_uuid}, 404
if cookiejar.get_level() == 0: # TODO: check if user is admin
if cookiejar.get_user() != user_id:
return {'error': 'The access to this cookiejar is restricted'}, 403
2020-03-30 18:43:50 +02:00
2023-02-17 14:50:20 +01:00
def api_edit_cookiejar_description(user_id, cookiejar_uuid, description):
resp = api_verify_cookiejar_acl(cookiejar_uuid, user_id)
if resp:
return resp
cookiejar = Cookiejar(cookiejar_uuid)
return {'cookiejar_uuid': cookiejar_uuid}, 200
2020-03-30 18:43:50 +02:00
2023-02-17 14:50:20 +01:00
def api_delete_cookiejar(user_id, cookiejar_uuid):
resp = api_verify_cookiejar_acl(cookiejar_uuid, user_id)
if resp:
return resp
cookiejar = Cookiejar(cookiejar_uuid)
return {'cookiejar_uuid': cookiejar_uuid}, 200
2020-03-30 18:43:50 +02:00
2023-02-17 14:50:20 +01:00
def api_get_cookiejar(cookiejar_uuid, user_id):
resp = api_verify_cookiejar_acl(cookiejar_uuid, user_id)
if resp:
return resp
cookiejar = Cookiejar(cookiejar_uuid)
meta = cookiejar.get_meta(level=True, cookies=True, r_json=True)
return meta, 200
2020-04-01 09:58:47 +02:00
2020-03-30 18:43:50 +02:00
# # # # # # # #
# #
# #
# # # # # # # #
# # # #
# Cookies Fields:
# - name
# - value
# - path (optional)
# - domain (optional)
# - secure (optional)
# - httpOnly (optional)
# - text (optional)
# # # #
2023-02-17 14:50:20 +01:00
# TODO MISP Import
2020-03-30 18:43:50 +02:00
2023-02-17 14:50:20 +01:00
class Cookie:
2020-03-30 18:43:50 +02:00
2023-02-17 14:50:20 +01:00
def __init__(self, cookie_uuid):
self.uuid = cookie_uuid
2020-03-30 18:43:50 +02:00
2023-02-17 14:50:20 +01:00
def exists(self):
return r_crawler.exists(f'cookie:meta:{self.uuid}')
2020-04-01 09:58:47 +02:00
2023-02-17 14:50:20 +01:00
def get_cookiejar(self):
return r_crawler.hget(f'cookie:meta:{self.uuid}', 'cookiejar')
def set_cookiejar(self, cookiejar_uuid):
r_crawler.hset(f'cookie:meta:{self.uuid}', 'cookiejar', cookiejar_uuid)
def get_name(self):
return r_crawler.hget(f'cookie:meta:{self.uuid}', 'name')
def get_value(self):
return r_crawler.hget(f'cookie:meta:{self.uuid}', 'value')
def _get_field(self, field):
return r_crawler.hget(f'cookie:meta:{self.uuid}', field)
def set_field(self, field, value):
return r_crawler.hset(f'cookie:meta:{self.uuid}', field, value)
def remove_field(self, field):
return r_crawler.hdel(f'cookie:meta:{self.uuid}', field)
def get_fields(self):
fields = set(r_crawler.hkeys(f'cookie:meta:{self.uuid}'))
if 'cookiejar' in fields:
return fields
# def get_domain(self):
# return r_crawler.hget(f'cookie:meta:{self.uuid}', 'domain')
# def get_path(self):
# return r_crawler.hget(f'cookie:meta:{self.uuid}', 'path')
# def get_httpOnly(self):
# return r_crawler.hget(f'cookie:meta:{self.uuid}', 'httpOnly')
# def get_secure(self):
# return r_crawler.hget(f'cookie:meta:{self.uuid}', 'secure')
# TODO expire ????
def get_meta(self, r_json=False):
meta = {}
# ['domain', 'path', 'httpOnly', 'secure'] + name + value
for field in self.get_fields():
value = self._get_field(field)
if value:
meta[field] = value
if r_json:
data = json.dumps(meta, indent=4, sort_keys=True)
meta = {'data': data}
2023-02-21 12:22:49 +01:00
meta['uuid'] = self.uuid
2023-02-17 14:50:20 +01:00
return meta
def edit(self, cookie_dict):
# remove old keys
for field in self.get_fields():
if field not in cookie_dict:
# add new keys
for field in cookie_dict:
value = cookie_dict[field]
if value:
if field == 'secure' or field == 'httpOnly':
value = str(value)
self.set_field(field, value)
def delete(self):
cookiejar_uuid = self.get_cookiejar()
r_crawler.srem(f'cookiejar:cookies:{cookiejar_uuid}', self.uuid)
## API ##
def api_get_cookie(user_id, cookie_uuid):
cookie = Cookie(cookie_uuid)
if not cookie.exists():
return {'error': 'unknown cookie uuid', 'cookie_uuid': cookie_uuid}, 404
resp = api_verify_cookiejar_acl(cookie.get_cookiejar(), user_id)
if resp:
return resp
return cookie.get_meta()
def api_edit_cookie(user_id, cookie_uuid, cookie_dict):
cookie = Cookie(cookie_uuid)
if not cookie.exists():
return {'error': 'unknown cookie uuid', 'cookie_uuid': cookie_uuid}, 404
resp = api_verify_cookiejar_acl(cookie.get_cookiejar(), user_id)
if resp:
return resp
if 'name' not in cookie_dict or 'value' not in cookie_dict or not cookie_dict['name'] or not cookie_dict['value']:
return {'error': 'cookie name or value not provided'}, 400
return cookie.get_meta(), 200
def api_create_cookie(user_id, cookiejar_uuid, cookie_dict):
resp = api_verify_cookiejar_acl(cookiejar_uuid, user_id)
if resp:
return resp
if 'name' not in cookie_dict or 'value' not in cookie_dict or not cookie_dict['name'] or not cookie_dict['value']:
return {'error': 'cookie name or value not provided'}, 400
cookiejar = Cookiejar(cookiejar_uuid)
name = cookie_dict.get('name')
value = cookie_dict.get('value')
domain = cookie_dict.get('domain')
path = cookie_dict.get('path')
text = cookie_dict.get('text')
httponly = bool(cookie_dict.get('httponly'))
secure = bool(cookie_dict.get('secure'))
cookiejar.add_cookie(name, value, domain=domain, httponly=httponly, path=path, secure=secure, text=text)
return resp, 200
2020-03-30 18:43:50 +02:00
2023-02-17 14:50:20 +01:00
def api_delete_cookie(user_id, cookie_uuid):
cookie = Cookie(cookie_uuid)
if not cookie.exists():
return {'error': 'unknown cookie uuid', 'cookie_uuid': cookie_uuid}, 404
cookiejar_uuid = cookie.get_cookiejar()
resp = api_verify_cookiejar_acl(cookiejar_uuid, user_id)
if resp:
return resp
cookiejar = Cookiejar(cookiejar_uuid)
if not cookiejar.is_cookie_in_jar(cookie_uuid):
return {'error': 'Cookie isn\'t in the jar', 'cookiejar_uuid': cookiejar_uuid}, 404
return {'cookiejar_uuid': cookiejar_uuid, 'cookie_uuid': cookie_uuid}, 200
# def get_cookie_all_keys_name():
# return ['name', 'value', 'domain', 'path', 'httpOnly', 'secure']
2020-03-30 18:43:50 +02:00
## - - ##
## Cookies import ## # TODO: add browser type ?
def import_cookies_from_json(json_cookies, cookiejar_uuid):
2023-02-17 14:50:20 +01:00
cookiejar = Cookiejar(cookiejar_uuid)
2020-03-30 18:43:50 +02:00
for cookie in json_cookies:
cookie_dict = unpack_imported_json_cookie(cookie)
2023-02-17 14:50:20 +01:00
name = cookie_dict.get('name')
value = cookie_dict.get('value')
domain = cookie_dict.get('domain')
httponly = cookie_dict.get('httponly')
path = cookie_dict.get('path')
secure = cookie_dict.get('secure')
text = cookie_dict.get('text')
cookiejar.add_cookie(name, value, domain=domain, httponly=httponly, path=path, secure=secure, text=text)
2020-03-30 18:43:50 +02:00
except KeyError:
2023-02-17 14:50:20 +01:00
return {'error': 'Invalid cookie key, please submit a valid JSON', 'cookiejar_uuid': cookiejar_uuid}, 400
2020-03-30 18:43:50 +02:00
# # TODO: add text field
def unpack_imported_json_cookie(json_cookie):
cookie_dict = {'name': json_cookie['Name raw'], 'value': json_cookie['Content raw']}
if 'Path raw' in json_cookie:
cookie_dict['path'] = json_cookie['Path raw']
2023-02-21 12:22:49 +01:00
if 'HTTP only raw' in json_cookie:
cookie_dict['httponly'] = json_cookie['HTTP only raw'] == 'true'
if 'Send for' in json_cookie:
2020-03-30 18:43:50 +02:00
cookie_dict['secure'] = json_cookie['Send for'] == 'Encrypted connections only'
if 'Host raw' in json_cookie:
url = urlparse(json_cookie['Host raw'])
cookie_dict['domain'] = url.netloc.split(':', 1)[0]
return cookie_dict
## - - ##
2023-02-17 14:50:20 +01:00
def api_import_cookies_from_json(user_id, cookiejar_uuid, json_cookies_str): # # TODO: add catch
resp = api_verify_cookiejar_acl(cookiejar_uuid, user_id)
if resp:
return resp
2020-03-30 18:43:50 +02:00
json_cookies = json.loads(json_cookies_str)
2022-10-25 16:25:19 +02:00
resp = import_cookies_from_json(json_cookies, cookiejar_uuid)
if resp:
return resp, 400
2020-03-30 18:43:50 +02:00
#### ####
2020-08-17 21:52:57 +02:00
# # # # # # # #
2023-02-21 12:22:49 +01:00
# #
2022-10-25 16:25:19 +02:00
# CRAWLER # ###################################################################################
2020-08-17 21:52:57 +02:00
# #
# # # # # # # #
2022-10-25 16:25:19 +02:00
def get_default_user_agent():
2023-04-04 09:23:52 +02:00
return 'Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0'
2022-10-25 16:25:19 +02:00
def get_last_crawled_domains(domain_type):
return r_crawler.lrange(f'last_{domain_type}', 0, -1)
def update_last_crawled_domain(domain_type, domain, epoch):
# update list, last crawled domains
r_crawler.lpush(f'last_{domain_type}', f'{domain}:{epoch}')
r_crawler.ltrim(f'last_{domain_type}', 0, 15)
2022-11-30 15:50:10 +01:00
def create_item_metadata(item_id, url, item_father):
item = Item(item_id)
item.set_crawled(url, item_father)
2022-10-25 16:25:19 +02:00
def get_gzipped_b64_item(item_id, content):
gzipencoded = gzip.compress(content.encode())
gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
return gzip64encoded
print(f'file error: {item_id}')
return False
def get_crawlers_stats_by_day(date, domain_type):
return {
'date': date[0:4] + '-' + date[4:6] + '-' + date[6:8],
'up': r_crawler.scard(f'{domain_type}_up:{date}'),
'down': r_crawler.scard(f'{domain_type}_down:{date}'),
def get_crawlers_stats(domain_type=None):
stats = {}
date = datetime.now().strftime("%Y%m%d")
if domain_type:
domain_types = [domain_type]
domain_types = get_crawler_all_types()
for domain_type in domain_types:
queue = r_crawler.scard(f'crawler:queue:type:{domain_type}')
up = r_crawler.scard(f'{domain_type}_up:{date}')
down = r_crawler.scard(f'{domain_type}_down:{date}')
crawled = up + down
stats[domain_type] = {'queue': queue, 'up': up, 'down': down, 'crawled': crawled}
return stats
2023-02-21 12:22:49 +01:00
#### Blocklist ####
def get_blacklist():
return r_crawler.smembers('blacklist:domain')
def is_blacklisted_domain(domain):
return r_crawler.sismember('blacklist:domain', domain)
def blacklist_domain(domain):
return r_crawler.sadd('blacklist:domain', domain)
2023-03-31 09:25:06 +02:00
def unblacklist_domain(domain):
return r_crawler.srem('blacklist:domain', domain)
2023-02-21 12:22:49 +01:00
def load_blacklist():
with open(os.path.join(os.environ['AIL_BIN'], 'crawlers/blacklist.txt'), 'r') as f:
lines = f.read().splitlines()
for line in lines:
except Exception as e:
2023-03-31 09:25:06 +02:00
def api_blacklist_domain(data):
domain = str(data.get('domain', '')).lower()
if not is_valid_domain(domain):
return {'error': 'invalid domain'}, 400
if is_blacklisted_domain(domain):
return {'error': 'domain already blacklisted'}, 400
return blacklist_domain(domain), 200
def api_unblacklist_domain(data):
domain = str(data.get('domain', '')).lower()
if not is_valid_domain(domain):
return {'error': 'invalid domain'}, 400
if not is_blacklisted_domain(domain):
return {'error': 'domain not blacklisted'}, 404
return unblacklist_domain(domain), 200
2023-03-14 17:36:42 +01:00
#### CRAWLER Scheduler ####
class ScheduleStatus(IntEnum):
"""The status of the capture"""
def get_schedulers_uuid():
return r_crawler.smembers('scheduler:schedules')
def get_schedulers_metas():
schedulers = []
for schedule_uuid in get_schedulers_uuid():
schedule = CrawlerSchedule(schedule_uuid)
return schedulers
class CrawlerScheduler:
def __init__(self):
self.min_frequency = 60 # TODO ADD IN CONFIG
def update_queue(self):
for schedule_uuid in get_schedulers_uuid():
schedule = CrawlerSchedule(schedule_uuid)
# check if already in scheduler queue
if schedule.is_scheduled():
if schedule.is_tasked():
# EXPIRE ????
time_next_run = 0.0
frequency = schedule.get_frequency() # optional or later -> cron
if frequency == 'hourly':
time_next_run = (datetime.now() + timedelta(hours=1)).timestamp()
elif frequency == 'daily':
time_next_run = (datetime.now() + timedelta(days=1)).timestamp()
elif frequency == 'weekly':
time_next_run = (datetime.now() + timedelta(weeks=1)).timestamp()
elif frequency == 'monthly':
time_next_run = (datetime.now() + relativedelta(months=1)).timestamp()
months, weeks, days, hours, minutes = frequency.split(':')
if not months:
months = 0
if not weeks:
weeks = 0
if not days:
days = 0
if not hours:
hours = 0
if not minutes:
minutes = 0
current_time = datetime.now().timestamp()
time_next_run = (datetime.now() + relativedelta(months=int(months), weeks=int(weeks),
days=int(days), hours=int(hours),
# Make sure the next capture is not scheduled for in a too short interval
interval_next_capture = time_next_run - current_time
if interval_next_capture < self.min_frequency:
# self.logger.warning(f'The next capture is scheduled too soon: {interval_next_capture}s. Minimal interval: {self.min_frequency}s.')
print(f'The next capture is scheduled too soon: {interval_next_capture}s. Minimal interval: {self.min_frequency}s.')
time_next_run = (datetime.now() + timedelta(seconds=self.min_frequency)).timestamp()
print('scheduled:', schedule_uuid)
def process_queue(self):
now = datetime.now().timestamp()
for raw_schedule in r_crawler.zrangebyscore('scheduler:queue', '-inf', int(now), withscores=True):
schedule_uuid, next_run = raw_schedule
schedule = CrawlerSchedule(schedule_uuid)
if not schedule.exists():
return None
meta = schedule.get_meta()
task_uuid = create_task(meta['url'], depth=meta['depth'], har=meta['har'], screenshot=meta['screenshot'],
cookiejar=meta['cookiejar'], proxy=meta['proxy'],
user_agent=meta['user_agent'], parent='scheduler', priority=40)
if task_uuid:
r_crawler.zrem('scheduler:queue', schedule_uuid)
# TODO Expire -> stuck in crawler queue or reached delta
class CrawlerSchedule:
def __init__(self, schedule_uuid):
self.uuid = schedule_uuid
def exists(self):
return r_crawler.exists(f'schedule:{self.uuid}')
def get_frequency(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'frequency')
def get_user(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'user')
def get_date(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'date')
def get_captures(self): # only scheduled capture ????? exclude manual/discovery
def get_status(self):
if self.is_scheduled():
return ScheduleStatus.SCHEDULED
if self.is_tasked():
if self.is_ongoing():
return ScheduleStatus.ONGOING
return ScheduleStatus.QUEUED
return ScheduleStatus.UNKNOWN
def get_task_uuid(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'task')
def is_tasked(self):
task_uuid = self.get_task_uuid()
if task_uuid:
task = CrawlerTask(task_uuid)
tasked = task.exists()
if not tasked:
r_crawler.hdel(f'schedule:{self.uuid}', 'task')
return tasked
return False
def get_task(self):
task_uuid = self.get_task_uuid()
if task_uuid:
return CrawlerTask(task_uuid)
def set_task(self, task_uuid):
return r_crawler.hset(f'schedule:{self.uuid}', 'task', task_uuid)
def is_ongoing(self):
task = self.get_task()
if task:
return task.is_ongoing()
return False
def get_next_run(self, r_str=False):
next_run = r_crawler.zscore('scheduler:queue', self.uuid)
if next_run and r_str:
next_run = time.strftime('%Y-%m-%d - %H:%M:%S', time.localtime(int(next_run)))
return next_run
def set_next_run(self, time_next_run):
r_crawler.zadd('scheduler:queue', mapping={self.uuid: time_next_run})
def is_scheduled(self):
return bool(r_crawler.zscore('scheduler:queue', self.uuid))
def get_url(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'url')
def get_depth(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'depth')
def get_har(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'har') == 'True'
def get_screenshot(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'screenshot') == 'True'
def get_header(self):
r_crawler.hget(f'schedule:{self.uuid}', 'header')
def get_cookiejar(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'cookiejar')
def get_proxy(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'proxy')
def get_user_agent(self):
return r_crawler.hget(f'schedule:{self.uuid}', 'user_agent')
def _set_field(self, field, value):
return r_crawler.hset(f'schedule:{self.uuid}', field, value)
def get_meta(self, ui=False):
meta = {
'uuid': self.uuid,
'date': self.get_date(),
'frequency': self.get_frequency(),
'user': self.get_user(),
'url': self.get_url(),
'depth': self.get_depth(),
'har': self.get_har(),
'screenshot': self.get_screenshot(),
'user_agent': self.get_user_agent(),
'cookiejar': self.get_cookiejar(),
'header': self.get_header(),
'proxy': self.get_proxy(),
status = self.get_status()
if ui:
status = status.name
r_str = True
r_str = False
meta['status'] = status
meta['next_run'] = self.get_next_run(r_str=r_str)
return meta
def get_meta_status(self): # TODO: Description ? Frequency ???
meta = {'uuid': self.uuid,
'url': self.get_url(),
'user': self.get_user(),
'next_run': self.get_next_run(r_str=True)}
status = self.get_status()
if isinstance(status, ScheduleStatus):
status = status.name
meta['status'] = status
return meta
def create(self, frequency, user, url,
depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, user_agent=None):
if self.exists():
raise Exception('Error: Monitor already exists')
url_decoded = unpack_url(url)
url = url_decoded['url']
self._set_field('date', datetime.now().strftime("%Y-%m-%d"))
self._set_field('frequency', frequency)
self._set_field('user', user)
self._set_field('url', url)
self._set_field('depth', int(depth))
self._set_field('har', str(har))
self._set_field('screenshot', str(screenshot))
if cookiejar:
self._set_field('cookiejar', cookiejar)
if header:
self._set_field('header', header)
if proxy:
if proxy == 'web':
proxy = None
elif proxy == 'force_tor' or proxy == 'tor' or proxy == 'onion':
proxy = 'force_tor'
self._set_field('proxy', proxy)
if user_agent:
self._set_field('user_agent', user_agent)
r_crawler.sadd('scheduler:schedules', self.uuid)
def delete(self):
# remove from schedule queue
r_crawler.zrem('scheduler:queue', self.uuid)
# delete task
task = self.get_task()
if task:
# delete meta
r_crawler.srem('scheduler:schedules', self.uuid)
def create_schedule(frequency, user, url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, user_agent=None):
schedule_uuid = gen_uuid()
schedule = CrawlerSchedule(schedule_uuid)
schedule.create(frequency, user, url, depth=depth, har=har, screenshot=screenshot, header=header, cookiejar=cookiejar, proxy=proxy, user_agent=user_agent)
return schedule_uuid
# TODO sanityze UUID
def api_delete_schedule(data):
schedule_uuid = data.get('uuid')
schedule = CrawlerSchedule(schedule_uuid)
if not schedule.exists():
return {'error': 'unknown schedule uuid', 'uuid': schedule}, 404
return schedule.delete(), 200
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
2022-10-25 16:25:19 +02:00
def get_nb_crawler_captures():
return r_cache.zcard('crawler:captures')
def get_crawler_captures():
return r_crawler.zrange('crawler:captures', 0, -1)
def reload_crawler_captures():
2023-02-21 12:22:49 +01:00
for capture_uuid in get_crawler_captures():
capture = CrawlerCapture(capture_uuid)
2023-03-14 17:36:42 +01:00
def _clear_captures():
for capture_uuid in get_crawler_captures():
capture = CrawlerCapture(capture_uuid)
task = capture.get_task()
print(capture_uuid, 'deleted')
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
class CaptureStatus(IntEnum):
"""The status of the capture"""
DONE = 1
class CrawlerCapture:
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def __init__(self, task_uuid):
self.uuid = task_uuid
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def exists(self):
return r_crawler.hexists('crawler:captures:tasks', self.uuid)
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def get_task_uuid(self):
return r_crawler.hget('crawler:captures:tasks', self.uuid)
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def get_task(self):
task_uuid = self.get_task_uuid()
if task_uuid:
return CrawlerTask(task_uuid)
def get_start_time(self):
return self.get_task().get_start_time()
def get_status(self):
return r_cache.hget(f'crawler:capture:{self.uuid}', 'status')
2023-03-14 17:36:42 +01:00
def is_ongoing(self):
return self.get_status() == CaptureStatus.ONGOING
2023-02-21 12:22:49 +01:00
def create(self, task_uuid):
if self.exists():
raise Exception(f'Error: Capture {self.uuid} already exists')
launch_time = int(time.time())
r_crawler.hset(f'crawler:task:{task_uuid}', 'capture', self.uuid)
r_crawler.hset('crawler:captures:tasks', self.uuid, task_uuid)
r_crawler.zadd('crawler:captures', {self.uuid: launch_time})
r_cache.hset(f'crawler:capture:{self.uuid}', 'launch_time', launch_time)
r_cache.zadd('crawler:captures', {self.uuid: launch_time})
def update(self, status):
2023-03-14 17:36:42 +01:00
# Error or Reload
if not status:
r_cache.hset(f'crawler:capture:{self.uuid}', 'status', CaptureStatus.UNKNOWN)
r_cache.zadd('crawler:captures', {self.uuid: 0})
last_check = int(time.time())
r_cache.hset(f'crawler:capture:{self.uuid}', 'status', status)
r_cache.zadd('crawler:captures', {self.uuid: last_check})
2023-02-21 12:22:49 +01:00
2023-03-14 17:36:42 +01:00
# Crawler
def remove(self):
2023-02-21 12:22:49 +01:00
r_crawler.zrem('crawler:captures', self.uuid)
2023-03-14 17:36:42 +01:00
2023-02-21 12:22:49 +01:00
r_crawler.hdel('crawler:captures:tasks', self.uuid)
2023-03-14 17:36:42 +01:00
# Manual
2023-02-21 12:22:49 +01:00
def delete(self):
2023-03-14 17:36:42 +01:00
# remove Capture from crawler queue
r_cache.zrem('crawler:captures', self.uuid)
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def create_capture(capture_uuid, task_uuid):
capture = CrawlerCapture(capture_uuid)
def get_crawler_capture():
capture = r_cache.zpopmin('crawler:captures')
if capture:
capture = CrawlerCapture(capture[0][0])
capture = None
return capture
# TODO add capture times
def get_captures_status():
2022-10-25 16:25:19 +02:00
status = []
for capture_uuid in get_crawler_captures():
2023-02-21 12:22:49 +01:00
capture = CrawlerCapture(capture_uuid)
task = capture.get_task()
domain = task.get_domain()
2022-10-25 16:25:19 +02:00
dom = Domain(domain)
meta = {
2023-02-21 12:22:49 +01:00
'uuid': task.uuid,
2022-10-25 16:25:19 +02:00
'domain': dom.get_id(),
'type': dom.get_domain_type(),
2023-03-14 17:36:42 +01:00
'start_time': capture.get_start_time(),
2023-02-21 12:22:49 +01:00
'status': capture.get_status(),
2022-10-25 16:25:19 +02:00
2023-03-14 17:36:42 +01:00
capture_status = capture.get_status()
if capture_status:
capture_status = CaptureStatus(int(capture_status)).name
meta['status'] = capture_status
2022-10-25 16:25:19 +02:00
return status
2023-02-21 12:22:49 +01:00
2022-10-25 16:25:19 +02:00
#### CRAWLER TASK ####
2023-02-21 12:22:49 +01:00
class CrawlerTask:
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def __init__(self, task_uuid):
self.uuid = task_uuid
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def exists(self):
return r_crawler.exists(f'crawler:task:{self.uuid}')
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def get_url(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'url')
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def get_domain(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'domain')
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def get_depth(self):
depth = r_crawler.hget(f'crawler:task:{self.uuid}', 'depth')
if not depth:
depth = 1
return int(depth)
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def get_har(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'har') == '1'
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def get_screenshot(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'screenshot') == '1'
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def get_queue(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'queue')
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def get_user_agent(self):
user_agent = r_crawler.hget(f'crawler:task:{self.uuid}', 'user_agent')
if not user_agent:
user_agent = get_default_user_agent()
return user_agent
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def get_cookiejar(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'cookiejar')
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def get_cookies(self):
cookiejar = self.get_cookiejar()
if cookiejar:
cookiejar = Cookiejar(cookiejar)
return cookiejar.get_cookies()
return []
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def get_header(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'header')
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def get_proxy(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'proxy')
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def get_parent(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'parent')
def get_hash(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'hash')
def get_start_time(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'start_time')
def get_status(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'status') #######################################
def get_capture(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'capture')
2023-03-14 17:36:42 +01:00
def is_ongoing(self):
capture_uuid = self.get_capture()
if capture_uuid:
return CrawlerCapture(capture_uuid).is_ongoing()
return False
2023-02-21 12:22:49 +01:00
def _set_field(self, field, value):
return r_crawler.hset(f'crawler:task:{self.uuid}', field, value)
def get_meta(self):
meta = {
'uuid': self.uuid,
'url': self.get_url(),
'domain': self.get_domain(),
'depth': self.get_depth(),
'har': self.get_har(),
'screenshot': self.get_screenshot(),
'type': self.get_queue(),
'user_agent': self.get_user_agent(),
'cookiejar': self.get_cookiejar(),
'header': self.get_header(),
'proxy': self.get_proxy(),
'parent': self.get_parent(),
return meta
# PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100
def create(self, url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None,
user_agent=None, parent='manual', priority=0):
if self.exists():
raise Exception('Error: Task already exists')
url_decoded = unpack_url(url)
url = url_decoded['url']
domain = url_decoded['domain']
dom = Domain(domain)
# Discovery crawler
if priority == 0:
if is_blacklisted_domain(dom.get_id()):
return None
if not dom.exists():
priority = 10
# Domain Crawled today or UP this month
if dom.is_down_today() or dom.is_up_this_month():
return None
har = int(har)
screenshot = int(screenshot)
if proxy == 'web':
proxy = None
elif proxy == 'force_tor' or proxy == 'tor' or proxy == 'onion':
proxy = 'force_tor'
# Check if already in queue
hash_query = get_task_hash(url, domain, depth, har, screenshot, priority, proxy, cookiejar, user_agent, header)
if r_crawler.hexists(f'crawler:queue:hash', hash_query):
self.uuid = r_crawler.hget(f'crawler:queue:hash', hash_query)
return self.uuid
self._set_field('domain', domain)
self._set_field('url', url)
self._set_field('depth', int(depth))
self._set_field('har', har)
self._set_field('screenshot', screenshot)
self._set_field('parent', parent)
if cookiejar:
self._set_field('cookiejar', cookiejar)
if header:
self._set_field('header', header)
if proxy:
self._set_field('proxy', proxy)
2023-03-14 17:36:42 +01:00
if user_agent:
self._set_field('user_agent', user_agent)
2023-02-21 12:22:49 +01:00
r_crawler.hset('crawler:queue:hash', hash_query, self.uuid)
self._set_field('hash', hash_query)
r_crawler.zadd('crawler:queue', {self.uuid: priority})
2023-03-14 17:36:42 +01:00
2023-02-21 12:22:49 +01:00
# UI
domain_type = dom.get_domain_type()
r_crawler.sadd(f'crawler:queue:type:{domain_type}', self.uuid)
self._set_field('queue', domain_type)
return self.uuid
2023-03-14 17:36:42 +01:00
def add_to_db_crawler_queue(self, priority):
r_crawler.zadd('crawler:queue', {self.uuid: priority})
2023-02-21 12:22:49 +01:00
2023-03-14 17:36:42 +01:00
def start(self):
self._set_field('start_time', datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
2022-10-25 16:25:19 +02:00
2023-03-14 17:36:42 +01:00
# Crawler
def remove(self): # zrem cache + DB
capture_uuid = self.get_capture()
if capture_uuid:
capture = CrawlerCapture(capture_uuid)
queue_type = self.get_queue()
if queue_type:
r_crawler.srem(f'crawler:queue:type:{queue_type}', self.uuid)
task_hash = self.get_hash()
if task_hash:
r_crawler.hdel('crawler:queue:hash', task_hash)
# meta
2023-02-21 12:22:49 +01:00
2023-03-14 17:36:42 +01:00
# Manual
def delete(self):
# queue
r_crawler.zrem('crawler:queue', self.uuid)
2023-02-21 12:22:49 +01:00
# TODO move to class ???
2022-10-25 16:25:19 +02:00
def get_task_hash(url, domain, depth, har, screenshot, priority, proxy, cookiejar, user_agent, header):
to_enqueue = {'domain': domain, 'depth': depth, 'har': har, 'screenshot': screenshot,
'priority': priority, 'proxy': proxy, 'cookiejar': cookiejar, 'user_agent': user_agent,
'header': header}
if priority != 0:
to_enqueue['url'] = url
return hashlib.sha512(pickle.dumps(to_enqueue)).hexdigest()
2023-02-21 12:22:49 +01:00
def add_task_to_lacus_queue():
2022-10-25 16:25:19 +02:00
task_uuid = r_crawler.zpopmax('crawler:queue')
if not task_uuid or not task_uuid[0]:
return None
task_uuid, priority = task_uuid[0]
2023-02-21 12:22:49 +01:00
task = CrawlerTask(task_uuid)
2023-03-14 17:36:42 +01:00
2023-02-21 12:22:49 +01:00
return task.uuid, priority
# PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100
def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None,
user_agent=None, parent='manual', priority=0, task_uuid=None):
if task_uuid:
if CrawlerTask(task_uuid).exists():
task_uuid = gen_uuid()
task_uuid = gen_uuid()
task = CrawlerTask(task_uuid)
task_uuid = task.create(url, depth=depth, har=har, screenshot=screenshot, header=header, cookiejar=cookiejar,
proxy=proxy, user_agent=user_agent, parent=parent, priority=priority)
return task_uuid
2023-03-14 17:36:42 +01:00
## -- CRAWLER TASK -- ##
2022-10-25 16:25:19 +02:00
# # TODO: ADD user agent
# # TODO: sanitize URL
def api_add_crawler_task(data, user_id=None):
url = data.get('url', None)
2023-02-21 12:22:49 +01:00
if not url or url == '\n':
return {'status': 'error', 'reason': 'No url supplied'}, 400
2022-10-25 16:25:19 +02:00
screenshot = data.get('screenshot', False)
if screenshot:
screenshot = True
screenshot = False
har = data.get('har', False)
if har:
har = True
har = False
2023-02-21 12:22:49 +01:00
depth_limit = data.get('depth', 1)
2022-10-25 16:25:19 +02:00
if depth_limit:
depth_limit = int(depth_limit)
if depth_limit < 0:
depth_limit = 0
except ValueError:
2023-02-21 12:22:49 +01:00
return {'error': 'invalid depth limit'}, 400
2022-10-25 16:25:19 +02:00
depth_limit = 0
2023-02-21 12:22:49 +01:00
cookiejar_uuid = data.get('cookiejar', None)
if cookiejar_uuid:
cookiejar = Cookiejar(cookiejar_uuid)
if not cookiejar.exists():
return {'error': 'unknown cookiejar uuid', 'cookiejar_uuid': cookiejar_uuid}, 404
level = cookiejar.get_level()
if level == 0: # # TODO: check if user is admin
if cookiejar.get_user() != user_id:
return {'error': 'The access to this cookiejar is restricted'}, 403
cookiejar_uuid = cookiejar.uuid
2022-10-25 16:25:19 +02:00
2023-03-14 17:36:42 +01:00
frequency = data.get('frequency', None)
if frequency:
if frequency not in ['monthly', 'weekly', 'daily', 'hourly']:
if not isinstance(frequency, dict):
return {'error': 'Invalid frequency'}, 400
months = int(frequency.get('months', 0))
weeks = int(frequency.get('weeks', 0))
days = int(frequency.get('days', 0))
hours = int(frequency.get('hours', 0))
minutes = int(frequency.get('minutes', 0))
except (TypeError, ValueError):
return {'error': 'Invalid frequency'}, 400
if min(months, weeks, days, hours, minutes) < 0:
return {'error': 'Invalid frequency'}, 400
if max(months, weeks, days, hours, minutes) <= 0:
return {'error': 'Invalid frequency'}, 400
frequency = f'{months}:{weeks}:{days}:{hours}:{minutes}'
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
2022-10-25 16:25:19 +02:00
proxy = data.get('proxy', None)
2023-02-21 12:22:49 +01:00
if proxy == 'onion' or proxy == 'tor' or proxy == 'force_tor':
2022-10-25 16:25:19 +02:00
proxy = 'force_tor'
2023-02-21 12:22:49 +01:00
elif proxy:
verify = api_verify_proxy(proxy)
if verify[1] != 200:
return verify
2022-10-25 16:25:19 +02:00
2023-03-14 17:36:42 +01:00
if frequency:
# TODO verify user
return create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None), 200
return create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None,
parent='manual', priority=90), 200
2022-10-25 16:25:19 +02:00
#### ####
2020-08-17 21:52:57 +02:00
2022-10-25 16:25:19 +02:00
# TODO: # FIXME: config db, dynamic load
2021-05-14 14:42:16 +02:00
def is_crawler_activated():
return activate_crawler == 'True'
def get_crawler_all_types():
2022-10-25 16:25:19 +02:00
return ['onion', 'web']
2021-05-14 14:42:16 +02:00
2020-08-17 21:52:57 +02:00
2020-09-14 17:03:36 +02:00
#### ####
2020-03-30 18:43:50 +02:00
def is_redirection(domain, last_url):
url = urlparse(last_url)
last_domain = url.netloc
last_domain = last_domain.split('.')
last_domain = '{}.{}'.format(last_domain[-2], last_domain[-1])
return domain != last_domain
def create_item_id(item_dir, domain):
2021-07-14 15:48:17 +02:00
# remove /
domain = domain.replace('/', '_')
2020-03-30 18:43:50 +02:00
if len(domain) > 215:
UUID = domain[-215:]+str(uuid.uuid4())
UUID = domain+str(uuid.uuid4())
return os.path.join(item_dir, UUID)
def save_har(har_dir, item_id, har_content):
if not os.path.exists(har_dir):
item_id = item_id.split('/')[-1]
filename = os.path.join(har_dir, item_id + '.json')
with open(filename, 'w') as f:
2020-05-22 15:41:05 +02:00
2020-08-17 21:52:57 +02:00
# # # # # # # # # # # #
# #
2023-02-21 12:22:49 +01:00
2020-08-17 21:52:57 +02:00
# #
# # # # # # # # # # # #
2020-05-22 15:41:05 +02:00
2023-03-14 17:36:42 +01:00
#### PROXY ####
2023-02-21 12:22:49 +01:00
def api_verify_proxy(proxy_url):
parsed_proxy = urlparse(proxy_url)
if parsed_proxy.scheme and parsed_proxy.hostname and parsed_proxy.port:
if parsed_proxy.scheme in ['http', 'https', 'socks5']:
if (parsed_proxy.username and parsed_proxy.password) != (
not parsed_proxy.username and not parsed_proxy.password):
return proxy_url, 200
return {'error': 'You need to enter a username AND a password for your proxy.'}, 400
return {'error': 'Proxy scheme not supported: must be http(s) or socks5.'}, 400
2020-05-22 15:41:05 +02:00
2023-02-21 12:22:49 +01:00
return {'error': 'Invalid proxy: Check that you entered a scheme, a hostname and a port.'}, 400
2020-08-18 19:10:38 +02:00
2023-02-21 12:22:49 +01:00
def get_proxies():
return r_crawler.smembers('crawler:proxies')
2020-08-18 19:10:38 +02:00
2023-02-21 12:22:49 +01:00
class CrawlerProxy:
def __init__(self, proxy_uuid):
self.uuid = proxy_uuid
2020-08-18 19:10:38 +02:00
2023-02-21 12:22:49 +01:00
def get_description(self):
return r_crawler.hgrt(f'crawler:proxy:{self.uuif}', 'description')
2020-07-24 08:54:54 +02:00
2023-02-21 12:22:49 +01:00
# Host
# Port
# Type -> need test
def get_url(self):
return r_crawler.hgrt(f'crawler:proxy:{self.uuif}', 'url')
2020-05-22 15:41:05 +02:00
2023-03-14 17:36:42 +01:00
2022-10-25 16:25:19 +02:00
def get_lacus_url():
return r_db.hget('crawler:lacus', 'url')
2023-02-21 12:22:49 +01:00
def get_lacus_api_key():
2022-10-25 16:25:19 +02:00
return r_db.hget('crawler:lacus', 'key')
# TODO Rewrite with new API key
2023-02-21 12:22:49 +01:00
def get_hidden_lacus_api_key():
2022-10-25 16:25:19 +02:00
key = get_lacus_api_key()
if key:
2023-02-21 12:22:49 +01:00
if len(key) == 41:
2022-10-25 16:25:19 +02:00
return f'{key[:4]}*********************************{key[-4:]}'
# TODO Rewrite with new API key
def is_valid_api_key(api_key, search=re.compile(r'[^a-zA-Z0-9_-]').search):
if len(api_key) != 41:
2020-07-27 15:46:09 +02:00
return False
2022-10-25 16:25:19 +02:00
return not bool(search(api_key))
def save_lacus_url_api(url, api_key):
r_db.hset('crawler:lacus', 'url', url)
2023-02-21 12:22:49 +01:00
# r_db.hset('crawler:lacus', 'key', api_key)
2022-10-25 16:25:19 +02:00
def is_lacus_connected(delta_check=30):
last_check = r_cache.hget('crawler:lacus', 'last_check')
if last_check:
if int(time.time()) - int(last_check) > delta_check:
is_connected = r_cache.hget('crawler:lacus', 'connected')
return is_connected == 'True'
2020-05-22 15:41:05 +02:00
2022-10-25 16:25:19 +02:00
def get_lacus_connection_metadata(force_ping=False):
2023-02-21 12:22:49 +01:00
dict_manager = {}
2022-10-25 16:25:19 +02:00
if force_ping:
dict_manager['status'] = ping_lacus()
dict_manager['status'] = is_lacus_connected()
if not dict_manager['status']:
dict_manager['status_code'] = r_cache.hget('crawler:lacus', 'status_code')
dict_manager['error'] = r_cache.hget('crawler:lacus', 'error')
return dict_manager
def get_lacus():
url = get_lacus_url()
if url:
return PyLacus(get_lacus_url())
def ping_lacus():
2023-02-21 12:22:49 +01:00
req_error = None
2022-10-25 16:25:19 +02:00
lacus = get_lacus()
if not lacus:
ping = False
2023-02-21 12:22:49 +01:00
req_error = {'error': 'Lacus URL undefined', 'status_code': 400}
2022-10-25 16:25:19 +02:00
ping = lacus.is_up
2023-02-21 12:22:49 +01:00
update_lacus_connection_status(ping, req_error=req_error)
2022-10-25 16:25:19 +02:00
return ping
def update_lacus_connection_status(is_connected, req_error=None):
r_cache.hset('crawler:lacus', 'connected', str(is_connected))
r_cache.hset('crawler:lacus', 'last_check', int(time.time()))
if not req_error:
r_cache.hdel('crawler:lacus', 'error')
r_cache.hset('crawler:lacus', 'status_code', req_error['status_code'])
r_cache.hset('crawler:lacus', 'error', req_error['error'])
def api_save_lacus_url_key(data):
# unpack json
manager_url = data.get('url', None)
api_key = data.get('api_key', None)
if not manager_url: # or not api_key:
return {'status': 'error', 'reason': 'No url or API key supplied'}, 400
# check if is valid url
result = urlparse(manager_url)
if not all([result.scheme, result.netloc]):
return {'status': 'error', 'reason': 'Invalid url'}, 400
return {'status': 'error', 'reason': 'Invalid url'}, 400
# # check if is valid key CURRENTLY DISABLE
# if not is_valid_api_key(api_key):
# return ({'status': 'error', 'reason': 'Invalid API key'}, 400)
save_lacus_url_api(manager_url, api_key)
return {'url': manager_url, 'api_key': get_hidden_lacus_api_key()}, 200
2023-02-21 12:22:49 +01:00
def get_crawler_max_captures():
nb_captures = r_cache.hget('crawler:lacus', 'nb_captures')
if not nb_captures:
nb_captures = r_db.hget('crawler:lacus', 'nb_captures')
if not nb_captures:
nb_captures = 10
r_cache.hset('crawler:lacus', 'nb_captures', int(nb_captures))
return int(nb_captures)
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def save_nb_max_captures(nb_captures):
r_db.hset('crawler:lacus', 'nb_captures', int(nb_captures))
r_cache.hset('crawler:lacus', 'nb_captures', int(nb_captures))
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
def api_set_crawler_max_captures(data):
nb_captures = data.get('nb', 10)
nb_captures = int(nb_captures)
if nb_captures < 1:
nb_captures = 1
except (TypeError, ValueError):
return {'error': 'Invalid number of crawlers to launch'}, 400
return nb_captures, 200
2022-10-25 16:25:19 +02:00
2023-03-14 17:36:42 +01:00
## TEST ##
2020-05-22 15:41:05 +02:00
2021-03-29 20:27:20 +02:00
def is_test_ail_crawlers_successful():
2022-10-25 16:25:19 +02:00
return r_db.hget('crawler:tor:test', 'success') == 'True'
2021-03-29 20:27:20 +02:00
def get_test_ail_crawlers_message():
2022-10-25 16:25:19 +02:00
return r_db.hget('crawler:tor:test', 'message')
2021-03-29 20:27:20 +02:00
def save_test_ail_crawlers_result(test_success, message):
2022-10-25 16:25:19 +02:00
r_db.hset('crawler:tor:test', 'success', str(test_success))
r_db.hset('crawler:tor:test', 'message', message)
2021-03-29 20:27:20 +02:00
def test_ail_crawlers():
2022-10-25 16:25:19 +02:00
# # TODO: test web domain
if not ping_lacus():
lacus_url = get_lacus_url()
error_message = f'Error: Can\'t connect to AIL Lacus, {lacus_url}'
2021-03-29 20:27:20 +02:00
save_test_ail_crawlers_result(False, error_message)
return False
2022-10-25 16:25:19 +02:00
lacus = get_lacus()
2021-03-29 20:27:20 +02:00
commit_id = git_status.get_last_commit_id_from_local()
2023-02-21 12:22:49 +01:00
user_agent = f'{commit_id}-AIL LACUS CRAWLER'
# domain = 'eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion'
2022-10-25 16:25:19 +02:00
url = 'http://eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion'
2020-05-22 15:41:05 +02:00
2021-03-29 20:27:20 +02:00
2022-10-25 16:25:19 +02:00
# set_current_crawler_status(splash_url, 'CRAWLER TEST', started_time=True,
# crawled_domain='TEST DOMAIN', crawler_type='onion')
capture_uuid = lacus.enqueue(url=url, depth=0, user_agent=user_agent, proxy='force_tor',
force=True, general_timeout_in_sec=90)
status = lacus.get_capture_status(capture_uuid)
2023-02-21 12:22:49 +01:00
launch_time = int(time.time()) # capture timeout
while int(time.time()) - launch_time < 90 and status != CaptureStatus.DONE:
2022-10-25 16:25:19 +02:00
print(int(time.time()) - launch_time)
2021-03-29 20:27:20 +02:00
2022-10-25 16:25:19 +02:00
status = lacus.get_capture_status(capture_uuid)
2021-03-29 20:27:20 +02:00
2022-10-25 16:25:19 +02:00
entries = lacus.get_capture(capture_uuid)
if 'error' in entries:
save_test_ail_crawlers_result(False, entries['error'])
2021-03-29 20:27:20 +02:00
return False
2022-10-25 16:25:19 +02:00
elif 'html' in entries and entries['html']:
mess = 'It works!'
if mess in entries['html']:
save_test_ail_crawlers_result(True, mess)
return True
return False
2023-02-21 12:22:49 +01:00
elif status == 2:
save_test_ail_crawlers_result(False, 'Timeout Error')
save_test_ail_crawlers_result(False, 'Error')
2022-10-25 16:25:19 +02:00
return False
2020-05-22 15:41:05 +02:00
#### ---- ####
2023-03-16 15:50:42 +01:00
2020-05-22 15:41:05 +02:00
2023-03-16 15:50:42 +01:00
# def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None):
# # validate url
# if url is None or url=='' or url=='\n':
# return ({'error':'invalid depth limit'}, 400)
2022-10-25 16:25:19 +02:00
2023-03-14 17:36:42 +01:00
2022-10-25 16:25:19 +02:00
2023-02-21 12:22:49 +01:00
# if __name__ == '__main__':
2023-03-14 17:36:42 +01:00
# task = CrawlerTask('2dffcae9-8f66-4cfa-8e2c-de1df738a6cd')
# print(task.get_meta())
# _clear_captures()
2023-02-21 12:22:49 +01:00