diff --git a/bin/archiver.py b/bin/archiver.py index 034be3be..a07db0f5 100755 --- a/bin/archiver.py +++ b/bin/archiver.py @@ -8,9 +8,10 @@ import logging from typing import Dict, List, Tuple from pathlib import Path +from redis import Redis + from lookyloo.abstractmanager import AbstractManager -from lookyloo.lookyloo import Lookyloo -from lookyloo.helpers import get_config +from lookyloo.helpers import get_config, get_homedir, get_socket_path, get_captures_dir logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%I:%M:%S') @@ -21,17 +22,18 @@ class Archiver(AbstractManager): def __init__(self, loglevel: int=logging.INFO): super().__init__(loglevel) self.script_name = 'archiver' - self._load_indexes() + self.redis = Redis(unix_socket_path=get_socket_path('cache')) + + # make sure archived captures dir exists + self.archived_captures_dir = get_homedir / 'archived_captures' + self.archived_captures_dir.mkdir(parents=True, exist_ok=True) + + self._load_archives() def _to_run_forever(self): self._archive() def _archive(self): - # Initialize the lookyloo class here, no need to keep it in memory all the time. - lookyloo = Lookyloo() - # make sure archived captures dir exists - archived_captures_dir = lookyloo.capture_dir.parent / 'archived_captures' - archived_captures_dir.mkdir(parents=True, exist_ok=True) archive_interval = timedelta(days=get_config('generic', 'archive')) cut_time = (datetime.now() - archive_interval).date() cut_time = cut_time.replace(day=1) @@ -39,7 +41,7 @@ class Archiver(AbstractManager): # Format: # { 2020: { 12: [(directory, uuid)] } } to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list)) - for capture_path in lookyloo.capture_dir.glob('*'): + for capture_path in get_captures_dir().glob('*'): if not capture_path.is_dir(): continue timestamp = datetime.strptime(capture_path.name, '%Y-%m-%dT%H:%M:%S.%f') @@ -58,7 +60,7 @@ class Archiver(AbstractManager): archived_uuids = {} for year, month_captures in to_archive.items(): for month, captures in month_captures.items(): - dest_dir = archived_captures_dir / str(year) / f'{month:02}' + dest_dir = self.archived_captures_dir / str(year) / f'{month:02}' dest_dir.mkdir(parents=True, exist_ok=True) if (dest_dir / 'index').exists(): with (dest_dir / 'index').open('r') as _f: @@ -75,36 +77,22 @@ class Archiver(AbstractManager): index_writer.writerow([uuid, dirname]) if archived_uuids: - lookyloo.redis.hdel('lookup_dirs', *archived_uuids.keys()) - lookyloo.redis.hset('lookup_dirs_archived', mapping=archived_uuids) - lookyloo.clear_captures_index_cache(archived_uuids.keys()) + p = self.redis.pipeline() + p.redis.hdel('lookup_dirs', *archived_uuids.keys()) + p.redis.hset('lookup_dirs_archived', mapping=archived_uuids) + p.execute() self.logger.info('Archiving done.') - def _load_indexes(self): - # Initialize the lookyloo class here, no need to keep it in memory all the time. - lookyloo = Lookyloo() - - # NOTE: Initialize recent - recent_uuids = {} - for uuid_path in sorted(lookyloo.capture_dir.glob('*/uuid'), reverse=True): - with uuid_path.open() as f: - uuid = f.read() - recent_uuids[uuid] = str(uuid_path.parent) - lookyloo.redis.delete('lookup_dirs') - lookyloo.redis.hset('lookup_dirs', mapping=recent_uuids) - - # NOTE: Initialize archives - # make sure archived captures dir exists - archived_captures_dir = lookyloo.capture_dir.parent / 'archived_captures' - archived_captures_dir.mkdir(parents=True, exist_ok=True) - lookyloo.redis.delete('lookup_dirs_archived') - for year in archived_captures_dir.iterdir(): + def _load_archives(self): + # Initialize archives + self.redis.delete('lookup_dirs_archived') + for year in self.archived_captures_dir.iterdir(): for month in year.iterdir(): if not (month / 'index').exists(): continue with (month / 'index').open('r') as _f: archived_uuids = {uuid: str(month / dirname) for uuid, dirname in csv.reader(_f)} - lookyloo.redis.hset('lookup_dirs_archived', mapping=archived_uuids) + self.redis.hset('lookup_dirs_archived', mapping=archived_uuids) def main(): diff --git a/bin/start.py b/bin/start.py index b2b71eb1..585867ee 100755 --- a/bin/start.py +++ b/bin/start.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- from subprocess import run, Popen -from lookyloo.helpers import get_homedir, get_config +from lookyloo.helpers import get_homedir, get_config, reload_uuids_index def main(): @@ -12,8 +12,11 @@ def main(): p = run(['run_backend', '--start']) p.check_returncode() print('done.') + print('Reload UUIDs index...') + reload_uuids_index() + print('done.') print('Start asynchronous ingestor...') - for i in range(get_config('generic', 'async_capture_processes')): + for _ in range(get_config('generic', 'async_capture_processes')): Popen(['async_capture']) print('done.') print('Start background indexer...') diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py index 6870251c..369d320a 100644 --- a/lookyloo/helpers.py +++ b/lookyloo/helpers.py @@ -102,6 +102,13 @@ Run the following command (assuming you run the code from the clonned repository return Path(os.environ['LOOKYLOO_HOME']) +@lru_cache(64) +def get_capture_dir() -> Path: + capture_dir = get_homedir() / 'scraped' + safe_create_dir(capture_dir) + return capture_dir + + @lru_cache(64) def get_email_template() -> str: with (get_homedir() / 'config' / 'email.tmpl').open() as f: @@ -355,3 +362,16 @@ def try_make_file(filename: Path): def get_useragent_for_requests(): version = pkg_resources.get_distribution('lookyloo').version return f'Lookyloo / {version}' + + +def reload_uuids_index() -> None: + recent_uuids = {} + for uuid_path in sorted(get_capture_dir().glob('*/uuid'), reverse=True): + with uuid_path.open() as f: + uuid = f.read() + recent_uuids[uuid] = str(uuid_path.parent) + r = Redis(unix_socket_path=get_socket_path('cache')) + p = r.pipeline() + p.delete('lookup_dirs') + p.hset('lookup_dirs', mapping=recent_uuids) + p.execute() diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 6ba013eb..bfac0f13 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -40,7 +40,7 @@ from .exceptions import NoValidHarFile, MissingUUID, LookylooException, MissingC from .helpers import (get_homedir, get_socket_path, load_cookies, get_config, safe_create_dir, get_email_template, load_pickle_tree, remove_pickle_tree, get_resources_hashes, get_taxonomies, uniq_domains, - CaptureStatus, try_make_file) + CaptureStatus, try_make_file, get_captures_dir) from .modules import VirusTotal, SaneJavaScript, PhishingInitiative, MISP, UniversalWhois, UrlScan from .capturecache import CaptureCache from .context import Context @@ -59,7 +59,7 @@ class Lookyloo(): self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection, path=get_socket_path('cache'), decode_responses=True) - self.capture_dir: Path = get_homedir() / 'scraped' + self.capture_dir: Path = get_captures_dir() if os.environ.get('SPLASH_URL_DOCKER'): # In order to have a working default for the docker image, it is easier to use an environment variable self.splash_url: str = os.environ['SPLASH_URL_DOCKER'] @@ -69,8 +69,6 @@ class Lookyloo(): self._priority = get_config('generic', 'priority') - safe_create_dir(self.capture_dir) - # Initialize 3rd party components self.pi = PhishingInitiative(get_config('modules', 'PhishingInitiative')) if not self.pi.available: @@ -103,19 +101,6 @@ class Lookyloo(): def redis(self): return Redis(connection_pool=self.redis_pool) - def _get_priority(self, source: str, user: str, authenticated: bool) -> int: - src_prio: int = self._priority['sources'][source] if source in self._priority['sources'] else -1 - if not authenticated: - usr_prio = self._priority['users']['_default_anon'] - # reduce priority for anonymous users making lots of captures - queue_size = self.redis.zscore('queues', f'{source}|{authenticated}|{user}') - if queue_size is None: - queue_size = 0 - usr_prio -= int(queue_size / 10) - else: - usr_prio = self._priority['users'][user] if self._priority['users'].get(user) else self._priority['users']['_default_auth'] - return src_prio + usr_prio - def cache_user_agents(self, user_agent: str, remote_ip: str) -> None: '''Cache the useragents of the visitors''' today = date.today().isoformat() @@ -592,9 +577,6 @@ class Lookyloo(): all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True) return all_cache - def clear_captures_index_cache(self, uuids: Iterable[str]) -> None: - [self._captures_index.pop(uuid) for uuid in uuids if uuid in self._captures_index] - def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]: """Get the cache from redis.""" if capture_uuid in self._captures_index and not self._captures_index[capture_uuid].incomplete_redirects: @@ -654,6 +636,19 @@ class Lookyloo(): return CaptureStatus.ONGOING return CaptureStatus.UNKNOWN + def _get_priority(self, source: str, user: str, authenticated: bool) -> int: + src_prio: int = self._priority['sources'][source] if source in self._priority['sources'] else -1 + if not authenticated: + usr_prio = self._priority['users']['_default_anon'] + # reduce priority for anonymous users making lots of captures + queue_size = self.redis.zscore('queues', f'{source}|{authenticated}|{user}') + if queue_size is None: + queue_size = 0 + usr_prio -= int(queue_size / 10) + else: + usr_prio = self._priority['users'][user] if self._priority['users'].get(user) else self._priority['users']['_default_auth'] + return src_prio + usr_prio + def enqueue_capture(self, query: MutableMapping[str, Any], source: str, user: str, authenticated: bool) -> str: '''Enqueue a query in the capture queue (used by the UI and the API for asynchronous processing)''' perma_uuid = str(uuid4())