mirror of https://github.com/CIRCL/lookyloo
chg: Cleanup archiver, initialize index captures in start
parent
ece30a33eb
commit
8433cbcc1b
|
@ -8,9 +8,10 @@ import logging
|
|||
from typing import Dict, List, Tuple
|
||||
from pathlib import Path
|
||||
|
||||
from redis import Redis
|
||||
|
||||
from lookyloo.abstractmanager import AbstractManager
|
||||
from lookyloo.lookyloo import Lookyloo
|
||||
from lookyloo.helpers import get_config
|
||||
from lookyloo.helpers import get_config, get_homedir, get_socket_path, get_captures_dir
|
||||
|
||||
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
||||
level=logging.INFO, datefmt='%I:%M:%S')
|
||||
|
@ -21,17 +22,18 @@ class Archiver(AbstractManager):
|
|||
def __init__(self, loglevel: int=logging.INFO):
|
||||
super().__init__(loglevel)
|
||||
self.script_name = 'archiver'
|
||||
self._load_indexes()
|
||||
self.redis = Redis(unix_socket_path=get_socket_path('cache'))
|
||||
|
||||
# make sure archived captures dir exists
|
||||
self.archived_captures_dir = get_homedir / 'archived_captures'
|
||||
self.archived_captures_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self._load_archives()
|
||||
|
||||
def _to_run_forever(self):
|
||||
self._archive()
|
||||
|
||||
def _archive(self):
|
||||
# Initialize the lookyloo class here, no need to keep it in memory all the time.
|
||||
lookyloo = Lookyloo()
|
||||
# make sure archived captures dir exists
|
||||
archived_captures_dir = lookyloo.capture_dir.parent / 'archived_captures'
|
||||
archived_captures_dir.mkdir(parents=True, exist_ok=True)
|
||||
archive_interval = timedelta(days=get_config('generic', 'archive'))
|
||||
cut_time = (datetime.now() - archive_interval).date()
|
||||
cut_time = cut_time.replace(day=1)
|
||||
|
@ -39,7 +41,7 @@ class Archiver(AbstractManager):
|
|||
# Format:
|
||||
# { 2020: { 12: [(directory, uuid)] } }
|
||||
to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list))
|
||||
for capture_path in lookyloo.capture_dir.glob('*'):
|
||||
for capture_path in get_captures_dir().glob('*'):
|
||||
if not capture_path.is_dir():
|
||||
continue
|
||||
timestamp = datetime.strptime(capture_path.name, '%Y-%m-%dT%H:%M:%S.%f')
|
||||
|
@ -58,7 +60,7 @@ class Archiver(AbstractManager):
|
|||
archived_uuids = {}
|
||||
for year, month_captures in to_archive.items():
|
||||
for month, captures in month_captures.items():
|
||||
dest_dir = archived_captures_dir / str(year) / f'{month:02}'
|
||||
dest_dir = self.archived_captures_dir / str(year) / f'{month:02}'
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
if (dest_dir / 'index').exists():
|
||||
with (dest_dir / 'index').open('r') as _f:
|
||||
|
@ -75,36 +77,22 @@ class Archiver(AbstractManager):
|
|||
index_writer.writerow([uuid, dirname])
|
||||
|
||||
if archived_uuids:
|
||||
lookyloo.redis.hdel('lookup_dirs', *archived_uuids.keys())
|
||||
lookyloo.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
|
||||
lookyloo.clear_captures_index_cache(archived_uuids.keys())
|
||||
p = self.redis.pipeline()
|
||||
p.redis.hdel('lookup_dirs', *archived_uuids.keys())
|
||||
p.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
|
||||
p.execute()
|
||||
self.logger.info('Archiving done.')
|
||||
|
||||
def _load_indexes(self):
|
||||
# Initialize the lookyloo class here, no need to keep it in memory all the time.
|
||||
lookyloo = Lookyloo()
|
||||
|
||||
# NOTE: Initialize recent
|
||||
recent_uuids = {}
|
||||
for uuid_path in sorted(lookyloo.capture_dir.glob('*/uuid'), reverse=True):
|
||||
with uuid_path.open() as f:
|
||||
uuid = f.read()
|
||||
recent_uuids[uuid] = str(uuid_path.parent)
|
||||
lookyloo.redis.delete('lookup_dirs')
|
||||
lookyloo.redis.hset('lookup_dirs', mapping=recent_uuids)
|
||||
|
||||
# NOTE: Initialize archives
|
||||
# make sure archived captures dir exists
|
||||
archived_captures_dir = lookyloo.capture_dir.parent / 'archived_captures'
|
||||
archived_captures_dir.mkdir(parents=True, exist_ok=True)
|
||||
lookyloo.redis.delete('lookup_dirs_archived')
|
||||
for year in archived_captures_dir.iterdir():
|
||||
def _load_archives(self):
|
||||
# Initialize archives
|
||||
self.redis.delete('lookup_dirs_archived')
|
||||
for year in self.archived_captures_dir.iterdir():
|
||||
for month in year.iterdir():
|
||||
if not (month / 'index').exists():
|
||||
continue
|
||||
with (month / 'index').open('r') as _f:
|
||||
archived_uuids = {uuid: str(month / dirname) for uuid, dirname in csv.reader(_f)}
|
||||
lookyloo.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
|
||||
self.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
|
||||
|
||||
|
||||
def main():
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from subprocess import run, Popen
|
||||
from lookyloo.helpers import get_homedir, get_config
|
||||
from lookyloo.helpers import get_homedir, get_config, reload_uuids_index
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -12,8 +12,11 @@ def main():
|
|||
p = run(['run_backend', '--start'])
|
||||
p.check_returncode()
|
||||
print('done.')
|
||||
print('Reload UUIDs index...')
|
||||
reload_uuids_index()
|
||||
print('done.')
|
||||
print('Start asynchronous ingestor...')
|
||||
for i in range(get_config('generic', 'async_capture_processes')):
|
||||
for _ in range(get_config('generic', 'async_capture_processes')):
|
||||
Popen(['async_capture'])
|
||||
print('done.')
|
||||
print('Start background indexer...')
|
||||
|
|
|
@ -102,6 +102,13 @@ Run the following command (assuming you run the code from the clonned repository
|
|||
return Path(os.environ['LOOKYLOO_HOME'])
|
||||
|
||||
|
||||
@lru_cache(64)
|
||||
def get_capture_dir() -> Path:
|
||||
capture_dir = get_homedir() / 'scraped'
|
||||
safe_create_dir(capture_dir)
|
||||
return capture_dir
|
||||
|
||||
|
||||
@lru_cache(64)
|
||||
def get_email_template() -> str:
|
||||
with (get_homedir() / 'config' / 'email.tmpl').open() as f:
|
||||
|
@ -355,3 +362,16 @@ def try_make_file(filename: Path):
|
|||
def get_useragent_for_requests():
|
||||
version = pkg_resources.get_distribution('lookyloo').version
|
||||
return f'Lookyloo / {version}'
|
||||
|
||||
|
||||
def reload_uuids_index() -> None:
|
||||
recent_uuids = {}
|
||||
for uuid_path in sorted(get_capture_dir().glob('*/uuid'), reverse=True):
|
||||
with uuid_path.open() as f:
|
||||
uuid = f.read()
|
||||
recent_uuids[uuid] = str(uuid_path.parent)
|
||||
r = Redis(unix_socket_path=get_socket_path('cache'))
|
||||
p = r.pipeline()
|
||||
p.delete('lookup_dirs')
|
||||
p.hset('lookup_dirs', mapping=recent_uuids)
|
||||
p.execute()
|
||||
|
|
|
@ -40,7 +40,7 @@ from .exceptions import NoValidHarFile, MissingUUID, LookylooException, MissingC
|
|||
from .helpers import (get_homedir, get_socket_path, load_cookies, get_config,
|
||||
safe_create_dir, get_email_template, load_pickle_tree,
|
||||
remove_pickle_tree, get_resources_hashes, get_taxonomies, uniq_domains,
|
||||
CaptureStatus, try_make_file)
|
||||
CaptureStatus, try_make_file, get_captures_dir)
|
||||
from .modules import VirusTotal, SaneJavaScript, PhishingInitiative, MISP, UniversalWhois, UrlScan
|
||||
from .capturecache import CaptureCache
|
||||
from .context import Context
|
||||
|
@ -59,7 +59,7 @@ class Lookyloo():
|
|||
|
||||
self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
|
||||
path=get_socket_path('cache'), decode_responses=True)
|
||||
self.capture_dir: Path = get_homedir() / 'scraped'
|
||||
self.capture_dir: Path = get_captures_dir()
|
||||
if os.environ.get('SPLASH_URL_DOCKER'):
|
||||
# In order to have a working default for the docker image, it is easier to use an environment variable
|
||||
self.splash_url: str = os.environ['SPLASH_URL_DOCKER']
|
||||
|
@ -69,8 +69,6 @@ class Lookyloo():
|
|||
|
||||
self._priority = get_config('generic', 'priority')
|
||||
|
||||
safe_create_dir(self.capture_dir)
|
||||
|
||||
# Initialize 3rd party components
|
||||
self.pi = PhishingInitiative(get_config('modules', 'PhishingInitiative'))
|
||||
if not self.pi.available:
|
||||
|
@ -103,19 +101,6 @@ class Lookyloo():
|
|||
def redis(self):
|
||||
return Redis(connection_pool=self.redis_pool)
|
||||
|
||||
def _get_priority(self, source: str, user: str, authenticated: bool) -> int:
|
||||
src_prio: int = self._priority['sources'][source] if source in self._priority['sources'] else -1
|
||||
if not authenticated:
|
||||
usr_prio = self._priority['users']['_default_anon']
|
||||
# reduce priority for anonymous users making lots of captures
|
||||
queue_size = self.redis.zscore('queues', f'{source}|{authenticated}|{user}')
|
||||
if queue_size is None:
|
||||
queue_size = 0
|
||||
usr_prio -= int(queue_size / 10)
|
||||
else:
|
||||
usr_prio = self._priority['users'][user] if self._priority['users'].get(user) else self._priority['users']['_default_auth']
|
||||
return src_prio + usr_prio
|
||||
|
||||
def cache_user_agents(self, user_agent: str, remote_ip: str) -> None:
|
||||
'''Cache the useragents of the visitors'''
|
||||
today = date.today().isoformat()
|
||||
|
@ -592,9 +577,6 @@ class Lookyloo():
|
|||
all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True)
|
||||
return all_cache
|
||||
|
||||
def clear_captures_index_cache(self, uuids: Iterable[str]) -> None:
|
||||
[self._captures_index.pop(uuid) for uuid in uuids if uuid in self._captures_index]
|
||||
|
||||
def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]:
|
||||
"""Get the cache from redis."""
|
||||
if capture_uuid in self._captures_index and not self._captures_index[capture_uuid].incomplete_redirects:
|
||||
|
@ -654,6 +636,19 @@ class Lookyloo():
|
|||
return CaptureStatus.ONGOING
|
||||
return CaptureStatus.UNKNOWN
|
||||
|
||||
def _get_priority(self, source: str, user: str, authenticated: bool) -> int:
|
||||
src_prio: int = self._priority['sources'][source] if source in self._priority['sources'] else -1
|
||||
if not authenticated:
|
||||
usr_prio = self._priority['users']['_default_anon']
|
||||
# reduce priority for anonymous users making lots of captures
|
||||
queue_size = self.redis.zscore('queues', f'{source}|{authenticated}|{user}')
|
||||
if queue_size is None:
|
||||
queue_size = 0
|
||||
usr_prio -= int(queue_size / 10)
|
||||
else:
|
||||
usr_prio = self._priority['users'][user] if self._priority['users'].get(user) else self._priority['users']['_default_auth']
|
||||
return src_prio + usr_prio
|
||||
|
||||
def enqueue_capture(self, query: MutableMapping[str, Any], source: str, user: str, authenticated: bool) -> str:
|
||||
'''Enqueue a query in the capture queue (used by the UI and the API for asynchronous processing)'''
|
||||
perma_uuid = str(uuid4())
|
||||
|
|
Loading…
Reference in New Issue