mirror of https://github.com/CIRCL/lookyloo
new: Archiver, refactoring.
parent
6be9b69d95
commit
58b837cb6c
|
@ -0,0 +1,80 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from collections import defaultdict
|
||||
import csv
|
||||
from datetime import datetime, timedelta
|
||||
import logging
|
||||
from typing import Dict, List, Tuple
|
||||
from pathlib import Path
|
||||
|
||||
from lookyloo.abstractmanager import AbstractManager
|
||||
from lookyloo.lookyloo import Lookyloo
|
||||
from lookyloo.helpers import get_config
|
||||
|
||||
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
||||
level=logging.INFO, datefmt='%I:%M:%S')
|
||||
|
||||
|
||||
class Archiver(AbstractManager):
|
||||
|
||||
def __init__(self, loglevel: int=logging.INFO):
|
||||
super().__init__(loglevel)
|
||||
self.script_name = 'archiver'
|
||||
|
||||
def _to_run_forever(self):
|
||||
self._archive()
|
||||
|
||||
def _archive(self):
|
||||
# Initialize the lookyloo class here, no need to keep it in memory all the time.
|
||||
lookyloo = Lookyloo()
|
||||
# make sure archived captures dir exists
|
||||
archived_captures_dir = lookyloo.capture_dir.parent / 'archived_captures'
|
||||
archived_captures_dir.mkdir(parents=True, exist_ok=True)
|
||||
archive_interval = timedelta(days=get_config('generic', 'archive'))
|
||||
cut_time = datetime.now() - archive_interval
|
||||
|
||||
# Format:
|
||||
# { 2020: { 12: [(directory, uuid)] } }
|
||||
to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list))
|
||||
for capture_path in lookyloo.capture_dir.glob('*'):
|
||||
timestamp = datetime.strptime(capture_path.name, '%Y-%m-%dT%H:%M:%S.%f')
|
||||
if timestamp >= cut_time:
|
||||
# do not archive.
|
||||
continue
|
||||
with (capture_path / 'uuid').open() as _f:
|
||||
uuid = _f.read().strip()
|
||||
to_archive[timestamp.year][timestamp.month].append((capture_path, uuid))
|
||||
|
||||
archived_uuids = {}
|
||||
for year, month_captures in to_archive.items():
|
||||
for month, captures in month_captures.items():
|
||||
dest_dir = archived_captures_dir / str(year) / str(month)
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
if (dest_dir / 'index').exists():
|
||||
with (dest_dir / 'index').open('r') as _f:
|
||||
current_index = {uuid: dirname for uuid, dirname in csv.reader(_f)}
|
||||
else:
|
||||
current_index = {}
|
||||
for capture_path, uuid in captures:
|
||||
current_index[uuid] = capture_path.name
|
||||
capture_path.rename(dest_dir / capture_path.name)
|
||||
archived_uuids[uuid] = str(dest_dir / capture_path.name)
|
||||
with (dest_dir / 'index').open('w') as _f:
|
||||
index_writer = csv.writer(_f)
|
||||
for uuid, dirname in current_index.items():
|
||||
index_writer.writerow([uuid, dirname])
|
||||
|
||||
if archived_uuids:
|
||||
lookyloo.redis.hdel('lookup_dirs', *archived_uuids.keys())
|
||||
lookyloo.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
|
||||
lookyloo.clear_captures_index_cache(archived_uuids.keys())
|
||||
|
||||
|
||||
def main():
|
||||
a = Archiver()
|
||||
a.run(sleep_in_sec=3600 * 24)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -27,7 +27,7 @@ class BackgroundIndexer(AbstractManager):
|
|||
self._check_indexes()
|
||||
|
||||
def _build_missing_pickles(self):
|
||||
for uuid_path in self.lookyloo.capture_dir.glob('*/uuid'):
|
||||
for uuid_path in sorted(self.lookyloo.capture_dir.glob('*/uuid'), reverse=True):
|
||||
if (uuid_path.parent / 'tree.pickle').exists():
|
||||
continue
|
||||
lock_file = uuid_path.parent / 'lock'
|
||||
|
@ -45,15 +45,17 @@ class BackgroundIndexer(AbstractManager):
|
|||
|
||||
with uuid_path.open() as f:
|
||||
uuid = f.read()
|
||||
if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
|
||||
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
|
||||
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
|
||||
|
||||
try:
|
||||
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}')
|
||||
self.lookyloo.get_crawled_tree(uuid)
|
||||
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
|
||||
self.logger.info(f'Pickle for {uuid} build.')
|
||||
except MissingUUID:
|
||||
# The cache is not up-to-date, but the UUID definitely exists in the captures.
|
||||
self.logger.warning(f'Unable to find {uuid}, re-triggering the cache.')
|
||||
self.lookyloo._set_capture_cache(uuid_path.parent, force=True)
|
||||
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
|
||||
except NoValidHarFile:
|
||||
self.logger.warning(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}')
|
||||
# The capture is not working, moving it away.
|
||||
|
|
|
@ -99,9 +99,6 @@ class Lookyloo():
|
|||
self.context = Context(self.sanejs)
|
||||
self._captures_index: Dict[str, CaptureCache] = {}
|
||||
|
||||
if not self.redis.exists('cache_loaded'):
|
||||
self._init_existing_dumps()
|
||||
|
||||
@property
|
||||
def redis(self):
|
||||
return Redis(connection_pool=self.redis_pool)
|
||||
|
@ -186,8 +183,6 @@ class Lookyloo():
|
|||
self._ensure_meta(capture_dir, ct)
|
||||
self._resolve_dns(ct)
|
||||
self.context.contextualize_tree(ct)
|
||||
# Force update cache of the capture (takes care of the incomplete redirect key)
|
||||
self._set_capture_cache(capture_dir, force=True)
|
||||
cache = self.capture_cache(capture_uuid)
|
||||
if not cache:
|
||||
raise LookylooException(f'Broken cache for {capture_dir}')
|
||||
|
@ -310,13 +305,14 @@ class Lookyloo():
|
|||
remove_pickle_tree(capture_dir)
|
||||
|
||||
def rebuild_cache(self) -> None:
|
||||
'''Flush and rebuild the redis cache. Doesn't remove the pickles.'''
|
||||
'''Flush and rebuild the redis cache. Doesn't remove the pickles.
|
||||
The cached captures will be rebuild when loading the index.'''
|
||||
self.redis.flushdb()
|
||||
self._init_existing_dumps()
|
||||
|
||||
def rebuild_all(self) -> None:
|
||||
'''Flush and rebuild the redis cache, and delede all the pickles.'''
|
||||
[remove_pickle_tree(capture_dir) for capture_dir in self.capture_dirs] # type: ignore
|
||||
'''Flush and rebuild the redis cache, and delete all the pickles.
|
||||
The captures will be rebuilt by the background indexer'''
|
||||
[remove_pickle_tree(capture_dir) for capture_dir in self.capture_dir.iterdir() if capture_dir.is_dir()] # type: ignore
|
||||
self.rebuild_cache()
|
||||
|
||||
def get_urlnode_from_tree(self, capture_uuid: str, /, node_uuid: str) -> URLNode:
|
||||
|
@ -468,15 +464,9 @@ class Lookyloo():
|
|||
to_return[event_id].update(values)
|
||||
return to_return
|
||||
|
||||
def _set_capture_cache(self, capture_dir: Path, force: bool=False, redis_pipeline: Optional[Redis]=None) -> None:
|
||||
'''Populate the redis cache for a capture. Mostly used on the index page.'''
|
||||
# NOTE: this method is called in the background indexer as a fallback.
|
||||
if force or not self.redis.exists(str(capture_dir)):
|
||||
# (re)build cache
|
||||
pass
|
||||
else:
|
||||
return
|
||||
|
||||
def _set_capture_cache(self, capture_dir: Path) -> Dict[str, Any]:
|
||||
'''Populate the redis cache for a capture. Mostly used on the index page.
|
||||
NOTE: Doesn't require the pickle.'''
|
||||
with (capture_dir / 'uuid').open() as f:
|
||||
uuid = f.read().strip()
|
||||
|
||||
|
@ -513,10 +503,7 @@ class Lookyloo():
|
|||
else:
|
||||
categories = []
|
||||
|
||||
if not redis_pipeline:
|
||||
p = self.redis.pipeline()
|
||||
else:
|
||||
p = redis_pipeline
|
||||
p = self.redis.pipeline()
|
||||
p.hset('lookup_dirs', uuid, str(capture_dir))
|
||||
if error_cache:
|
||||
if 'HTTP Error' not in error_cache['error']:
|
||||
|
@ -551,10 +538,10 @@ class Lookyloo():
|
|||
cache['parent'] = f.read().strip()
|
||||
|
||||
p.hmset(str(capture_dir), cache)
|
||||
if not redis_pipeline:
|
||||
p.execute()
|
||||
p.execute()
|
||||
# If the cache is re-created for some reason, pop from the local cache.
|
||||
self._captures_index.pop(uuid, None)
|
||||
return cache
|
||||
|
||||
def hide_capture(self, capture_uuid: str, /) -> None:
|
||||
"""Add the capture in the hidden pool (not shown on the front page)
|
||||
|
@ -580,7 +567,9 @@ class Lookyloo():
|
|||
# No captures at all on the instance
|
||||
return []
|
||||
|
||||
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if uuid in self._captures_index and not self._captures_index[uuid].incomplete_redirects]
|
||||
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids
|
||||
if (uuid in self._captures_index
|
||||
and not self._captures_index[uuid].incomplete_redirects)]
|
||||
|
||||
captures_to_get = set(capture_uuids) - set(self._captures_index.keys())
|
||||
if captures_to_get:
|
||||
|
@ -593,60 +582,47 @@ class Lookyloo():
|
|||
if not c:
|
||||
continue
|
||||
c = CaptureCache(c)
|
||||
if c.incomplete_redirects:
|
||||
self._set_capture_cache(c.capture_dir, force=True)
|
||||
c = self.capture_cache(c.uuid)
|
||||
if hasattr(c, 'timestamp'):
|
||||
all_cache.append(c)
|
||||
self._captures_index[c.uuid] = c
|
||||
all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True)
|
||||
return all_cache
|
||||
|
||||
def clear_captures_index_cache(self, uuids: Iterable[str]) -> None:
|
||||
[self._captures_index.pop(uuid) for uuid in uuids if uuid in self._captures_index]
|
||||
|
||||
def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]:
|
||||
"""Get the cache from redis.
|
||||
NOTE: Doesn't try to build the pickle"""
|
||||
"""Get the cache from redis."""
|
||||
if capture_uuid in self._captures_index:
|
||||
if self._captures_index[capture_uuid].incomplete_redirects:
|
||||
# Try to rebuild the cache
|
||||
capture_dir = self._get_capture_dir(capture_uuid)
|
||||
cached = self._set_capture_cache(capture_dir)
|
||||
self._captures_index[capture_uuid] = CaptureCache(cached)
|
||||
return self._captures_index[capture_uuid]
|
||||
capture_dir = self._get_capture_dir(capture_uuid)
|
||||
cached: Dict[str, Any] = self.redis.hgetall(str(capture_dir))
|
||||
if not cached:
|
||||
self.logger.warning(f'No cache available for {capture_dir}.')
|
||||
if not capture_dir:
|
||||
self.logger.warning(f'No directory for {capture_uuid}.')
|
||||
return None
|
||||
|
||||
cached = self.redis.hgetall(str(capture_dir))
|
||||
if not cached:
|
||||
cached = self._set_capture_cache(capture_dir)
|
||||
try:
|
||||
cc = CaptureCache(cached)
|
||||
self._captures_index[cc.uuid] = cc
|
||||
return cc
|
||||
self._captures_index[capture_uuid] = CaptureCache(cached)
|
||||
return self._captures_index[capture_uuid]
|
||||
except LookylooException as e:
|
||||
self.logger.warning(f'Cache ({capture_dir}) is invalid ({e}): {json.dumps(cached, indent=2)}')
|
||||
return None
|
||||
|
||||
def _init_existing_dumps(self) -> None:
|
||||
'''Initialize the cache for all the captures'''
|
||||
p = self.redis.pipeline()
|
||||
for capture_dir in self.capture_dirs:
|
||||
if capture_dir.exists():
|
||||
self._set_capture_cache(capture_dir, redis_pipeline=p)
|
||||
p.set('cache_loaded', 1)
|
||||
p.execute()
|
||||
|
||||
@property
|
||||
def capture_dirs(self) -> List[Path]:
|
||||
'''Get all the capture directories, sorder from newest to oldest.'''
|
||||
for capture_dir in self.capture_dir.iterdir():
|
||||
if capture_dir.is_dir() and not capture_dir.iterdir():
|
||||
# Cleanup self.capture_dir of failed runs.
|
||||
capture_dir.rmdir()
|
||||
if not (capture_dir / 'uuid').exists():
|
||||
# Create uuid if missing
|
||||
with (capture_dir / 'uuid').open('w') as f:
|
||||
f.write(str(uuid4()))
|
||||
return sorted(self.capture_dir.iterdir(), reverse=True)
|
||||
|
||||
def _get_capture_dir(self, capture_uuid: str, /) -> Path:
|
||||
'''Use the cache to get a capture directory from a capture UUID'''
|
||||
capture_dir: str = self.redis.hget('lookup_dirs', capture_uuid)
|
||||
if not capture_dir:
|
||||
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
|
||||
# Try in the archive
|
||||
capture_dir = self.redis.hget('lookup_dirs_archived', capture_uuid)
|
||||
if not capture_dir:
|
||||
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
|
||||
to_return = Path(capture_dir)
|
||||
if not to_return.exists():
|
||||
# The capture was removed, remove the UUID
|
||||
|
@ -970,8 +946,7 @@ class Lookyloo():
|
|||
cookies = item['cookies']
|
||||
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
|
||||
json.dump(cookies, _cookies)
|
||||
|
||||
self._set_capture_cache(dirpath)
|
||||
self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
|
||||
return perma_uuid
|
||||
|
||||
def get_body_hash_investigator(self, body_hash: str, /) -> Tuple[List[Tuple[str, str]], List[Tuple[str, float]]]:
|
||||
|
|
|
@ -31,6 +31,7 @@ stop = "bin.stop:main"
|
|||
rebuild_caches = "bin.rebuild_caches:main"
|
||||
update = "bin.update:main"
|
||||
background_indexer = "bin.background_indexer:main"
|
||||
archiver = "bin.archiver:main"
|
||||
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
|
|
Loading…
Reference in New Issue