mirror of https://github.com/CIRCL/lookyloo
new: Archiver, refactoring.
parent
6be9b69d95
commit
58b837cb6c
|
@ -0,0 +1,80 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
import csv
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import logging
|
||||||
|
from typing import Dict, List, Tuple
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from lookyloo.abstractmanager import AbstractManager
|
||||||
|
from lookyloo.lookyloo import Lookyloo
|
||||||
|
from lookyloo.helpers import get_config
|
||||||
|
|
||||||
|
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
||||||
|
level=logging.INFO, datefmt='%I:%M:%S')
|
||||||
|
|
||||||
|
|
||||||
|
class Archiver(AbstractManager):
|
||||||
|
|
||||||
|
def __init__(self, loglevel: int=logging.INFO):
|
||||||
|
super().__init__(loglevel)
|
||||||
|
self.script_name = 'archiver'
|
||||||
|
|
||||||
|
def _to_run_forever(self):
|
||||||
|
self._archive()
|
||||||
|
|
||||||
|
def _archive(self):
|
||||||
|
# Initialize the lookyloo class here, no need to keep it in memory all the time.
|
||||||
|
lookyloo = Lookyloo()
|
||||||
|
# make sure archived captures dir exists
|
||||||
|
archived_captures_dir = lookyloo.capture_dir.parent / 'archived_captures'
|
||||||
|
archived_captures_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
archive_interval = timedelta(days=get_config('generic', 'archive'))
|
||||||
|
cut_time = datetime.now() - archive_interval
|
||||||
|
|
||||||
|
# Format:
|
||||||
|
# { 2020: { 12: [(directory, uuid)] } }
|
||||||
|
to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list))
|
||||||
|
for capture_path in lookyloo.capture_dir.glob('*'):
|
||||||
|
timestamp = datetime.strptime(capture_path.name, '%Y-%m-%dT%H:%M:%S.%f')
|
||||||
|
if timestamp >= cut_time:
|
||||||
|
# do not archive.
|
||||||
|
continue
|
||||||
|
with (capture_path / 'uuid').open() as _f:
|
||||||
|
uuid = _f.read().strip()
|
||||||
|
to_archive[timestamp.year][timestamp.month].append((capture_path, uuid))
|
||||||
|
|
||||||
|
archived_uuids = {}
|
||||||
|
for year, month_captures in to_archive.items():
|
||||||
|
for month, captures in month_captures.items():
|
||||||
|
dest_dir = archived_captures_dir / str(year) / str(month)
|
||||||
|
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
if (dest_dir / 'index').exists():
|
||||||
|
with (dest_dir / 'index').open('r') as _f:
|
||||||
|
current_index = {uuid: dirname for uuid, dirname in csv.reader(_f)}
|
||||||
|
else:
|
||||||
|
current_index = {}
|
||||||
|
for capture_path, uuid in captures:
|
||||||
|
current_index[uuid] = capture_path.name
|
||||||
|
capture_path.rename(dest_dir / capture_path.name)
|
||||||
|
archived_uuids[uuid] = str(dest_dir / capture_path.name)
|
||||||
|
with (dest_dir / 'index').open('w') as _f:
|
||||||
|
index_writer = csv.writer(_f)
|
||||||
|
for uuid, dirname in current_index.items():
|
||||||
|
index_writer.writerow([uuid, dirname])
|
||||||
|
|
||||||
|
if archived_uuids:
|
||||||
|
lookyloo.redis.hdel('lookup_dirs', *archived_uuids.keys())
|
||||||
|
lookyloo.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
|
||||||
|
lookyloo.clear_captures_index_cache(archived_uuids.keys())
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
a = Archiver()
|
||||||
|
a.run(sleep_in_sec=3600 * 24)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
|
@ -27,7 +27,7 @@ class BackgroundIndexer(AbstractManager):
|
||||||
self._check_indexes()
|
self._check_indexes()
|
||||||
|
|
||||||
def _build_missing_pickles(self):
|
def _build_missing_pickles(self):
|
||||||
for uuid_path in self.lookyloo.capture_dir.glob('*/uuid'):
|
for uuid_path in sorted(self.lookyloo.capture_dir.glob('*/uuid'), reverse=True):
|
||||||
if (uuid_path.parent / 'tree.pickle').exists():
|
if (uuid_path.parent / 'tree.pickle').exists():
|
||||||
continue
|
continue
|
||||||
lock_file = uuid_path.parent / 'lock'
|
lock_file = uuid_path.parent / 'lock'
|
||||||
|
@ -45,15 +45,17 @@ class BackgroundIndexer(AbstractManager):
|
||||||
|
|
||||||
with uuid_path.open() as f:
|
with uuid_path.open() as f:
|
||||||
uuid = f.read()
|
uuid = f.read()
|
||||||
|
if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
|
||||||
|
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
|
||||||
|
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}')
|
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}')
|
||||||
self.lookyloo.get_crawled_tree(uuid)
|
self.lookyloo.get_crawled_tree(uuid)
|
||||||
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
|
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
|
||||||
self.logger.info(f'Pickle for {uuid} build.')
|
self.logger.info(f'Pickle for {uuid} build.')
|
||||||
except MissingUUID:
|
except MissingUUID:
|
||||||
# The cache is not up-to-date, but the UUID definitely exists in the captures.
|
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
|
||||||
self.logger.warning(f'Unable to find {uuid}, re-triggering the cache.')
|
|
||||||
self.lookyloo._set_capture_cache(uuid_path.parent, force=True)
|
|
||||||
except NoValidHarFile:
|
except NoValidHarFile:
|
||||||
self.logger.warning(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}')
|
self.logger.warning(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}')
|
||||||
# The capture is not working, moving it away.
|
# The capture is not working, moving it away.
|
||||||
|
|
|
@ -99,9 +99,6 @@ class Lookyloo():
|
||||||
self.context = Context(self.sanejs)
|
self.context = Context(self.sanejs)
|
||||||
self._captures_index: Dict[str, CaptureCache] = {}
|
self._captures_index: Dict[str, CaptureCache] = {}
|
||||||
|
|
||||||
if not self.redis.exists('cache_loaded'):
|
|
||||||
self._init_existing_dumps()
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def redis(self):
|
def redis(self):
|
||||||
return Redis(connection_pool=self.redis_pool)
|
return Redis(connection_pool=self.redis_pool)
|
||||||
|
@ -186,8 +183,6 @@ class Lookyloo():
|
||||||
self._ensure_meta(capture_dir, ct)
|
self._ensure_meta(capture_dir, ct)
|
||||||
self._resolve_dns(ct)
|
self._resolve_dns(ct)
|
||||||
self.context.contextualize_tree(ct)
|
self.context.contextualize_tree(ct)
|
||||||
# Force update cache of the capture (takes care of the incomplete redirect key)
|
|
||||||
self._set_capture_cache(capture_dir, force=True)
|
|
||||||
cache = self.capture_cache(capture_uuid)
|
cache = self.capture_cache(capture_uuid)
|
||||||
if not cache:
|
if not cache:
|
||||||
raise LookylooException(f'Broken cache for {capture_dir}')
|
raise LookylooException(f'Broken cache for {capture_dir}')
|
||||||
|
@ -310,13 +305,14 @@ class Lookyloo():
|
||||||
remove_pickle_tree(capture_dir)
|
remove_pickle_tree(capture_dir)
|
||||||
|
|
||||||
def rebuild_cache(self) -> None:
|
def rebuild_cache(self) -> None:
|
||||||
'''Flush and rebuild the redis cache. Doesn't remove the pickles.'''
|
'''Flush and rebuild the redis cache. Doesn't remove the pickles.
|
||||||
|
The cached captures will be rebuild when loading the index.'''
|
||||||
self.redis.flushdb()
|
self.redis.flushdb()
|
||||||
self._init_existing_dumps()
|
|
||||||
|
|
||||||
def rebuild_all(self) -> None:
|
def rebuild_all(self) -> None:
|
||||||
'''Flush and rebuild the redis cache, and delede all the pickles.'''
|
'''Flush and rebuild the redis cache, and delete all the pickles.
|
||||||
[remove_pickle_tree(capture_dir) for capture_dir in self.capture_dirs] # type: ignore
|
The captures will be rebuilt by the background indexer'''
|
||||||
|
[remove_pickle_tree(capture_dir) for capture_dir in self.capture_dir.iterdir() if capture_dir.is_dir()] # type: ignore
|
||||||
self.rebuild_cache()
|
self.rebuild_cache()
|
||||||
|
|
||||||
def get_urlnode_from_tree(self, capture_uuid: str, /, node_uuid: str) -> URLNode:
|
def get_urlnode_from_tree(self, capture_uuid: str, /, node_uuid: str) -> URLNode:
|
||||||
|
@ -468,15 +464,9 @@ class Lookyloo():
|
||||||
to_return[event_id].update(values)
|
to_return[event_id].update(values)
|
||||||
return to_return
|
return to_return
|
||||||
|
|
||||||
def _set_capture_cache(self, capture_dir: Path, force: bool=False, redis_pipeline: Optional[Redis]=None) -> None:
|
def _set_capture_cache(self, capture_dir: Path) -> Dict[str, Any]:
|
||||||
'''Populate the redis cache for a capture. Mostly used on the index page.'''
|
'''Populate the redis cache for a capture. Mostly used on the index page.
|
||||||
# NOTE: this method is called in the background indexer as a fallback.
|
NOTE: Doesn't require the pickle.'''
|
||||||
if force or not self.redis.exists(str(capture_dir)):
|
|
||||||
# (re)build cache
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
|
|
||||||
with (capture_dir / 'uuid').open() as f:
|
with (capture_dir / 'uuid').open() as f:
|
||||||
uuid = f.read().strip()
|
uuid = f.read().strip()
|
||||||
|
|
||||||
|
@ -513,10 +503,7 @@ class Lookyloo():
|
||||||
else:
|
else:
|
||||||
categories = []
|
categories = []
|
||||||
|
|
||||||
if not redis_pipeline:
|
p = self.redis.pipeline()
|
||||||
p = self.redis.pipeline()
|
|
||||||
else:
|
|
||||||
p = redis_pipeline
|
|
||||||
p.hset('lookup_dirs', uuid, str(capture_dir))
|
p.hset('lookup_dirs', uuid, str(capture_dir))
|
||||||
if error_cache:
|
if error_cache:
|
||||||
if 'HTTP Error' not in error_cache['error']:
|
if 'HTTP Error' not in error_cache['error']:
|
||||||
|
@ -551,10 +538,10 @@ class Lookyloo():
|
||||||
cache['parent'] = f.read().strip()
|
cache['parent'] = f.read().strip()
|
||||||
|
|
||||||
p.hmset(str(capture_dir), cache)
|
p.hmset(str(capture_dir), cache)
|
||||||
if not redis_pipeline:
|
p.execute()
|
||||||
p.execute()
|
|
||||||
# If the cache is re-created for some reason, pop from the local cache.
|
# If the cache is re-created for some reason, pop from the local cache.
|
||||||
self._captures_index.pop(uuid, None)
|
self._captures_index.pop(uuid, None)
|
||||||
|
return cache
|
||||||
|
|
||||||
def hide_capture(self, capture_uuid: str, /) -> None:
|
def hide_capture(self, capture_uuid: str, /) -> None:
|
||||||
"""Add the capture in the hidden pool (not shown on the front page)
|
"""Add the capture in the hidden pool (not shown on the front page)
|
||||||
|
@ -580,7 +567,9 @@ class Lookyloo():
|
||||||
# No captures at all on the instance
|
# No captures at all on the instance
|
||||||
return []
|
return []
|
||||||
|
|
||||||
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if uuid in self._captures_index and not self._captures_index[uuid].incomplete_redirects]
|
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids
|
||||||
|
if (uuid in self._captures_index
|
||||||
|
and not self._captures_index[uuid].incomplete_redirects)]
|
||||||
|
|
||||||
captures_to_get = set(capture_uuids) - set(self._captures_index.keys())
|
captures_to_get = set(capture_uuids) - set(self._captures_index.keys())
|
||||||
if captures_to_get:
|
if captures_to_get:
|
||||||
|
@ -593,60 +582,47 @@ class Lookyloo():
|
||||||
if not c:
|
if not c:
|
||||||
continue
|
continue
|
||||||
c = CaptureCache(c)
|
c = CaptureCache(c)
|
||||||
if c.incomplete_redirects:
|
|
||||||
self._set_capture_cache(c.capture_dir, force=True)
|
|
||||||
c = self.capture_cache(c.uuid)
|
|
||||||
if hasattr(c, 'timestamp'):
|
if hasattr(c, 'timestamp'):
|
||||||
all_cache.append(c)
|
all_cache.append(c)
|
||||||
self._captures_index[c.uuid] = c
|
self._captures_index[c.uuid] = c
|
||||||
all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True)
|
all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True)
|
||||||
return all_cache
|
return all_cache
|
||||||
|
|
||||||
|
def clear_captures_index_cache(self, uuids: Iterable[str]) -> None:
|
||||||
|
[self._captures_index.pop(uuid) for uuid in uuids if uuid in self._captures_index]
|
||||||
|
|
||||||
def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]:
|
def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]:
|
||||||
"""Get the cache from redis.
|
"""Get the cache from redis."""
|
||||||
NOTE: Doesn't try to build the pickle"""
|
|
||||||
if capture_uuid in self._captures_index:
|
if capture_uuid in self._captures_index:
|
||||||
|
if self._captures_index[capture_uuid].incomplete_redirects:
|
||||||
|
# Try to rebuild the cache
|
||||||
|
capture_dir = self._get_capture_dir(capture_uuid)
|
||||||
|
cached = self._set_capture_cache(capture_dir)
|
||||||
|
self._captures_index[capture_uuid] = CaptureCache(cached)
|
||||||
return self._captures_index[capture_uuid]
|
return self._captures_index[capture_uuid]
|
||||||
capture_dir = self._get_capture_dir(capture_uuid)
|
capture_dir = self._get_capture_dir(capture_uuid)
|
||||||
cached: Dict[str, Any] = self.redis.hgetall(str(capture_dir))
|
if not capture_dir:
|
||||||
if not cached:
|
self.logger.warning(f'No directory for {capture_uuid}.')
|
||||||
self.logger.warning(f'No cache available for {capture_dir}.')
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
cached = self.redis.hgetall(str(capture_dir))
|
||||||
|
if not cached:
|
||||||
|
cached = self._set_capture_cache(capture_dir)
|
||||||
try:
|
try:
|
||||||
cc = CaptureCache(cached)
|
self._captures_index[capture_uuid] = CaptureCache(cached)
|
||||||
self._captures_index[cc.uuid] = cc
|
return self._captures_index[capture_uuid]
|
||||||
return cc
|
|
||||||
except LookylooException as e:
|
except LookylooException as e:
|
||||||
self.logger.warning(f'Cache ({capture_dir}) is invalid ({e}): {json.dumps(cached, indent=2)}')
|
self.logger.warning(f'Cache ({capture_dir}) is invalid ({e}): {json.dumps(cached, indent=2)}')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _init_existing_dumps(self) -> None:
|
|
||||||
'''Initialize the cache for all the captures'''
|
|
||||||
p = self.redis.pipeline()
|
|
||||||
for capture_dir in self.capture_dirs:
|
|
||||||
if capture_dir.exists():
|
|
||||||
self._set_capture_cache(capture_dir, redis_pipeline=p)
|
|
||||||
p.set('cache_loaded', 1)
|
|
||||||
p.execute()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def capture_dirs(self) -> List[Path]:
|
|
||||||
'''Get all the capture directories, sorder from newest to oldest.'''
|
|
||||||
for capture_dir in self.capture_dir.iterdir():
|
|
||||||
if capture_dir.is_dir() and not capture_dir.iterdir():
|
|
||||||
# Cleanup self.capture_dir of failed runs.
|
|
||||||
capture_dir.rmdir()
|
|
||||||
if not (capture_dir / 'uuid').exists():
|
|
||||||
# Create uuid if missing
|
|
||||||
with (capture_dir / 'uuid').open('w') as f:
|
|
||||||
f.write(str(uuid4()))
|
|
||||||
return sorted(self.capture_dir.iterdir(), reverse=True)
|
|
||||||
|
|
||||||
def _get_capture_dir(self, capture_uuid: str, /) -> Path:
|
def _get_capture_dir(self, capture_uuid: str, /) -> Path:
|
||||||
'''Use the cache to get a capture directory from a capture UUID'''
|
'''Use the cache to get a capture directory from a capture UUID'''
|
||||||
capture_dir: str = self.redis.hget('lookup_dirs', capture_uuid)
|
capture_dir: str = self.redis.hget('lookup_dirs', capture_uuid)
|
||||||
if not capture_dir:
|
if not capture_dir:
|
||||||
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
|
# Try in the archive
|
||||||
|
capture_dir = self.redis.hget('lookup_dirs_archived', capture_uuid)
|
||||||
|
if not capture_dir:
|
||||||
|
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
|
||||||
to_return = Path(capture_dir)
|
to_return = Path(capture_dir)
|
||||||
if not to_return.exists():
|
if not to_return.exists():
|
||||||
# The capture was removed, remove the UUID
|
# The capture was removed, remove the UUID
|
||||||
|
@ -970,8 +946,7 @@ class Lookyloo():
|
||||||
cookies = item['cookies']
|
cookies = item['cookies']
|
||||||
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
|
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
|
||||||
json.dump(cookies, _cookies)
|
json.dump(cookies, _cookies)
|
||||||
|
self.redis.hset('lookup_dirs', perma_uuid, str(dirpath))
|
||||||
self._set_capture_cache(dirpath)
|
|
||||||
return perma_uuid
|
return perma_uuid
|
||||||
|
|
||||||
def get_body_hash_investigator(self, body_hash: str, /) -> Tuple[List[Tuple[str, str]], List[Tuple[str, float]]]:
|
def get_body_hash_investigator(self, body_hash: str, /) -> Tuple[List[Tuple[str, str]], List[Tuple[str, float]]]:
|
||||||
|
|
|
@ -31,6 +31,7 @@ stop = "bin.stop:main"
|
||||||
rebuild_caches = "bin.rebuild_caches:main"
|
rebuild_caches = "bin.rebuild_caches:main"
|
||||||
update = "bin.update:main"
|
update = "bin.update:main"
|
||||||
background_indexer = "bin.background_indexer:main"
|
background_indexer = "bin.background_indexer:main"
|
||||||
|
archiver = "bin.archiver:main"
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
|
|
Loading…
Reference in New Issue