diff --git a/bin/archiver.py b/bin/archiver.py index 3194d60a..40ca8a75 100755 --- a/bin/archiver.py +++ b/bin/archiver.py @@ -41,17 +41,15 @@ class Archiver(AbstractManager): # Format: # { 2020: { 12: [(directory, uuid)] } } to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list)) - for capture_path in get_captures_dir().glob('*'): - if not capture_path.is_dir(): - continue - timestamp = datetime.strptime(capture_path.name, '%Y-%m-%dT%H:%M:%S.%f') + for capture_uuid in get_captures_dir().glob('**/uuid'): + timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f') if timestamp.date() >= cut_time: # do not archive. continue - with (capture_path / 'uuid').open() as _f: + with capture_uuid.open() as _f: uuid = _f.read().strip() - to_archive[timestamp.year][timestamp.month].append((capture_path, uuid)) - self.logger.info(f'Archiving {capture_path}.') + to_archive[timestamp.year][timestamp.month].append((capture_uuid.parent, uuid)) + self.logger.info(f'Archiving {capture_uuid.parent}.') if not to_archive: self.logger.info('Nothing to archive.') @@ -78,6 +76,10 @@ class Archiver(AbstractManager): if archived_uuids: p = self.redis.pipeline() + for dir_key in self.redis.hmget('lookup_dirs', *archived_uuids.keys()): + # Clear cache + if dir_key: + p.delete(dir_key) p.hdel('lookup_dirs', *archived_uuids.keys()) p.hmset('lookup_dirs_archived', archived_uuids) # type: ignore p.execute() @@ -86,13 +88,10 @@ class Archiver(AbstractManager): def _load_archives(self): # Initialize archives self.redis.delete('lookup_dirs_archived') - for year in self.archived_captures_dir.iterdir(): - for month in year.iterdir(): - if not (month / 'index').exists(): - continue - with (month / 'index').open('r') as _f: - archived_uuids: Dict[str, str] = {uuid: str(month / dirname) for uuid, dirname in csv.reader(_f)} - self.redis.hmset('lookup_dirs_archived', archived_uuids) # type: ignore + for index in self.archived_captures_dir.glob('**/index'): + with index.open('r') as _f: + archived_uuids: Dict[str, str] = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f)} + self.redis.hmset('lookup_dirs_archived', archived_uuids) # type: ignore def main(): diff --git a/bin/async_capture.py b/bin/async_capture.py index d1cf7631..66ad77f2 100755 --- a/bin/async_capture.py +++ b/bin/async_capture.py @@ -115,7 +115,8 @@ class AsyncCapture(AbstractManager): self.logger.critical(f'Something went terribly wrong when capturing {url}.') return False width = len(str(len(items))) - dirpath = self.capture_dir / datetime.now().isoformat() + now = datetime.now() + dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat() safe_create_dir(dirpath) if os or browser: diff --git a/bin/background_indexer.py b/bin/background_indexer.py index 68eda51d..3f3179c8 100755 --- a/bin/background_indexer.py +++ b/bin/background_indexer.py @@ -27,7 +27,7 @@ class BackgroundIndexer(AbstractManager): self._check_indexes() def _build_missing_pickles(self): - for uuid_path in sorted(self.lookyloo.capture_dir.glob('*/uuid'), reverse=True): + for uuid_path in sorted(self.lookyloo.capture_dir.glob('**/uuid'), reverse=True): if (uuid_path.parent / 'tree.pickle').exists(): continue lock_file = uuid_path.parent / 'lock' diff --git a/bin/start.py b/bin/start.py index 16f85685..e10d5d32 100755 --- a/bin/start.py +++ b/bin/start.py @@ -13,6 +13,9 @@ def main(): p.check_returncode() print('done.') print('Reload UUIDs index...') + print('If this is taking too long, it means you have a lot of captures.') + print('You should run tools/change_captures_dir.py to re-organize the capture directory by year and month.') + print('You may also want to archive more captures.') reload_uuids_index() print('done.') print('Start asynchronous ingestor...') @@ -25,6 +28,9 @@ def main(): print('Start background processing...') Popen(['processing']) print('done.') + print('Start archiving process...') + Popen(['archiver']) + print('done.') print('Start website...') Popen(['start_website']) print('done.') diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py index 8382d537..55231812 100644 --- a/lookyloo/helpers.py +++ b/lookyloo/helpers.py @@ -371,7 +371,7 @@ def get_useragent_for_requests(): def reload_uuids_index() -> None: recent_uuids: Dict[str, str] = {} - for uuid_path in sorted(get_captures_dir().glob('*/uuid'), reverse=True): + for uuid_path in get_captures_dir().glob('**/uuid'): with uuid_path.open() as f: uuid = f.read() recent_uuids[uuid] = str(uuid_path.parent) @@ -380,7 +380,7 @@ def reload_uuids_index() -> None: r = Redis(unix_socket_path=get_socket_path('cache')) p = r.pipeline() p.delete('lookup_dirs') - p.hset('lookup_dirs', mapping=recent_uuids) # type: ignore + p.hmset('lookup_dirs', recent_uuids) # type: ignore p.execute() diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 83cf7a68..168eda3e 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -100,6 +100,7 @@ class Lookyloo(): if capture_dir and not Path(capture_dir).exists(): # The capture was either removed or archived, cleaning up self.redis.hdel('lookup_dirs', capture_uuid) + self.redis.delete(capture_dir) capture_dir = None if not capture_dir: # Try in the archive @@ -141,7 +142,11 @@ class Lookyloo(): with metafile.open('w') as f: json.dump(to_dump, f) - capture_dir = self._get_capture_dir(capture_uuid) + try: + capture_dir = self._get_capture_dir(capture_uuid) + except MissingUUID: + raise MissingCaptureDirectory(f'Unable to find the directory for {capture_uuid}') + har_files = sorted(capture_dir.glob('*.har')) lock_file = capture_dir / 'lock' pickle_file = capture_dir / 'tree.pickle' @@ -536,11 +541,14 @@ class Lookyloo(): if not directory: continue p.hgetall(directory) - for c in p.execute(): - if not c: - continue + for uuid, c in zip(captures_to_get, p.execute()): try: - c = CaptureCache(c) + if not c: + c = self.capture_cache(uuid) + if not c: + continue + else: + c = CaptureCache(c) except LookylooException as e: self.logger.warning(e) continue @@ -554,8 +562,9 @@ class Lookyloo(): """Get the cache from redis.""" if capture_uuid in self._captures_index and not self._captures_index[capture_uuid].incomplete_redirects: return self._captures_index[capture_uuid] - capture_dir = self._get_capture_dir(capture_uuid) - if not capture_dir: + try: + capture_dir = self._get_capture_dir(capture_uuid) + except MissingUUID: self.logger.warning(f'No directory for {capture_uuid}.') return None @@ -575,7 +584,10 @@ class Lookyloo(): def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree: '''Get the generated tree in ETE Toolkit format. Loads the pickle if it exists, creates it otherwise.''' - capture_dir = self._get_capture_dir(capture_uuid) + try: + capture_dir = self._get_capture_dir(capture_uuid) + except MissingUUID: + raise MissingCaptureDirectory(f'Unable to find the directory for {capture_uuid}') ct = load_pickle_tree(capture_dir) if not ct: ct = self._cache_capture(capture_uuid) diff --git a/tools/change_captures_dir.py b/tools/change_captures_dir.py new file mode 100755 index 00000000..09b6731f --- /dev/null +++ b/tools/change_captures_dir.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from datetime import datetime +from pathlib import Path + +from redis import Redis + +from lookyloo.helpers import get_captures_dir, safe_create_dir, get_socket_path + + +def rename_captures(): + r = Redis(unix_socket_path=get_socket_path('cache')) + capture_dir: Path = get_captures_dir() + for uuid_path in capture_dir.glob('*/uuid'): + with uuid_path.open() as f: + uuid = f.read() + dir_key = r.hget('lookup_dirs', uuid) + r.hdel('lookup_dirs', uuid) + if dir_key: + r.delete(dir_key) + timestamp = datetime.strptime(uuid_path.parent.name, '%Y-%m-%dT%H:%M:%S.%f') + dest_dir = capture_dir / str(timestamp.year) / f'{timestamp.month:02}' + safe_create_dir(dest_dir) + uuid_path.parent.rename(dest_dir / uuid_path.parent.name) + + +if __name__ == '__main__': + rename_captures() diff --git a/website/web/__init__.py b/website/web/__init__.py index 25fe062a..0d540087 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -554,9 +554,8 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None): if tree_uuid == 'False': flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error') return redirect(url_for('index')) - try: - cache = lookyloo.capture_cache(tree_uuid) - except MissingUUID: + cache = lookyloo.capture_cache(tree_uuid) + if not cache: status = get_capture_status(tree_uuid) splash_up, splash_message = splash_status() if not splash_up: