chg: Improve storage, support both modes.

2021-08-26 15:49:19 +02:00 · 2021-08-26 15:49:19 +02:00 · d41b7735dd
parent 407e78ae7f
commit d41b7735dd
8 changed files with 75 additions and 29 deletions
--- a/bin/archiver.py
+++ b/bin/archiver.py
@ -41,17 +41,15 @@ class Archiver(AbstractManager):
        # Format:
        # { 2020: { 12: [(directory, uuid)] } }
        to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list))
-        for capture_path in get_captures_dir().glob('*'):
-            if not capture_path.is_dir():
-                continue
-            timestamp = datetime.strptime(capture_path.name, '%Y-%m-%dT%H:%M:%S.%f')
+        for capture_uuid in get_captures_dir().glob('**/uuid'):
+            timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
            if timestamp.date() >= cut_time:
                # do not archive.
                continue
-            with (capture_path / 'uuid').open() as _f:
+            with capture_uuid.open() as _f:
                uuid = _f.read().strip()
-            to_archive[timestamp.year][timestamp.month].append((capture_path, uuid))
-            self.logger.info(f'Archiving {capture_path}.')
+            to_archive[timestamp.year][timestamp.month].append((capture_uuid.parent, uuid))
+            self.logger.info(f'Archiving {capture_uuid.parent}.')

        if not to_archive:
            self.logger.info('Nothing to archive.')
@ -78,6 +76,10 @@ class Archiver(AbstractManager):

        if archived_uuids:
            p = self.redis.pipeline()
+            for dir_key in self.redis.hmget('lookup_dirs', *archived_uuids.keys()):
+                # Clear cache
+                if dir_key:
+                    p.delete(dir_key)
            p.hdel('lookup_dirs', *archived_uuids.keys())
            p.hmset('lookup_dirs_archived', archived_uuids)  # type: ignore
            p.execute()
@ -86,12 +88,9 @@ class Archiver(AbstractManager):
    def _load_archives(self):
        # Initialize archives
        self.redis.delete('lookup_dirs_archived')
-        for year in self.archived_captures_dir.iterdir():
-            for month in year.iterdir():
-                if not (month / 'index').exists():
-                    continue
-                with (month / 'index').open('r') as _f:
-                    archived_uuids: Dict[str, str] = {uuid: str(month / dirname) for uuid, dirname in csv.reader(_f)}
+        for index in self.archived_captures_dir.glob('**/index'):
+            with index.open('r') as _f:
+                archived_uuids: Dict[str, str] = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f)}
            self.redis.hmset('lookup_dirs_archived', archived_uuids)  # type: ignore


--- a/bin/async_capture.py
+++ b/bin/async_capture.py
@ -115,7 +115,8 @@ class AsyncCapture(AbstractManager):
            self.logger.critical(f'Something went terribly wrong when capturing {url}.')
            return False
        width = len(str(len(items)))
-        dirpath = self.capture_dir / datetime.now().isoformat()
+        now = datetime.now()
+        dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat()
        safe_create_dir(dirpath)

        if os or browser:
--- a/bin/background_indexer.py
+++ b/bin/background_indexer.py
@ -27,7 +27,7 @@ class BackgroundIndexer(AbstractManager):
        self._check_indexes()

    def _build_missing_pickles(self):
-        for uuid_path in sorted(self.lookyloo.capture_dir.glob('*/uuid'), reverse=True):
+        for uuid_path in sorted(self.lookyloo.capture_dir.glob('**/uuid'), reverse=True):
            if (uuid_path.parent / 'tree.pickle').exists():
                continue
            lock_file = uuid_path.parent / 'lock'
--- a/bin/start.py
+++ b/bin/start.py
@ -13,6 +13,9 @@ def main():
    p.check_returncode()
    print('done.')
    print('Reload UUIDs index...')
+    print('If this is taking too long, it means you have a lot of captures.')
+    print('You should run tools/change_captures_dir.py to re-organize the capture directory by year and month.')
+    print('You may also want to archive more captures.')
    reload_uuids_index()
    print('done.')
    print('Start asynchronous ingestor...')
@ -25,6 +28,9 @@ def main():
    print('Start background processing...')
    Popen(['processing'])
    print('done.')
+    print('Start archiving process...')
+    Popen(['archiver'])
+    print('done.')
    print('Start website...')
    Popen(['start_website'])
    print('done.')
--- a/lookyloo/helpers.py
+++ b/lookyloo/helpers.py
@ -371,7 +371,7 @@ def get_useragent_for_requests():

 def reload_uuids_index() -> None:
    recent_uuids: Dict[str, str] = {}
-    for uuid_path in sorted(get_captures_dir().glob('*/uuid'), reverse=True):
+    for uuid_path in get_captures_dir().glob('**/uuid'):
        with uuid_path.open() as f:
            uuid = f.read()
        recent_uuids[uuid] = str(uuid_path.parent)
@ -380,7 +380,7 @@ def reload_uuids_index() -> None:
    r = Redis(unix_socket_path=get_socket_path('cache'))
    p = r.pipeline()
    p.delete('lookup_dirs')
-    p.hset('lookup_dirs', mapping=recent_uuids)  # type: ignore
+    p.hmset('lookup_dirs', recent_uuids)  # type: ignore
    p.execute()


--- a/lookyloo/lookyloo.py
+++ b/lookyloo/lookyloo.py
@ -100,6 +100,7 @@ class Lookyloo():
        if capture_dir and not Path(capture_dir).exists():
            # The capture was either removed or archived, cleaning up
            self.redis.hdel('lookup_dirs', capture_uuid)
+            self.redis.delete(capture_dir)
            capture_dir = None
        if not capture_dir:
            # Try in the archive
@ -141,7 +142,11 @@ class Lookyloo():
            with metafile.open('w') as f:
                json.dump(to_dump, f)

+        try:
            capture_dir = self._get_capture_dir(capture_uuid)
+        except MissingUUID:
+            raise MissingCaptureDirectory(f'Unable to find the directory for {capture_uuid}')
+
        har_files = sorted(capture_dir.glob('*.har'))
        lock_file = capture_dir / 'lock'
        pickle_file = capture_dir / 'tree.pickle'
@ -536,10 +541,13 @@ class Lookyloo():
                if not directory:
                    continue
                p.hgetall(directory)
-            for c in p.execute():
+            for uuid, c in zip(captures_to_get, p.execute()):
+                try:
+                    if not c:
+                        c = self.capture_cache(uuid)
                        if not c:
                            continue
-                try:
+                    else:
                        c = CaptureCache(c)
                except LookylooException as e:
                    self.logger.warning(e)
@ -554,8 +562,9 @@ class Lookyloo():
        """Get the cache from redis."""
        if capture_uuid in self._captures_index and not self._captures_index[capture_uuid].incomplete_redirects:
            return self._captures_index[capture_uuid]
+        try:
            capture_dir = self._get_capture_dir(capture_uuid)
-        if not capture_dir:
+        except MissingUUID:
            self.logger.warning(f'No directory for {capture_uuid}.')
            return None

@ -575,7 +584,10 @@ class Lookyloo():
    def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
        '''Get the generated tree in ETE Toolkit format.
        Loads the pickle if it exists, creates it otherwise.'''
+        try:
            capture_dir = self._get_capture_dir(capture_uuid)
+        except MissingUUID:
+            raise MissingCaptureDirectory(f'Unable to find the directory for {capture_uuid}')
        ct = load_pickle_tree(capture_dir)
        if not ct:
            ct = self._cache_capture(capture_uuid)
--- a/tools/change_captures_dir.py
+++ b/tools/change_captures_dir.py
@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from datetime import datetime
+from pathlib import Path
+
+from redis import Redis
+
+from lookyloo.helpers import get_captures_dir, safe_create_dir, get_socket_path
+
+
+def rename_captures():
+    r = Redis(unix_socket_path=get_socket_path('cache'))
+    capture_dir: Path = get_captures_dir()
+    for uuid_path in capture_dir.glob('*/uuid'):
+        with uuid_path.open() as f:
+            uuid = f.read()
+            dir_key = r.hget('lookup_dirs', uuid)
+            r.hdel('lookup_dirs', uuid)
+            if dir_key:
+                r.delete(dir_key)
+        timestamp = datetime.strptime(uuid_path.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
+        dest_dir = capture_dir / str(timestamp.year) / f'{timestamp.month:02}'
+        safe_create_dir(dest_dir)
+        uuid_path.parent.rename(dest_dir / uuid_path.parent.name)
+
+
+if __name__ == '__main__':
+    rename_captures()
--- a/website/web/init.py
+++ b/website/web/init.py
@ -554,9 +554,8 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
    if tree_uuid == 'False':
        flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error')
        return redirect(url_for('index'))
-    try:
    cache = lookyloo.capture_cache(tree_uuid)
-    except MissingUUID:
+    if not cache:
        status = get_capture_status(tree_uuid)
        splash_up, splash_message = splash_status()
        if not splash_up: