chg: Improve storage, support both modes.

pull/251/head
Raphaël Vinot 2021-08-26 15:49:19 +02:00
parent 407e78ae7f
commit d41b7735dd
8 changed files with 75 additions and 29 deletions

View File

@ -41,17 +41,15 @@ class Archiver(AbstractManager):
# Format: # Format:
# { 2020: { 12: [(directory, uuid)] } } # { 2020: { 12: [(directory, uuid)] } }
to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list)) to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list))
for capture_path in get_captures_dir().glob('*'): for capture_uuid in get_captures_dir().glob('**/uuid'):
if not capture_path.is_dir(): timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
continue
timestamp = datetime.strptime(capture_path.name, '%Y-%m-%dT%H:%M:%S.%f')
if timestamp.date() >= cut_time: if timestamp.date() >= cut_time:
# do not archive. # do not archive.
continue continue
with (capture_path / 'uuid').open() as _f: with capture_uuid.open() as _f:
uuid = _f.read().strip() uuid = _f.read().strip()
to_archive[timestamp.year][timestamp.month].append((capture_path, uuid)) to_archive[timestamp.year][timestamp.month].append((capture_uuid.parent, uuid))
self.logger.info(f'Archiving {capture_path}.') self.logger.info(f'Archiving {capture_uuid.parent}.')
if not to_archive: if not to_archive:
self.logger.info('Nothing to archive.') self.logger.info('Nothing to archive.')
@ -78,6 +76,10 @@ class Archiver(AbstractManager):
if archived_uuids: if archived_uuids:
p = self.redis.pipeline() p = self.redis.pipeline()
for dir_key in self.redis.hmget('lookup_dirs', *archived_uuids.keys()):
# Clear cache
if dir_key:
p.delete(dir_key)
p.hdel('lookup_dirs', *archived_uuids.keys()) p.hdel('lookup_dirs', *archived_uuids.keys())
p.hmset('lookup_dirs_archived', archived_uuids) # type: ignore p.hmset('lookup_dirs_archived', archived_uuids) # type: ignore
p.execute() p.execute()
@ -86,13 +88,10 @@ class Archiver(AbstractManager):
def _load_archives(self): def _load_archives(self):
# Initialize archives # Initialize archives
self.redis.delete('lookup_dirs_archived') self.redis.delete('lookup_dirs_archived')
for year in self.archived_captures_dir.iterdir(): for index in self.archived_captures_dir.glob('**/index'):
for month in year.iterdir(): with index.open('r') as _f:
if not (month / 'index').exists(): archived_uuids: Dict[str, str] = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f)}
continue self.redis.hmset('lookup_dirs_archived', archived_uuids) # type: ignore
with (month / 'index').open('r') as _f:
archived_uuids: Dict[str, str] = {uuid: str(month / dirname) for uuid, dirname in csv.reader(_f)}
self.redis.hmset('lookup_dirs_archived', archived_uuids) # type: ignore
def main(): def main():

View File

@ -115,7 +115,8 @@ class AsyncCapture(AbstractManager):
self.logger.critical(f'Something went terribly wrong when capturing {url}.') self.logger.critical(f'Something went terribly wrong when capturing {url}.')
return False return False
width = len(str(len(items))) width = len(str(len(items)))
dirpath = self.capture_dir / datetime.now().isoformat() now = datetime.now()
dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat()
safe_create_dir(dirpath) safe_create_dir(dirpath)
if os or browser: if os or browser:

View File

@ -27,7 +27,7 @@ class BackgroundIndexer(AbstractManager):
self._check_indexes() self._check_indexes()
def _build_missing_pickles(self): def _build_missing_pickles(self):
for uuid_path in sorted(self.lookyloo.capture_dir.glob('*/uuid'), reverse=True): for uuid_path in sorted(self.lookyloo.capture_dir.glob('**/uuid'), reverse=True):
if (uuid_path.parent / 'tree.pickle').exists(): if (uuid_path.parent / 'tree.pickle').exists():
continue continue
lock_file = uuid_path.parent / 'lock' lock_file = uuid_path.parent / 'lock'

View File

@ -13,6 +13,9 @@ def main():
p.check_returncode() p.check_returncode()
print('done.') print('done.')
print('Reload UUIDs index...') print('Reload UUIDs index...')
print('If this is taking too long, it means you have a lot of captures.')
print('You should run tools/change_captures_dir.py to re-organize the capture directory by year and month.')
print('You may also want to archive more captures.')
reload_uuids_index() reload_uuids_index()
print('done.') print('done.')
print('Start asynchronous ingestor...') print('Start asynchronous ingestor...')
@ -25,6 +28,9 @@ def main():
print('Start background processing...') print('Start background processing...')
Popen(['processing']) Popen(['processing'])
print('done.') print('done.')
print('Start archiving process...')
Popen(['archiver'])
print('done.')
print('Start website...') print('Start website...')
Popen(['start_website']) Popen(['start_website'])
print('done.') print('done.')

View File

@ -371,7 +371,7 @@ def get_useragent_for_requests():
def reload_uuids_index() -> None: def reload_uuids_index() -> None:
recent_uuids: Dict[str, str] = {} recent_uuids: Dict[str, str] = {}
for uuid_path in sorted(get_captures_dir().glob('*/uuid'), reverse=True): for uuid_path in get_captures_dir().glob('**/uuid'):
with uuid_path.open() as f: with uuid_path.open() as f:
uuid = f.read() uuid = f.read()
recent_uuids[uuid] = str(uuid_path.parent) recent_uuids[uuid] = str(uuid_path.parent)
@ -380,7 +380,7 @@ def reload_uuids_index() -> None:
r = Redis(unix_socket_path=get_socket_path('cache')) r = Redis(unix_socket_path=get_socket_path('cache'))
p = r.pipeline() p = r.pipeline()
p.delete('lookup_dirs') p.delete('lookup_dirs')
p.hset('lookup_dirs', mapping=recent_uuids) # type: ignore p.hmset('lookup_dirs', recent_uuids) # type: ignore
p.execute() p.execute()

View File

@ -100,6 +100,7 @@ class Lookyloo():
if capture_dir and not Path(capture_dir).exists(): if capture_dir and not Path(capture_dir).exists():
# The capture was either removed or archived, cleaning up # The capture was either removed or archived, cleaning up
self.redis.hdel('lookup_dirs', capture_uuid) self.redis.hdel('lookup_dirs', capture_uuid)
self.redis.delete(capture_dir)
capture_dir = None capture_dir = None
if not capture_dir: if not capture_dir:
# Try in the archive # Try in the archive
@ -141,7 +142,11 @@ class Lookyloo():
with metafile.open('w') as f: with metafile.open('w') as f:
json.dump(to_dump, f) json.dump(to_dump, f)
capture_dir = self._get_capture_dir(capture_uuid) try:
capture_dir = self._get_capture_dir(capture_uuid)
except MissingUUID:
raise MissingCaptureDirectory(f'Unable to find the directory for {capture_uuid}')
har_files = sorted(capture_dir.glob('*.har')) har_files = sorted(capture_dir.glob('*.har'))
lock_file = capture_dir / 'lock' lock_file = capture_dir / 'lock'
pickle_file = capture_dir / 'tree.pickle' pickle_file = capture_dir / 'tree.pickle'
@ -536,11 +541,14 @@ class Lookyloo():
if not directory: if not directory:
continue continue
p.hgetall(directory) p.hgetall(directory)
for c in p.execute(): for uuid, c in zip(captures_to_get, p.execute()):
if not c:
continue
try: try:
c = CaptureCache(c) if not c:
c = self.capture_cache(uuid)
if not c:
continue
else:
c = CaptureCache(c)
except LookylooException as e: except LookylooException as e:
self.logger.warning(e) self.logger.warning(e)
continue continue
@ -554,8 +562,9 @@ class Lookyloo():
"""Get the cache from redis.""" """Get the cache from redis."""
if capture_uuid in self._captures_index and not self._captures_index[capture_uuid].incomplete_redirects: if capture_uuid in self._captures_index and not self._captures_index[capture_uuid].incomplete_redirects:
return self._captures_index[capture_uuid] return self._captures_index[capture_uuid]
capture_dir = self._get_capture_dir(capture_uuid) try:
if not capture_dir: capture_dir = self._get_capture_dir(capture_uuid)
except MissingUUID:
self.logger.warning(f'No directory for {capture_uuid}.') self.logger.warning(f'No directory for {capture_uuid}.')
return None return None
@ -575,7 +584,10 @@ class Lookyloo():
def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree: def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
'''Get the generated tree in ETE Toolkit format. '''Get the generated tree in ETE Toolkit format.
Loads the pickle if it exists, creates it otherwise.''' Loads the pickle if it exists, creates it otherwise.'''
capture_dir = self._get_capture_dir(capture_uuid) try:
capture_dir = self._get_capture_dir(capture_uuid)
except MissingUUID:
raise MissingCaptureDirectory(f'Unable to find the directory for {capture_uuid}')
ct = load_pickle_tree(capture_dir) ct = load_pickle_tree(capture_dir)
if not ct: if not ct:
ct = self._cache_capture(capture_uuid) ct = self._cache_capture(capture_uuid)

29
tools/change_captures_dir.py Executable file
View File

@ -0,0 +1,29 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from datetime import datetime
from pathlib import Path
from redis import Redis
from lookyloo.helpers import get_captures_dir, safe_create_dir, get_socket_path
def rename_captures():
r = Redis(unix_socket_path=get_socket_path('cache'))
capture_dir: Path = get_captures_dir()
for uuid_path in capture_dir.glob('*/uuid'):
with uuid_path.open() as f:
uuid = f.read()
dir_key = r.hget('lookup_dirs', uuid)
r.hdel('lookup_dirs', uuid)
if dir_key:
r.delete(dir_key)
timestamp = datetime.strptime(uuid_path.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
dest_dir = capture_dir / str(timestamp.year) / f'{timestamp.month:02}'
safe_create_dir(dest_dir)
uuid_path.parent.rename(dest_dir / uuid_path.parent.name)
if __name__ == '__main__':
rename_captures()

View File

@ -554,9 +554,8 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
if tree_uuid == 'False': if tree_uuid == 'False':
flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error') flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error')
return redirect(url_for('index')) return redirect(url_for('index'))
try: cache = lookyloo.capture_cache(tree_uuid)
cache = lookyloo.capture_cache(tree_uuid) if not cache:
except MissingUUID:
status = get_capture_status(tree_uuid) status = get_capture_status(tree_uuid)
splash_up, splash_message = splash_status() splash_up, splash_message = splash_status()
if not splash_up: if not splash_up: