chg: Improve storage, support both modes.

pull/251/head
Raphaël Vinot 2021-08-26 15:49:19 +02:00
parent 407e78ae7f
commit d41b7735dd
8 changed files with 75 additions and 29 deletions

View File

@ -41,17 +41,15 @@ class Archiver(AbstractManager):
# Format:
# { 2020: { 12: [(directory, uuid)] } }
to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list))
for capture_path in get_captures_dir().glob('*'):
if not capture_path.is_dir():
continue
timestamp = datetime.strptime(capture_path.name, '%Y-%m-%dT%H:%M:%S.%f')
for capture_uuid in get_captures_dir().glob('**/uuid'):
timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
if timestamp.date() >= cut_time:
# do not archive.
continue
with (capture_path / 'uuid').open() as _f:
with capture_uuid.open() as _f:
uuid = _f.read().strip()
to_archive[timestamp.year][timestamp.month].append((capture_path, uuid))
self.logger.info(f'Archiving {capture_path}.')
to_archive[timestamp.year][timestamp.month].append((capture_uuid.parent, uuid))
self.logger.info(f'Archiving {capture_uuid.parent}.')
if not to_archive:
self.logger.info('Nothing to archive.')
@ -78,6 +76,10 @@ class Archiver(AbstractManager):
if archived_uuids:
p = self.redis.pipeline()
for dir_key in self.redis.hmget('lookup_dirs', *archived_uuids.keys()):
# Clear cache
if dir_key:
p.delete(dir_key)
p.hdel('lookup_dirs', *archived_uuids.keys())
p.hmset('lookup_dirs_archived', archived_uuids) # type: ignore
p.execute()
@ -86,12 +88,9 @@ class Archiver(AbstractManager):
def _load_archives(self):
# Initialize archives
self.redis.delete('lookup_dirs_archived')
for year in self.archived_captures_dir.iterdir():
for month in year.iterdir():
if not (month / 'index').exists():
continue
with (month / 'index').open('r') as _f:
archived_uuids: Dict[str, str] = {uuid: str(month / dirname) for uuid, dirname in csv.reader(_f)}
for index in self.archived_captures_dir.glob('**/index'):
with index.open('r') as _f:
archived_uuids: Dict[str, str] = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f)}
self.redis.hmset('lookup_dirs_archived', archived_uuids) # type: ignore

View File

@ -115,7 +115,8 @@ class AsyncCapture(AbstractManager):
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
return False
width = len(str(len(items)))
dirpath = self.capture_dir / datetime.now().isoformat()
now = datetime.now()
dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat()
safe_create_dir(dirpath)
if os or browser:

View File

@ -27,7 +27,7 @@ class BackgroundIndexer(AbstractManager):
self._check_indexes()
def _build_missing_pickles(self):
for uuid_path in sorted(self.lookyloo.capture_dir.glob('*/uuid'), reverse=True):
for uuid_path in sorted(self.lookyloo.capture_dir.glob('**/uuid'), reverse=True):
if (uuid_path.parent / 'tree.pickle').exists():
continue
lock_file = uuid_path.parent / 'lock'

View File

@ -13,6 +13,9 @@ def main():
p.check_returncode()
print('done.')
print('Reload UUIDs index...')
print('If this is taking too long, it means you have a lot of captures.')
print('You should run tools/change_captures_dir.py to re-organize the capture directory by year and month.')
print('You may also want to archive more captures.')
reload_uuids_index()
print('done.')
print('Start asynchronous ingestor...')
@ -25,6 +28,9 @@ def main():
print('Start background processing...')
Popen(['processing'])
print('done.')
print('Start archiving process...')
Popen(['archiver'])
print('done.')
print('Start website...')
Popen(['start_website'])
print('done.')

View File

@ -371,7 +371,7 @@ def get_useragent_for_requests():
def reload_uuids_index() -> None:
recent_uuids: Dict[str, str] = {}
for uuid_path in sorted(get_captures_dir().glob('*/uuid'), reverse=True):
for uuid_path in get_captures_dir().glob('**/uuid'):
with uuid_path.open() as f:
uuid = f.read()
recent_uuids[uuid] = str(uuid_path.parent)
@ -380,7 +380,7 @@ def reload_uuids_index() -> None:
r = Redis(unix_socket_path=get_socket_path('cache'))
p = r.pipeline()
p.delete('lookup_dirs')
p.hset('lookup_dirs', mapping=recent_uuids) # type: ignore
p.hmset('lookup_dirs', recent_uuids) # type: ignore
p.execute()

View File

@ -100,6 +100,7 @@ class Lookyloo():
if capture_dir and not Path(capture_dir).exists():
# The capture was either removed or archived, cleaning up
self.redis.hdel('lookup_dirs', capture_uuid)
self.redis.delete(capture_dir)
capture_dir = None
if not capture_dir:
# Try in the archive
@ -141,7 +142,11 @@ class Lookyloo():
with metafile.open('w') as f:
json.dump(to_dump, f)
try:
capture_dir = self._get_capture_dir(capture_uuid)
except MissingUUID:
raise MissingCaptureDirectory(f'Unable to find the directory for {capture_uuid}')
har_files = sorted(capture_dir.glob('*.har'))
lock_file = capture_dir / 'lock'
pickle_file = capture_dir / 'tree.pickle'
@ -536,10 +541,13 @@ class Lookyloo():
if not directory:
continue
p.hgetall(directory)
for c in p.execute():
for uuid, c in zip(captures_to_get, p.execute()):
try:
if not c:
c = self.capture_cache(uuid)
if not c:
continue
try:
else:
c = CaptureCache(c)
except LookylooException as e:
self.logger.warning(e)
@ -554,8 +562,9 @@ class Lookyloo():
"""Get the cache from redis."""
if capture_uuid in self._captures_index and not self._captures_index[capture_uuid].incomplete_redirects:
return self._captures_index[capture_uuid]
try:
capture_dir = self._get_capture_dir(capture_uuid)
if not capture_dir:
except MissingUUID:
self.logger.warning(f'No directory for {capture_uuid}.')
return None
@ -575,7 +584,10 @@ class Lookyloo():
def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
'''Get the generated tree in ETE Toolkit format.
Loads the pickle if it exists, creates it otherwise.'''
try:
capture_dir = self._get_capture_dir(capture_uuid)
except MissingUUID:
raise MissingCaptureDirectory(f'Unable to find the directory for {capture_uuid}')
ct = load_pickle_tree(capture_dir)
if not ct:
ct = self._cache_capture(capture_uuid)

29
tools/change_captures_dir.py Executable file
View File

@ -0,0 +1,29 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from datetime import datetime
from pathlib import Path
from redis import Redis
from lookyloo.helpers import get_captures_dir, safe_create_dir, get_socket_path
def rename_captures():
r = Redis(unix_socket_path=get_socket_path('cache'))
capture_dir: Path = get_captures_dir()
for uuid_path in capture_dir.glob('*/uuid'):
with uuid_path.open() as f:
uuid = f.read()
dir_key = r.hget('lookup_dirs', uuid)
r.hdel('lookup_dirs', uuid)
if dir_key:
r.delete(dir_key)
timestamp = datetime.strptime(uuid_path.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
dest_dir = capture_dir / str(timestamp.year) / f'{timestamp.month:02}'
safe_create_dir(dest_dir)
uuid_path.parent.rename(dest_dir / uuid_path.parent.name)
if __name__ == '__main__':
rename_captures()

View File

@ -554,9 +554,8 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
if tree_uuid == 'False':
flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error')
return redirect(url_for('index'))
try:
cache = lookyloo.capture_cache(tree_uuid)
except MissingUUID:
if not cache:
status = get_capture_status(tree_uuid)
splash_up, splash_message = splash_status()
if not splash_up: