mirror of https://github.com/CIRCL/lookyloo
chg: Improve storage, support both modes.
parent
407e78ae7f
commit
d41b7735dd
|
@ -41,17 +41,15 @@ class Archiver(AbstractManager):
|
||||||
# Format:
|
# Format:
|
||||||
# { 2020: { 12: [(directory, uuid)] } }
|
# { 2020: { 12: [(directory, uuid)] } }
|
||||||
to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list))
|
to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list))
|
||||||
for capture_path in get_captures_dir().glob('*'):
|
for capture_uuid in get_captures_dir().glob('**/uuid'):
|
||||||
if not capture_path.is_dir():
|
timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
|
||||||
continue
|
|
||||||
timestamp = datetime.strptime(capture_path.name, '%Y-%m-%dT%H:%M:%S.%f')
|
|
||||||
if timestamp.date() >= cut_time:
|
if timestamp.date() >= cut_time:
|
||||||
# do not archive.
|
# do not archive.
|
||||||
continue
|
continue
|
||||||
with (capture_path / 'uuid').open() as _f:
|
with capture_uuid.open() as _f:
|
||||||
uuid = _f.read().strip()
|
uuid = _f.read().strip()
|
||||||
to_archive[timestamp.year][timestamp.month].append((capture_path, uuid))
|
to_archive[timestamp.year][timestamp.month].append((capture_uuid.parent, uuid))
|
||||||
self.logger.info(f'Archiving {capture_path}.')
|
self.logger.info(f'Archiving {capture_uuid.parent}.')
|
||||||
|
|
||||||
if not to_archive:
|
if not to_archive:
|
||||||
self.logger.info('Nothing to archive.')
|
self.logger.info('Nothing to archive.')
|
||||||
|
@ -78,6 +76,10 @@ class Archiver(AbstractManager):
|
||||||
|
|
||||||
if archived_uuids:
|
if archived_uuids:
|
||||||
p = self.redis.pipeline()
|
p = self.redis.pipeline()
|
||||||
|
for dir_key in self.redis.hmget('lookup_dirs', *archived_uuids.keys()):
|
||||||
|
# Clear cache
|
||||||
|
if dir_key:
|
||||||
|
p.delete(dir_key)
|
||||||
p.hdel('lookup_dirs', *archived_uuids.keys())
|
p.hdel('lookup_dirs', *archived_uuids.keys())
|
||||||
p.hmset('lookup_dirs_archived', archived_uuids) # type: ignore
|
p.hmset('lookup_dirs_archived', archived_uuids) # type: ignore
|
||||||
p.execute()
|
p.execute()
|
||||||
|
@ -86,13 +88,10 @@ class Archiver(AbstractManager):
|
||||||
def _load_archives(self):
|
def _load_archives(self):
|
||||||
# Initialize archives
|
# Initialize archives
|
||||||
self.redis.delete('lookup_dirs_archived')
|
self.redis.delete('lookup_dirs_archived')
|
||||||
for year in self.archived_captures_dir.iterdir():
|
for index in self.archived_captures_dir.glob('**/index'):
|
||||||
for month in year.iterdir():
|
with index.open('r') as _f:
|
||||||
if not (month / 'index').exists():
|
archived_uuids: Dict[str, str] = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f)}
|
||||||
continue
|
self.redis.hmset('lookup_dirs_archived', archived_uuids) # type: ignore
|
||||||
with (month / 'index').open('r') as _f:
|
|
||||||
archived_uuids: Dict[str, str] = {uuid: str(month / dirname) for uuid, dirname in csv.reader(_f)}
|
|
||||||
self.redis.hmset('lookup_dirs_archived', archived_uuids) # type: ignore
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
|
@ -115,7 +115,8 @@ class AsyncCapture(AbstractManager):
|
||||||
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
||||||
return False
|
return False
|
||||||
width = len(str(len(items)))
|
width = len(str(len(items)))
|
||||||
dirpath = self.capture_dir / datetime.now().isoformat()
|
now = datetime.now()
|
||||||
|
dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat()
|
||||||
safe_create_dir(dirpath)
|
safe_create_dir(dirpath)
|
||||||
|
|
||||||
if os or browser:
|
if os or browser:
|
||||||
|
|
|
@ -27,7 +27,7 @@ class BackgroundIndexer(AbstractManager):
|
||||||
self._check_indexes()
|
self._check_indexes()
|
||||||
|
|
||||||
def _build_missing_pickles(self):
|
def _build_missing_pickles(self):
|
||||||
for uuid_path in sorted(self.lookyloo.capture_dir.glob('*/uuid'), reverse=True):
|
for uuid_path in sorted(self.lookyloo.capture_dir.glob('**/uuid'), reverse=True):
|
||||||
if (uuid_path.parent / 'tree.pickle').exists():
|
if (uuid_path.parent / 'tree.pickle').exists():
|
||||||
continue
|
continue
|
||||||
lock_file = uuid_path.parent / 'lock'
|
lock_file = uuid_path.parent / 'lock'
|
||||||
|
|
|
@ -13,6 +13,9 @@ def main():
|
||||||
p.check_returncode()
|
p.check_returncode()
|
||||||
print('done.')
|
print('done.')
|
||||||
print('Reload UUIDs index...')
|
print('Reload UUIDs index...')
|
||||||
|
print('If this is taking too long, it means you have a lot of captures.')
|
||||||
|
print('You should run tools/change_captures_dir.py to re-organize the capture directory by year and month.')
|
||||||
|
print('You may also want to archive more captures.')
|
||||||
reload_uuids_index()
|
reload_uuids_index()
|
||||||
print('done.')
|
print('done.')
|
||||||
print('Start asynchronous ingestor...')
|
print('Start asynchronous ingestor...')
|
||||||
|
@ -25,6 +28,9 @@ def main():
|
||||||
print('Start background processing...')
|
print('Start background processing...')
|
||||||
Popen(['processing'])
|
Popen(['processing'])
|
||||||
print('done.')
|
print('done.')
|
||||||
|
print('Start archiving process...')
|
||||||
|
Popen(['archiver'])
|
||||||
|
print('done.')
|
||||||
print('Start website...')
|
print('Start website...')
|
||||||
Popen(['start_website'])
|
Popen(['start_website'])
|
||||||
print('done.')
|
print('done.')
|
||||||
|
|
|
@ -371,7 +371,7 @@ def get_useragent_for_requests():
|
||||||
|
|
||||||
def reload_uuids_index() -> None:
|
def reload_uuids_index() -> None:
|
||||||
recent_uuids: Dict[str, str] = {}
|
recent_uuids: Dict[str, str] = {}
|
||||||
for uuid_path in sorted(get_captures_dir().glob('*/uuid'), reverse=True):
|
for uuid_path in get_captures_dir().glob('**/uuid'):
|
||||||
with uuid_path.open() as f:
|
with uuid_path.open() as f:
|
||||||
uuid = f.read()
|
uuid = f.read()
|
||||||
recent_uuids[uuid] = str(uuid_path.parent)
|
recent_uuids[uuid] = str(uuid_path.parent)
|
||||||
|
@ -380,7 +380,7 @@ def reload_uuids_index() -> None:
|
||||||
r = Redis(unix_socket_path=get_socket_path('cache'))
|
r = Redis(unix_socket_path=get_socket_path('cache'))
|
||||||
p = r.pipeline()
|
p = r.pipeline()
|
||||||
p.delete('lookup_dirs')
|
p.delete('lookup_dirs')
|
||||||
p.hset('lookup_dirs', mapping=recent_uuids) # type: ignore
|
p.hmset('lookup_dirs', recent_uuids) # type: ignore
|
||||||
p.execute()
|
p.execute()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -100,6 +100,7 @@ class Lookyloo():
|
||||||
if capture_dir and not Path(capture_dir).exists():
|
if capture_dir and not Path(capture_dir).exists():
|
||||||
# The capture was either removed or archived, cleaning up
|
# The capture was either removed or archived, cleaning up
|
||||||
self.redis.hdel('lookup_dirs', capture_uuid)
|
self.redis.hdel('lookup_dirs', capture_uuid)
|
||||||
|
self.redis.delete(capture_dir)
|
||||||
capture_dir = None
|
capture_dir = None
|
||||||
if not capture_dir:
|
if not capture_dir:
|
||||||
# Try in the archive
|
# Try in the archive
|
||||||
|
@ -141,7 +142,11 @@ class Lookyloo():
|
||||||
with metafile.open('w') as f:
|
with metafile.open('w') as f:
|
||||||
json.dump(to_dump, f)
|
json.dump(to_dump, f)
|
||||||
|
|
||||||
capture_dir = self._get_capture_dir(capture_uuid)
|
try:
|
||||||
|
capture_dir = self._get_capture_dir(capture_uuid)
|
||||||
|
except MissingUUID:
|
||||||
|
raise MissingCaptureDirectory(f'Unable to find the directory for {capture_uuid}')
|
||||||
|
|
||||||
har_files = sorted(capture_dir.glob('*.har'))
|
har_files = sorted(capture_dir.glob('*.har'))
|
||||||
lock_file = capture_dir / 'lock'
|
lock_file = capture_dir / 'lock'
|
||||||
pickle_file = capture_dir / 'tree.pickle'
|
pickle_file = capture_dir / 'tree.pickle'
|
||||||
|
@ -536,11 +541,14 @@ class Lookyloo():
|
||||||
if not directory:
|
if not directory:
|
||||||
continue
|
continue
|
||||||
p.hgetall(directory)
|
p.hgetall(directory)
|
||||||
for c in p.execute():
|
for uuid, c in zip(captures_to_get, p.execute()):
|
||||||
if not c:
|
|
||||||
continue
|
|
||||||
try:
|
try:
|
||||||
c = CaptureCache(c)
|
if not c:
|
||||||
|
c = self.capture_cache(uuid)
|
||||||
|
if not c:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
c = CaptureCache(c)
|
||||||
except LookylooException as e:
|
except LookylooException as e:
|
||||||
self.logger.warning(e)
|
self.logger.warning(e)
|
||||||
continue
|
continue
|
||||||
|
@ -554,8 +562,9 @@ class Lookyloo():
|
||||||
"""Get the cache from redis."""
|
"""Get the cache from redis."""
|
||||||
if capture_uuid in self._captures_index and not self._captures_index[capture_uuid].incomplete_redirects:
|
if capture_uuid in self._captures_index and not self._captures_index[capture_uuid].incomplete_redirects:
|
||||||
return self._captures_index[capture_uuid]
|
return self._captures_index[capture_uuid]
|
||||||
capture_dir = self._get_capture_dir(capture_uuid)
|
try:
|
||||||
if not capture_dir:
|
capture_dir = self._get_capture_dir(capture_uuid)
|
||||||
|
except MissingUUID:
|
||||||
self.logger.warning(f'No directory for {capture_uuid}.')
|
self.logger.warning(f'No directory for {capture_uuid}.')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -575,7 +584,10 @@ class Lookyloo():
|
||||||
def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
|
def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
|
||||||
'''Get the generated tree in ETE Toolkit format.
|
'''Get the generated tree in ETE Toolkit format.
|
||||||
Loads the pickle if it exists, creates it otherwise.'''
|
Loads the pickle if it exists, creates it otherwise.'''
|
||||||
capture_dir = self._get_capture_dir(capture_uuid)
|
try:
|
||||||
|
capture_dir = self._get_capture_dir(capture_uuid)
|
||||||
|
except MissingUUID:
|
||||||
|
raise MissingCaptureDirectory(f'Unable to find the directory for {capture_uuid}')
|
||||||
ct = load_pickle_tree(capture_dir)
|
ct = load_pickle_tree(capture_dir)
|
||||||
if not ct:
|
if not ct:
|
||||||
ct = self._cache_capture(capture_uuid)
|
ct = self._cache_capture(capture_uuid)
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from redis import Redis
|
||||||
|
|
||||||
|
from lookyloo.helpers import get_captures_dir, safe_create_dir, get_socket_path
|
||||||
|
|
||||||
|
|
||||||
|
def rename_captures():
|
||||||
|
r = Redis(unix_socket_path=get_socket_path('cache'))
|
||||||
|
capture_dir: Path = get_captures_dir()
|
||||||
|
for uuid_path in capture_dir.glob('*/uuid'):
|
||||||
|
with uuid_path.open() as f:
|
||||||
|
uuid = f.read()
|
||||||
|
dir_key = r.hget('lookup_dirs', uuid)
|
||||||
|
r.hdel('lookup_dirs', uuid)
|
||||||
|
if dir_key:
|
||||||
|
r.delete(dir_key)
|
||||||
|
timestamp = datetime.strptime(uuid_path.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
|
||||||
|
dest_dir = capture_dir / str(timestamp.year) / f'{timestamp.month:02}'
|
||||||
|
safe_create_dir(dest_dir)
|
||||||
|
uuid_path.parent.rename(dest_dir / uuid_path.parent.name)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
rename_captures()
|
|
@ -554,9 +554,8 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
|
||||||
if tree_uuid == 'False':
|
if tree_uuid == 'False':
|
||||||
flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error')
|
flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error')
|
||||||
return redirect(url_for('index'))
|
return redirect(url_for('index'))
|
||||||
try:
|
cache = lookyloo.capture_cache(tree_uuid)
|
||||||
cache = lookyloo.capture_cache(tree_uuid)
|
if not cache:
|
||||||
except MissingUUID:
|
|
||||||
status = get_capture_status(tree_uuid)
|
status = get_capture_status(tree_uuid)
|
||||||
splash_up, splash_message = splash_status()
|
splash_up, splash_message = splash_status()
|
||||||
if not splash_up:
|
if not splash_up:
|
||||||
|
|
Loading…
Reference in New Issue