mirror of https://github.com/CIRCL/lookyloo
chg: Improve storage, support both modes.
parent
407e78ae7f
commit
d41b7735dd
|
@ -41,17 +41,15 @@ class Archiver(AbstractManager):
|
|||
# Format:
|
||||
# { 2020: { 12: [(directory, uuid)] } }
|
||||
to_archive: Dict[int, Dict[int, List[Tuple[Path, str]]]] = defaultdict(lambda: defaultdict(list))
|
||||
for capture_path in get_captures_dir().glob('*'):
|
||||
if not capture_path.is_dir():
|
||||
continue
|
||||
timestamp = datetime.strptime(capture_path.name, '%Y-%m-%dT%H:%M:%S.%f')
|
||||
for capture_uuid in get_captures_dir().glob('**/uuid'):
|
||||
timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
|
||||
if timestamp.date() >= cut_time:
|
||||
# do not archive.
|
||||
continue
|
||||
with (capture_path / 'uuid').open() as _f:
|
||||
with capture_uuid.open() as _f:
|
||||
uuid = _f.read().strip()
|
||||
to_archive[timestamp.year][timestamp.month].append((capture_path, uuid))
|
||||
self.logger.info(f'Archiving {capture_path}.')
|
||||
to_archive[timestamp.year][timestamp.month].append((capture_uuid.parent, uuid))
|
||||
self.logger.info(f'Archiving {capture_uuid.parent}.')
|
||||
|
||||
if not to_archive:
|
||||
self.logger.info('Nothing to archive.')
|
||||
|
@ -78,6 +76,10 @@ class Archiver(AbstractManager):
|
|||
|
||||
if archived_uuids:
|
||||
p = self.redis.pipeline()
|
||||
for dir_key in self.redis.hmget('lookup_dirs', *archived_uuids.keys()):
|
||||
# Clear cache
|
||||
if dir_key:
|
||||
p.delete(dir_key)
|
||||
p.hdel('lookup_dirs', *archived_uuids.keys())
|
||||
p.hmset('lookup_dirs_archived', archived_uuids) # type: ignore
|
||||
p.execute()
|
||||
|
@ -86,13 +88,10 @@ class Archiver(AbstractManager):
|
|||
def _load_archives(self):
|
||||
# Initialize archives
|
||||
self.redis.delete('lookup_dirs_archived')
|
||||
for year in self.archived_captures_dir.iterdir():
|
||||
for month in year.iterdir():
|
||||
if not (month / 'index').exists():
|
||||
continue
|
||||
with (month / 'index').open('r') as _f:
|
||||
archived_uuids: Dict[str, str] = {uuid: str(month / dirname) for uuid, dirname in csv.reader(_f)}
|
||||
self.redis.hmset('lookup_dirs_archived', archived_uuids) # type: ignore
|
||||
for index in self.archived_captures_dir.glob('**/index'):
|
||||
with index.open('r') as _f:
|
||||
archived_uuids: Dict[str, str] = {uuid: str(index.parent / dirname) for uuid, dirname in csv.reader(_f)}
|
||||
self.redis.hmset('lookup_dirs_archived', archived_uuids) # type: ignore
|
||||
|
||||
|
||||
def main():
|
||||
|
|
|
@ -115,7 +115,8 @@ class AsyncCapture(AbstractManager):
|
|||
self.logger.critical(f'Something went terribly wrong when capturing {url}.')
|
||||
return False
|
||||
width = len(str(len(items)))
|
||||
dirpath = self.capture_dir / datetime.now().isoformat()
|
||||
now = datetime.now()
|
||||
dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat()
|
||||
safe_create_dir(dirpath)
|
||||
|
||||
if os or browser:
|
||||
|
|
|
@ -27,7 +27,7 @@ class BackgroundIndexer(AbstractManager):
|
|||
self._check_indexes()
|
||||
|
||||
def _build_missing_pickles(self):
|
||||
for uuid_path in sorted(self.lookyloo.capture_dir.glob('*/uuid'), reverse=True):
|
||||
for uuid_path in sorted(self.lookyloo.capture_dir.glob('**/uuid'), reverse=True):
|
||||
if (uuid_path.parent / 'tree.pickle').exists():
|
||||
continue
|
||||
lock_file = uuid_path.parent / 'lock'
|
||||
|
|
|
@ -13,6 +13,9 @@ def main():
|
|||
p.check_returncode()
|
||||
print('done.')
|
||||
print('Reload UUIDs index...')
|
||||
print('If this is taking too long, it means you have a lot of captures.')
|
||||
print('You should run tools/change_captures_dir.py to re-organize the capture directory by year and month.')
|
||||
print('You may also want to archive more captures.')
|
||||
reload_uuids_index()
|
||||
print('done.')
|
||||
print('Start asynchronous ingestor...')
|
||||
|
@ -25,6 +28,9 @@ def main():
|
|||
print('Start background processing...')
|
||||
Popen(['processing'])
|
||||
print('done.')
|
||||
print('Start archiving process...')
|
||||
Popen(['archiver'])
|
||||
print('done.')
|
||||
print('Start website...')
|
||||
Popen(['start_website'])
|
||||
print('done.')
|
||||
|
|
|
@ -371,7 +371,7 @@ def get_useragent_for_requests():
|
|||
|
||||
def reload_uuids_index() -> None:
|
||||
recent_uuids: Dict[str, str] = {}
|
||||
for uuid_path in sorted(get_captures_dir().glob('*/uuid'), reverse=True):
|
||||
for uuid_path in get_captures_dir().glob('**/uuid'):
|
||||
with uuid_path.open() as f:
|
||||
uuid = f.read()
|
||||
recent_uuids[uuid] = str(uuid_path.parent)
|
||||
|
@ -380,7 +380,7 @@ def reload_uuids_index() -> None:
|
|||
r = Redis(unix_socket_path=get_socket_path('cache'))
|
||||
p = r.pipeline()
|
||||
p.delete('lookup_dirs')
|
||||
p.hset('lookup_dirs', mapping=recent_uuids) # type: ignore
|
||||
p.hmset('lookup_dirs', recent_uuids) # type: ignore
|
||||
p.execute()
|
||||
|
||||
|
||||
|
|
|
@ -100,6 +100,7 @@ class Lookyloo():
|
|||
if capture_dir and not Path(capture_dir).exists():
|
||||
# The capture was either removed or archived, cleaning up
|
||||
self.redis.hdel('lookup_dirs', capture_uuid)
|
||||
self.redis.delete(capture_dir)
|
||||
capture_dir = None
|
||||
if not capture_dir:
|
||||
# Try in the archive
|
||||
|
@ -141,7 +142,11 @@ class Lookyloo():
|
|||
with metafile.open('w') as f:
|
||||
json.dump(to_dump, f)
|
||||
|
||||
capture_dir = self._get_capture_dir(capture_uuid)
|
||||
try:
|
||||
capture_dir = self._get_capture_dir(capture_uuid)
|
||||
except MissingUUID:
|
||||
raise MissingCaptureDirectory(f'Unable to find the directory for {capture_uuid}')
|
||||
|
||||
har_files = sorted(capture_dir.glob('*.har'))
|
||||
lock_file = capture_dir / 'lock'
|
||||
pickle_file = capture_dir / 'tree.pickle'
|
||||
|
@ -536,11 +541,14 @@ class Lookyloo():
|
|||
if not directory:
|
||||
continue
|
||||
p.hgetall(directory)
|
||||
for c in p.execute():
|
||||
if not c:
|
||||
continue
|
||||
for uuid, c in zip(captures_to_get, p.execute()):
|
||||
try:
|
||||
c = CaptureCache(c)
|
||||
if not c:
|
||||
c = self.capture_cache(uuid)
|
||||
if not c:
|
||||
continue
|
||||
else:
|
||||
c = CaptureCache(c)
|
||||
except LookylooException as e:
|
||||
self.logger.warning(e)
|
||||
continue
|
||||
|
@ -554,8 +562,9 @@ class Lookyloo():
|
|||
"""Get the cache from redis."""
|
||||
if capture_uuid in self._captures_index and not self._captures_index[capture_uuid].incomplete_redirects:
|
||||
return self._captures_index[capture_uuid]
|
||||
capture_dir = self._get_capture_dir(capture_uuid)
|
||||
if not capture_dir:
|
||||
try:
|
||||
capture_dir = self._get_capture_dir(capture_uuid)
|
||||
except MissingUUID:
|
||||
self.logger.warning(f'No directory for {capture_uuid}.')
|
||||
return None
|
||||
|
||||
|
@ -575,7 +584,10 @@ class Lookyloo():
|
|||
def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
|
||||
'''Get the generated tree in ETE Toolkit format.
|
||||
Loads the pickle if it exists, creates it otherwise.'''
|
||||
capture_dir = self._get_capture_dir(capture_uuid)
|
||||
try:
|
||||
capture_dir = self._get_capture_dir(capture_uuid)
|
||||
except MissingUUID:
|
||||
raise MissingCaptureDirectory(f'Unable to find the directory for {capture_uuid}')
|
||||
ct = load_pickle_tree(capture_dir)
|
||||
if not ct:
|
||||
ct = self._cache_capture(capture_uuid)
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from redis import Redis
|
||||
|
||||
from lookyloo.helpers import get_captures_dir, safe_create_dir, get_socket_path
|
||||
|
||||
|
||||
def rename_captures():
|
||||
r = Redis(unix_socket_path=get_socket_path('cache'))
|
||||
capture_dir: Path = get_captures_dir()
|
||||
for uuid_path in capture_dir.glob('*/uuid'):
|
||||
with uuid_path.open() as f:
|
||||
uuid = f.read()
|
||||
dir_key = r.hget('lookup_dirs', uuid)
|
||||
r.hdel('lookup_dirs', uuid)
|
||||
if dir_key:
|
||||
r.delete(dir_key)
|
||||
timestamp = datetime.strptime(uuid_path.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
|
||||
dest_dir = capture_dir / str(timestamp.year) / f'{timestamp.month:02}'
|
||||
safe_create_dir(dest_dir)
|
||||
uuid_path.parent.rename(dest_dir / uuid_path.parent.name)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
rename_captures()
|
|
@ -554,9 +554,8 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
|
|||
if tree_uuid == 'False':
|
||||
flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error')
|
||||
return redirect(url_for('index'))
|
||||
try:
|
||||
cache = lookyloo.capture_cache(tree_uuid)
|
||||
except MissingUUID:
|
||||
cache = lookyloo.capture_cache(tree_uuid)
|
||||
if not cache:
|
||||
status = get_capture_status(tree_uuid)
|
||||
splash_up, splash_message = splash_status()
|
||||
if not splash_up:
|
||||
|
|
Loading…
Reference in New Issue