new: Store directories by day, refactor indexing

pull/833/head
Raphaël Vinot 2023-11-15 15:31:11 +01:00
parent 1b987c38b7
commit 7791eff842
4 changed files with 280 additions and 248 deletions

View File

@ -8,17 +8,15 @@ import os
import random import random
import shutil import shutil
from collections import defaultdict from datetime import datetime, timedelta
from collections.abc import Mapping
from datetime import datetime, timedelta, date
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional from typing import Dict, List, Optional, Set
from redis import Redis from redis import Redis
import s3fs # type: ignore import s3fs # type: ignore
from lookyloo.default import AbstractManager, get_config, get_homedir, get_socket_path, try_make_file from lookyloo.default import AbstractManager, get_config, get_homedir, get_socket_path, try_make_file
from lookyloo.helpers import get_captures_dir, is_locked from lookyloo.helpers import get_captures_dir, is_locked, make_ts_from_dirname, make_dirs_list
logging.config.dictConfig(get_config('logging')) logging.config.dictConfig(get_config('logging'))
@ -67,88 +65,105 @@ class Archiver(AbstractManager):
break break
archiving_done = self._archive() archiving_done = self._archive()
self._load_indexes() self._load_indexes()
# The HARs are supposedly all compressed so this call shouldn't be required
# unless you're processing old captures for the first time.
# self._compress_hars()
if not self.shutdown_requested(): if not self.shutdown_requested():
# This call takes a very long time on MinIO # This call takes a very long time on MinIO
self._update_all_capture_indexes() self._update_all_capture_indexes()
def _update_index(self, root_dir: Path, *, s3fs: bool=False) -> None: def _update_index(self, root_dir: Path, *, s3fs_parent_dir: Optional[str]=None) -> Optional[Path]:
current_index: Dict[str, str] = {} # returns a path to the index for the given directory
if s3fs: logmsg = f'Updating index for {root_dir}'
self.logger.info(f'Updating index for {root_dir} (s3fs)') if s3fs_parent_dir:
self.s3fs_client.invalidate_cache(self.s3fs_bucket) logmsg = f'{logmsg} (s3fs)'
# On s3fs, the path is bucket_name/year/month self.logger.info(logmsg)
# root_dir is /full/local/path/to/archived_captures/year/month
s3fs_dir = '/'.join([self.s3fs_bucket, root_dir.parent.name, root_dir.name])
all_s3fs_captures = self.s3fs_client.ls(s3fs_dir, detail=False, refresh=True)
if not all_s3fs_captures:
self.logger.warning(f'{root_dir} is empty on s3fs ({s3fs_dir}).')
return
else:
self.logger.debug(f'Updating index for {root_dir}')
if not any(os.scandir(root_dir)):
# the directory is empty, we can safely remove it
root_dir.rmdir()
return
current_index: Dict[str, str] = {}
index_file = root_dir / 'index' index_file = root_dir / 'index'
if index_file.exists(): if index_file.exists():
# Skip index if the directory has been archived.
try: try:
with index_file.open('r') as _f: current_index = self.__load_index(index_file, ignore_sub=True)
current_index = {uuid: dirname for uuid, dirname in csv.reader(_f)
if uuid and dirname}
except Exception as e: except Exception as e:
# the index file is broken, it will be recreated. # the index file is broken, it will be recreated.
self.logger.warning(f'Index for {root_dir} broken, recreating it: {e}') self.logger.warning(f'Index for {root_dir} broken, recreating it: {e}')
pass
if not current_index: if not current_index:
# The file is either empty or only contains subs
# NOTE: should we remove if it has subs?
index_file.unlink() index_file.unlink()
curent_index_dirs = set(current_index.values()) sub_indexes: List[Path] = []
current_index_dirs: Set[str] = set(current_index.values())
new_captures: Set[Path] = set()
if s3fs_parent_dir:
s3fs_dir = '/'.join([s3fs_parent_dir, root_dir.name])
# the call below will spit out a mix of directories:
# * <datetime>
# * <day> (which contains a <datetime> directory)
for entry in self.s3fs_client.ls(s3fs_dir, detail=False, refresh=False):
if not self.s3fs_client.isdir(entry):
# index
continue
dir_on_disk = root_dir / entry.rsplit('/', 1)[-1]
if dir_on_disk.name.isdigit():
# got a day directory that contains captures
sub_index = self._update_index(dir_on_disk, s3fs_parent_dir=s3fs_dir)
if sub_index:
sub_indexes.append(sub_index)
else:
# got a capture
if str(dir_on_disk) not in current_index_dirs:
new_captures.add(dir_on_disk)
if s3fs:
new_captures = {existing_capture.rsplit('/', 1)[-1] for existing_capture in all_s3fs_captures
if existing_capture.rsplit('/', 1)[-1]
and (existing_capture.rsplit('/', 1)[-1] not in curent_index_dirs)
and self.s3fs_client.isdir(existing_capture)}
else: else:
with os.scandir(root_dir) as it: with os.scandir(root_dir) as it:
new_captures = {existing_capture.name for existing_capture in it for entry in it:
if (existing_capture.name not in curent_index_dirs) # can be index, sub directory (digit), or isoformat
and existing_capture.is_dir()} if not entry.is_dir():
# index
continue
dir_on_disk = Path(entry)
if dir_on_disk.name.isdigit():
sub_index = self._update_index(dir_on_disk)
if sub_index:
sub_indexes.append(sub_index)
else:
# isoformat
if str(dir_on_disk) not in current_index_dirs:
new_captures.add(dir_on_disk)
if not new_captures: if not current_index and not new_captures and not sub_indexes:
if s3fs: # No captures at all in the directory and subdirectories, quitting
self.logger.info(f'No new captures in {root_dir} (s3fs directory)') logmsg = f'No captures in {root_dir}'
else: if s3fs_parent_dir:
self.logger.debug(f'No new captures in {root_dir}') logmsg = f'{logmsg} (s3fs directory)'
# No new captures, quitting self.logger.info(logmsg)
return return None
self.logger.info(f'{len(new_captures)} new captures in {root_dir}.') if new_captures:
self.logger.info(f'{len(new_captures)} new captures in {root_dir}.')
for capture_dir_name in new_captures: for capture_dir in new_captures:
capture_dir = root_dir / capture_dir_name # capture_dir_name is *only* the isoformat of the capture.
# This directory will either be directly in the month directory (old format)
# or in the day directory (new format)
if not next(capture_dir.iterdir(), None): if not next(capture_dir.iterdir(), None):
self.logger.warning(f'{capture_dir} is empty, removing.') self.logger.warning(f'{capture_dir} is empty, removing.')
capture_dir.rmdir() capture_dir.rmdir()
continue continue
uuid_file = capture_dir / 'uuid'
if not uuid_file.exists():
self.logger.warning(f'No UUID file in {capture_dir}.')
shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))
continue
with uuid_file.open() as _f:
uuid = _f.read().strip()
try: try:
uuid_file = capture_dir / 'uuid'
if not uuid_file.exists():
self.logger.warning(f'No UUID file in {capture_dir}.')
shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))
continue
with uuid_file.open() as _f:
uuid = _f.read().strip()
if not uuid: if not uuid:
self.logger.warning(f'{uuid_file} is empty') self.logger.warning(f'{uuid_file} is empty')
shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures')) shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))
continue continue
if uuid in current_index: if uuid in current_index:
self.logger.warning(f'Duplicate UUID ({uuid}) in {current_index[uuid]} and {uuid_file.parent.name}') self.logger.warning(f'Duplicate UUID ({uuid}) in {current_index[uuid]} and {uuid_file.parent.name}')
shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures')) shutil.move(str(capture_dir), str(get_homedir() / 'discarded_captures'))
@ -156,34 +171,24 @@ class Archiver(AbstractManager):
except OSError as e: except OSError as e:
self.logger.warning(f'Error when discarding capture {capture_dir}: {e}') self.logger.warning(f'Error when discarding capture {capture_dir}: {e}')
continue continue
current_index[uuid] = capture_dir.name
current_index[uuid] = uuid_file.parent.name if not current_index and not sub_indexes:
if not current_index:
# The directory has been archived. It is probably safe to unlink, but # The directory has been archived. It is probably safe to unlink, but
# if it's not, we will lose a whole buch of captures. Moving instead for safety. # if it's not, we will lose a whole buch of captures. Moving instead for safety.
shutil.move(str(root_dir), str(get_homedir() / 'discarded_captures' / root_dir.parent / root_dir.name)) shutil.move(str(root_dir), str(get_homedir() / 'discarded_captures' / root_dir.parent / root_dir.name))
return self.logger.warning(f'Nothing to index in {root_dir}')
return None
with index_file.open('w') as _f: with index_file.open('w') as _f:
index_writer = csv.writer(_f) index_writer = csv.writer(_f)
for uuid, dirname in current_index.items(): for uuid, dirname in current_index.items():
index_writer.writerow([uuid, dirname]) index_writer.writerow([uuid, dirname])
for sub_path in sub_indexes:
# Only keep the dir name
index_writer.writerow(['sub_index', sub_path.parent.name])
def _make_dirs_list(self, root_dir: Path) -> List[Path]: return index_file
directories = []
year_now = date.today().year
while True:
year_dir = root_dir / str(year_now)
if not year_dir.exists():
# if we do not have a directory with this year, quit the loop
break
for month in range(12, 0, -1):
month_dir = year_dir / f'{month:02}'
if month_dir.exists():
directories.append(month_dir)
year_now -= 1
return directories
def _update_all_capture_indexes(self): def _update_all_capture_indexes(self):
'''Run that after the captures are in the proper directories''' '''Run that after the captures are in the proper directories'''
@ -194,7 +199,7 @@ class Archiver(AbstractManager):
# and we only care about the root directory (ex: 2023/06) # and we only care about the root directory (ex: 2023/06)
# directories_to_index = {capture_dir.parent.parent # directories_to_index = {capture_dir.parent.parent
# for capture_dir in get_captures_dir().glob('*/*/*/uuid')} # for capture_dir in get_captures_dir().glob('*/*/*/uuid')}
for directory_to_index in self._make_dirs_list(get_captures_dir()): for directory_to_index in make_dirs_list(get_captures_dir()):
if self.shutdown_requested(): if self.shutdown_requested():
self.logger.warning('Shutdown requested, breaking.') self.logger.warning('Shutdown requested, breaking.')
break break
@ -202,173 +207,157 @@ class Archiver(AbstractManager):
self.logger.info('Recent indexes updated') self.logger.info('Recent indexes updated')
# Archived captures # Archived captures
self.logger.info('Update archives indexes') self.logger.info('Update archives indexes')
for directory_to_index in self._make_dirs_list(self.archived_captures_dir): for directory_to_index in make_dirs_list(self.archived_captures_dir):
if self.shutdown_requested(): if self.shutdown_requested():
self.logger.warning('Shutdown requested, breaking.') self.logger.warning('Shutdown requested, breaking.')
break break
# Updating the indexes can take a while, just run this call once in N calls year = directory_to_index.parent.name
if random.randrange(20) == 0: if self.archive_on_s3fs:
self._update_index(directory_to_index, s3fs=self.archive_on_s3fs) # Updating the indexes can take a while, just run this call once in N calls
if random.randrange(20) == 0:
self._update_index(directory_to_index,
s3fs_parent_dir='/'.join([self.s3fs_bucket, year]))
else:
self._update_index(directory_to_index)
self.logger.info('Archived indexes updated') self.logger.info('Archived indexes updated')
def __archive_single_capture(self, capture_path: Path) -> Path:
capture_timestamp = make_ts_from_dirname(capture_path.name)
dest_dir = self.archived_captures_dir / str(capture_timestamp.year) / f'{capture_timestamp.month:02}' / f'{capture_timestamp.day:02}'
dest_dir.mkdir(parents=True, exist_ok=True)
# If the HAR isn't archived yet, archive it before copy
for har in capture_path.glob('*.har'):
with har.open('rb') as f_in:
with gzip.open(f'{har}.gz', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
har.unlink()
# read uuid before copying over to (maybe) S3
with (capture_path / 'uuid').open() as _uuid:
uuid = _uuid.read().strip()
(capture_path / 'tree.pickle').unlink(missing_ok=True)
(capture_path / 'tree.pickle.gz').unlink(missing_ok=True)
shutil.move(str(capture_path), str(dest_dir))
# Update index in parent
with (dest_dir / 'index').open('a') as _index:
index_writer = csv.writer(_index)
index_writer.writerow([uuid, capture_path.name])
# Update redis cache all at once.
p = self.redis.pipeline()
p.delete(str(capture_path))
p.hset('lookup_dirs_archived', mapping={uuid: str(dest_dir / capture_path.name)})
p.hdel('lookup_dirs', uuid)
p.execute()
return dest_dir / capture_path.name
def _archive(self): def _archive(self):
archive_interval = timedelta(days=get_config('generic', 'archive')) archive_interval = timedelta(days=get_config('generic', 'archive'))
cut_time = (datetime.now() - archive_interval).date() cut_time = (datetime.now() - archive_interval)
cut_time = cut_time.replace(day=1)
self.logger.info(f'Archiving all captures older than {cut_time.isoformat()}.') self.logger.info(f'Archiving all captures older than {cut_time.isoformat()}.')
archiving_done = True archiving_done = True
# Format: # Let's use the indexes instead of listing directories to find what we want to archive.
# { 2020: { 12: [(directory, uuid)] } } capture_breakpoint = 300
to_archive: Dict[int, Dict[int, List[Path]]] = defaultdict(lambda: defaultdict(list)) for u, p in self.redis.hscan_iter('lookup_dirs'):
# In order to avoid scanning the complete directory on each run, we check if year and month are uuid = u.decode()
# older than the cut time. path = p.decode()
for index in get_captures_dir().glob('*/*/index'): if capture_breakpoint <= 0:
if self.shutdown_requested(): # Break and restart later
self.logger.warning('Shutdown requested, breaking.') self.logger.info('Archived many captures will keep going later.')
archiving_done = False
break break
month = int(index.parent.name) elif capture_breakpoint % 10:
year = int(index.parent.parent.name) # Just check if we requested a shutdown.
if date(year, month, 1) >= cut_time: if self.shutdown_requested():
continue self.logger.warning('Shutdown requested, breaking.')
for capture_uuid in index.parent.glob('*/uuid'):
try:
timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S.%f')
except ValueError:
timestamp = datetime.strptime(capture_uuid.parent.name, '%Y-%m-%dT%H:%M:%S')
if timestamp.date() >= cut_time:
continue
to_archive[timestamp.year][timestamp.month].append(capture_uuid.parent)
self.logger.debug(f'Archiving {capture_uuid.parent}.')
if not to_archive:
self.logger.info('Nothing to archive.')
return archiving_done
for year, month_captures in to_archive.items():
for month, captures in month_captures.items():
dest_dir = self.archived_captures_dir / str(year) / f'{month:02}'
dest_dir.mkdir(parents=True, exist_ok=True)
capture_breakpoint = 300
self.logger.info(f'{len(captures)} captures to archive in {year}-{month}.')
for capture_path in captures:
if capture_breakpoint <= 0:
# Break and restart later
self.logger.info(f'Archived many captures in {year}-{month}, will keep going later.')
archiving_done = False
break
elif capture_breakpoint % 10:
# Just check if we requested a shutdown.
if self.shutdown_requested():
self.logger.warning('Shutdown requested, breaking.')
break
lock_file = capture_path / 'lock'
if try_make_file(lock_file):
# Lock created, we can proceede
with lock_file.open('w') as f:
f.write(f"{datetime.now().isoformat()};{os.getpid()}")
else:
# The directory is locked because a pickle is being created, try again later
if is_locked(capture_path):
# call this method to remove dead locks
continue
capture_breakpoint -= 1
# If the HAR isn't archived yet, archive it before copy
for har in capture_path.glob('*.har'):
with har.open('rb') as f_in:
with gzip.open(f'{har}.gz', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
har.unlink()
try:
(capture_path / 'tree.pickle').unlink(missing_ok=True)
(capture_path / 'tree.pickle.gz').unlink(missing_ok=True)
shutil.move(str(capture_path), str(dest_dir))
self.redis.delete(str(capture_path))
except OSError as e:
self.logger.warning(f'Unable to archive capture: {e}')
finally:
(dest_dir / capture_path.name / 'lock').unlink(missing_ok=True)
# we archived some captures, update relevant index
self._update_index(dest_dir, s3fs=self.archive_on_s3fs)
if not archiving_done:
break break
capture_time_isoformat = os.path.basename(path)
if not capture_time_isoformat:
continue
capture_time = make_ts_from_dirname(capture_time_isoformat)
if capture_time >= cut_time:
continue
# archive the capture.
capture_path = Path(path)
if not capture_path.exists():
if not self.redis.hexists('lookup_dirs_archived', uuid):
self.logger.warning(f'Missing capture directory for {uuid}, unable to archive {capture_path}')
continue
lock_file = capture_path / 'lock'
if try_make_file(lock_file):
# Lock created, we can proceede
with lock_file.open('w') as f:
f.write(f"{datetime.now().isoformat()};{os.getpid()}")
else: else:
break # The directory is locked because a pickle is being created, try again later
if is_locked(capture_path):
# call this method to remove dead locks
continue
try:
new_capture_path = self.__archive_single_capture(capture_path)
capture_breakpoint -= 1
except OSError as e:
self.logger.warning(f'Unable to archive capture: {e}')
finally:
(new_capture_path / 'lock').unlink(missing_ok=True)
if archiving_done: if archiving_done:
self.logger.info('Archiving done.') self.logger.info('Archiving done.')
return archiving_done return archiving_done
def _compress_hars(self): def __load_index(self, index_path: Path, ignore_sub: bool=False) -> Dict[str, str]:
"""This method is very slow (it checks every single capture for non-compressed HARs) '''Loads the given index file and all the subsequent ones if they exist'''
The new approach is to compress the har of every capture by default so this shouldn't be # NOTE: this method is used on recent and archived captures, it must never trigger a dir listing
needed anymore. Keeping it here just for reference, or to process old archives that contain indexed_captures = {}
non-gziped HARs. with index_path.open() as _i:
""" for key, path_name in csv.reader(_i):
self.logger.info('Compressing archived captures') if key == 'sub_index' and not ignore_sub:
for index in self.archived_captures_dir.glob('*/*/index'): sub_index_file = index_path.parent / path_name / 'index'
if self.shutdown_requested(): if sub_index_file.exists():
self.logger.warning('Shutdown requested, breaking.') indexed_captures.update(self.__load_index(sub_index_file))
break else:
with index.open('r') as _f: self.logger.warning(f'Missing sub index file: {sub_index_file}')
for uuid, dirname in csv.reader(_f): else:
for har in (index.parent / dirname).glob('*.har'): # NOTE: we were initially checking if that path exists,
with har.open('rb') as f_in: # but that's something we can do when we update the indexes instead.
with gzip.open(f'{har}.gz', 'wb') as f_out: # And a missing capture directory is already handled at rendering
shutil.copyfileobj(f_in, f_out) indexed_captures[key] = str(index_path.parent / path_name)
har.unlink() return indexed_captures
self.logger.info('Archived captures compressed')
def _load_indexes(self): def _load_indexes(self):
# Initialize archives # capture_dir / Year / Month / index <- should always exists. If not, created by _update_index
for index in get_captures_dir().glob('*/*/index'): # Initialize recent index
for index in sorted(get_captures_dir().glob('*/*/index'), reverse=True):
if self.shutdown_requested(): if self.shutdown_requested():
self.logger.warning('Shutdown requested, breaking.') self.logger.warning('Shutdown requested, breaking.')
break break
self.logger.info(f'Loading {index}') self.logger.info(f'Loading {index}')
with index.open('r') as _f: if recent_uuids := self.__load_index(index):
recent_uuids: Mapping = {uuid: str(index.parent / dirname) self.logger.debug(f'{len(recent_uuids)} captures in directory {index.parent}.')
for uuid, dirname in csv.reader(_f) self.redis.hset('lookup_dirs', mapping=recent_uuids) # type: ignore
if (index.parent / dirname).exists()}
if recent_uuids:
self.logger.info(f'{len(recent_uuids)} captures in directory.')
self.redis.hset('lookup_dirs', mapping=recent_uuids)
else: else:
index.unlink() index.unlink()
self.logger.info('Recent indexes loaded') total_recent_captures = self.redis.hlen('lookup_dirs')
self.logger.info(f'Recent indexes loaded: {total_recent_captures} entries.')
already_archived_uuids = {k.decode() for k in self.redis.hkeys('lookup_dirs_archived')} # Initialize archives index
self.logger.info(f'Already have {len(already_archived_uuids)} UUIDs archived')
# Initialize archives
for index in sorted(self.archived_captures_dir.glob('*/*/index'), reverse=True): for index in sorted(self.archived_captures_dir.glob('*/*/index'), reverse=True):
if self.shutdown_requested(): if self.shutdown_requested():
self.logger.warning('Shutdown requested, breaking.') self.logger.warning('Shutdown requested, breaking.')
break break
self.logger.debug(f'Loading {index}') self.logger.debug(f'Loading {index}')
with index.open('r') as _f: if archived_uuids := self.__load_index(index):
archived_uuids: Mapping = {uuid: index.parent / dirname self.logger.debug(f'{len(archived_uuids)} captures in directory {index.parent}.')
for uuid, dirname in csv.reader(_f)} self.redis.hset('lookup_dirs_archived', mapping=archived_uuids) # type: ignore
if archived_uuids:
self.logger.debug(f'{len(archived_uuids)} captures in directory.')
new_uuids = set(archived_uuids.keys()) - already_archived_uuids
if not new_uuids:
self.logger.debug('No new archived UUID to check.')
continue
self.logger.info(f'Loading {index}, {len(archived_uuids)} captures in directory, {len(new_uuids)} archived UUID to check.')
# NOTE: Only check if the directory exists if the UUID isn't in the cache.
self.redis.hset('lookup_dirs_archived',
mapping={uuid: str(dirname)
for uuid, dirname in archived_uuids.items()
if uuid in new_uuids and dirname.exists()})
self.redis.hdel('lookup_dirs', *archived_uuids.keys())
else: else:
index.unlink() index.unlink()
self.logger.info('Archived indexes loaded') total_archived_captures = self.redis.hlen('lookup_dirs_archived')
self.logger.info(f'Archived indexes loaded: {total_archived_captures} entries.')
def main(): def main():

View File

@ -5,14 +5,14 @@ import logging.config
import os import os
import shutil import shutil
from datetime import date from datetime import datetime, timedelta
from pathlib import Path from pathlib import Path
from typing import Optional, List from typing import Optional
from lookyloo.default import AbstractManager, get_config from lookyloo.default import AbstractManager, get_config
from lookyloo.exceptions import MissingUUID, NoValidHarFile from lookyloo.exceptions import MissingUUID, NoValidHarFile
from lookyloo.lookyloo import Lookyloo from lookyloo.lookyloo import Lookyloo
from lookyloo.helpers import is_locked from lookyloo.helpers import is_locked, get_sorted_captures_from_disk, make_dirs_list
logging.config.dictConfig(get_config('logging')) logging.config.dictConfig(get_config('logging'))
@ -34,67 +34,56 @@ class BackgroundIndexer(AbstractManager):
self._check_indexes() self._check_indexes()
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name) self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
def _make_dirs_list(self, root_dir: Path) -> List[Path]:
directories = []
year_now = date.today().year
while True:
year_dir = root_dir / str(year_now)
if not year_dir.exists():
# if we do not have a directory with this year, quit the loop
break
for month in range(12, 0, -1):
month_dir = year_dir / f'{month:02}'
if month_dir.exists():
directories.append(month_dir)
year_now -= 1
return directories
def _build_missing_pickles(self) -> bool: def _build_missing_pickles(self) -> bool:
self.logger.debug('Build missing pickles...') self.logger.debug('Build missing pickles...')
# Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time # Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
# This value makes sure we break out of the loop and build pickles of the most recent captures # This value makes sure we break out of the loop and build pickles of the most recent captures
max_captures = 50 max_captures = 50
got_new_captures = False got_new_captures = False
for month_dir in self._make_dirs_list(self.lookyloo.capture_dir):
for uuid_path in sorted(month_dir.glob('*/uuid'), reverse=True): # Initialize time where we do not want to build the pickles anymore.
if ((uuid_path.parent / 'tree.pickle.gz').exists() or (uuid_path.parent / 'tree.pickle').exists()): archive_interval = timedelta(days=get_config('generic', 'archive'))
cut_time = (datetime.now() - archive_interval)
for month_dir in make_dirs_list(self.lookyloo.capture_dir):
for capture_time, path in get_sorted_captures_from_disk(month_dir, cut_time=cut_time, keep_more_recent=True):
if ((path / 'tree.pickle.gz').exists() or (path / 'tree.pickle').exists()):
# We already have a pickle file # We already have a pickle file
self.logger.debug(f'{uuid_path.parent} has a pickle.') self.logger.debug(f'{path} has a pickle.')
continue continue
if not list(uuid_path.parent.rglob('*.har.gz')) and not list(uuid_path.parent.rglob('*.har')): if not list(path.rglob('*.har.gz')) and not list(path.rglob('*.har')):
# No HAR file # No HAR file
self.logger.debug(f'{uuid_path.parent} has no HAR file.') self.logger.debug(f'{path} has no HAR file.')
continue continue
if is_locked(uuid_path.parent): if is_locked(path):
# it is really locked # it is really locked
self.logger.debug(f'{uuid_path.parent} is locked, pickle generated by another process.') self.logger.debug(f'{path} is locked, pickle generated by another process.')
continue continue
with uuid_path.open() as f: with (path / 'uuid').open() as f:
uuid = f.read() uuid = f.read()
if not self.lookyloo.redis.hexists('lookup_dirs', uuid): if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs # The capture with this UUID exists, but it is for some reason missing in lookup_dirs
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent)) self.lookyloo.redis.hset('lookup_dirs', uuid, str(path))
else: else:
cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid)) cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid))
if cached_path != uuid_path.parent: if cached_path != path:
# we have a duplicate UUID, it is proably related to some bad copy/paste # we have a duplicate UUID, it is proably related to some bad copy/paste
if cached_path.exists(): if cached_path.exists():
# Both paths exist, move the one that isn't in lookup_dirs # Both paths exist, move the one that isn't in lookup_dirs
self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {uuid_path.parent}, discarding the latest') self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {path}, discarding the latest')
try: try:
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name)) shutil.move(str(path), str(self.discarded_captures_dir / path.name))
except FileNotFoundError as e: except FileNotFoundError as e:
self.logger.warning(f'Unable to move capture: {e}') self.logger.warning(f'Unable to move capture: {e}')
continue continue
else: else:
# The path in lookup_dirs for that UUID doesn't exists, just update it. # The path in lookup_dirs for that UUID doesn't exists, just update it.
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent)) self.lookyloo.redis.hset('lookup_dirs', uuid, str(path))
try: try:
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}') self.logger.info(f'Build pickle for {uuid}: {path.name}')
self.lookyloo.get_crawled_tree(uuid) self.lookyloo.get_crawled_tree(uuid)
self.lookyloo.trigger_modules(uuid, auto_trigger=True) self.lookyloo.trigger_modules(uuid, auto_trigger=True)
self.logger.info(f'Pickle for {uuid} build.') self.logger.info(f'Pickle for {uuid} build.')
@ -103,14 +92,14 @@ class BackgroundIndexer(AbstractManager):
except MissingUUID: except MissingUUID:
self.logger.warning(f'Unable to find {uuid}. That should not happen.') self.logger.warning(f'Unable to find {uuid}. That should not happen.')
except NoValidHarFile as e: except NoValidHarFile as e:
self.logger.critical(f'There are no HAR files in the capture {uuid}: {uuid_path.parent.name} - {e}') self.logger.critical(f'There are no HAR files in the capture {uuid}: {path.name} - {e}')
except FileNotFoundError: except FileNotFoundError:
self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.') self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.')
except Exception: except Exception:
self.logger.exception(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}') self.logger.exception(f'Unable to build pickle for {uuid}: {path.name}')
# The capture is not working, moving it away. # The capture is not working, moving it away.
try: try:
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name)) shutil.move(str(path), str(self.discarded_captures_dir / path.name))
self.lookyloo.redis.hdel('lookup_dirs', uuid) self.lookyloo.redis.hdel('lookup_dirs', uuid)
except FileNotFoundError as e: except FileNotFoundError as e:
self.logger.warning(f'Unable to move capture: {e}') self.logger.warning(f'Unable to move capture: {e}')

View File

@ -5,12 +5,12 @@ import logging
import os import os
import time import time
from datetime import datetime, timedelta from datetime import datetime, timedelta, date
from functools import lru_cache from functools import lru_cache
from importlib.metadata import version from importlib.metadata import version
from io import BufferedIOBase from io import BufferedIOBase
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Union from typing import Any, Dict, List, Optional, Set, Union, Tuple
from urllib.parse import urlparse from urllib.parse import urlparse
@ -76,6 +76,60 @@ def get_email_template() -> str:
return f.read() return f.read()
def make_dirs_list(root_dir: Path) -> List[Path]:
directories = []
year_now = date.today().year
while True:
year_dir = root_dir / str(year_now)
if not year_dir.exists():
# if we do not have a directory with this year, quit the loop
break
for month in range(12, 0, -1):
month_dir = year_dir / f'{month:02}'
if month_dir.exists():
directories.append(month_dir)
year_now -= 1
return directories
@lru_cache
def make_ts_from_dirname(dirname: str) -> datetime:
try:
return datetime.strptime(dirname, '%Y-%m-%dT%H:%M:%S.%f')
except ValueError:
return datetime.strptime(dirname, '%Y-%m-%dT%H:%M:%S')
def get_sorted_captures_from_disk(captures_dir: Path, /, *,
cut_time: Optional[Union[datetime, date]]=None,
keep_more_recent: bool=True) -> List[Tuple[datetime, Path]]:
'''Recursively gets all the captures present in a specific directory, doesn't use the indexes.
NOTE: this method should never be used on archived captures as it's going to take forever on S3
'''
all_paths: List[Tuple[datetime, Path]] = []
for entry in captures_dir.iterdir():
if not entry.is_dir():
# index file
continue
if entry.name.isdigit():
# sub directory
all_paths += get_sorted_captures_from_disk(entry, cut_time=cut_time, keep_more_recent=keep_more_recent)
else:
# capture directory
capture_time = make_ts_from_dirname(entry.name)
if cut_time:
if keep_more_recent and capture_time >= cut_time:
all_paths.append((capture_time, entry))
elif capture_time < cut_time:
# keep only older
all_paths.append((capture_time, entry))
else:
all_paths.append((capture_time, entry))
return sorted(all_paths)
class UserAgents: class UserAgents:
def __init__(self): def __init__(self):

View File

@ -1497,7 +1497,7 @@ class Lookyloo():
) -> None: ) -> None:
now = datetime.now() now = datetime.now()
dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / now.isoformat() dirpath = self.capture_dir / str(now.year) / f'{now.month:02}' / f'{now.day:02}' / now.isoformat()
safe_create_dir(dirpath) safe_create_dir(dirpath)
if os or browser: if os or browser: