chg: More cleanup

pull/526/head
Raphaël Vinot 2022-09-27 02:39:10 +02:00
parent f886b8676b
commit 33d30a3f4c
2 changed files with 16 additions and 23 deletions

View File

@ -247,13 +247,13 @@ class CapturesIndex(Mapping):
try:
tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
except NoValidHarFile:
self.logger.warning('Unable to rebuild the tree, the HAR files are broken.')
self.logger.debug('Unable to rebuild the tree, the HAR files are broken.')
except TreeNeedsRebuild:
try:
tree = self._create_pickle(capture_dir)
self.indexing.new_internal_uuids(tree)
except NoValidHarFile:
self.logger.warning('Unable to rebuild the tree, the HAR files are broken.')
self.logger.info('Unable to rebuild the tree, the HAR files are broken.')
cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': capture_dir_str}
if (capture_dir / 'error.txt').exists():
@ -285,8 +285,9 @@ class CapturesIndex(Mapping):
if (cache.get('error')
and isinstance(cache['error'], str)
and 'HTTP Error' not in cache['error']):
self.logger.warning(cache['error'])
and 'HTTP Error' not in cache['error']
and "No har files in" not in cache['error']):
self.logger.info(cache['error'])
if (capture_dir / 'categories').exists():
with (capture_dir / 'categories').open() as _categories:

View File

@ -1,11 +1,9 @@
#!/usr/bin/env python3
import base64
import hashlib
import json
import logging
import operator
import pickle
import smtplib
from collections import defaultdict
@ -462,15 +460,8 @@ class Lookyloo():
query[key] = json.dumps(value) if value else None
query = self._prepare_lacus_query(query)
# dirty deduplicate
hash_query = hashlib.sha512(pickle.dumps(query)).hexdigest()
# FIXME The line below should work, but it doesn't
# if (existing_uuid := self.redis.set(f'query_hash:{hash_query}', temp_uuid, get=True, nx=True, ex=300)):
if (existing_uuid := self.redis.get(f'query_hash:{hash_query}')):
return existing_uuid
priority = get_priority(source, user, authenticated)
perma_uuid = self.lacus.enqueue(
url=query.pop('url', None),
document_name=query.pop('document_name', None),
@ -492,6 +483,7 @@ class Lookyloo():
priority=priority
)
if self.redis.zscore('to_capture', perma_uuid) is None:
if priority < -10:
# Someone is probably abusing the system with useless URLs, remove them from the index
query['listing'] = 0