chg: More cleanup

pull/526/head
Raphaël Vinot 2022-09-27 02:39:10 +02:00
parent f886b8676b
commit 33d30a3f4c
2 changed files with 16 additions and 23 deletions

View File

@ -247,13 +247,13 @@ class CapturesIndex(Mapping):
try: try:
tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime) tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
except NoValidHarFile: except NoValidHarFile:
self.logger.warning('Unable to rebuild the tree, the HAR files are broken.') self.logger.debug('Unable to rebuild the tree, the HAR files are broken.')
except TreeNeedsRebuild: except TreeNeedsRebuild:
try: try:
tree = self._create_pickle(capture_dir) tree = self._create_pickle(capture_dir)
self.indexing.new_internal_uuids(tree) self.indexing.new_internal_uuids(tree)
except NoValidHarFile: except NoValidHarFile:
self.logger.warning('Unable to rebuild the tree, the HAR files are broken.') self.logger.info('Unable to rebuild the tree, the HAR files are broken.')
cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': capture_dir_str} cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': capture_dir_str}
if (capture_dir / 'error.txt').exists(): if (capture_dir / 'error.txt').exists():
@ -285,8 +285,9 @@ class CapturesIndex(Mapping):
if (cache.get('error') if (cache.get('error')
and isinstance(cache['error'], str) and isinstance(cache['error'], str)
and 'HTTP Error' not in cache['error']): and 'HTTP Error' not in cache['error']
self.logger.warning(cache['error']) and "No har files in" not in cache['error']):
self.logger.info(cache['error'])
if (capture_dir / 'categories').exists(): if (capture_dir / 'categories').exists():
with (capture_dir / 'categories').open() as _categories: with (capture_dir / 'categories').open() as _categories:

View File

@ -1,11 +1,9 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import base64 import base64
import hashlib
import json import json
import logging import logging
import operator import operator
import pickle
import smtplib import smtplib
from collections import defaultdict from collections import defaultdict
@ -462,15 +460,8 @@ class Lookyloo():
query[key] = json.dumps(value) if value else None query[key] = json.dumps(value) if value else None
query = self._prepare_lacus_query(query) query = self._prepare_lacus_query(query)
# dirty deduplicate
hash_query = hashlib.sha512(pickle.dumps(query)).hexdigest()
# FIXME The line below should work, but it doesn't
# if (existing_uuid := self.redis.set(f'query_hash:{hash_query}', temp_uuid, get=True, nx=True, ex=300)):
if (existing_uuid := self.redis.get(f'query_hash:{hash_query}')):
return existing_uuid
priority = get_priority(source, user, authenticated) priority = get_priority(source, user, authenticated)
perma_uuid = self.lacus.enqueue( perma_uuid = self.lacus.enqueue(
url=query.pop('url', None), url=query.pop('url', None),
document_name=query.pop('document_name', None), document_name=query.pop('document_name', None),
@ -492,17 +483,18 @@ class Lookyloo():
priority=priority priority=priority
) )
if priority < -10: if self.redis.zscore('to_capture', perma_uuid) is None:
# Someone is probably abusing the system with useless URLs, remove them from the index if priority < -10:
query['listing'] = 0 # Someone is probably abusing the system with useless URLs, remove them from the index
query['listing'] = 0
p = self.redis.pipeline() p = self.redis.pipeline()
p.zadd('to_capture', {perma_uuid: priority}) p.zadd('to_capture', {perma_uuid: priority})
if query: if query:
p.hset(perma_uuid, mapping=query) # This will add the remaining entries that are lookyloo specific p.hset(perma_uuid, mapping=query) # This will add the remaining entries that are lookyloo specific
p.zincrby('queues', 1, f'{source}|{authenticated}|{user}') p.zincrby('queues', 1, f'{source}|{authenticated}|{user}')
p.set(f'{perma_uuid}_mgmt', f'{source}|{authenticated}|{user}') p.set(f'{perma_uuid}_mgmt', f'{source}|{authenticated}|{user}')
p.execute() p.execute()
return perma_uuid return perma_uuid
def send_mail(self, capture_uuid: str, /, email: str='', comment: str='') -> None: def send_mail(self, capture_uuid: str, /, email: str='', comment: str='') -> None: