From 33d30a3f4cef63cee59326da85b2c6cbc097ee0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Tue, 27 Sep 2022 02:39:10 +0200 Subject: [PATCH] chg: More cleanup --- lookyloo/capturecache.py | 9 +++++---- lookyloo/lookyloo.py | 30 +++++++++++------------------- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/lookyloo/capturecache.py b/lookyloo/capturecache.py index ba76a80f..bcc6d032 100644 --- a/lookyloo/capturecache.py +++ b/lookyloo/capturecache.py @@ -247,13 +247,13 @@ class CapturesIndex(Mapping): try: tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime) except NoValidHarFile: - self.logger.warning('Unable to rebuild the tree, the HAR files are broken.') + self.logger.debug('Unable to rebuild the tree, the HAR files are broken.') except TreeNeedsRebuild: try: tree = self._create_pickle(capture_dir) self.indexing.new_internal_uuids(tree) except NoValidHarFile: - self.logger.warning('Unable to rebuild the tree, the HAR files are broken.') + self.logger.info('Unable to rebuild the tree, the HAR files are broken.') cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': capture_dir_str} if (capture_dir / 'error.txt').exists(): @@ -285,8 +285,9 @@ class CapturesIndex(Mapping): if (cache.get('error') and isinstance(cache['error'], str) - and 'HTTP Error' not in cache['error']): - self.logger.warning(cache['error']) + and 'HTTP Error' not in cache['error'] + and "No har files in" not in cache['error']): + self.logger.info(cache['error']) if (capture_dir / 'categories').exists(): with (capture_dir / 'categories').open() as _categories: diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 6bab83b1..8ed3739a 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -1,11 +1,9 @@ #!/usr/bin/env python3 import base64 -import hashlib import json import logging import operator -import pickle import smtplib from collections import defaultdict @@ -462,15 +460,8 @@ class Lookyloo(): query[key] = json.dumps(value) if value else None query = self._prepare_lacus_query(query) - # dirty deduplicate - hash_query = hashlib.sha512(pickle.dumps(query)).hexdigest() - # FIXME The line below should work, but it doesn't - # if (existing_uuid := self.redis.set(f'query_hash:{hash_query}', temp_uuid, get=True, nx=True, ex=300)): - if (existing_uuid := self.redis.get(f'query_hash:{hash_query}')): - return existing_uuid priority = get_priority(source, user, authenticated) - perma_uuid = self.lacus.enqueue( url=query.pop('url', None), document_name=query.pop('document_name', None), @@ -492,17 +483,18 @@ class Lookyloo(): priority=priority ) - if priority < -10: - # Someone is probably abusing the system with useless URLs, remove them from the index - query['listing'] = 0 + if self.redis.zscore('to_capture', perma_uuid) is None: + if priority < -10: + # Someone is probably abusing the system with useless URLs, remove them from the index + query['listing'] = 0 - p = self.redis.pipeline() - p.zadd('to_capture', {perma_uuid: priority}) - if query: - p.hset(perma_uuid, mapping=query) # This will add the remaining entries that are lookyloo specific - p.zincrby('queues', 1, f'{source}|{authenticated}|{user}') - p.set(f'{perma_uuid}_mgmt', f'{source}|{authenticated}|{user}') - p.execute() + p = self.redis.pipeline() + p.zadd('to_capture', {perma_uuid: priority}) + if query: + p.hset(perma_uuid, mapping=query) # This will add the remaining entries that are lookyloo specific + p.zincrby('queues', 1, f'{source}|{authenticated}|{user}') + p.set(f'{perma_uuid}_mgmt', f'{source}|{authenticated}|{user}') + p.execute() return perma_uuid def send_mail(self, capture_uuid: str, /, email: str='', comment: str='') -> None: