chg: remove incomplete_redirects from cache, was always 0/False

pull/746/head
Raphaël Vinot 2023-07-28 14:05:28 +02:00
parent ea2ded9beb
commit c0f601e5db
5 changed files with 13 additions and 27 deletions

View File

@ -44,7 +44,7 @@ class LookylooCacheLogAdapter(LoggerAdapter):
class CaptureCache(): class CaptureCache():
__slots__ = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir', __slots__ = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir',
'error', 'incomplete_redirects', 'no_index', 'categories', 'parent', 'error', 'no_index', 'categories', 'parent',
'user_agent', 'referer', 'logger') 'user_agent', 'referer', 'logger')
def __init__(self, cache_entry: Dict[str, Any]): def __init__(self, cache_entry: Dict[str, Any]):
@ -87,7 +87,6 @@ class CaptureCache():
# Error without all the keys in __default_cache_keys was fatal. # Error without all the keys in __default_cache_keys was fatal.
# if the keys in __default_cache_keys are present, it was an HTTP error and we still need to pass the error along # if the keys in __default_cache_keys are present, it was an HTTP error and we still need to pass the error along
self.error: Optional[str] = cache_entry.get('error') self.error: Optional[str] = cache_entry.get('error')
self.incomplete_redirects: bool = True if cache_entry.get('incomplete_redirects') in [1, '1'] else False
self.no_index: bool = True if cache_entry.get('no_index') in [1, '1'] else False self.no_index: bool = True if cache_entry.get('no_index') in [1, '1'] else False
self.categories: List[str] = json.loads(cache_entry['categories']) if cache_entry.get('categories') else [] self.categories: List[str] = json.loads(cache_entry['categories']) if cache_entry.get('categories') else []
self.parent: Optional[str] = cache_entry.get('parent') self.parent: Optional[str] = cache_entry.get('parent')
@ -181,8 +180,7 @@ class CapturesIndex(Mapping):
def __getitem__(self, uuid: str) -> CaptureCache: def __getitem__(self, uuid: str) -> CaptureCache:
if uuid in self.__cache: if uuid in self.__cache:
if (self.__cache[uuid].capture_dir.exists() if self.__cache[uuid].capture_dir.exists():
and not self.__cache[uuid].incomplete_redirects):
return self.__cache[uuid] return self.__cache[uuid]
del self.__cache[uuid] del self.__cache[uuid]
capture_dir = self._get_capture_dir(uuid) capture_dir = self._get_capture_dir(uuid)
@ -192,11 +190,9 @@ class CapturesIndex(Mapping):
# NOTE: checking for pickle to exist may be a bad idea here. # NOTE: checking for pickle to exist may be a bad idea here.
if (cc.capture_dir.exists() if (cc.capture_dir.exists()
and ((cc.capture_dir / 'tree.pickle.gz').exists() and ((cc.capture_dir / 'tree.pickle.gz').exists()
or (cc.capture_dir / 'tree.pickle').exists()) or (cc.capture_dir / 'tree.pickle').exists())):
and not cc.incomplete_redirects):
self.__cache[uuid] = cc self.__cache[uuid] = cc
return self.__cache[uuid] return self.__cache[uuid]
self.__cache[uuid] = self._set_capture_cache(capture_dir) self.__cache[uuid] = self._set_capture_cache(capture_dir)
return self.__cache[uuid] return self.__cache[uuid]
@ -398,7 +394,6 @@ class CapturesIndex(Mapping):
cache['title'] = har.initial_title cache['title'] = har.initial_title
cache['timestamp'] = har.initial_start_time cache['timestamp'] = har.initial_start_time
cache['redirects'] = json.dumps(tree.redirects) if tree else '' cache['redirects'] = json.dumps(tree.redirects) if tree else ''
cache['incomplete_redirects'] = 0
cache['user_agent'] = har.root_user_agent if har.root_user_agent else 'No User Agent.' cache['user_agent'] = har.root_user_agent if har.root_user_agent else 'No User Agent.'
if 'url' not in cache: if 'url' not in cache:
# if all went well, we already filled that one above. # if all went well, we already filled that one above.

View File

@ -484,7 +484,9 @@ class Lookyloo():
# Do not try to build pickles # Do not try to build pickles
capture_uuids = set(capture_uuids) & self._captures_index.cached_captures capture_uuids = set(capture_uuids) & self._captures_index.cached_captures
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if self.capture_cache(uuid) and hasattr(self._captures_index[uuid], 'timestamp')] all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids
if self.capture_cache(uuid)
and hasattr(self._captures_index[uuid], 'timestamp')]
all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True) all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True)
return all_cache return all_cache
@ -1010,7 +1012,8 @@ class Lookyloo():
If a URL is given, it splits the results if the hash is seen on the same URL or an other one. If a URL is given, it splits the results if the hash is seen on the same URL or an other one.
Capture UUID avoids duplicates on the same capture''' Capture UUID avoids duplicates on the same capture'''
captures_list: Dict[str, List[Tuple[str, str, str, str, str]]] = {'same_url': [], 'different_url': []} captures_list: Dict[str, List[Tuple[str, str, str, str, str]]] = {'same_url': [], 'different_url': []}
total_captures, details = self.indexing.get_body_hash_captures(blob_hash, url, filter_capture_uuid=capture_uuid, limit=-1, prefered_uuids=self._captures_index.cached_captures) total_captures, details = self.indexing.get_body_hash_captures(blob_hash, url, filter_capture_uuid=capture_uuid, limit=-1,
prefered_uuids=self._captures_index.cached_captures)
for h_capture_uuid, url_uuid, url_hostname, same_url in details: for h_capture_uuid, url_uuid, url_hostname, same_url in details:
cache = self.capture_cache(h_capture_uuid) cache = self.capture_cache(h_capture_uuid)
if cache and hasattr(cache, 'title'): if cache and hasattr(cache, 'title'):

View File

@ -866,7 +866,7 @@ def index_generic(show_hidden: bool=False, show_error: bool=True, category: Opti
continue continue
titles.append((cached.uuid, cached.title, cached.timestamp.isoformat(), cached.url, titles.append((cached.uuid, cached.title, cached.timestamp.isoformat(), cached.url,
cached.redirects, cached.incomplete_redirects)) cached.redirects))
titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True) titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True)
return render_template('index.html', titles=titles, public_domain=lookyloo.public_domain, return render_template('index.html', titles=titles, public_domain=lookyloo.public_domain,
show_project_page=get_config('generic', 'show_project_page'), show_project_page=get_config('generic', 'show_project_page'),

View File

@ -143,18 +143,10 @@ class CaptureRedirects(Resource):
to_return: Dict[str, Any] = {} to_return: Dict[str, Any] = {}
try: try:
to_return = {'response': {'url': cache.url, 'redirects': []}} to_return = {'response': {'url': cache.url,
'redirects': cache.redirects if cache.redirects else []}}
if not cache.redirects: if not cache.redirects:
to_return['response']['info'] = 'No redirects' to_return['response']['info'] = 'No redirects'
return to_return
if cache.incomplete_redirects:
# Trigger tree build, get all redirects
lookyloo.get_crawled_tree(capture_uuid)
cache = lookyloo.capture_cache(capture_uuid)
if cache:
to_return['response']['redirects'] = cache.redirects
else:
to_return['response']['redirects'] = cache.redirects
except Exception as e: except Exception as e:
if cache and hasattr(cache, 'error'): if cache and hasattr(cache, 'error'):
to_return['error'] = cache.error to_return['error'] = cache.error

View File

@ -97,7 +97,7 @@ $(document).ready(function () {
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
{% for uuid, page_title, datetime, url, redirects, incomplete_redirects in titles %} {% for uuid, page_title, datetime, url, redirects in titles %}
<tr> <tr>
<td data-search="{{ page_title }} {{ url }}"> <td data-search="{{ page_title }} {{ url }}">
<p title="{{ page_title }}"><a href="{{ url_for('tree', tree_uuid=uuid) }}">{{ page_title }}</a></p> <p title="{{ page_title }}"><a href="{{ url_for('tree', tree_uuid=uuid) }}">{{ page_title }}</a></p>
@ -115,11 +115,7 @@ $(document).ready(function () {
{%endif%} {%endif%}
</p> </p>
{% endfor %} {% endfor %}
{% if incomplete_redirects %} <a style="float: right;" href="{{ url_for('redirects', tree_uuid=uuid) }}">Download redirects</a>
<a style="float: right;" href="{{ url_for('cache_tree', tree_uuid=uuid) }}">Unable to find the redirects, click here to build the tree</a>
{%else%}
<a style="float: right;" href="{{ url_for('redirects', tree_uuid=uuid) }}">Download redirects</a>
{%endif%}
{% else%} {% else%}
No redirect No redirect
{%endif%} {%endif%}