new: Integrate categorization in indexing

pull/124/head
Raphaël Vinot 2020-11-09 16:02:54 +01:00
parent 9243f84295
commit 354f269218
6 changed files with 115 additions and 16 deletions

View File

@ -9,7 +9,3 @@ updates:
directory: "/" directory: "/"
schedule: schedule:
interval: "daily" interval: "daily"
- package-ecosystem: "pip"
directory: "/client/"
schedule:
interval: "daily"

View File

@ -41,6 +41,8 @@ def main():
indexing.index_cookies_capture(tree) indexing.index_cookies_capture(tree)
indexing.index_body_hashes_capture(tree) indexing.index_body_hashes_capture(tree)
indexing.index_url_capture(tree) indexing.index_url_capture(tree)
categories = list(lookyloo.categories_capture(capture_uuid).keys())
indexing.index_categories_capture(capture_uuid, categories)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -3,7 +3,7 @@
import hashlib import hashlib
from urllib.parse import urlsplit from urllib.parse import urlsplit
from typing import List, Tuple, Set, Dict, Optional from typing import List, Tuple, Set, Dict, Optional, Iterable
from collections import defaultdict from collections import defaultdict
from redis import Redis from redis import Redis
@ -192,3 +192,29 @@ class Indexing():
def get_captures_hostname(self, hostname: str) -> Set[str]: def get_captures_hostname(self, hostname: str) -> Set[str]:
return self.redis.smembers(f'hostnames|{hostname}|captures') # type: ignore return self.redis.smembers(f'hostnames|{hostname}|captures') # type: ignore
# ###### Categories ######
@property
def categories(self) -> List[Tuple[str, int]]:
return [(c, int(score))
for c, score in self.redis.zrevrange('categories', 0, 200, withscores=True)]
def index_categories_capture(self, capture_uuid: str, categories: Iterable[str]):
if not categories:
return
print(capture_uuid, categories)
if self.redis.sismember('indexed_categories', capture_uuid):
# do not reindex
return
self.redis.sadd('indexed_categories', capture_uuid)
if not categories:
return
pipeline = self.redis.pipeline()
for category in categories:
pipeline.zincrby('categories', 1, category)
pipeline.sadd(category, capture_uuid)
pipeline.execute()
def get_captures_category(self, category: str) -> Set[str]:
return self.redis.smembers(category) # type: ignore

View File

@ -133,6 +133,8 @@ class Lookyloo():
self.indexing.index_cookies_capture(ct) self.indexing.index_cookies_capture(ct)
self.indexing.index_body_hashes_capture(ct) self.indexing.index_body_hashes_capture(ct)
self.indexing.index_url_capture(ct) self.indexing.index_url_capture(ct)
categories = list(self.categories_capture(capture_uuid).keys())
self.indexing.index_categories_capture(capture_uuid, categories)
except Har2TreeError as e: except Har2TreeError as e:
raise NoValidHarFile(e.message) raise NoValidHarFile(e.message)
@ -272,7 +274,7 @@ class Lookyloo():
return {} return {}
return ct.root_hartree.stats return ct.root_hartree.stats
def categories_capture(self, capture_uuid: str): def categories_capture(self, capture_uuid: str) -> Dict[str, Any]:
capture_dir = self.lookup_capture_dir(capture_uuid) capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir: if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache') raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
@ -280,11 +282,10 @@ class Lookyloo():
if (capture_dir / 'categories').exists(): if (capture_dir / 'categories').exists():
with (capture_dir / 'categories').open() as f: with (capture_dir / 'categories').open() as f:
current_categories = [line.strip() for line in f.readlines()] current_categories = [line.strip() for line in f.readlines()]
else:
current_categories = []
return {e: self.taxonomies.revert_machinetag(e) for e in current_categories} return {e: self.taxonomies.revert_machinetag(e) for e in current_categories}
return {}
def categorize_capture(self, capture_uuid: str, category: str): def categorize_capture(self, capture_uuid: str, category: str) -> None:
if not get_config('generic', 'enable_categorization'): if not get_config('generic', 'enable_categorization'):
return return
# Make sure the category is mappable to a taxonomy. # Make sure the category is mappable to a taxonomy.
@ -303,7 +304,7 @@ class Lookyloo():
with (capture_dir / 'categories').open('w') as f: with (capture_dir / 'categories').open('w') as f:
f.writelines(f'{t}\n' for t in current_categories) f.writelines(f'{t}\n' for t in current_categories)
def uncategorize_capture(self, capture_uuid: str, category: str): def uncategorize_capture(self, capture_uuid: str, category: str) -> None:
if not get_config('generic', 'enable_categorization'): if not get_config('generic', 'enable_categorization'):
return return
capture_dir = self.lookup_capture_dir(capture_uuid) capture_dir = self.lookup_capture_dir(capture_uuid)
@ -382,7 +383,7 @@ class Lookyloo():
error_cache: Dict[str, str] = {} error_cache: Dict[str, str] = {}
if (capture_dir / 'error.txt').exists(): if (capture_dir / 'error.txt').exists():
# Something went wrong # Something went wrong
with (Path(capture_dir) / 'error.txt').open() as _error: with (capture_dir / 'error.txt').open() as _error:
content = _error.read() content = _error.read()
try: try:
error_to_cache = json.loads(content) error_to_cache = json.loads(content)
@ -404,6 +405,12 @@ class Lookyloo():
error_cache['error'] = f'No har files in {capture_dir.name}' error_cache['error'] = f'No har files in {capture_dir.name}'
fatal_error = True fatal_error = True
if (capture_dir / 'categories').exists():
with (capture_dir / 'categories').open() as _categories:
categories = [c.strip() for c in _categories.readlines()]
else:
categories = []
if not redis_pipeline: if not redis_pipeline:
p = self.redis.pipeline() p = self.redis.pipeline()
else: else:
@ -430,6 +437,7 @@ class Lookyloo():
'timestamp': har.initial_start_time, 'timestamp': har.initial_start_time,
'url': har.root_url, 'url': har.root_url,
'redirects': json.dumps(redirects), 'redirects': json.dumps(redirects),
'categories': json.dumps(categories),
'capture_dir': str(capture_dir), 'capture_dir': str(capture_dir),
'incomplete_redirects': 1 if incomplete_redirects else 0} 'incomplete_redirects': 1 if incomplete_redirects else 0}
if (capture_dir / 'no_index').exists(): # If the folders claims anonymity if (capture_dir / 'no_index').exists(): # If the folders claims anonymity
@ -473,6 +481,8 @@ class Lookyloo():
continue continue
if 'timestamp' not in c: if 'timestamp' not in c:
continue continue
if 'categories' in c:
c['categories'] = json.loads(c['categories'])
all_cache.append(c) all_cache.append(c)
return sorted(all_cache, key=operator.itemgetter('timestamp'), reverse=True) return sorted(all_cache, key=operator.itemgetter('timestamp'), reverse=True)
@ -487,6 +497,8 @@ class Lookyloo():
if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir']): if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir']):
cached['redirects'] = json.loads(cached['redirects']) # type: ignore cached['redirects'] = json.loads(cached['redirects']) # type: ignore
cached['capture_dir'] = Path(cached['capture_dir']) cached['capture_dir'] = Path(cached['capture_dir'])
if 'categories' in cached:
cached['categories'] = json.loads(cached['categories']) # type: ignore
return cached return cached
elif 'error' in cached: elif 'error' in cached:
return cached return cached

View File

@ -368,16 +368,20 @@ def mark_as_legitimate(tree_uuid: str):
# ##### helpers ##### # ##### helpers #####
def index_generic(show_hidden: bool=False): def index_generic(show_hidden: bool=False, category: Optional[str]=None):
titles = [] titles = []
if time_delta_on_index: if time_delta_on_index:
# We want to filter the captures on the index # We want to filter the captures on the index
cut_time = datetime.now() - timedelta(**time_delta_on_index) cut_time = datetime.now() - timedelta(**time_delta_on_index)
else: else:
cut_time = None # type: ignore cut_time = None # type: ignore
for cached in lookyloo.sorted_cache: for cached in lookyloo.sorted_cache:
if not cached: if not cached:
continue continue
if category:
if 'categories' not in cached or category not in cached['categories']:
continue
if show_hidden: if show_hidden:
if 'no_index' not in cached: if 'no_index' not in cached:
# Only display the hidden ones # Only display the hidden ones
@ -386,6 +390,7 @@ def index_generic(show_hidden: bool=False):
continue continue
if cut_time and datetime.fromisoformat(cached['timestamp'][:-1]) < cut_time: if cut_time and datetime.fromisoformat(cached['timestamp'][:-1]) < cut_time:
continue continue
titles.append((cached['uuid'], cached['title'], cached['timestamp'], cached['url'], titles.append((cached['uuid'], cached['title'], cached['timestamp'], cached['url'],
cached['redirects'], True if cached['incomplete_redirects'] == '1' else False)) cached['redirects'], True if cached['incomplete_redirects'] == '1' else False))
titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True) titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True)
@ -409,6 +414,11 @@ def index_hidden():
return index_generic(show_hidden=True) return index_generic(show_hidden=True)
@app.route('/category/<string:category>', methods=['GET'])
def index_category(category: str):
return index_generic(category=category)
@app.route('/cookies', methods=['GET']) @app.route('/cookies', methods=['GET'])
def cookies_lookup(): def cookies_lookup():
i = Indexing() i = Indexing()
@ -428,6 +438,13 @@ def ressources():
return render_template('ressources.html', ressources=ressources) return render_template('ressources.html', ressources=ressources)
@app.route('/categories', methods=['GET'])
def categories():
i = Indexing()
print(i.categories)
return render_template('categories.html', categories=i.categories)
@app.route('/rebuild_all') @app.route('/rebuild_all')
@auth.login_required @auth.login_required
def rebuild_all(): def rebuild_all():

View File

@ -0,0 +1,46 @@
{% extends "main.html" %}
{% from 'bootstrap/utils.html' import render_messages %}
{% block title %}Categories{% endblock %}
{% block scripts %}
{{ super() }}
<script src='{{ url_for('static', filename='datatables.min.js') }}'></script>
<script type="text/javascript">
$('#table').DataTable( {
"order": [[ 1, "desc" ]],
"pageLength": 500
});
</script>
{% endblock %}
{% block styles %}
{{ super() }}
<link rel="stylesheet" href="{{ url_for('static', filename='datatables.min.css') }}">
{% endblock %}
{% block content %}
<div class="table-responsive">
<table id="table" class="table" style="width:96%">
<thead>
<tr>
<th>Category</th>
<th>Frequency</th>
</tr>
</thead>
<tbody>
{% for category, freq in categories %}
<tr>
<td>
<a href="{{ url_for('index_category', category=category) }}">{{ category }}</a>
</td>
<td>{{ freq }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% endblock %}