mirror of https://github.com/CIRCL/lookyloo
new: Integrate categorization in indexing
parent
9243f84295
commit
354f269218
|
@ -9,7 +9,3 @@ updates:
|
||||||
directory: "/"
|
directory: "/"
|
||||||
schedule:
|
schedule:
|
||||||
interval: "daily"
|
interval: "daily"
|
||||||
- package-ecosystem: "pip"
|
|
||||||
directory: "/client/"
|
|
||||||
schedule:
|
|
||||||
interval: "daily"
|
|
||||||
|
|
|
@ -41,6 +41,8 @@ def main():
|
||||||
indexing.index_cookies_capture(tree)
|
indexing.index_cookies_capture(tree)
|
||||||
indexing.index_body_hashes_capture(tree)
|
indexing.index_body_hashes_capture(tree)
|
||||||
indexing.index_url_capture(tree)
|
indexing.index_url_capture(tree)
|
||||||
|
categories = list(lookyloo.categories_capture(capture_uuid).keys())
|
||||||
|
indexing.index_categories_capture(capture_uuid, categories)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
from urllib.parse import urlsplit
|
from urllib.parse import urlsplit
|
||||||
from typing import List, Tuple, Set, Dict, Optional
|
from typing import List, Tuple, Set, Dict, Optional, Iterable
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
from redis import Redis
|
from redis import Redis
|
||||||
|
@ -192,3 +192,29 @@ class Indexing():
|
||||||
|
|
||||||
def get_captures_hostname(self, hostname: str) -> Set[str]:
|
def get_captures_hostname(self, hostname: str) -> Set[str]:
|
||||||
return self.redis.smembers(f'hostnames|{hostname}|captures') # type: ignore
|
return self.redis.smembers(f'hostnames|{hostname}|captures') # type: ignore
|
||||||
|
|
||||||
|
# ###### Categories ######
|
||||||
|
|
||||||
|
@property
|
||||||
|
def categories(self) -> List[Tuple[str, int]]:
|
||||||
|
return [(c, int(score))
|
||||||
|
for c, score in self.redis.zrevrange('categories', 0, 200, withscores=True)]
|
||||||
|
|
||||||
|
def index_categories_capture(self, capture_uuid: str, categories: Iterable[str]):
|
||||||
|
if not categories:
|
||||||
|
return
|
||||||
|
print(capture_uuid, categories)
|
||||||
|
if self.redis.sismember('indexed_categories', capture_uuid):
|
||||||
|
# do not reindex
|
||||||
|
return
|
||||||
|
self.redis.sadd('indexed_categories', capture_uuid)
|
||||||
|
if not categories:
|
||||||
|
return
|
||||||
|
pipeline = self.redis.pipeline()
|
||||||
|
for category in categories:
|
||||||
|
pipeline.zincrby('categories', 1, category)
|
||||||
|
pipeline.sadd(category, capture_uuid)
|
||||||
|
pipeline.execute()
|
||||||
|
|
||||||
|
def get_captures_category(self, category: str) -> Set[str]:
|
||||||
|
return self.redis.smembers(category) # type: ignore
|
||||||
|
|
|
@ -133,6 +133,8 @@ class Lookyloo():
|
||||||
self.indexing.index_cookies_capture(ct)
|
self.indexing.index_cookies_capture(ct)
|
||||||
self.indexing.index_body_hashes_capture(ct)
|
self.indexing.index_body_hashes_capture(ct)
|
||||||
self.indexing.index_url_capture(ct)
|
self.indexing.index_url_capture(ct)
|
||||||
|
categories = list(self.categories_capture(capture_uuid).keys())
|
||||||
|
self.indexing.index_categories_capture(capture_uuid, categories)
|
||||||
except Har2TreeError as e:
|
except Har2TreeError as e:
|
||||||
raise NoValidHarFile(e.message)
|
raise NoValidHarFile(e.message)
|
||||||
|
|
||||||
|
@ -272,7 +274,7 @@ class Lookyloo():
|
||||||
return {}
|
return {}
|
||||||
return ct.root_hartree.stats
|
return ct.root_hartree.stats
|
||||||
|
|
||||||
def categories_capture(self, capture_uuid: str):
|
def categories_capture(self, capture_uuid: str) -> Dict[str, Any]:
|
||||||
capture_dir = self.lookup_capture_dir(capture_uuid)
|
capture_dir = self.lookup_capture_dir(capture_uuid)
|
||||||
if not capture_dir:
|
if not capture_dir:
|
||||||
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
|
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
|
||||||
|
@ -280,11 +282,10 @@ class Lookyloo():
|
||||||
if (capture_dir / 'categories').exists():
|
if (capture_dir / 'categories').exists():
|
||||||
with (capture_dir / 'categories').open() as f:
|
with (capture_dir / 'categories').open() as f:
|
||||||
current_categories = [line.strip() for line in f.readlines()]
|
current_categories = [line.strip() for line in f.readlines()]
|
||||||
else:
|
|
||||||
current_categories = []
|
|
||||||
return {e: self.taxonomies.revert_machinetag(e) for e in current_categories}
|
return {e: self.taxonomies.revert_machinetag(e) for e in current_categories}
|
||||||
|
return {}
|
||||||
|
|
||||||
def categorize_capture(self, capture_uuid: str, category: str):
|
def categorize_capture(self, capture_uuid: str, category: str) -> None:
|
||||||
if not get_config('generic', 'enable_categorization'):
|
if not get_config('generic', 'enable_categorization'):
|
||||||
return
|
return
|
||||||
# Make sure the category is mappable to a taxonomy.
|
# Make sure the category is mappable to a taxonomy.
|
||||||
|
@ -303,7 +304,7 @@ class Lookyloo():
|
||||||
with (capture_dir / 'categories').open('w') as f:
|
with (capture_dir / 'categories').open('w') as f:
|
||||||
f.writelines(f'{t}\n' for t in current_categories)
|
f.writelines(f'{t}\n' for t in current_categories)
|
||||||
|
|
||||||
def uncategorize_capture(self, capture_uuid: str, category: str):
|
def uncategorize_capture(self, capture_uuid: str, category: str) -> None:
|
||||||
if not get_config('generic', 'enable_categorization'):
|
if not get_config('generic', 'enable_categorization'):
|
||||||
return
|
return
|
||||||
capture_dir = self.lookup_capture_dir(capture_uuid)
|
capture_dir = self.lookup_capture_dir(capture_uuid)
|
||||||
|
@ -382,7 +383,7 @@ class Lookyloo():
|
||||||
error_cache: Dict[str, str] = {}
|
error_cache: Dict[str, str] = {}
|
||||||
if (capture_dir / 'error.txt').exists():
|
if (capture_dir / 'error.txt').exists():
|
||||||
# Something went wrong
|
# Something went wrong
|
||||||
with (Path(capture_dir) / 'error.txt').open() as _error:
|
with (capture_dir / 'error.txt').open() as _error:
|
||||||
content = _error.read()
|
content = _error.read()
|
||||||
try:
|
try:
|
||||||
error_to_cache = json.loads(content)
|
error_to_cache = json.loads(content)
|
||||||
|
@ -404,6 +405,12 @@ class Lookyloo():
|
||||||
error_cache['error'] = f'No har files in {capture_dir.name}'
|
error_cache['error'] = f'No har files in {capture_dir.name}'
|
||||||
fatal_error = True
|
fatal_error = True
|
||||||
|
|
||||||
|
if (capture_dir / 'categories').exists():
|
||||||
|
with (capture_dir / 'categories').open() as _categories:
|
||||||
|
categories = [c.strip() for c in _categories.readlines()]
|
||||||
|
else:
|
||||||
|
categories = []
|
||||||
|
|
||||||
if not redis_pipeline:
|
if not redis_pipeline:
|
||||||
p = self.redis.pipeline()
|
p = self.redis.pipeline()
|
||||||
else:
|
else:
|
||||||
|
@ -430,6 +437,7 @@ class Lookyloo():
|
||||||
'timestamp': har.initial_start_time,
|
'timestamp': har.initial_start_time,
|
||||||
'url': har.root_url,
|
'url': har.root_url,
|
||||||
'redirects': json.dumps(redirects),
|
'redirects': json.dumps(redirects),
|
||||||
|
'categories': json.dumps(categories),
|
||||||
'capture_dir': str(capture_dir),
|
'capture_dir': str(capture_dir),
|
||||||
'incomplete_redirects': 1 if incomplete_redirects else 0}
|
'incomplete_redirects': 1 if incomplete_redirects else 0}
|
||||||
if (capture_dir / 'no_index').exists(): # If the folders claims anonymity
|
if (capture_dir / 'no_index').exists(): # If the folders claims anonymity
|
||||||
|
@ -473,6 +481,8 @@ class Lookyloo():
|
||||||
continue
|
continue
|
||||||
if 'timestamp' not in c:
|
if 'timestamp' not in c:
|
||||||
continue
|
continue
|
||||||
|
if 'categories' in c:
|
||||||
|
c['categories'] = json.loads(c['categories'])
|
||||||
all_cache.append(c)
|
all_cache.append(c)
|
||||||
return sorted(all_cache, key=operator.itemgetter('timestamp'), reverse=True)
|
return sorted(all_cache, key=operator.itemgetter('timestamp'), reverse=True)
|
||||||
|
|
||||||
|
@ -487,6 +497,8 @@ class Lookyloo():
|
||||||
if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir']):
|
if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir']):
|
||||||
cached['redirects'] = json.loads(cached['redirects']) # type: ignore
|
cached['redirects'] = json.loads(cached['redirects']) # type: ignore
|
||||||
cached['capture_dir'] = Path(cached['capture_dir'])
|
cached['capture_dir'] = Path(cached['capture_dir'])
|
||||||
|
if 'categories' in cached:
|
||||||
|
cached['categories'] = json.loads(cached['categories']) # type: ignore
|
||||||
return cached
|
return cached
|
||||||
elif 'error' in cached:
|
elif 'error' in cached:
|
||||||
return cached
|
return cached
|
||||||
|
|
|
@ -368,16 +368,20 @@ def mark_as_legitimate(tree_uuid: str):
|
||||||
|
|
||||||
# ##### helpers #####
|
# ##### helpers #####
|
||||||
|
|
||||||
def index_generic(show_hidden: bool=False):
|
def index_generic(show_hidden: bool=False, category: Optional[str]=None):
|
||||||
titles = []
|
titles = []
|
||||||
if time_delta_on_index:
|
if time_delta_on_index:
|
||||||
# We want to filter the captures on the index
|
# We want to filter the captures on the index
|
||||||
cut_time = datetime.now() - timedelta(**time_delta_on_index)
|
cut_time = datetime.now() - timedelta(**time_delta_on_index)
|
||||||
else:
|
else:
|
||||||
cut_time = None # type: ignore
|
cut_time = None # type: ignore
|
||||||
|
|
||||||
for cached in lookyloo.sorted_cache:
|
for cached in lookyloo.sorted_cache:
|
||||||
if not cached:
|
if not cached:
|
||||||
continue
|
continue
|
||||||
|
if category:
|
||||||
|
if 'categories' not in cached or category not in cached['categories']:
|
||||||
|
continue
|
||||||
if show_hidden:
|
if show_hidden:
|
||||||
if 'no_index' not in cached:
|
if 'no_index' not in cached:
|
||||||
# Only display the hidden ones
|
# Only display the hidden ones
|
||||||
|
@ -386,6 +390,7 @@ def index_generic(show_hidden: bool=False):
|
||||||
continue
|
continue
|
||||||
if cut_time and datetime.fromisoformat(cached['timestamp'][:-1]) < cut_time:
|
if cut_time and datetime.fromisoformat(cached['timestamp'][:-1]) < cut_time:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
titles.append((cached['uuid'], cached['title'], cached['timestamp'], cached['url'],
|
titles.append((cached['uuid'], cached['title'], cached['timestamp'], cached['url'],
|
||||||
cached['redirects'], True if cached['incomplete_redirects'] == '1' else False))
|
cached['redirects'], True if cached['incomplete_redirects'] == '1' else False))
|
||||||
titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True)
|
titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True)
|
||||||
|
@ -409,6 +414,11 @@ def index_hidden():
|
||||||
return index_generic(show_hidden=True)
|
return index_generic(show_hidden=True)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/category/<string:category>', methods=['GET'])
|
||||||
|
def index_category(category: str):
|
||||||
|
return index_generic(category=category)
|
||||||
|
|
||||||
|
|
||||||
@app.route('/cookies', methods=['GET'])
|
@app.route('/cookies', methods=['GET'])
|
||||||
def cookies_lookup():
|
def cookies_lookup():
|
||||||
i = Indexing()
|
i = Indexing()
|
||||||
|
@ -428,6 +438,13 @@ def ressources():
|
||||||
return render_template('ressources.html', ressources=ressources)
|
return render_template('ressources.html', ressources=ressources)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/categories', methods=['GET'])
|
||||||
|
def categories():
|
||||||
|
i = Indexing()
|
||||||
|
print(i.categories)
|
||||||
|
return render_template('categories.html', categories=i.categories)
|
||||||
|
|
||||||
|
|
||||||
@app.route('/rebuild_all')
|
@app.route('/rebuild_all')
|
||||||
@auth.login_required
|
@auth.login_required
|
||||||
def rebuild_all():
|
def rebuild_all():
|
||||||
|
|
|
@ -0,0 +1,46 @@
|
||||||
|
{% extends "main.html" %}
|
||||||
|
|
||||||
|
{% from 'bootstrap/utils.html' import render_messages %}
|
||||||
|
|
||||||
|
{% block title %}Categories{% endblock %}
|
||||||
|
|
||||||
|
{% block scripts %}
|
||||||
|
{{ super() }}
|
||||||
|
<script src='{{ url_for('static', filename='datatables.min.js') }}'></script>
|
||||||
|
<script type="text/javascript">
|
||||||
|
$('#table').DataTable( {
|
||||||
|
"order": [[ 1, "desc" ]],
|
||||||
|
"pageLength": 500
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
|
||||||
|
{% endblock %}
|
||||||
|
|
||||||
|
{% block styles %}
|
||||||
|
{{ super() }}
|
||||||
|
<link rel="stylesheet" href="{{ url_for('static', filename='datatables.min.css') }}">
|
||||||
|
{% endblock %}
|
||||||
|
|
||||||
|
|
||||||
|
{% block content %}
|
||||||
|
<div class="table-responsive">
|
||||||
|
<table id="table" class="table" style="width:96%">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Category</th>
|
||||||
|
<th>Frequency</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for category, freq in categories %}
|
||||||
|
<tr>
|
||||||
|
<td>
|
||||||
|
<a href="{{ url_for('index_category', category=category) }}">{{ category }}</a>
|
||||||
|
</td>
|
||||||
|
<td>{{ freq }}</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
{% endblock %}
|
Loading…
Reference in New Issue