new: Integrate categorization in indexing

pull/124/head
Raphaël Vinot 2020-11-09 16:02:54 +01:00
parent 9243f84295
commit 354f269218
6 changed files with 115 additions and 16 deletions

View File

@ -9,7 +9,3 @@ updates:
directory: "/"
schedule:
interval: "daily"
- package-ecosystem: "pip"
directory: "/client/"
schedule:
interval: "daily"

View File

@ -41,6 +41,8 @@ def main():
indexing.index_cookies_capture(tree)
indexing.index_body_hashes_capture(tree)
indexing.index_url_capture(tree)
categories = list(lookyloo.categories_capture(capture_uuid).keys())
indexing.index_categories_capture(capture_uuid, categories)
if __name__ == '__main__':

View File

@ -3,7 +3,7 @@
import hashlib
from urllib.parse import urlsplit
from typing import List, Tuple, Set, Dict, Optional
from typing import List, Tuple, Set, Dict, Optional, Iterable
from collections import defaultdict
from redis import Redis
@ -192,3 +192,29 @@ class Indexing():
def get_captures_hostname(self, hostname: str) -> Set[str]:
return self.redis.smembers(f'hostnames|{hostname}|captures') # type: ignore
# ###### Categories ######
@property
def categories(self) -> List[Tuple[str, int]]:
return [(c, int(score))
for c, score in self.redis.zrevrange('categories', 0, 200, withscores=True)]
def index_categories_capture(self, capture_uuid: str, categories: Iterable[str]):
if not categories:
return
print(capture_uuid, categories)
if self.redis.sismember('indexed_categories', capture_uuid):
# do not reindex
return
self.redis.sadd('indexed_categories', capture_uuid)
if not categories:
return
pipeline = self.redis.pipeline()
for category in categories:
pipeline.zincrby('categories', 1, category)
pipeline.sadd(category, capture_uuid)
pipeline.execute()
def get_captures_category(self, category: str) -> Set[str]:
return self.redis.smembers(category) # type: ignore

View File

@ -133,6 +133,8 @@ class Lookyloo():
self.indexing.index_cookies_capture(ct)
self.indexing.index_body_hashes_capture(ct)
self.indexing.index_url_capture(ct)
categories = list(self.categories_capture(capture_uuid).keys())
self.indexing.index_categories_capture(capture_uuid, categories)
except Har2TreeError as e:
raise NoValidHarFile(e.message)
@ -272,7 +274,7 @@ class Lookyloo():
return {}
return ct.root_hartree.stats
def categories_capture(self, capture_uuid: str):
def categories_capture(self, capture_uuid: str) -> Dict[str, Any]:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
@ -280,11 +282,10 @@ class Lookyloo():
if (capture_dir / 'categories').exists():
with (capture_dir / 'categories').open() as f:
current_categories = [line.strip() for line in f.readlines()]
else:
current_categories = []
return {e: self.taxonomies.revert_machinetag(e) for e in current_categories}
return {e: self.taxonomies.revert_machinetag(e) for e in current_categories}
return {}
def categorize_capture(self, capture_uuid: str, category: str):
def categorize_capture(self, capture_uuid: str, category: str) -> None:
if not get_config('generic', 'enable_categorization'):
return
# Make sure the category is mappable to a taxonomy.
@ -303,7 +304,7 @@ class Lookyloo():
with (capture_dir / 'categories').open('w') as f:
f.writelines(f'{t}\n' for t in current_categories)
def uncategorize_capture(self, capture_uuid: str, category: str):
def uncategorize_capture(self, capture_uuid: str, category: str) -> None:
if not get_config('generic', 'enable_categorization'):
return
capture_dir = self.lookup_capture_dir(capture_uuid)
@ -382,7 +383,7 @@ class Lookyloo():
error_cache: Dict[str, str] = {}
if (capture_dir / 'error.txt').exists():
# Something went wrong
with (Path(capture_dir) / 'error.txt').open() as _error:
with (capture_dir / 'error.txt').open() as _error:
content = _error.read()
try:
error_to_cache = json.loads(content)
@ -404,6 +405,12 @@ class Lookyloo():
error_cache['error'] = f'No har files in {capture_dir.name}'
fatal_error = True
if (capture_dir / 'categories').exists():
with (capture_dir / 'categories').open() as _categories:
categories = [c.strip() for c in _categories.readlines()]
else:
categories = []
if not redis_pipeline:
p = self.redis.pipeline()
else:
@ -430,6 +437,7 @@ class Lookyloo():
'timestamp': har.initial_start_time,
'url': har.root_url,
'redirects': json.dumps(redirects),
'categories': json.dumps(categories),
'capture_dir': str(capture_dir),
'incomplete_redirects': 1 if incomplete_redirects else 0}
if (capture_dir / 'no_index').exists(): # If the folders claims anonymity
@ -473,6 +481,8 @@ class Lookyloo():
continue
if 'timestamp' not in c:
continue
if 'categories' in c:
c['categories'] = json.loads(c['categories'])
all_cache.append(c)
return sorted(all_cache, key=operator.itemgetter('timestamp'), reverse=True)
@ -487,6 +497,8 @@ class Lookyloo():
if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir']):
cached['redirects'] = json.loads(cached['redirects']) # type: ignore
cached['capture_dir'] = Path(cached['capture_dir'])
if 'categories' in cached:
cached['categories'] = json.loads(cached['categories']) # type: ignore
return cached
elif 'error' in cached:
return cached
@ -639,9 +651,9 @@ class Lookyloo():
return self._get_raw(capture_uuid)
def capture(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None,
browser: Optional[str]=None) -> Union[bool, str]:
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None,
browser: Optional[str]=None) -> Union[bool, str]:
url = url.strip()
url = refang(url)
if not url.startswith('http'):

View File

@ -368,16 +368,20 @@ def mark_as_legitimate(tree_uuid: str):
# ##### helpers #####
def index_generic(show_hidden: bool=False):
def index_generic(show_hidden: bool=False, category: Optional[str]=None):
titles = []
if time_delta_on_index:
# We want to filter the captures on the index
cut_time = datetime.now() - timedelta(**time_delta_on_index)
else:
cut_time = None # type: ignore
for cached in lookyloo.sorted_cache:
if not cached:
continue
if category:
if 'categories' not in cached or category not in cached['categories']:
continue
if show_hidden:
if 'no_index' not in cached:
# Only display the hidden ones
@ -386,6 +390,7 @@ def index_generic(show_hidden: bool=False):
continue
if cut_time and datetime.fromisoformat(cached['timestamp'][:-1]) < cut_time:
continue
titles.append((cached['uuid'], cached['title'], cached['timestamp'], cached['url'],
cached['redirects'], True if cached['incomplete_redirects'] == '1' else False))
titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True)
@ -409,6 +414,11 @@ def index_hidden():
return index_generic(show_hidden=True)
@app.route('/category/<string:category>', methods=['GET'])
def index_category(category: str):
return index_generic(category=category)
@app.route('/cookies', methods=['GET'])
def cookies_lookup():
i = Indexing()
@ -428,6 +438,13 @@ def ressources():
return render_template('ressources.html', ressources=ressources)
@app.route('/categories', methods=['GET'])
def categories():
i = Indexing()
print(i.categories)
return render_template('categories.html', categories=i.categories)
@app.route('/rebuild_all')
@auth.login_required
def rebuild_all():

View File

@ -0,0 +1,46 @@
{% extends "main.html" %}
{% from 'bootstrap/utils.html' import render_messages %}
{% block title %}Categories{% endblock %}
{% block scripts %}
{{ super() }}
<script src='{{ url_for('static', filename='datatables.min.js') }}'></script>
<script type="text/javascript">
$('#table').DataTable( {
"order": [[ 1, "desc" ]],
"pageLength": 500
});
</script>
{% endblock %}
{% block styles %}
{{ super() }}
<link rel="stylesheet" href="{{ url_for('static', filename='datatables.min.css') }}">
{% endblock %}
{% block content %}
<div class="table-responsive">
<table id="table" class="table" style="width:96%">
<thead>
<tr>
<th>Category</th>
<th>Frequency</th>
</tr>
</thead>
<tbody>
{% for category, freq in categories %}
<tr>
<td>
<a href="{{ url_for('index_category', category=category) }}">{{ category }}</a>
</td>
<td>{{ freq }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% endblock %}