mirror of https://github.com/CIRCL/lookyloo
new: Initial support for body hashes index
parent
e6aded6c12
commit
6bb26c4358
|
@ -38,6 +38,15 @@ class Indexing():
|
||||||
self.lookyloo = Lookyloo()
|
self.lookyloo = Lookyloo()
|
||||||
self.redis: Redis = Redis(unix_socket_path=get_socket_path('indexing'), decode_responses=True)
|
self.redis: Redis = Redis(unix_socket_path=get_socket_path('indexing'), decode_responses=True)
|
||||||
|
|
||||||
|
def clear_indexes(self):
|
||||||
|
self.redis.flushdb()
|
||||||
|
|
||||||
|
def index_all(self):
|
||||||
|
self.index_cookies()
|
||||||
|
self.index_body_hashes()
|
||||||
|
|
||||||
|
# ###### Cookies ######
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def cookies_names(self) -> List[Tuple[str, float]]:
|
def cookies_names(self) -> List[Tuple[str, float]]:
|
||||||
return self.redis.zrevrange('cookies_names', 0, -1, withscores=True)
|
return self.redis.zrevrange('cookies_names', 0, -1, withscores=True)
|
||||||
|
@ -60,14 +69,8 @@ class Indexing():
|
||||||
def get_cookies_names_captures(self, cookie_name: str) -> List[Tuple[str, str]]:
|
def get_cookies_names_captures(self, cookie_name: str) -> List[Tuple[str, str]]:
|
||||||
return [uuids.split('|')for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')]
|
return [uuids.split('|')for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')]
|
||||||
|
|
||||||
def clear_indexes(self):
|
|
||||||
self.redis.flushdb()
|
|
||||||
|
|
||||||
def index_all(self):
|
|
||||||
self.index_cookies()
|
|
||||||
|
|
||||||
def index_cookies_capture(self, capture_dir: Path) -> None:
|
def index_cookies_capture(self, capture_dir: Path) -> None:
|
||||||
print(f'Processing {capture_dir}')
|
print(f'Index cookies {capture_dir}')
|
||||||
try:
|
try:
|
||||||
crawled_tree = self.lookyloo.get_crawled_tree(capture_dir)
|
crawled_tree = self.lookyloo.get_crawled_tree(capture_dir)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -118,6 +121,49 @@ class Indexing():
|
||||||
self.redis.delete('aggregate_cn_domains')
|
self.redis.delete('aggregate_cn_domains')
|
||||||
return {'domains': aggregate_domains_cn, 'cookies': aggregate_cn_domains}
|
return {'domains': aggregate_domains_cn, 'cookies': aggregate_cn_domains}
|
||||||
|
|
||||||
|
# ###### Body hashes ######
|
||||||
|
|
||||||
|
def body_hash_fequency(self, body_hash: str) -> float:
|
||||||
|
return {'hash_freq': self.redis.zscore('body_hashes', body_hash),
|
||||||
|
'hash_domains_freq': self.redis.zcard(f'bh|{body_hash}')}
|
||||||
|
|
||||||
|
def index_body_hashes_capture(self, capture_dir: Path) -> None:
|
||||||
|
print(f'Index body hashes {capture_dir}')
|
||||||
|
try:
|
||||||
|
crawled_tree = self.lookyloo.get_crawled_tree(capture_dir)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
return
|
||||||
|
|
||||||
|
if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid):
|
||||||
|
# Do not reindex
|
||||||
|
return
|
||||||
|
self.redis.sadd('indexed_body_hashes', crawled_tree.uuid)
|
||||||
|
|
||||||
|
pipeline = self.redis.pipeline()
|
||||||
|
already_loaded: Set[str] = set()
|
||||||
|
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
|
||||||
|
if not urlnode.empty_response:
|
||||||
|
if urlnode.body_hash in already_loaded:
|
||||||
|
# Same hash multiple times in the same capture, skip
|
||||||
|
continue
|
||||||
|
already_loaded.add(urlnode.body_hash)
|
||||||
|
pipeline.zincrby('body_hashes', 1, urlnode.body_hash)
|
||||||
|
pipeline.zincrby(f'bh|{urlnode.body_hash}', 1, urlnode.hostname)
|
||||||
|
pipeline.sadd(f'bh|{urlnode.body_hash}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
|
||||||
|
|
||||||
|
pipeline.execute()
|
||||||
|
|
||||||
|
def index_body_hashes(self) -> None:
|
||||||
|
for capture_dir in self.lookyloo.capture_dirs:
|
||||||
|
self.index_body_hashes_capture(capture_dir)
|
||||||
|
|
||||||
|
def get_body_hash_captures(self, body_hash: str) -> List[Tuple[str, str]]:
|
||||||
|
return [uuids.split('|')for uuids in self.redis.smembers(f'bh|{body_hash}|captures')]
|
||||||
|
|
||||||
|
def get_body_hash_domains(self, body_hash: str) -> List[Tuple[str, float]]:
|
||||||
|
return self.redis.zrevrange(f'bh|{body_hash}', 0, -1, withscores=True)
|
||||||
|
|
||||||
|
|
||||||
class Lookyloo():
|
class Lookyloo():
|
||||||
|
|
||||||
|
@ -691,6 +737,13 @@ class Lookyloo():
|
||||||
to_append['url_path_short'] = to_append['url_path']
|
to_append['url_path_short'] = to_append['url_path']
|
||||||
|
|
||||||
if not url.empty_response:
|
if not url.empty_response:
|
||||||
|
# Index lookup
|
||||||
|
# NOTE: We probably don't want to leave it there.
|
||||||
|
indexing = Indexing()
|
||||||
|
freq = indexing.body_hash_fequency(url.body_hash)
|
||||||
|
if freq['hash_freq'] > 1:
|
||||||
|
to_append['body_hash_fequency'] = freq
|
||||||
|
|
||||||
# Optional: SaneJS information
|
# Optional: SaneJS information
|
||||||
if url.body_hash in sanejs_lookups:
|
if url.body_hash in sanejs_lookups:
|
||||||
if sanejs_lookups[url.body_hash]:
|
if sanejs_lookups[url.body_hash]:
|
||||||
|
|
|
@ -436,6 +436,14 @@ def cookies_name_detail(cookie_name: str):
|
||||||
for domain, freq in i.get_cookie_domains(cookie_name)]
|
for domain, freq in i.get_cookie_domains(cookie_name)]
|
||||||
return render_template('cookie_name.html', cookie_name=cookie_name, domains=domains, captures=captures)
|
return render_template('cookie_name.html', cookie_name=cookie_name, domains=domains, captures=captures)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/body_hashes/<string:body_hash>', methods=['GET'])
|
||||||
|
def body_hash_details(body_hash: str):
|
||||||
|
i = Indexing()
|
||||||
|
captures = [i.get_capture_cache(capture) for capture, url in i.get_body_hash_captures(body_hash)]
|
||||||
|
domains = i.get_body_hash_domains(body_hash)
|
||||||
|
return render_template('body_hash.html', body_hash=body_hash, domains=domains, captures=captures)
|
||||||
|
|
||||||
# Query API
|
# Query API
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
{% extends "main.html" %}
|
||||||
|
|
||||||
|
{% from 'bootstrap/utils.html' import render_messages %}
|
||||||
|
|
||||||
|
{% block title %}{{ body_hash }}{% endblock %}
|
||||||
|
|
||||||
|
{% block scripts %}
|
||||||
|
{{ super() }}
|
||||||
|
<script src='{{ url_for('static', filename='datatables.min.js') }}'></script>
|
||||||
|
<script type="text/javascript">
|
||||||
|
$('#table').DataTable( {
|
||||||
|
"order": [[ 1, "desc" ]],
|
||||||
|
"pageLength": 500
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
|
||||||
|
{% endblock %}
|
||||||
|
|
||||||
|
{% block styles %}
|
||||||
|
{{ super() }}
|
||||||
|
<link rel="stylesheet" href="{{ url_for('static', filename='datatables.min.css') }}">
|
||||||
|
{% endblock %}
|
||||||
|
|
||||||
|
|
||||||
|
{% block content %}
|
||||||
|
<center><h2>{{ cookie_name }}</h4></center>
|
||||||
|
<div class="table-responsive">
|
||||||
|
<table id="table" class="table" style="width:96%">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Hostname</th>
|
||||||
|
<th>Frequency</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for domain, freq in domains %}
|
||||||
|
<tr>
|
||||||
|
<td>
|
||||||
|
{{ domain }}
|
||||||
|
</td>
|
||||||
|
<td>{{ freq }}</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p>The same file was seen in these captures:</p>
|
||||||
|
<ul>
|
||||||
|
{% for capture in captures %}
|
||||||
|
<li><a href="{{ url_for('tree', tree_uuid=capture['uuid']) }}">{{ capture['title'] }}</a></li>
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
{% endblock %}
|
|
@ -103,6 +103,16 @@
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
{% if url['body_hash_fequency'] %}
|
||||||
|
<div>
|
||||||
|
This file can be found <b>{{ url['body_hash_fequency']['hash_freq'] }}</b> times
|
||||||
|
across all the captures on this lookyloo instance, in <b>{{ url['body_hash_fequency']['hash_domains_freq'] }}</b> unique domains.
|
||||||
|
</br>
|
||||||
|
<a href="{{ url_for('body_hash_details', body_hash=url['url_object'].body_hash) }}">
|
||||||
|
Show details
|
||||||
|
</a>
|
||||||
|
<div>
|
||||||
|
{% endif %}
|
||||||
{% if url['sane_js'] %}
|
{% if url['sane_js'] %}
|
||||||
<div>
|
<div>
|
||||||
{% if url['sane_js'] is string %}
|
{% if url['sane_js'] is string %}
|
||||||
|
|
Loading…
Reference in New Issue