new: Display body hashes matches on hostnode page

pull/78/head
Raphaël Vinot 2020-06-19 16:10:35 +02:00
parent 6e13622d88
commit 7772706262
4 changed files with 37 additions and 20 deletions

View File

@ -45,6 +45,12 @@ class Indexing():
self.index_cookies()
self.index_body_hashes()
def get_capture_cache(self, capture_uuid: str) -> Optional[Dict[str, Any]]:
capture_dir = self.lookyloo.lookup_capture_dir(capture_uuid)
if capture_dir:
return self.lookyloo.capture_cache(capture_dir)
return {}
# ###### Cookies ######
@property
@ -60,12 +66,6 @@ class Indexing():
def get_cookie_domains(self, cookie_name: str) -> List[Tuple[str, float]]:
return self.redis.zrevrange(f'cn|{cookie_name}', 0, -1, withscores=True)
def get_capture_cache(self, capture_uuid: str) -> Optional[Dict[str, Any]]:
capture_dir = self.lookyloo.lookup_capture_dir(capture_uuid)
if capture_dir:
return self.lookyloo.capture_cache(capture_dir)
return {}
def get_cookies_names_captures(self, cookie_name: str) -> List[Tuple[str, str]]:
return [uuids.split('|')for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')]
@ -141,16 +141,14 @@ class Indexing():
self.redis.sadd('indexed_body_hashes', crawled_tree.uuid)
pipeline = self.redis.pipeline()
already_loaded: Set[str] = set()
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
if not urlnode.empty_response:
if urlnode.body_hash in already_loaded:
# Same hash multiple times in the same capture, skip
continue
already_loaded.add(urlnode.body_hash)
pipeline.zincrby('body_hashes', 1, urlnode.body_hash)
pipeline.zincrby(f'bh|{urlnode.body_hash}', 1, urlnode.hostname)
pipeline.sadd(f'bh|{urlnode.body_hash}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
# set of all captures with this hash
pipeline.sadd(f'bh|{urlnode.body_hash}|captures', crawled_tree.uuid)
# ZSet of all urlnode_UUIDs|full_url
pipeline.zincrby(f'bh|{urlnode.body_hash}|captures|{crawled_tree.uuid}', 1, f'{urlnode.uuid}|{urlnode.name}')
pipeline.execute()
@ -158,8 +156,17 @@ class Indexing():
for capture_dir in self.lookyloo.capture_dirs:
self.index_body_hashes_capture(capture_dir)
def get_body_hash_captures(self, body_hash: str) -> List[Tuple[str, str]]:
return [uuids.split('|')for uuids in self.redis.smembers(f'bh|{body_hash}|captures')]
def get_body_hash_captures(self, body_hash: str, filter_url: Optional[str]=None) -> List[str]:
if not filter_url:
return self.redis.smembers(f'bh|{body_hash}|captures')
# We only want the captures if the hash match on a different URL
to_return = []
for capture_uuid in self.redis.smembers(f'bh|{body_hash}|captures'):
for entry in self.redis.zrevrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, -1):
url_uuid, url = entry.split('|', 1)
if url != filter_url:
to_return.append(capture_uuid)
return to_return
def get_body_hash_domains(self, body_hash: str) -> List[Tuple[str, float]]:
return self.redis.zrevrange(f'bh|{body_hash}', 0, -1, withscores=True)
@ -742,7 +749,9 @@ class Lookyloo():
indexing = Indexing()
freq = indexing.body_hash_fequency(url.body_hash)
if freq['hash_freq'] > 1:
to_append['body_hash_fequency'] = freq
to_append['body_hash_details'] = freq
to_append['body_hash_details']['other_captures'] = [indexing.get_capture_cache(capture)
for capture in indexing.get_body_hash_captures(url.body_hash, url.name)]
# Optional: SaneJS information
if url.body_hash in sanejs_lookups:

View File

@ -440,7 +440,7 @@ def cookies_name_detail(cookie_name: str):
@app.route('/body_hashes/<string:body_hash>', methods=['GET'])
def body_hash_details(body_hash: str):
i = Indexing()
captures = [i.get_capture_cache(capture) for capture, url in i.get_body_hash_captures(body_hash)]
captures = [i.get_capture_cache(capture) for capture in i.get_body_hash_captures(body_hash)]
domains = i.get_body_hash_domains(body_hash)
return render_template('body_hash.html', body_hash=body_hash, domains=domains, captures=captures)

View File

@ -23,7 +23,7 @@
{% block content %}
<center><h2>{{ cookie_name }}</h4></center>
<center><h4>{{ body_hash }}</h4></center>
<div class="table-responsive">
<table id="table" class="table" style="width:96%">
<thead>

View File

@ -103,11 +103,19 @@
</div>
{% if url['body_hash_fequency'] %}
{% if url['body_hash_details'] %}
<div>
This file can be found <b>{{ url['body_hash_fequency']['hash_freq'] }}</b> times
across all the captures on this lookyloo instance, in <b>{{ url['body_hash_fequency']['hash_domains_freq'] }}</b> unique domains.
This file can be found <b>{{ url['body_hash_details']['hash_freq'] }}</b> times
across all the captures on this lookyloo instance, in <b>{{ url['body_hash_details']['hash_domains_freq'] }}</b> unique domains.
</br>
{% if url['body_hash_details']['other_captures'] %}
<p>The same file was seen in these captures:</p>
<ul>
{% for capture in url['body_hash_details']['other_captures'] %}
<li><a href="{{ url_for('tree', tree_uuid=capture['uuid']) }}">{{ capture['title'] }}</a></li>
{% endfor %}
</ul>
{% endif %}
<a href="{{ url_for('body_hash_details', body_hash=url['url_object'].body_hash) }}">
Show details
</a>