new: Basic support for CERT PL phishing truncated hash HTML structure

Fix #905
pull/909/head
Raphaël Vinot 2024-04-11 17:46:04 +02:00
parent bd956abbf9
commit 466a3c5614
6 changed files with 238 additions and 5 deletions

View File

@ -38,7 +38,7 @@ class BackgroundIndexer(AbstractManager):
# Don't need the cache in this class.
self.lookyloo.clear_tree_cache()
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool], str], None, None]:
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool], str], None, None]:
# NOTE: only get the non-archived captures for now.
for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
if not self.full_indexer:
@ -88,6 +88,9 @@ class BackgroundIndexer(AbstractManager):
if not indexed[5]:
self.logger.info(f'Indexing identifiers for {uuid_to_index}')
self.indexing.index_identifiers_capture(ct)
if not indexed[6]:
self.logger.info(f'Indexing hash types for {uuid_to_index}')
self.indexing.index_capture_hashes_types(ct)
# NOTE: categories aren't taken in account here, should be fixed(?)
# see indexing.index_categories_capture(capture_uuid, categories)
self.indexing.indexing_done()

View File

@ -14,11 +14,14 @@ from zipfile import ZipFile
import mmh3
from bs4 import BeautifulSoup
from hashlib import sha256
from har2tree import CrawledTree
from redis import ConnectionPool, Redis
from redis.connection import UnixDomainSocketConnection
from .default import get_socket_path, get_config
from .default import get_socket_path
# from .helpers import get_public_suffix_list
@ -66,9 +69,13 @@ class Indexing():
p.srem('indexed_hhhashes', capture_uuid)
p.srem('indexed_favicons', capture_uuid)
p.srem('indexed_identifiers', capture_uuid)
for identifier_type in self.identifiers_types():
p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
for hash_type in self.captures_hashes_types():
p.srem(f'indexed_hash_type|{hash_type}', capture_uuid)
p.execute()
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool]:
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool]:
p = self.redis.pipeline()
p.sismember('indexed_urls', capture_uuid)
p.sismember('indexed_body_hashes', capture_uuid)
@ -76,8 +83,12 @@ class Indexing():
p.sismember('indexed_hhhashes', capture_uuid)
p.sismember('indexed_favicons', capture_uuid)
p.sismember('indexed_identifiers', capture_uuid)
# This call for sure returns a tuple of 6 booleans
return p.execute() # type: ignore[return-value]
# We also need to check if the hash_type are all indexed for this capture
hash_types_indexed = all(self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid) for hash_type in self.captures_hashes_types())
to_return: list[bool] = p.execute()
to_return.append(hash_types_indexed)
# This call for sure returns a tuple of 7 booleans
return tuple(to_return) # type: ignore[return-value]
# ###### Cookies ######
@ -367,6 +378,65 @@ class Indexing():
def get_favicon(self, favicon_sha512: str) -> bytes | None:
return self.redis_bytes.get(f'favicons|{favicon_sha512}')
# ###### Capture hashes ######
# This is where we define the indexing for the hashes generated for a whole capture (at most one hash per capture)
# certpl_html_structure_hash: concatenated list of all the tag names on the page - done on the rendered page
def _compute_certpl_html_structure_hash(self, html: str) -> str:
soup = BeautifulSoup(html, "lxml")
to_hash = "|".join(t.name for t in soup.findAll()).encode()
return sha256(to_hash).hexdigest()[:32]
def captures_hashes_types(self) -> set[str]:
return set('certpl_html_structure_hash', )
# return self.redis.smembers('capture_hash_types')
def captures_hashes(self, hash_type: str) -> list[tuple[str, float]]:
return self.redis.zrevrange(f'capture_hash_types|{hash_type}', 0, 200, withscores=True)
def hash_frequency(self, hash_type: str, h: str) -> float | None:
return self.redis.zscore(f'capture_hash_types|{hash_type}', h)
def hash_number_captures(self, hash_type: str, h: str) -> int:
return self.redis.scard(f'capture_hash_types|{hash_type}|{h}|captures')
def index_capture_hashes_types(self, crawled_tree: CrawledTree) -> None:
capture_uuid = crawled_tree.uuid
# NOTE: We will have multiple hash types for each captures, we want to make sure
# to reindex all the captures if there is a new hash type but only index the new
# captures on the existing hash types
# hashes = ('certpl_html_structure_hash', )
for hash_type in self.captures_hashes_types():
if self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid):
# Do not reindex
return
self.redis.sadd(f'indexed_hash_type|{hash_type}', capture_uuid)
if hash_type == 'certpl_html_structure_hash':
# we must have a rendered HTML for this hash to be relevant.
if (not hasattr(crawled_tree.root_hartree.rendered_node, 'rendered_html')
or not crawled_tree.root_hartree.rendered_node.rendered_html):
continue
# we have a rendered HTML, compute the hash
hash_to_index = self._compute_certpl_html_structure_hash(crawled_tree.root_hartree.rendered_node.rendered_html)
if self.redis.sismember(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid):
# Already counted this specific identifier for this capture
continue
self.logger.debug(f'Indexing hash {hash_type} for {capture_uuid} ... ')
pipeline = self.redis.pipeline()
pipeline.hset(f'capture_hash_types|{capture_uuid}', hash_type, hash_to_index)
pipeline.sadd(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid)
pipeline.zincrby(f'capture_hash_types|{hash_type}', 1, hash_to_index)
pipeline.execute()
def get_hashes_types_capture(self, capture_uuid: str) -> dict[str, str]:
return self.redis.hgetall(f'capture_hash_types|{capture_uuid}')
def get_captures_hash_type(self, hash_type: str, h: str) -> set[str]:
return self.redis.smembers(f'capture_hash_types|{hash_type}|{h}|captures')
# ###### identifiers ######
def identifiers_types(self) -> set[str]:

View File

@ -421,6 +421,11 @@ def get_identifier_investigator(identifier_type: str, identifier: str) -> list[t
return [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
def get_capture_hash_investigator(hash_type: str, h: str) -> list[tuple[str, str, str, datetime]]:
cached_captures = lookyloo.sorted_capture_cache([uuid for uuid in get_indexing(flask_login.current_user).get_captures_hash_type(hash_type=hash_type, h=h)])
return [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
def get_favicon_investigator(favicon_sha512: str,
/,
get_probabilistic: bool=False) -> tuple[list[tuple[str, str, str, datetime]],
@ -1232,6 +1237,16 @@ def tree_favicons(tree_uuid: str) -> str:
return render_template('tree_favicons.html', tree_uuid=tree_uuid, favicons=favicons)
@app.route('/tree/<string:tree_uuid>/hashes_types', methods=['GET'])
def tree_capture_hashes_types(tree_uuid: str) -> str:
to_return: list[tuple[int, str, str]] = []
for hash_type, h in get_indexing(flask_login.current_user).get_hashes_types_capture(tree_uuid).items():
nb_captures = get_indexing(flask_login.current_user).hash_number_captures(hash_type, h)
to_return.append((nb_captures, hash_type, h))
return render_template('tree_hashes_types.html', tree_uuid=tree_uuid, hashes=to_return)
@app.route('/tree/<string:tree_uuid>/body_hashes', methods=['GET'])
def tree_body_hashes(tree_uuid: str) -> str:
body_hashes = get_all_body_hashes(tree_uuid)
@ -1638,6 +1653,14 @@ def identifier_details(identifier_type: str, identifier: str) -> str:
captures=captures)
@app.route('/capture_hash_details/<string:hash_type>/<string:h>', methods=['GET'])
def capture_hash_details(hash_type: str, h: str) -> str:
captures = get_capture_hash_investigator(hash_type, h)
return render_template('identifier_details.html', hash_type=hash_type,
h=h,
captures=captures)
@app.route('/favicon_details/<string:favicon_sha512>', methods=['GET'])
@app.route('/favicon_details/<string:favicon_sha512>/<int:get_probabilistic>', methods=['GET'])
def favicon_detail(favicon_sha512: str, get_probabilistic: int=0) -> str:

View File

@ -0,0 +1,49 @@
{% from "macros.html" import shorten_string %}
<script type="text/javascript">
new DataTable('#hashTypeDetailsTable', {
order: [[ 0, "desc" ]],
columnDefs: [{ width: '30%',
targets: 0,
render: (data) => {
const date = new Date(data);
return date.getFullYear() + '-' + (date.getMonth() + 1).toString().padStart(2, "0") + '-' + date.getDate().toString().padStart(2, "0") + ' ' + date.toTimeString();
}
},
{ width: '30%', targets: 1 },
{ width: '50%', targets: 2 }],
});
</script>
<center>
<h5>{{hash_type}}: {{h}}</h5>
</center>
<table id="hashTypeDetailsTable" class="table table-striped" style="width:100%">
<thead>
<tr>
<th>Capture Time</th>
<th>Capture Title</th>
<th>Landing page</th>
</tr>
</thead>
<tbody>
{% for capture_uuid, title, landing_page, capture_time in captures %}
<tr>
<td>
{{capture_time}}
</td>
<td>
<a href="{{ url_for('tree', tree_uuid=capture_uuid) }}">
{{ title }}
</a>
</td>
<td>
<span class="d-inline-block text-break" style="max-width: 400px;">
{{ landing_page }}
</span>
</td>
</tr>
{% endfor %}
</tbody>
</table>

View File

@ -112,6 +112,20 @@
});
</script>
<script>
$('#captureHashesTypesModal').on('show.bs.modal', function(e) {
var button = $(e.relatedTarget);
var modal = $(this);
modal.find('.modal-body').load(button.data("remote"));
});
</script>
<script>
$('#captureHashesTypesDetailsModal').on('show.bs.modal', function(e) {
var button = $(e.relatedTarget);
var modal = $(this);
modal.find('.modal-body').load(button.data("remote"));
});
</script>
<script>
$('#faviconDetailsProbabilisticHashModal').on('show.bs.modal', function(e) {
var button = $(e.relatedTarget);
var modal = $(this);
@ -334,6 +348,10 @@
<a href="#faviconsModal" data-remote="{{ url_for('tree_favicons', tree_uuid=tree_uuid) }}"
data-bs-toggle="modal" data-bs-target="#faviconsModal" role="button">Favicons Capture</a>
</li>
<li class="list-group-item">
<a href="#captureHashesTypesModal" data-remote="{{ url_for('tree_capture_hashes_types', tree_uuid=tree_uuid) }}"
data-bs-toggle="modal" data-bs-target="#captureHashesTypesModal" role="button">Capture hashes types</a>
</li>
<li class="list-group-item">
<a href="#identifiersModal" data-remote="{{ url_for('tree_identifiers', tree_uuid=tree_uuid) }}"
data-bs-toggle="modal" data-bs-target="#identifiersModal" role="button">Identifiers Capture</a>
@ -634,6 +652,43 @@
</div>
</div>
<div class="modal fade" id="captureHashesTypesModal" tabindex="-1" role="dialog">
<div class="modal-dialog modal-xl" role="document">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title" id="captureHashesTypesModalLabel">Hashes of the rendered page</h5>
<button type="button" class="btn btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
</div>
<div class="modal-body">
... loading hash types ...
</div>
<div class="modal-footer">
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
<div class="modal fade" id="captureHashesTypesDetailsModal" tabindex="-1" role="dialog">
<div class="modal-dialog modal-xl" role="document">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title" id="captureHashesTypesDetailsModalLabel">Other occurrences of the hash</h5>
<button type="button" class="btn btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
</div>
<div class="modal-body">
... loading hash details ...
</div>
<div class="modal-footer">
<a class="btn btn-primary" href="#captureHashesTypesModal"
data-remote="{{ url_for('tree_capture_hashes_types', tree_uuid=tree_uuid) }}"
data-bs-toggle="modal" data-bs-target="#captureHashesTypesModal" role="button">Back to capture's hahses</a>
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
<div class="modal fade" id="faviconDetailsProbabilisticHashModal" tabindex="-1" role="dialog">
<div class="modal-dialog modal-xl" role="document">
<div class="modal-content">

View File

@ -0,0 +1,33 @@
<script type="text/javascript">
new DataTable('#identifiersTable', {
columnDefs: [{ width: '20%', targets: 0 },
{ width: '40%', targets: 1 },
{ width: '40%', targets: 2 }],
});
</script>
<h5 class="text-center">Click on the hash to see the other captures it's been found in</h5>
<table id="identifiersTable" class="table table-striped" style="width:100%">
<thead>
<tr>
<th>Number of captures</th>
<th>Hash</th>
<th>Hash type</th>
</tr>
</thead>
<tbody>
{% for number_captures, hash_type, hash in hashes %}
<tr>
<td>{{ number_captures }}</td>
<td>
<a href="#captureHashesTypesDetailsModal" data-remote="{{ url_for('capture_hash_details', hash_type=hash_type, h=hash) }}"
data-bs-toggle="modal" data-bs-target="#captureHashesTypesDetailsModal" role="button">
{{ hash }}
</a>
</td>
<td>{{hash_type}}</td>
</tr>
{% endfor %}
</tbody>
</table>