mirror of https://github.com/CIRCL/lookyloo
parent
bd956abbf9
commit
466a3c5614
|
@ -38,7 +38,7 @@ class BackgroundIndexer(AbstractManager):
|
|||
# Don't need the cache in this class.
|
||||
self.lookyloo.clear_tree_cache()
|
||||
|
||||
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool], str], None, None]:
|
||||
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool], str], None, None]:
|
||||
# NOTE: only get the non-archived captures for now.
|
||||
for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
|
||||
if not self.full_indexer:
|
||||
|
@ -88,6 +88,9 @@ class BackgroundIndexer(AbstractManager):
|
|||
if not indexed[5]:
|
||||
self.logger.info(f'Indexing identifiers for {uuid_to_index}')
|
||||
self.indexing.index_identifiers_capture(ct)
|
||||
if not indexed[6]:
|
||||
self.logger.info(f'Indexing hash types for {uuid_to_index}')
|
||||
self.indexing.index_capture_hashes_types(ct)
|
||||
# NOTE: categories aren't taken in account here, should be fixed(?)
|
||||
# see indexing.index_categories_capture(capture_uuid, categories)
|
||||
self.indexing.indexing_done()
|
||||
|
|
|
@ -14,11 +14,14 @@ from zipfile import ZipFile
|
|||
|
||||
import mmh3
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from hashlib import sha256
|
||||
|
||||
from har2tree import CrawledTree
|
||||
from redis import ConnectionPool, Redis
|
||||
from redis.connection import UnixDomainSocketConnection
|
||||
|
||||
from .default import get_socket_path, get_config
|
||||
from .default import get_socket_path
|
||||
# from .helpers import get_public_suffix_list
|
||||
|
||||
|
||||
|
@ -66,9 +69,13 @@ class Indexing():
|
|||
p.srem('indexed_hhhashes', capture_uuid)
|
||||
p.srem('indexed_favicons', capture_uuid)
|
||||
p.srem('indexed_identifiers', capture_uuid)
|
||||
for identifier_type in self.identifiers_types():
|
||||
p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
|
||||
for hash_type in self.captures_hashes_types():
|
||||
p.srem(f'indexed_hash_type|{hash_type}', capture_uuid)
|
||||
p.execute()
|
||||
|
||||
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool]:
|
||||
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool]:
|
||||
p = self.redis.pipeline()
|
||||
p.sismember('indexed_urls', capture_uuid)
|
||||
p.sismember('indexed_body_hashes', capture_uuid)
|
||||
|
@ -76,8 +83,12 @@ class Indexing():
|
|||
p.sismember('indexed_hhhashes', capture_uuid)
|
||||
p.sismember('indexed_favicons', capture_uuid)
|
||||
p.sismember('indexed_identifiers', capture_uuid)
|
||||
# This call for sure returns a tuple of 6 booleans
|
||||
return p.execute() # type: ignore[return-value]
|
||||
# We also need to check if the hash_type are all indexed for this capture
|
||||
hash_types_indexed = all(self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid) for hash_type in self.captures_hashes_types())
|
||||
to_return: list[bool] = p.execute()
|
||||
to_return.append(hash_types_indexed)
|
||||
# This call for sure returns a tuple of 7 booleans
|
||||
return tuple(to_return) # type: ignore[return-value]
|
||||
|
||||
# ###### Cookies ######
|
||||
|
||||
|
@ -367,6 +378,65 @@ class Indexing():
|
|||
def get_favicon(self, favicon_sha512: str) -> bytes | None:
|
||||
return self.redis_bytes.get(f'favicons|{favicon_sha512}')
|
||||
|
||||
# ###### Capture hashes ######
|
||||
|
||||
# This is where we define the indexing for the hashes generated for a whole capture (at most one hash per capture)
|
||||
# certpl_html_structure_hash: concatenated list of all the tag names on the page - done on the rendered page
|
||||
|
||||
def _compute_certpl_html_structure_hash(self, html: str) -> str:
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
to_hash = "|".join(t.name for t in soup.findAll()).encode()
|
||||
return sha256(to_hash).hexdigest()[:32]
|
||||
|
||||
def captures_hashes_types(self) -> set[str]:
|
||||
return set('certpl_html_structure_hash', )
|
||||
# return self.redis.smembers('capture_hash_types')
|
||||
|
||||
def captures_hashes(self, hash_type: str) -> list[tuple[str, float]]:
|
||||
return self.redis.zrevrange(f'capture_hash_types|{hash_type}', 0, 200, withscores=True)
|
||||
|
||||
def hash_frequency(self, hash_type: str, h: str) -> float | None:
|
||||
return self.redis.zscore(f'capture_hash_types|{hash_type}', h)
|
||||
|
||||
def hash_number_captures(self, hash_type: str, h: str) -> int:
|
||||
return self.redis.scard(f'capture_hash_types|{hash_type}|{h}|captures')
|
||||
|
||||
def index_capture_hashes_types(self, crawled_tree: CrawledTree) -> None:
|
||||
capture_uuid = crawled_tree.uuid
|
||||
# NOTE: We will have multiple hash types for each captures, we want to make sure
|
||||
# to reindex all the captures if there is a new hash type but only index the new
|
||||
# captures on the existing hash types
|
||||
# hashes = ('certpl_html_structure_hash', )
|
||||
for hash_type in self.captures_hashes_types():
|
||||
if self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid):
|
||||
# Do not reindex
|
||||
return
|
||||
self.redis.sadd(f'indexed_hash_type|{hash_type}', capture_uuid)
|
||||
|
||||
if hash_type == 'certpl_html_structure_hash':
|
||||
# we must have a rendered HTML for this hash to be relevant.
|
||||
if (not hasattr(crawled_tree.root_hartree.rendered_node, 'rendered_html')
|
||||
or not crawled_tree.root_hartree.rendered_node.rendered_html):
|
||||
continue
|
||||
# we have a rendered HTML, compute the hash
|
||||
hash_to_index = self._compute_certpl_html_structure_hash(crawled_tree.root_hartree.rendered_node.rendered_html)
|
||||
|
||||
if self.redis.sismember(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid):
|
||||
# Already counted this specific identifier for this capture
|
||||
continue
|
||||
self.logger.debug(f'Indexing hash {hash_type} for {capture_uuid} ... ')
|
||||
pipeline = self.redis.pipeline()
|
||||
pipeline.hset(f'capture_hash_types|{capture_uuid}', hash_type, hash_to_index)
|
||||
pipeline.sadd(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid)
|
||||
pipeline.zincrby(f'capture_hash_types|{hash_type}', 1, hash_to_index)
|
||||
pipeline.execute()
|
||||
|
||||
def get_hashes_types_capture(self, capture_uuid: str) -> dict[str, str]:
|
||||
return self.redis.hgetall(f'capture_hash_types|{capture_uuid}')
|
||||
|
||||
def get_captures_hash_type(self, hash_type: str, h: str) -> set[str]:
|
||||
return self.redis.smembers(f'capture_hash_types|{hash_type}|{h}|captures')
|
||||
|
||||
# ###### identifiers ######
|
||||
|
||||
def identifiers_types(self) -> set[str]:
|
||||
|
|
|
@ -421,6 +421,11 @@ def get_identifier_investigator(identifier_type: str, identifier: str) -> list[t
|
|||
return [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
|
||||
|
||||
|
||||
def get_capture_hash_investigator(hash_type: str, h: str) -> list[tuple[str, str, str, datetime]]:
|
||||
cached_captures = lookyloo.sorted_capture_cache([uuid for uuid in get_indexing(flask_login.current_user).get_captures_hash_type(hash_type=hash_type, h=h)])
|
||||
return [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
|
||||
|
||||
|
||||
def get_favicon_investigator(favicon_sha512: str,
|
||||
/,
|
||||
get_probabilistic: bool=False) -> tuple[list[tuple[str, str, str, datetime]],
|
||||
|
@ -1232,6 +1237,16 @@ def tree_favicons(tree_uuid: str) -> str:
|
|||
return render_template('tree_favicons.html', tree_uuid=tree_uuid, favicons=favicons)
|
||||
|
||||
|
||||
@app.route('/tree/<string:tree_uuid>/hashes_types', methods=['GET'])
|
||||
def tree_capture_hashes_types(tree_uuid: str) -> str:
|
||||
to_return: list[tuple[int, str, str]] = []
|
||||
|
||||
for hash_type, h in get_indexing(flask_login.current_user).get_hashes_types_capture(tree_uuid).items():
|
||||
nb_captures = get_indexing(flask_login.current_user).hash_number_captures(hash_type, h)
|
||||
to_return.append((nb_captures, hash_type, h))
|
||||
return render_template('tree_hashes_types.html', tree_uuid=tree_uuid, hashes=to_return)
|
||||
|
||||
|
||||
@app.route('/tree/<string:tree_uuid>/body_hashes', methods=['GET'])
|
||||
def tree_body_hashes(tree_uuid: str) -> str:
|
||||
body_hashes = get_all_body_hashes(tree_uuid)
|
||||
|
@ -1638,6 +1653,14 @@ def identifier_details(identifier_type: str, identifier: str) -> str:
|
|||
captures=captures)
|
||||
|
||||
|
||||
@app.route('/capture_hash_details/<string:hash_type>/<string:h>', methods=['GET'])
|
||||
def capture_hash_details(hash_type: str, h: str) -> str:
|
||||
captures = get_capture_hash_investigator(hash_type, h)
|
||||
return render_template('identifier_details.html', hash_type=hash_type,
|
||||
h=h,
|
||||
captures=captures)
|
||||
|
||||
|
||||
@app.route('/favicon_details/<string:favicon_sha512>', methods=['GET'])
|
||||
@app.route('/favicon_details/<string:favicon_sha512>/<int:get_probabilistic>', methods=['GET'])
|
||||
def favicon_detail(favicon_sha512: str, get_probabilistic: int=0) -> str:
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
{% from "macros.html" import shorten_string %}
|
||||
|
||||
<script type="text/javascript">
|
||||
new DataTable('#hashTypeDetailsTable', {
|
||||
order: [[ 0, "desc" ]],
|
||||
columnDefs: [{ width: '30%',
|
||||
targets: 0,
|
||||
render: (data) => {
|
||||
const date = new Date(data);
|
||||
return date.getFullYear() + '-' + (date.getMonth() + 1).toString().padStart(2, "0") + '-' + date.getDate().toString().padStart(2, "0") + ' ' + date.toTimeString();
|
||||
}
|
||||
},
|
||||
{ width: '30%', targets: 1 },
|
||||
{ width: '50%', targets: 2 }],
|
||||
|
||||
});
|
||||
</script>
|
||||
|
||||
<center>
|
||||
<h5>{{hash_type}}: {{h}}</h5>
|
||||
</center>
|
||||
<table id="hashTypeDetailsTable" class="table table-striped" style="width:100%">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Capture Time</th>
|
||||
<th>Capture Title</th>
|
||||
<th>Landing page</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for capture_uuid, title, landing_page, capture_time in captures %}
|
||||
<tr>
|
||||
<td>
|
||||
{{capture_time}}
|
||||
</td>
|
||||
<td>
|
||||
<a href="{{ url_for('tree', tree_uuid=capture_uuid) }}">
|
||||
{{ title }}
|
||||
</a>
|
||||
</td>
|
||||
<td>
|
||||
<span class="d-inline-block text-break" style="max-width: 400px;">
|
||||
{{ landing_page }}
|
||||
</span>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
|
@ -112,6 +112,20 @@
|
|||
});
|
||||
</script>
|
||||
<script>
|
||||
$('#captureHashesTypesModal').on('show.bs.modal', function(e) {
|
||||
var button = $(e.relatedTarget);
|
||||
var modal = $(this);
|
||||
modal.find('.modal-body').load(button.data("remote"));
|
||||
});
|
||||
</script>
|
||||
<script>
|
||||
$('#captureHashesTypesDetailsModal').on('show.bs.modal', function(e) {
|
||||
var button = $(e.relatedTarget);
|
||||
var modal = $(this);
|
||||
modal.find('.modal-body').load(button.data("remote"));
|
||||
});
|
||||
</script>
|
||||
<script>
|
||||
$('#faviconDetailsProbabilisticHashModal').on('show.bs.modal', function(e) {
|
||||
var button = $(e.relatedTarget);
|
||||
var modal = $(this);
|
||||
|
@ -334,6 +348,10 @@
|
|||
<a href="#faviconsModal" data-remote="{{ url_for('tree_favicons', tree_uuid=tree_uuid) }}"
|
||||
data-bs-toggle="modal" data-bs-target="#faviconsModal" role="button">Favicons Capture</a>
|
||||
</li>
|
||||
<li class="list-group-item">
|
||||
<a href="#captureHashesTypesModal" data-remote="{{ url_for('tree_capture_hashes_types', tree_uuid=tree_uuid) }}"
|
||||
data-bs-toggle="modal" data-bs-target="#captureHashesTypesModal" role="button">Capture hashes types</a>
|
||||
</li>
|
||||
<li class="list-group-item">
|
||||
<a href="#identifiersModal" data-remote="{{ url_for('tree_identifiers', tree_uuid=tree_uuid) }}"
|
||||
data-bs-toggle="modal" data-bs-target="#identifiersModal" role="button">Identifiers Capture</a>
|
||||
|
@ -634,6 +652,43 @@
|
|||
</div>
|
||||
</div>
|
||||
|
||||
<div class="modal fade" id="captureHashesTypesModal" tabindex="-1" role="dialog">
|
||||
<div class="modal-dialog modal-xl" role="document">
|
||||
<div class="modal-content">
|
||||
<div class="modal-header">
|
||||
<h5 class="modal-title" id="captureHashesTypesModalLabel">Hashes of the rendered page</h5>
|
||||
<button type="button" class="btn btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
|
||||
</div>
|
||||
<div class="modal-body">
|
||||
... loading hash types ...
|
||||
</div>
|
||||
<div class="modal-footer">
|
||||
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="modal fade" id="captureHashesTypesDetailsModal" tabindex="-1" role="dialog">
|
||||
<div class="modal-dialog modal-xl" role="document">
|
||||
<div class="modal-content">
|
||||
<div class="modal-header">
|
||||
<h5 class="modal-title" id="captureHashesTypesDetailsModalLabel">Other occurrences of the hash</h5>
|
||||
<button type="button" class="btn btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
|
||||
</div>
|
||||
<div class="modal-body">
|
||||
... loading hash details ...
|
||||
</div>
|
||||
<div class="modal-footer">
|
||||
<a class="btn btn-primary" href="#captureHashesTypesModal"
|
||||
data-remote="{{ url_for('tree_capture_hashes_types', tree_uuid=tree_uuid) }}"
|
||||
data-bs-toggle="modal" data-bs-target="#captureHashesTypesModal" role="button">Back to capture's hahses</a>
|
||||
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="modal fade" id="faviconDetailsProbabilisticHashModal" tabindex="-1" role="dialog">
|
||||
<div class="modal-dialog modal-xl" role="document">
|
||||
<div class="modal-content">
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
<script type="text/javascript">
|
||||
new DataTable('#identifiersTable', {
|
||||
columnDefs: [{ width: '20%', targets: 0 },
|
||||
{ width: '40%', targets: 1 },
|
||||
{ width: '40%', targets: 2 }],
|
||||
});
|
||||
</script>
|
||||
|
||||
|
||||
<h5 class="text-center">Click on the hash to see the other captures it's been found in</h5>
|
||||
<table id="identifiersTable" class="table table-striped" style="width:100%">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Number of captures</th>
|
||||
<th>Hash</th>
|
||||
<th>Hash type</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for number_captures, hash_type, hash in hashes %}
|
||||
<tr>
|
||||
<td>{{ number_captures }}</td>
|
||||
<td>
|
||||
<a href="#captureHashesTypesDetailsModal" data-remote="{{ url_for('capture_hash_details', hash_type=hash_type, h=hash) }}"
|
||||
data-bs-toggle="modal" data-bs-target="#captureHashesTypesDetailsModal" role="button">
|
||||
{{ hash }}
|
||||
</a>
|
||||
</td>
|
||||
<td>{{hash_type}}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
Loading…
Reference in New Issue