diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py
index 9d1514e8..7ddcc5e6 100755
--- a/bin/crawlers/Crawler.py
+++ b/bin/crawlers/Crawler.py
@@ -192,6 +192,7 @@ class Crawler(AbstractModule):
# force=force,
# general_timeout_in_sec=120)
+ # with_favicon = True,
capture_uuid = self.lacus.enqueue(url=url,
depth=task.get_depth(),
user_agent=task.get_user_agent(),
@@ -274,8 +275,9 @@ class Crawler(AbstractModule):
for tag in task.get_tags():
self.domain.add_tag(tag)
self.original_domain.add_history(epoch, root_item=self.root_item)
- crawlers.update_last_crawled_domain(self.original_domain.get_domain_type(), self.original_domain.id, epoch)
+ # crawlers.update_last_crawled_domain(self.original_domain.get_domain_type(), self.original_domain.id, epoch)
+ self.domain.update_vanity_cluster()
crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
print('capture:', capture.uuid, 'completed')
print('task: ', task.uuid, 'completed')
diff --git a/bin/lib/objects/Domains.py b/bin/lib/objects/Domains.py
index 3895aa54..f4c800e4 100755
--- a/bin/lib/objects/Domains.py
+++ b/bin/lib/objects/Domains.py
@@ -411,6 +411,10 @@ class Domain(AbstractObject):
r_crawler.sadd(f'language:domains:{self.domain_type}:{language}', self.id)
r_crawler.sadd(f'domain:language:{self.id}', language)
+ def update_vanity_cluster(self):
+ if self.get_domain_type() == 'onion':
+ update_vanity_cluster(self.id)
+
############################################################################
############################################################################
@@ -644,10 +648,71 @@ def api_search_domains_by_name(name_to_search, domain_types, meta=False, page=1)
################################################################################
################################################################################
+#### Vanity Explorer ####
+
+# TODO ADD ME IN OBJ CLASS
+def get_domain_vanity(domain, len_vanity=4):
+ return domain[:len_vanity]
+
+def get_vanity_clusters(nb_min=4):
+ return r_crawler.zrange('vanity:onion:4', nb_min, '+inf', byscore=True, withscores=True)
+
+def get_vanity_domains(vanity, len_vanity=4, meta=False):
+ if len_vanity == 4:
+ domains = r_crawler.smembers(f'vanity:{int(len_vanity)}:{vanity}')
+ else:
+ domains = []
+ for domain in r_crawler.smembers(f'vanity:4:{vanity[:4]}'):
+ dom_vanity = get_domain_vanity(domain, len_vanity=len_vanity)
+ if vanity == dom_vanity:
+ domains.append(domain)
+ if meta:
+ metas = []
+ for domain in domains:
+ metas.append(Domain(domain).get_meta(options={'languages', 'screenshot', 'tags_safe'}))
+ return metas
+ else:
+ return domains
+
+def get_vanity_cluster(vanity, len_vanity=4, nb_min=4):
+ if len_vanity == 4:
+ return get_vanity_clusters(nb_min=nb_min)
+ else:
+ clusters = {}
+ for domain in get_vanity_domains(vanity[:4], len_vanity=4):
+ new_vanity = get_domain_vanity(domain, len_vanity=len_vanity)
+ if vanity not in clusters:
+ clusters[new_vanity] = 0
+ clusters[new_vanity] += 1
+ to_remove = []
+ for new_vanity in clusters:
+ if clusters[new_vanity] < nb_min:
+ to_remove.append(new_vanity)
+ for new_vanity in to_remove:
+ del clusters[new_vanity]
+ return clusters
+
+def get_vanity_nb_domains(vanity, len_vanity=4):
+ return r_crawler.scard(f'vanity:{int(len_vanity)}:{vanity}')
+
+# TODO BUILD DICTIONARY
+def update_vanity_cluster(domain):
+ vanity = get_domain_vanity(domain, len_vanity=4)
+ add = r_crawler.sadd(f'vanity:4:{vanity}', domain)
+ if add == 1:
+ r_crawler.zadd('vanity:onion:4', {vanity: 1})
+
+def _rebuild_vanity_clusters():
+ for vanity in r_crawler.zrange('vanity:onion:4', 0, -1):
+ r_crawler.delete(f'vanity:4:{vanity}')
+ r_crawler.delete('vanity:onion:4')
+ for domain in get_domains_up_by_type('onion'):
+ update_vanity_cluster(domain)
+
def cluster_onion_domain_vanity(len_vanity=4):
domains = {}
occurrences = {}
- for domain in get_domains_up_by_type('web'):
+ for domain in get_domains_up_by_type('onion'):
start = domain[:len_vanity]
if start not in domains:
domains[start] = []
@@ -659,8 +724,6 @@ def cluster_onion_domain_vanity(len_vanity=4):
res = dict(sorted(occurrences.items(), key=lambda item: item[1], reverse=True))
print(json.dumps(res))
-################################################################################
-################################################################################
if __name__ == '__main__':
- cluster_onion_domain_vanity(len_vanity=4)
+ _rebuild_vanity_clusters()
diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py
index c8983ace..31a1e647 100644
--- a/var/www/blueprints/crawler_splash.py
+++ b/var/www/blueprints/crawler_splash.py
@@ -576,6 +576,37 @@ def domains_search_date_post():
type=domain_type, down=down, up=up))
+@crawler_splash.route('/domains/explorer/vanity', methods=['GET'])
+@login_required
+@login_analyst
+def domains_explorer_vanity_clusters():
+ nb_min = request.args.get('min', 0)
+ if int(nb_min) < 0:
+ nb_min = 4
+ vanity_clusters = Domains.get_vanity_clusters(nb_min=nb_min)
+ return render_template("explorer_vanity_clusters.html", vanity_clusters=vanity_clusters,
+ length=4)
+
+@crawler_splash.route('/domains/explorer/vanity/explore', methods=['GET'])
+@login_required
+@login_analyst
+def domains_explorer_vanity_explore():
+ vanity = request.args.get('vanity')
+ nb_min = request.args.get('min', 0) # TODO SHOW DOMAINS OPTIONS + HARD CODED DOMAINS LIMIT FOR RENDER
+ length = len(vanity)
+ if int(nb_min) < 0:
+ nb_min = 4
+ vanity_clusters = Domains.get_vanity_cluster(vanity, len_vanity=length+1, nb_min=nb_min)
+ vanity_domains = Domains.get_vanity_domains(vanity, len_vanity=length, meta=True)
+ vanities_tree = []
+ for i in range(4, length):
+ vanities_tree.append(vanity[:i])
+ if length == len(vanity):
+ vanities_tree.append(vanity)
+ return render_template("explorer_vanity_domains.html", vanity_clusters=vanity_clusters,
+ bootstrap_label=bootstrap_label, vanity=vanity, vanities_tree=vanities_tree,
+ vanity_domains=vanity_domains, length=length)
+
##-- --##
diff --git a/var/www/blueprints/objects_item.py b/var/www/blueprints/objects_item.py
index e3ae8d18..29e6a2de 100644
--- a/var/www/blueprints/objects_item.py
+++ b/var/www/blueprints/objects_item.py
@@ -56,7 +56,7 @@ def screenshot(filename):
abort(404)
filename = filename.replace('/', '')
s = Screenshot(filename)
- return send_from_directory(SCREENSHOT_FOLDER, s.get_rel_path(add_extension=True), as_attachment=True)
+ return send_from_directory(SCREENSHOT_FOLDER, s.get_rel_path(add_extension=True), as_attachment=False, mimetype='image')
@objects_item.route("/object/item")
@login_required
diff --git a/var/www/templates/chats_explorer/block_message.html b/var/www/templates/chats_explorer/block_message.html
index 3343210a..811a642e 100644
--- a/var/www/templates/chats_explorer/block_message.html
+++ b/var/www/templates/chats_explorer/block_message.html
@@ -16,7 +16,7 @@
height: 2px;
background: #eee;
}
- .message_image {
+ .object_image {
max-width: 50%;
filter: blur(5px);
}
@@ -66,7 +66,7 @@
{% endif %}
{% if message['images'] %}
{% for message_image in message['images'] %}
-
{% endfor %}
{% endif %}
{% if message['files-names'] %}
diff --git a/var/www/templates/crawler/crawler_splash/domain_explorer.html b/var/www/templates/crawler/crawler_splash/domain_explorer.html
index b3ed158f..e0e59275 100644
--- a/var/www/templates/crawler/crawler_splash/domain_explorer.html
+++ b/var/www/templates/crawler/crawler_splash/domain_explorer.html
@@ -41,26 +41,8 @@