mirror of https://github.com/CIRCL/AIL-framework
chg: [crawler] tag domain by vanity
parent
72f4733242
commit
2ead8c21aa
|
@ -17,6 +17,7 @@ from modules.abstract_module import AbstractModule
|
|||
from lib import ail_logger
|
||||
from lib import crawlers
|
||||
from lib.ConfigLoader import ConfigLoader
|
||||
from lib.Tag import get_domain_vanity_tags
|
||||
from lib.objects import CookiesNames
|
||||
from lib.objects import Etags
|
||||
from lib.objects.Domains import Domain
|
||||
|
@ -40,6 +41,9 @@ class Crawler(AbstractModule):
|
|||
|
||||
self.tracker_yara = Tracker_Yara(queue=False)
|
||||
|
||||
self.vanity_tags = get_domain_vanity_tags()
|
||||
print('vanity tags:', self.vanity_tags)
|
||||
|
||||
config_loader = ConfigLoader()
|
||||
|
||||
self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
|
||||
|
@ -271,7 +275,12 @@ class Crawler(AbstractModule):
|
|||
# Origin + History + tags
|
||||
if self.root_item:
|
||||
self.domain.set_last_origin(parent_id)
|
||||
# Vanity
|
||||
self.domain.update_vanity_cluster()
|
||||
domain_vanity = self.domain.get_vanity()
|
||||
if domain_vanity in self.vanity_tags:
|
||||
for tag in self.vanity_tags[domain_vanity]:
|
||||
self.domain.add_tag(tag)
|
||||
# Tags
|
||||
for tag in task.get_tags():
|
||||
self.domain.add_tag(tag)
|
||||
|
|
|
@ -1521,6 +1521,24 @@ def refresh_auto_push():
|
|||
|
||||
# --- TAG AUTO PUSH --- #
|
||||
|
||||
def get_domain_vanity_tags():
|
||||
vanity = {}
|
||||
try:
|
||||
with open(os.path.join(os.environ['AIL_HOME'], 'files/vanity_tags')) as f:
|
||||
ltags = json.load(f)
|
||||
if ltags:
|
||||
for tag in ltags:
|
||||
if is_taxonomie_tag(tag) or is_galaxy_tag(tag):
|
||||
for s_vanity in ltags[tag]:
|
||||
if s_vanity not in vanity:
|
||||
vanity[s_vanity] = []
|
||||
vanity[s_vanity].append(tag)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
except json.decoder.JSONDecodeError:
|
||||
print('Error files/vanity_tags, Invalid JSON')
|
||||
return vanity
|
||||
|
||||
###################################################################################
|
||||
###################################################################################
|
||||
###################################################################################
|
||||
|
|
|
@ -411,6 +411,9 @@ class Domain(AbstractObject):
|
|||
r_crawler.sadd(f'language:domains:{self.domain_type}:{language}', self.id)
|
||||
r_crawler.sadd(f'domain:language:{self.id}', language)
|
||||
|
||||
def get_vanity(self, len_vanity=4):
|
||||
return get_domain_vanity(self.id, len_vanity=len_vanity)
|
||||
|
||||
def update_vanity_cluster(self):
|
||||
if self.get_domain_type() == 'onion':
|
||||
update_vanity_cluster(self.id)
|
||||
|
|
Loading…
Reference in New Issue