chg: [crawler] add unsafe tag if domain contain unsafe screenshot

pull/594/head
Terrtia 2023-05-10 16:28:19 +02:00
parent 37c71b8438
commit 6b60041db2
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
1 changed files with 11 additions and 2 deletions

View File

@ -15,6 +15,7 @@ from modules.abstract_module import AbstractModule
from lib import crawlers
from lib.ConfigLoader import ConfigLoader
from lib.objects.Domains import Domain
from lib.objects.Items import Item
from lib.objects import Screenshots
@ -53,6 +54,9 @@ class Crawler(AbstractModule):
self.items_dir = None
self.domain = None
# TODO Replace with warning list ???
self.placeholder_screenshots = {'27e14ace10b0f96acd2bd919aaa98a964597532c35b6409dff6cc8eec8214748'}
# Send module state to logs
self.redis_logger.info('Crawler initialized')
@ -248,8 +252,13 @@ class Crawler(AbstractModule):
if 'png' in entries and entries['png']:
screenshot = Screenshots.create_screenshot(entries['png'], b64=False)
if screenshot:
# Remove Errors pages # TODO Replace with warning list ???
if screenshot.id not in ['27e14ace10b0f96acd2bd919aaa98a964597532c35b6409dff6cc8eec8214748']:
if not screenshot.is_tags_safe():
unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
self.domain.add_tag(unsafe_tag)
item = Item(item_id)
item.add_tag(unsafe_tag)
# Remove Placeholder pages # TODO Replace with warning list ???
if screenshot.id not in self.placeholder_screenshots:
# Create Correlations
screenshot.add_correlation('item', '', item_id)
screenshot.add_correlation('domain', '', self.domain.id)