chg: [crawler] add unsafe tag if domain contain unsafe screenshot

pull/594/head
Terrtia 2023-05-10 16:28:19 +02:00
parent 37c71b8438
commit 6b60041db2
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
1 changed files with 11 additions and 2 deletions

View File

@ -15,6 +15,7 @@ from modules.abstract_module import AbstractModule
from lib import crawlers from lib import crawlers
from lib.ConfigLoader import ConfigLoader from lib.ConfigLoader import ConfigLoader
from lib.objects.Domains import Domain from lib.objects.Domains import Domain
from lib.objects.Items import Item
from lib.objects import Screenshots from lib.objects import Screenshots
@ -53,6 +54,9 @@ class Crawler(AbstractModule):
self.items_dir = None self.items_dir = None
self.domain = None self.domain = None
# TODO Replace with warning list ???
self.placeholder_screenshots = {'27e14ace10b0f96acd2bd919aaa98a964597532c35b6409dff6cc8eec8214748'}
# Send module state to logs # Send module state to logs
self.redis_logger.info('Crawler initialized') self.redis_logger.info('Crawler initialized')
@ -248,8 +252,13 @@ class Crawler(AbstractModule):
if 'png' in entries and entries['png']: if 'png' in entries and entries['png']:
screenshot = Screenshots.create_screenshot(entries['png'], b64=False) screenshot = Screenshots.create_screenshot(entries['png'], b64=False)
if screenshot: if screenshot:
# Remove Errors pages # TODO Replace with warning list ??? if not screenshot.is_tags_safe():
if screenshot.id not in ['27e14ace10b0f96acd2bd919aaa98a964597532c35b6409dff6cc8eec8214748']: unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
self.domain.add_tag(unsafe_tag)
item = Item(item_id)
item.add_tag(unsafe_tag)
# Remove Placeholder pages # TODO Replace with warning list ???
if screenshot.id not in self.placeholder_screenshots:
# Create Correlations # Create Correlations
screenshot.add_correlation('item', '', item_id) screenshot.add_correlation('item', '', item_id)
screenshot.add_correlation('domain', '', self.domain.id) screenshot.add_correlation('domain', '', self.domain.id)