mirror of https://github.com/CIRCL/lookyloo
chg: refactorize contacts filtering
parent
9fa71c8cf8
commit
f4f234a331
|
@ -118,6 +118,7 @@ dump.rdb
|
||||||
# Local config files
|
# Local config files
|
||||||
config/*.json
|
config/*.json
|
||||||
config/*.json.bkp
|
config/*.json.bkp
|
||||||
|
config/takedown_filters.ini
|
||||||
|
|
||||||
# user defined known content
|
# user defined known content
|
||||||
known_content_user/
|
known_content_user/
|
||||||
|
|
|
@ -0,0 +1,28 @@
|
||||||
|
[abuse]
|
||||||
|
ignore=
|
||||||
|
ripe.net$
|
||||||
|
arin.net$
|
||||||
|
apnic.net$
|
||||||
|
idnic.net$
|
||||||
|
peering@
|
||||||
|
domreg@
|
||||||
|
registrar-email
|
||||||
|
akamai.com$
|
||||||
|
google.com$
|
||||||
|
arin-noc@tucows.com
|
||||||
|
dnstech@tucows.com
|
||||||
|
avermeer@tucows.com
|
||||||
|
arin-maint@tucows.com
|
||||||
|
amzn-noc-contact@amazon.com
|
||||||
|
aws-routing-poc@amazon.com
|
||||||
|
aws-rpki-routing-poc@amazon.com
|
||||||
|
|
||||||
|
[replacelist]
|
||||||
|
noc@as5577.net=abuse@as5577.net
|
||||||
|
abuse@godaddy.com=abuse@godaddy.com,phishing@godaddy.com,malware@godaddy.com
|
||||||
|
|
||||||
|
[domain]
|
||||||
|
ignore=
|
||||||
|
apple.com
|
||||||
|
paypal.com
|
||||||
|
google.com
|
|
@ -2,10 +2,12 @@
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import configparser
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from datetime import datetime, timedelta, date
|
from datetime import datetime, timedelta, date
|
||||||
|
@ -53,31 +55,55 @@ def get_resources_hashes(har2tree_container: CrawledTree | HostNode | URLNode) -
|
||||||
return all_ressources_hashes
|
return all_ressources_hashes
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(64)
|
@lru_cache
|
||||||
def get_taxonomies() -> Taxonomies:
|
def get_taxonomies() -> Taxonomies:
|
||||||
return Taxonomies()
|
return Taxonomies()
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(64)
|
@lru_cache
|
||||||
def get_public_suffix_list() -> PublicSuffixList:
|
def get_public_suffix_list() -> PublicSuffixList:
|
||||||
"""Initialize Public Suffix List"""
|
"""Initialize Public Suffix List"""
|
||||||
# TODO (?): fetch the list
|
# TODO (?): fetch the list
|
||||||
return PublicSuffixList()
|
return PublicSuffixList()
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(64)
|
@lru_cache
|
||||||
def get_captures_dir() -> Path:
|
def get_captures_dir() -> Path:
|
||||||
capture_dir = get_homedir() / 'scraped'
|
capture_dir = get_homedir() / 'scraped'
|
||||||
safe_create_dir(capture_dir)
|
safe_create_dir(capture_dir)
|
||||||
return capture_dir
|
return capture_dir
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(64)
|
@lru_cache
|
||||||
def get_email_template() -> str:
|
def get_email_template() -> str:
|
||||||
with (get_homedir() / 'config' / 'email.tmpl').open() as f:
|
with (get_homedir() / 'config' / 'email.tmpl').open() as f:
|
||||||
return f.read()
|
return f.read()
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache
|
||||||
|
def load_takedown_filters() -> tuple[re.Pattern[str], re.Pattern[str], dict[str, list[str]]]:
|
||||||
|
filter_ini_file = get_homedir() / 'config' / 'takedown_filters.ini'
|
||||||
|
if not filter_ini_file.exists():
|
||||||
|
raise LookylooException(f'Unable to find the takedown filters file: {filter_ini_file}')
|
||||||
|
config = configparser.ConfigParser()
|
||||||
|
config.optionxform = str # type: ignore[method-assign,assignment]
|
||||||
|
config.read(filter_ini_file)
|
||||||
|
# compile the domains and subdomains to ignore
|
||||||
|
ignore_domains_list = []
|
||||||
|
for d in [d.strip() for d in config['domain']['ignore'].split('\n') if d.strip()]:
|
||||||
|
ignore_domain = f'{d}$'
|
||||||
|
ignore_subdomain = rf'.*\.{ignore_domain}'
|
||||||
|
ignore_domains_list.append(ignore_domain)
|
||||||
|
ignore_domains_list.append(ignore_subdomain)
|
||||||
|
ignore_domains = re.compile('|'.join(ignore_domains_list))
|
||||||
|
# Compile the emails addresses to ignore
|
||||||
|
ignore_emails = re.compile('|'.join([i.strip() for i in config['abuse']['ignore'].split('\n') if i.strip()]))
|
||||||
|
# Make the replace list a dictionary
|
||||||
|
replace_list = {to_replace: config['replacelist'][to_replace].split(',') for to_replace in config['replacelist']}
|
||||||
|
|
||||||
|
return ignore_domains, ignore_emails, replace_list
|
||||||
|
|
||||||
|
|
||||||
def make_dirs_list(root_dir: Path) -> list[Path]:
|
def make_dirs_list(root_dir: Path) -> list[Path]:
|
||||||
directories = []
|
directories = []
|
||||||
year_now = date.today().year
|
year_now = date.today().year
|
||||||
|
|
|
@ -3,7 +3,6 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
import configparser
|
|
||||||
import copy
|
import copy
|
||||||
import gzip
|
import gzip
|
||||||
import json
|
import json
|
||||||
|
@ -58,7 +57,8 @@ from .exceptions import (MissingCaptureDirectory,
|
||||||
from .helpers import (get_captures_dir, get_email_template,
|
from .helpers import (get_captures_dir, get_email_template,
|
||||||
get_resources_hashes, get_taxonomies,
|
get_resources_hashes, get_taxonomies,
|
||||||
uniq_domains, ParsedUserAgent, load_cookies, UserAgents,
|
uniq_domains, ParsedUserAgent, load_cookies, UserAgents,
|
||||||
get_useragent_for_requests, make_ts_from_dirname)
|
get_useragent_for_requests, make_ts_from_dirname, load_takedown_filters
|
||||||
|
)
|
||||||
from .modules import (MISPs, PhishingInitiative, UniversalWhois,
|
from .modules import (MISPs, PhishingInitiative, UniversalWhois,
|
||||||
UrlScan, VirusTotal, Phishtank, Hashlookup,
|
UrlScan, VirusTotal, Phishtank, Hashlookup,
|
||||||
RiskIQ, RiskIQError, Pandora, URLhaus, CIRCLPDNS)
|
RiskIQ, RiskIQError, Pandora, URLhaus, CIRCLPDNS)
|
||||||
|
@ -722,6 +722,9 @@ class Lookyloo():
|
||||||
'asns': {}, # ASN: [list of contacts from whois]
|
'asns': {}, # ASN: [list of contacts from whois]
|
||||||
'all_emails': set()
|
'all_emails': set()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if to_return['contacts']:
|
||||||
|
to_return['all_emails'] |= set(to_return['contacts'])
|
||||||
to_return['ips'] = {ip: self.uwhois.whois(ip, contact_email_only=True) for ip in set(hostnode.resolved_ips['v4']) | set(hostnode.resolved_ips['v6'])}
|
to_return['ips'] = {ip: self.uwhois.whois(ip, contact_email_only=True) for ip in set(hostnode.resolved_ips['v4']) | set(hostnode.resolved_ips['v6'])}
|
||||||
to_return['asns'] = {asn['asn']: self.uwhois.whois(f'AS{asn["asn"]}', contact_email_only=True) for asn in hostnode.ipasn.values()}
|
to_return['asns'] = {asn['asn']: self.uwhois.whois(f'AS{asn["asn"]}', contact_email_only=True) for asn in hostnode.ipasn.values()}
|
||||||
|
|
||||||
|
@ -763,57 +766,44 @@ class Lookyloo():
|
||||||
to_return['all_emails'] = list(to_return['all_emails'])
|
to_return['all_emails'] = list(to_return['all_emails'])
|
||||||
return to_return
|
return to_return
|
||||||
|
|
||||||
def takedown_filtered(self, hostnode: HostNode) -> dict[str, Any] | None:
|
def takedown_filtered(self, hostnode: HostNode) -> set[str] | None:
|
||||||
config = configparser.ConfigParser()
|
ignore_domains, ignore_emails, replace_list = load_takedown_filters()
|
||||||
config.optionxform = str # type: ignore[method-assign,assignment]
|
|
||||||
ignorelist_path = get_homedir() / 'config' / 'ignore_list.ini'
|
|
||||||
config.read(ignorelist_path)
|
|
||||||
# checking if domain should be ignored
|
# checking if domain should be ignored
|
||||||
domains = config['domain']['ignore']
|
|
||||||
pattern = r"(https?://)?(www\d?\.)?(?P<domain>[\w\.-]+\.\w+)(/\S*)?"
|
pattern = r"(https?://)?(www\d?\.)?(?P<domain>[\w\.-]+\.\w+)(/\S*)?"
|
||||||
match = re.match(pattern, hostnode.name)
|
if match := re.match(pattern, hostnode.name):
|
||||||
if match:
|
# NOTE: the name may not be a hostname if the capture is not a URL.
|
||||||
for regex in domains:
|
if re.search(ignore_domains, match.group("domain")):
|
||||||
ignore_domain = regex + "$"
|
self.logger.debug(f'{hostnode.name} is ignored')
|
||||||
ignore_subdomain = r".*\." + regex + "$"
|
return None
|
||||||
if (re.match(ignore_domain, match.group("domain")) or re.match(ignore_subdomain, match.group("domain"))) and regex.strip():
|
else:
|
||||||
return None
|
# The name is not a domain, we won't have any contacts.
|
||||||
result = self.takedown_details(hostnode)
|
self.logger.debug(f'{hostnode.name} is not a domain, no contacts.')
|
||||||
# ignoring mails
|
return None
|
||||||
final_mails = []
|
|
||||||
replacelist = config['replacelist']
|
|
||||||
ignorelist = config['abuse']['ignore'].split('\n')
|
|
||||||
for mail in result['all_emails']:
|
|
||||||
# ignoring mails
|
|
||||||
is_valid = True
|
|
||||||
for regex in ignorelist:
|
|
||||||
if not regex.strip():
|
|
||||||
continue
|
|
||||||
match = re.search(regex.strip(), mail)
|
|
||||||
if match:
|
|
||||||
is_valid = False
|
|
||||||
break
|
|
||||||
if is_valid:
|
|
||||||
# replacing emails
|
|
||||||
for replaceable in replacelist:
|
|
||||||
if mail == replaceable:
|
|
||||||
final_mails += replacelist[replaceable].split(',')
|
|
||||||
is_valid = False
|
|
||||||
break
|
|
||||||
if is_valid:
|
|
||||||
# mail is valid and can be added to the result
|
|
||||||
final_mails += [mail]
|
|
||||||
result['all_emails'] = final_mails
|
|
||||||
return result
|
|
||||||
|
|
||||||
def get_filtered_emails(self, capture_uuid: str, detailed: bool=False) -> set[str] | dict[str, str]:
|
result = self.takedown_details(hostnode)
|
||||||
info = self.contacts(capture_uuid)
|
# process mails
|
||||||
final_mails = set()
|
final_mails: set[str] = set()
|
||||||
for i in info:
|
for mail in result['all_emails']:
|
||||||
for mail in i['all_emails']:
|
if re.search(ignore_emails, mail):
|
||||||
|
self.logger.debug(f'{mail} is ignored')
|
||||||
|
continue
|
||||||
|
if mail in replace_list:
|
||||||
|
final_mails |= set(replace_list[mail])
|
||||||
|
else:
|
||||||
final_mails.add(mail)
|
final_mails.add(mail)
|
||||||
return final_mails
|
return final_mails
|
||||||
|
|
||||||
|
def contacts_filtered(self, capture_uuid: str, /) -> set[str]:
|
||||||
|
capture = self.get_crawled_tree(capture_uuid)
|
||||||
|
rendered_hostnode = self.get_hostnode_from_tree(capture_uuid, capture.root_hartree.rendered_node.hostnode_uuid)
|
||||||
|
result: set[str] = set()
|
||||||
|
for node in reversed(rendered_hostnode.get_ancestors()):
|
||||||
|
if mails := self.takedown_filtered(node):
|
||||||
|
result |= mails
|
||||||
|
if mails := self.takedown_filtered(rendered_hostnode):
|
||||||
|
result |= mails
|
||||||
|
return result
|
||||||
|
|
||||||
def contacts(self, capture_uuid: str, /) -> list[dict[str, Any]]:
|
def contacts(self, capture_uuid: str, /) -> list[dict[str, Any]]:
|
||||||
capture = self.get_crawled_tree(capture_uuid)
|
capture = self.get_crawled_tree(capture_uuid)
|
||||||
rendered_hostnode = self.get_hostnode_from_tree(capture_uuid, capture.root_hartree.rendered_node.hostnode_uuid)
|
rendered_hostnode = self.get_hostnode_from_tree(capture_uuid, capture.root_hartree.rendered_node.hostnode_uuid)
|
||||||
|
|
|
@ -76,11 +76,9 @@ class UniversalWhois(AbstractModule):
|
||||||
...
|
...
|
||||||
|
|
||||||
def whois(self, query: str, contact_email_only: bool=False) -> str | list[str]:
|
def whois(self, query: str, contact_email_only: bool=False) -> str | list[str]:
|
||||||
|
|
||||||
EMAIL_REGEX = rb'(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)'
|
|
||||||
|
|
||||||
if not self.available:
|
if not self.available:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
bytes_whois = b''
|
bytes_whois = b''
|
||||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
||||||
sock.connect((self.server, self.port))
|
sock.connect((self.server, self.port))
|
||||||
|
@ -108,5 +106,5 @@ class UniversalWhois(AbstractModule):
|
||||||
# We either dont have an abuse-c object or it does not exist
|
# We either dont have an abuse-c object or it does not exist
|
||||||
if not contact_email_only:
|
if not contact_email_only:
|
||||||
return bytes_whois.decode()
|
return bytes_whois.decode()
|
||||||
emails = list(set(re.findall(EMAIL_REGEX, bytes_whois)))
|
emails = list(set(re.findall(rb'[\w\.-]+@[\w\.-]+', bytes_whois)))
|
||||||
return [e.decode() for e in sorted(emails)]
|
return [e.decode() for e in sorted(emails)]
|
||||||
|
|
|
@ -582,6 +582,7 @@ class Comparables(Resource): # type: ignore[misc]
|
||||||
|
|
||||||
takedown_fields = api.model('TakedownFields', {
|
takedown_fields = api.model('TakedownFields', {
|
||||||
'capture_uuid': fields.String(description="The UUID of the capture.", required=True),
|
'capture_uuid': fields.String(description="The UUID of the capture.", required=True),
|
||||||
|
'filter': fields.Boolean(description="If true, the response is a list of emails.", default=False),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
@ -589,12 +590,17 @@ takedown_fields = api.model('TakedownFields', {
|
||||||
@api.doc(description='Get information for triggering a takedown request')
|
@api.doc(description='Get information for triggering a takedown request')
|
||||||
class Takedown(Resource): # type: ignore[misc]
|
class Takedown(Resource): # type: ignore[misc]
|
||||||
@api.doc(body=takedown_fields) # type: ignore[misc]
|
@api.doc(body=takedown_fields) # type: ignore[misc]
|
||||||
def post(self) -> list[dict[str, Any]] | dict[str, str]:
|
def post(self) -> list[dict[str, Any]] | dict[str, str] | list[str]:
|
||||||
|
if not lookyloo.uwhois.available:
|
||||||
|
return {'error': 'UWhois not available, cannot get contacts.'}
|
||||||
parameters: dict[str, Any] = request.get_json(force=True)
|
parameters: dict[str, Any] = request.get_json(force=True)
|
||||||
capture_uuid = parameters.get('capture_uuid')
|
capture_uuid = parameters.get('capture_uuid')
|
||||||
if not capture_uuid:
|
if not capture_uuid:
|
||||||
return {'error': f'Invalid request: {parameters}'}
|
return {'error': f'Invalid request: {parameters}'}
|
||||||
return lookyloo.contacts(capture_uuid)
|
if parameters.get('filter'):
|
||||||
|
return list(lookyloo.contacts_filtered(capture_uuid))
|
||||||
|
else:
|
||||||
|
return lookyloo.contacts(capture_uuid)
|
||||||
|
|
||||||
|
|
||||||
# Admin stuff
|
# Admin stuff
|
||||||
|
|
Loading…
Reference in New Issue