misp-modules/misp_modules/modules/expansion/urlscan.py

import json
import requests
import logging
import sys
import time

log = logging.getLogger('urlscan')
log.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
log.addHandler(ch)

moduleinfo = {
    'version': '0.1',
    'author': 'Dave Johnson',
    'description': 'An expansion module to query urlscan.io.',
    'module-type': ['expansion'],
    'name': 'URLScan Lookup',
    'logo': 'urlscan.jpg',
    'requirements': ['An access to the urlscan.io API'],
    'features': 'This module takes a MISP attribute as input and queries urlscan.io with it.\n\nThe result of this query is then parsed and some data is mapped into MISP attributes in order to enrich the input attribute.',
    'references': ['https://urlscan.io/'],
    'input': 'A domain, hostname or url attribute.',
    'output': 'MISP attributes mapped from the result of the query on urlscan.io.',
}

moduleconfig = ['apikey']
misperrors = {'error': 'Error'}
mispattributes = {
    'input': ['hostname', 'domain', 'ip-src', 'ip-dst', 'url'],
    'output': ['hostname', 'domain', 'ip-src', 'ip-dst', 'url', 'text', 'link', 'hash']
}


def handler(q=False):
    if q is False:
        return False
    request = json.loads(q)
    if not request.get('config') or not request['config'].get('apikey'):
        misperrors['error'] = 'Urlscan apikey is missing'
        return misperrors
    client = urlscanAPI(request['config']['apikey'])

    r = {'results': []}

    if 'ip-src' in request:
        r['results'] += lookup_indicator(client, request['ip-src'])
    if 'ip-dst' in request:
        r['results'] += lookup_indicator(client, request['ip-dst'])
    if 'domain' in request:
        r['results'] += lookup_indicator(client, request['domain'])
    if 'hostname' in request:
        r['results'] += lookup_indicator(client, request['hostname'])
    if 'url' in request:
        r['results'] += lookup_indicator(client, request['url'])

    # Return any errors generated from lookup to the UI and remove duplicates

    uniq = []
    log.debug(r['results'])
    for item in r['results']:
        log.debug(item)
        if 'error' in item:
            misperrors['error'] = item['error']
            return misperrors
        if item not in uniq:
            uniq.append(item)
    r['results'] = uniq
    return r


def lookup_indicator(client, query):
    result = client.search_url(query)
    log.debug('RESULTS: ' + json.dumps(result))
    r = []
    misp_comment = "{}: Enriched via the urlscan module".format(query)

    # Determine if the page is reachable
    for request in result['data']['requests']:
        if request['response'].get('failed'):
            if request['response']['failed']['errorText']:
                log.debug('The page could not load')
                r.append(
                    {'error': 'Domain could not be resolved: {}'.format(request['response']['failed']['errorText'])})

    if result.get('page'):
        if result['page'].get('domain'):
            misp_val = result['page']['domain']
            r.append({'types': 'domain',
                      'categories': ['Network activity'],
                      'values': misp_val,
                      'comment': misp_comment})

        if result['page'].get('ip'):
            misp_val = result['page']['ip']
            r.append({'types': 'ip-dst',
                      'categories': ['Network activity'],
                      'values': misp_val,
                      'comment': misp_comment})

        if result['page'].get('country'):
            misp_val = 'country: ' + result['page']['country']
            if result['page'].get('city'):
                misp_val += ', city: ' + result['page']['city']
            r.append({'types': 'text',
                      'categories': ['External analysis'],
                      'values': misp_val,
                      'comment': misp_comment})

        if result['page'].get('asn'):
            misp_val = result['page']['asn']
            r.append({'types': 'AS', 'categories': ['External analysis'], 'values': misp_val, 'comment': misp_comment})

        if result['page'].get('asnname'):
            misp_val = result['page']['asnname']
            r.append({'types': 'text',
                      'categories': ['External analysis'],
                      'values': misp_val,
                      'comment': misp_comment})

    if result.get('stats'):
        if result['stats'].get('malicious'):
            log.debug('There is something in results > stats > malicious')
            threat_list = set()

            if 'matches' in result['meta']['processors']['gsb']['data']:
                for item in result['meta']['processors']['gsb']['data']['matches']:
                    if item['threatType']:
                        threat_list.add(item['threatType'])

            threat_list = ', '.join(threat_list)
            log.debug('threat_list values are: \'' + threat_list + '\'')

            if threat_list:
                misp_val = '{} threat(s) detected'.format(threat_list)
                r.append({'types': 'text',
                          'categories': ['External analysis'],
                          'values': misp_val,
                          'comment': misp_comment})

    if result.get('lists'):
        if result['lists'].get('urls'):
            for url in result['lists']['urls']:
                url = url.lower()
                if 'office' in url:
                    misp_val = "Possible Office-themed phishing"
                elif 'o365' in url or '0365' in url:
                    misp_val = "Possible O365-themed phishing"
                elif 'microsoft' in url:
                    misp_val = "Possible Microsoft-themed phishing"
                elif 'paypal' in url:
                    misp_val = "Possible PayPal-themed phishing"
                elif 'onedrive' in url:
                    misp_val = "Possible OneDrive-themed phishing"
                elif 'docusign' in url:
                    misp_val = "Possible DocuSign-themed phishing"
                r.append({'types': 'text',
                          'categories': ['External analysis'],
                          'values': misp_val,
                          'comment': misp_comment})

    if result.get('task'):
        if result['task'].get('reportURL'):
            misp_val = result['task']['reportURL']
            r.append({'types': 'link',
                      'categories': ['External analysis'],
                      'values': misp_val,
                      'comment': misp_comment})

        if result['task'].get('screenshotURL'):
            image_url = result['task']['screenshotURL']
            r.append({'types': 'link',
                      'categories': ['External analysis'],
                      'values': image_url,
                      'comment': misp_comment})
            # ## TO DO ###
            # ## Add ability to add an in-line screenshot of the target website into an attribute
            # screenshot = requests.get(image_url).content
            # r.append({'types': ['attachment'],
            #           'categories': ['External analysis'],
            #           'values': image_url,
            #           'image': str(base64.b64encode(screenshot), 'utf-8'),
            #           'comment': 'Screenshot of website'})

    return r


def introspection():
    return mispattributes


def version():
    moduleinfo['config'] = moduleconfig
    return moduleinfo


class urlscanAPI():
    def __init__(self, apikey=None, uuid=None):
        self.key = apikey
        self.uuid = uuid

    def request(self, query):
        log.debug('From request function with the parameter: ' + query)
        payload = {'url': query}
        headers = {'API-Key': self.key,
                   'Content-Type': "application/json",
                   'Cache-Control': "no-cache"}

        # Troubleshooting problems with initial search request
        log.debug('PAYLOAD: ' + json.dumps(payload))
        log.debug('HEADERS: ' + json.dumps(headers))

        search_url_string = "https://urlscan.io/api/v1/scan/"
        response = requests.request("POST",
                                    search_url_string,
                                    data=json.dumps(payload),
                                    headers=headers)

        # HTTP 400 - Bad Request
        if response.status_code == 400:
            raise Exception('HTTP Error 400 - Bad Request')

        # HTTP 404 - Not found
        if response.status_code == 404:
            raise Exception('HTTP Error 404 - These are not the droids you\'re looking for')

        # Any other status code
        if response.status_code != 200:
            raise Exception('HTTP Error ' + str(response.status_code))

        if response.text:
            response = json.loads(response.content.decode("utf-8"))
            time.sleep(3)
            self.uuid = response['uuid']

            # Strings for to check for errors on the results page
            # Null response string for any unavailable resources
            null_response_string = '"status": 404'
            # Redirect string accounting for 301/302/303/307/308 status codes
            redirect_string = '"status": 30'
            # Normal response string with 200 status code
            normal_response_string = '"status": 200'

            results_url_string = "https://urlscan.io/api/v1/result/" + self.uuid
            log.debug('Results URL: ' + results_url_string)

            # Need to wait for results to process and check if they are valid
            tries = 10
            while tries >= 0:
                results = requests.request("GET", results_url_string)
                log.debug('Made a GET request')
                results = results.content.decode("utf-8")
                # checking if there is a 404 status code and no available resources
                if null_response_string in results and \
                        redirect_string not in results and \
                        normal_response_string not in results:
                    log.debug('Results not processed. Please check again later.')
                    time.sleep(3)
                    tries -= 1
                else:
                    return json.loads(results)

            raise Exception('Results contained a 404 status error and could not be processed.')

    def search_url(self, query):
        log.debug('From search_url with parameter: ' + query)
        return self.request(query)
Create urlscan.py 2018-08-10 23:00:01 +02:00			`import json`
			`import requests`
			`import logging`
			`import sys`
			`import time`

			`log = logging.getLogger('urlscan')`
			`log.setLevel(logging.DEBUG)`
			`ch = logging.StreamHandler(sys.stdout)`
			`ch.setLevel(logging.DEBUG)`
			`formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')`
			`ch.setFormatter(formatter)`
			`log.addHandler(ch)`

			`moduleinfo = {`
Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00			`'version': '0.1',`
			`'author': 'Dave Johnson',`
chg: [doc] Big doc revamp #680 2024-08-12 11:23:10 +02:00			`'description': 'An expansion module to query urlscan.io.',`
			`'module-type': ['expansion'],`
			`'name': 'URLScan Lookup',`
			`'logo': 'urlscan.jpg',`
			`'requirements': ['An access to the urlscan.io API'],`
			`'features': 'This module takes a MISP attribute as input and queries urlscan.io with it.\n\nThe result of this query is then parsed and some data is mapped into MISP attributes in order to enrich the input attribute.',`
			`'references': ['https://urlscan.io/'],`
			`'input': 'A domain, hostname or url attribute.',`
			`'output': 'MISP attributes mapped from the result of the query on urlscan.io.',`
Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00			`}`
Create urlscan.py 2018-08-10 23:00:01 +02:00
			`moduleconfig = ['apikey']`
			`misperrors = {'error': 'Error'}`
			`mispattributes = {`
fix: Making urlscan module available in MISP for ip attributes - As expected in the the handler function 2019-10-30 16:39:07 +01:00			`'input': ['hostname', 'domain', 'ip-src', 'ip-dst', 'url'],`
Update urlscan.py Added hash to the search so you can take advantage of the new file down load function on urlscan.io. You can use this to pivot on file hashes and find out domains that hosting the same malicious file. 2018-08-31 02:41:34 +02:00			`'output': ['hostname', 'domain', 'ip-src', 'ip-dst', 'url', 'text', 'link', 'hash']`
Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00			`}`

Create urlscan.py 2018-08-10 23:00:01 +02:00
			`def handler(q=False):`
			`if q is False:`
			`return False`
			`request = json.loads(q)`
fix: Fixed config field parsing for various modules - Same as previous commit 2019-10-30 16:31:57 +01:00			`if not request.get('config') or not request['config'].get('apikey'):`
			`misperrors['error'] = 'Urlscan apikey is missing'`
			`return misperrors`
Create urlscan.py 2018-08-10 23:00:01 +02:00			`client = urlscanAPI(request['config']['apikey'])`

			`r = {'results': []}`

			`if 'ip-src' in request:`
			`r['results'] += lookup_indicator(client, request['ip-src'])`
			`if 'ip-dst' in request:`
			`r['results'] += lookup_indicator(client, request['ip-dst'])`
			`if 'domain' in request:`
			`r['results'] += lookup_indicator(client, request['domain'])`
			`if 'hostname' in request:`
			`r['results'] += lookup_indicator(client, request['hostname'])`
			`if 'url' in request:`
			`r['results'] += lookup_indicator(client, request['url'])`

Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00			`# Return any errors generated from lookup to the UI and remove duplicates`

Create urlscan.py 2018-08-10 23:00:01 +02:00			`uniq = []`
Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00			`log.debug(r['results'])`
Create urlscan.py 2018-08-10 23:00:01 +02:00			`for item in r['results']:`
Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00			`log.debug(item)`
			`if 'error' in item:`
			`misperrors['error'] = item['error']`
			`return misperrors`
Create urlscan.py 2018-08-10 23:00:01 +02:00			`if item not in uniq:`
			`uniq.append(item)`
			`r['results'] = uniq`
			`return r`


			`def lookup_indicator(client, query):`
			`result = client.search_url(query)`
			`log.debug('RESULTS: ' + json.dumps(result))`
			`r = []`
Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00			`misp_comment = "{}: Enriched via the urlscan module".format(query)`

			`# Determine if the page is reachable`
			`for request in result['data']['requests']:`
			`if request['response'].get('failed'):`
			`if request['response']['failed']['errorText']:`
			`log.debug('The page could not load')`
			`r.append(`
			`{'error': 'Domain could not be resolved: {}'.format(request['response']['failed']['errorText'])})`

Create urlscan.py 2018-08-10 23:00:01 +02:00			`if result.get('page'):`
			`if result['page'].get('domain'):`
			`misp_val = result['page']['domain']`
			`r.append({'types': 'domain',`
			`'categories': ['Network activity'],`
			`'values': misp_val,`
			`'comment': misp_comment})`

			`if result['page'].get('ip'):`
			`misp_val = result['page']['ip']`
			`r.append({'types': 'ip-dst',`
			`'categories': ['Network activity'],`
			`'values': misp_val,`
			`'comment': misp_comment})`

			`if result['page'].get('country'):`
Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00			`misp_val = 'country: ' + result['page']['country']`
Create urlscan.py 2018-08-10 23:00:01 +02:00			`if result['page'].get('city'):`
Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00			`misp_val += ', city: ' + result['page']['city']`
Create urlscan.py 2018-08-10 23:00:01 +02:00			`r.append({'types': 'text',`
			`'categories': ['External analysis'],`
			`'values': misp_val,`
			`'comment': misp_comment})`

			`if result['page'].get('asn'):`
			`misp_val = result['page']['asn']`
Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00			`r.append({'types': 'AS', 'categories': ['External analysis'], 'values': misp_val, 'comment': misp_comment})`
Create urlscan.py 2018-08-10 23:00:01 +02:00
			`if result['page'].get('asnname'):`
			`misp_val = result['page']['asnname']`
			`r.append({'types': 'text',`
			`'categories': ['External analysis'],`
			`'values': misp_val,`
			`'comment': misp_comment})`

			`if result.get('stats'):`
			`if result['stats'].get('malicious'):`
			`log.debug('There is something in results > stats > malicious')`
			`threat_list = set()`

			`if 'matches' in result['meta']['processors']['gsb']['data']:`
			`for item in result['meta']['processors']['gsb']['data']['matches']:`
			`if item['threatType']:`
			`threat_list.add(item['threatType'])`

			`threat_list = ', '.join(threat_list)`
			`log.debug('threat_list values are: \'' + threat_list + '\'')`

			`if threat_list:`
			`misp_val = '{} threat(s) detected'.format(threat_list)`
			`r.append({'types': 'text',`
			`'categories': ['External analysis'],`
			`'values': misp_val,`
			`'comment': misp_comment})`

			`if result.get('lists'):`
			`if result['lists'].get('urls'):`
			`for url in result['lists']['urls']:`
			`url = url.lower()`
			`if 'office' in url:`
Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00			`misp_val = "Possible Office-themed phishing"`
Create urlscan.py 2018-08-10 23:00:01 +02:00			`elif 'o365' in url or '0365' in url:`
Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00			`misp_val = "Possible O365-themed phishing"`
Create urlscan.py 2018-08-10 23:00:01 +02:00			`elif 'microsoft' in url:`
Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00			`misp_val = "Possible Microsoft-themed phishing"`
Create urlscan.py 2018-08-10 23:00:01 +02:00			`elif 'paypal' in url:`
Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00			`misp_val = "Possible PayPal-themed phishing"`
Create urlscan.py 2018-08-10 23:00:01 +02:00			`elif 'onedrive' in url:`
Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00			`misp_val = "Possible OneDrive-themed phishing"`
Create urlscan.py 2018-08-10 23:00:01 +02:00			`elif 'docusign' in url:`
Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00			`misp_val = "Possible DocuSign-themed phishing"`
Create urlscan.py 2018-08-10 23:00:01 +02:00			`r.append({'types': 'text',`
			`'categories': ['External analysis'],`
			`'values': misp_val,`
			`'comment': misp_comment})`

			`if result.get('task'):`
			`if result['task'].get('reportURL'):`
			`misp_val = result['task']['reportURL']`
			`r.append({'types': 'link',`
			`'categories': ['External analysis'],`
			`'values': misp_val,`
			`'comment': misp_comment})`

			`if result['task'].get('screenshotURL'):`
			`image_url = result['task']['screenshotURL']`
			`r.append({'types': 'link',`
			`'categories': ['External analysis'],`
			`'values': image_url,`
			`'comment': misp_comment})`
fix: Make pep8 happy 2018-12-11 15:29:09 +01:00			`# ## TO DO ###`
			`# ## Add ability to add an in-line screenshot of the target website into an attribute`
Create urlscan.py 2018-08-10 23:00:01 +02:00			`# screenshot = requests.get(image_url).content`
			`# r.append({'types': ['attachment'],`
			`# 'categories': ['External analysis'],`
			`# 'values': image_url,`
			`# 'image': str(base64.b64encode(screenshot), 'utf-8'),`
			`# 'comment': 'Screenshot of website'})`

			`return r`


			`def introspection():`
			`return mispattributes`


			`def version():`
			`moduleinfo['config'] = moduleconfig`
			`return moduleinfo`


			`class urlscanAPI():`
			`def __init__(self, apikey=None, uuid=None):`
			`self.key = apikey`
			`self.uuid = uuid`

			`def request(self, query):`
			`log.debug('From request function with the parameter: ' + query)`
			`payload = {'url': query}`
			`headers = {'API-Key': self.key,`
			`'Content-Type': "application/json",`
			`'Cache-Control': "no-cache"}`

			`# Troubleshooting problems with initial search request`
			`log.debug('PAYLOAD: ' + json.dumps(payload))`
			`log.debug('HEADERS: ' + json.dumps(headers))`

			`search_url_string = "https://urlscan.io/api/v1/scan/"`
			`response = requests.request("POST",`
			`search_url_string,`
			`data=json.dumps(payload),`
			`headers=headers)`

			`# HTTP 400 - Bad Request`
			`if response.status_code == 400:`
			`raise Exception('HTTP Error 400 - Bad Request')`

			`# HTTP 404 - Not found`
			`if response.status_code == 404:`
			`raise Exception('HTTP Error 404 - These are not the droids you\'re looking for')`

			`# Any other status code`
			`if response.status_code != 200:`
			`raise Exception('HTTP Error ' + str(response.status_code))`

			`if response.text:`
			`response = json.loads(response.content.decode("utf-8"))`
			`time.sleep(3)`
			`self.uuid = response['uuid']`

			`# Strings for to check for errors on the results page`
			`# Null response string for any unavailable resources`
			`null_response_string = '"status": 404'`
			`# Redirect string accounting for 301/302/303/307/308 status codes`
			`redirect_string = '"status": 30'`
			`# Normal response string with 200 status code`
			`normal_response_string = '"status": 200'`

			`results_url_string = "https://urlscan.io/api/v1/result/" + self.uuid`
			`log.debug('Results URL: ' + results_url_string)`

			`# Need to wait for results to process and check if they are valid`
			`tries = 10`
			`while tries >= 0:`
			`results = requests.request("GET", results_url_string)`
			`log.debug('Made a GET request')`
			`results = results.content.decode("utf-8")`
			`# checking if there is a 404 status code and no available resources`
			`if null_response_string in results and \`
			`redirect_string not in results and \`
			`normal_response_string not in results:`
			`log.debug('Results not processed. Please check again later.')`
			`time.sleep(3)`
			`tries -= 1`
			`else:`
			`return json.loads(results)`
Add error handling for DNS failures, reduce imports, and simplify misp_comments 2018-08-14 17:51:15 +02:00
Create urlscan.py 2018-08-10 23:00:01 +02:00			`raise Exception('Results contained a 404 status error and could not be processed.')`

			`def search_url(self, query):`
			`log.debug('From search_url with parameter: ' + query)`
			`return self.request(query)`