From bdbf5388934478d79fee2c5c3ae21b0642d3cd78 Mon Sep 17 00:00:00 2001
From: David J <david.lee.johnson@gmail.com>
Date: Fri, 10 Aug 2018 16:00:01 -0500
Subject: [PATCH 1/3] Create urlscan.py

---
 misp_modules/modules/expansion/urlscan.py | 269 ++++++++++++++++++++++
 1 file changed, 269 insertions(+)
 create mode 100644 misp_modules/modules/expansion/urlscan.py

diff --git a/misp_modules/modules/expansion/urlscan.py b/misp_modules/modules/expansion/urlscan.py
new file mode 100644
index 0000000..8f4067c
--- /dev/null
+++ b/misp_modules/modules/expansion/urlscan.py
@@ -0,0 +1,269 @@
+import json
+import requests
+import logging
+import sys
+import time
+# Need base64 if encoding data for attachments, but disabled for now
+# import base64
+
+log = logging.getLogger('urlscan')
+log.setLevel(logging.DEBUG)
+ch = logging.StreamHandler(sys.stdout)
+ch.setLevel(logging.DEBUG)
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+ch.setFormatter(formatter)
+log.addHandler(ch)
+
+moduleinfo = {
+                'version': '0.1',
+                'author': 'Dave Johnson',
+                'description': 'Module to query urlscan.io',
+                'module-type': ['expansion']
+             }
+
+moduleconfig = ['apikey']
+misperrors = {'error': 'Error'}
+mispattributes = {
+                    'input': ['hostname', 'domain', 'ip-src', 'ip-dst', 'url'],
+                    'output': ['hostname', 'domain', 'ip-src', 'ip-dst', 'url', 'text', 'link']
+                 }
+
+def handler(q=False):
+    if q is False:
+        return False
+    request = json.loads(q)
+    if (request.get('config')):
+        if (request['config'].get('apikey') is None):
+            misperrors['error'] = 'urlscan apikey is missing'
+            return misperrors
+    client = urlscanAPI(request['config']['apikey'])
+
+    r = {'results': []}
+
+    if 'ip-src' in request:
+        r['results'] += lookup_indicator(client, request['ip-src'])
+    if 'ip-dst' in request:
+        r['results'] += lookup_indicator(client, request['ip-dst'])
+    if 'domain' in request:
+        r['results'] += lookup_indicator(client, request['domain'])
+    if 'hostname' in request:
+        r['results'] += lookup_indicator(client, request['hostname'])
+    if 'url' in request:
+        r['results'] += lookup_indicator(client, request['url'])
+
+    uniq = []
+    for item in r['results']:
+        if item not in uniq:
+            uniq.append(item)
+    r['results'] = uniq
+    return r
+
+
+def lookup_indicator(client, query):
+    result = client.search_url(query)
+    log.debug('RESULTS: ' + json.dumps(result))
+    r = []
+    if result.get('page'):
+        if result['page'].get('domain'):
+            misp_val = result['page']['domain']
+            misp_comment = "Domain associated with {} (source: urlscan.io)".format(query)
+            r.append({'types': 'domain',
+                      'categories': ['Network activity'],
+                      'values': misp_val,
+                      'comment': misp_comment})
+
+        if result['page'].get('ip'):
+            misp_val = result['page']['ip']
+            misp_comment = "IP associated with {} (source: urlscan.io)".format(query)
+            r.append({'types': 'ip-dst',
+                      'categories': ['Network activity'],
+                      'values': misp_val,
+                      'comment': misp_comment})
+
+        if result['page'].get('country'):
+            misp_val = 'Country: ' + result['page']['country']
+            if result['page'].get('city'):
+                misp_val += ', City: ' + result['page']['city']
+            misp_comment = "Location associated with {} (source: urlscan.io)".format(query)
+            r.append({'types': 'text',
+                      'categories': ['External analysis'],
+                      'values': misp_val,
+                      'comment': misp_comment})
+
+        if result['page'].get('asn'):
+            misp_val = result['page']['asn']
+            misp_comment = "ASN associated with {} (source: urlscan.io)".format(query)
+            r.append({'types': 'AS', 'categories': ['Network activity'], 'values': misp_val, 'comment': misp_comment})
+
+        if result['page'].get('asnname'):
+            misp_val = result['page']['asnname']
+            misp_comment = "ASN name associated with {} (source: urlscan.io)".format(query)
+            r.append({'types': 'text',
+                      'categories': ['External analysis'],
+                      'values': misp_val,
+                      'comment': misp_comment})
+
+    if result.get('stats'):
+        if result['stats'].get('malicious'):
+            log.debug('There is something in results > stats > malicious')
+            threat_list = set()
+
+            if 'matches' in result['meta']['processors']['gsb']['data']:
+                for item in result['meta']['processors']['gsb']['data']['matches']:
+                    if item['threatType']:
+                        threat_list.add(item['threatType'])
+
+            threat_list = ', '.join(threat_list)
+            log.debug('threat_list values are: \'' + threat_list + '\'')
+
+            if threat_list:
+                misp_val = '{} threat(s) detected'.format(threat_list)
+                misp_comment = '{} malicious indicator(s) were present on ' \
+                               '{} (source: urlscan.io)'.format(result['stats']['malicious'], query, threat_list)
+                r.append({'types': 'text',
+                          'categories': ['External analysis'],
+                          'values': misp_val,
+                          'comment': misp_comment})
+
+    if result.get('lists'):
+        if result['lists'].get('urls'):
+            for url in result['lists']['urls']:
+                url = url.lower()
+                if 'office' in url:
+                    misp_val = 'Possible Microsoft Office themed phishing page'
+                    misp_comment = 'There was resource containing an \'Office\' string in the URL.'
+                elif 'o365' in url or '0365' in url:
+                    misp_val = 'Possible Microsoft O365 themed phishing page'
+                    misp_comment = 'There was resource containing an \'O365\' string in the URL.'
+                elif 'microsoft' in url:
+                    misp_val = 'Possible Microsoft themed phishing page'
+                    misp_comment = 'There was resource containing an \'Office\' string in the URL.'
+                elif 'paypal' in url:
+                    misp_val = 'Possible PayPal themed phishing page'
+                    misp_comment = 'There was resource containing a \'PayPal\' string in the URL.'
+                elif 'onedrive' in url:
+                    misp_val = 'Possible OneDrive themed phishing page'
+                    misp_comment = 'There was resource containing a \'OneDrive\' string in the URL.'
+                elif 'docusign' in url:
+                    misp_val = 'Possible DocuSign themed phishing page'
+                    misp_comment = 'There was resource containing a \'DocuSign\' string in the URL'
+                r.append({'types': 'text',
+                          'categories': ['External analysis'],
+                          'values': misp_val,
+                          'comment': misp_comment})
+
+    if result.get('task'):
+        if result['task'].get('reportURL'):
+            misp_val = result['task']['reportURL']
+            misp_comment = 'Link to full report (source: urlscan.io)'
+            r.append({'types': 'link',
+                      'categories': ['External analysis'],
+                      'values': misp_val,
+                      'comment': misp_comment})
+
+        if result['task'].get('screenshotURL'):
+            image_url = result['task']['screenshotURL']
+            misp_comment = 'Link to screenshot (source: urlscan.io)'
+            r.append({'types': 'link',
+                      'categories': ['External analysis'],
+                      'values': image_url,
+                      'comment': misp_comment})
+            ### TO DO ###
+            ### Add ability to add an in-line screenshot of the target website into an attribute
+            # screenshot = requests.get(image_url).content
+            # r.append({'types': ['attachment'],
+            #           'categories': ['External analysis'],
+            #           'values': image_url,
+            #           'image': str(base64.b64encode(screenshot), 'utf-8'),
+            #           'comment': 'Screenshot of website'})
+
+        if result['task'].get('domURL'):
+            misp_val = result['task']['domURL']
+            misp_comment = 'Link to DOM (source: urlscan.io)'
+            r.append({'types': 'link',
+                      'categories': ['External analysis'],
+                      'values': misp_val,
+                      'comment': misp_comment})
+
+    return r
+
+
+def introspection():
+    return mispattributes
+
+
+def version():
+    moduleinfo['config'] = moduleconfig
+    return moduleinfo
+
+
+class urlscanAPI():
+    def __init__(self, apikey=None, uuid=None):
+        self.key = apikey
+        self.uuid = uuid
+
+    def request(self, query):
+        log.debug('From request function with the parameter: ' + query)
+        payload = {'url': query}
+        headers = {'API-Key': self.key,
+                   'Content-Type': "application/json",
+                   'Cache-Control': "no-cache"}
+
+        # Troubleshooting problems with initial search request
+        log.debug('PAYLOAD: ' + json.dumps(payload))
+        log.debug('HEADERS: ' + json.dumps(headers))
+
+        search_url_string = "https://urlscan.io/api/v1/scan/"
+        response = requests.request("POST",
+                                    search_url_string,
+                                    data=json.dumps(payload),
+                                    headers=headers)
+
+        # HTTP 400 - Bad Request
+        if response.status_code == 400:
+            raise Exception('HTTP Error 400 - Bad Request')
+
+        # HTTP 404 - Not found
+        if response.status_code == 404:
+            raise Exception('HTTP Error 404 - These are not the droids you\'re looking for')
+
+        # Any other status code
+        if response.status_code != 200:
+            raise Exception('HTTP Error ' + str(response.status_code))
+
+        if response.text:
+            response = json.loads(response.content.decode("utf-8"))
+            time.sleep(3)
+            self.uuid = response['uuid']
+
+            # Strings for to check for errors on the results page
+            # Null response string for any unavailable resources
+            null_response_string = '"status": 404'
+            # Redirect string accounting for 301/302/303/307/308 status codes
+            redirect_string = '"status": 30'
+            # Normal response string with 200 status code
+            normal_response_string = '"status": 200'
+
+            results_url_string = "https://urlscan.io/api/v1/result/" + self.uuid
+            log.debug('Results URL: ' + results_url_string)
+
+            # Need to wait for results to process and check if they are valid
+            tries = 10
+            while tries >= 0:
+                results = requests.request("GET", results_url_string)
+                log.debug('Made a GET request')
+                results = results.content.decode("utf-8")
+                # checking if there is a 404 status code and no available resources
+                if null_response_string in results and \
+                        redirect_string not in results and \
+                        normal_response_string not in results:
+                    log.debug('Results not processed. Please check again later.')
+                    time.sleep(3)
+                    tries -= 1
+                else:
+                    return json.loads(results)
+            raise Exception('Results contained a 404 status error and could not be processed.')
+
+    def search_url(self, query):
+        log.debug('From search_url with parameter: ' + query)
+        return self.request(query)

From a697f653822b893d95a4142a4d88cbf66821f208 Mon Sep 17 00:00:00 2001
From: David J <david.lee.johnson@gmail.com>
Date: Tue, 14 Aug 2018 10:51:15 -0500
Subject: [PATCH 2/3] Add error handling for DNS failures, reduce imports, and
 simplify misp_comments

---
 misp_modules/modules/expansion/urlscan.py | 78 +++++++++++------------
 1 file changed, 36 insertions(+), 42 deletions(-)

diff --git a/misp_modules/modules/expansion/urlscan.py b/misp_modules/modules/expansion/urlscan.py
index 8f4067c..a0adc25 100644
--- a/misp_modules/modules/expansion/urlscan.py
+++ b/misp_modules/modules/expansion/urlscan.py
@@ -3,8 +3,6 @@ import requests
 import logging
 import sys
 import time
-# Need base64 if encoding data for attachments, but disabled for now
-# import base64
 
 log = logging.getLogger('urlscan')
 log.setLevel(logging.DEBUG)
@@ -15,18 +13,19 @@ ch.setFormatter(formatter)
 log.addHandler(ch)
 
 moduleinfo = {
-                'version': '0.1',
-                'author': 'Dave Johnson',
-                'description': 'Module to query urlscan.io',
-                'module-type': ['expansion']
-             }
+    'version': '0.1',
+    'author': 'Dave Johnson',
+    'description': 'Module to query urlscan.io',
+    'module-type': ['expansion']
+}
 
 moduleconfig = ['apikey']
 misperrors = {'error': 'Error'}
 mispattributes = {
-                    'input': ['hostname', 'domain', 'ip-src', 'ip-dst', 'url'],
-                    'output': ['hostname', 'domain', 'ip-src', 'ip-dst', 'url', 'text', 'link']
-                 }
+    'input': ['hostname', 'domain', 'url'],
+    'output': ['hostname', 'domain', 'ip-src', 'ip-dst', 'url', 'text', 'link']
+}
+
 
 def handler(q=False):
     if q is False:
@@ -51,8 +50,15 @@ def handler(q=False):
     if 'url' in request:
         r['results'] += lookup_indicator(client, request['url'])
 
+    # Return any errors generated from lookup to the UI and remove duplicates
+
     uniq = []
+    log.debug(r['results'])
     for item in r['results']:
+        log.debug(item)
+        if 'error' in item:
+            misperrors['error'] = item['error']
+            return misperrors
         if item not in uniq:
             uniq.append(item)
     r['results'] = uniq
@@ -63,10 +69,19 @@ def lookup_indicator(client, query):
     result = client.search_url(query)
     log.debug('RESULTS: ' + json.dumps(result))
     r = []
+    misp_comment = "{}: Enriched via the urlscan module".format(query)
+
+    # Determine if the page is reachable
+    for request in result['data']['requests']:
+        if request['response'].get('failed'):
+            if request['response']['failed']['errorText']:
+                log.debug('The page could not load')
+                r.append(
+                    {'error': 'Domain could not be resolved: {}'.format(request['response']['failed']['errorText'])})
+
     if result.get('page'):
         if result['page'].get('domain'):
             misp_val = result['page']['domain']
-            misp_comment = "Domain associated with {} (source: urlscan.io)".format(query)
             r.append({'types': 'domain',
                       'categories': ['Network activity'],
                       'values': misp_val,
@@ -74,17 +89,15 @@ def lookup_indicator(client, query):
 
         if result['page'].get('ip'):
             misp_val = result['page']['ip']
-            misp_comment = "IP associated with {} (source: urlscan.io)".format(query)
             r.append({'types': 'ip-dst',
                       'categories': ['Network activity'],
                       'values': misp_val,
                       'comment': misp_comment})
 
         if result['page'].get('country'):
-            misp_val = 'Country: ' + result['page']['country']
+            misp_val = 'country: ' + result['page']['country']
             if result['page'].get('city'):
-                misp_val += ', City: ' + result['page']['city']
-            misp_comment = "Location associated with {} (source: urlscan.io)".format(query)
+                misp_val += ', city: ' + result['page']['city']
             r.append({'types': 'text',
                       'categories': ['External analysis'],
                       'values': misp_val,
@@ -92,12 +105,10 @@ def lookup_indicator(client, query):
 
         if result['page'].get('asn'):
             misp_val = result['page']['asn']
-            misp_comment = "ASN associated with {} (source: urlscan.io)".format(query)
-            r.append({'types': 'AS', 'categories': ['Network activity'], 'values': misp_val, 'comment': misp_comment})
+            r.append({'types': 'AS', 'categories': ['External analysis'], 'values': misp_val, 'comment': misp_comment})
 
         if result['page'].get('asnname'):
             misp_val = result['page']['asnname']
-            misp_comment = "ASN name associated with {} (source: urlscan.io)".format(query)
             r.append({'types': 'text',
                       'categories': ['External analysis'],
                       'values': misp_val,
@@ -118,8 +129,6 @@ def lookup_indicator(client, query):
 
             if threat_list:
                 misp_val = '{} threat(s) detected'.format(threat_list)
-                misp_comment = '{} malicious indicator(s) were present on ' \
-                               '{} (source: urlscan.io)'.format(result['stats']['malicious'], query, threat_list)
                 r.append({'types': 'text',
                           'categories': ['External analysis'],
                           'values': misp_val,
@@ -130,23 +139,17 @@ def lookup_indicator(client, query):
             for url in result['lists']['urls']:
                 url = url.lower()
                 if 'office' in url:
-                    misp_val = 'Possible Microsoft Office themed phishing page'
-                    misp_comment = 'There was resource containing an \'Office\' string in the URL.'
+                    misp_val = "Possible Office-themed phishing"
                 elif 'o365' in url or '0365' in url:
-                    misp_val = 'Possible Microsoft O365 themed phishing page'
-                    misp_comment = 'There was resource containing an \'O365\' string in the URL.'
+                    misp_val = "Possible O365-themed phishing"
                 elif 'microsoft' in url:
-                    misp_val = 'Possible Microsoft themed phishing page'
-                    misp_comment = 'There was resource containing an \'Office\' string in the URL.'
+                    misp_val = "Possible Microsoft-themed phishing"
                 elif 'paypal' in url:
-                    misp_val = 'Possible PayPal themed phishing page'
-                    misp_comment = 'There was resource containing a \'PayPal\' string in the URL.'
+                    misp_val = "Possible PayPal-themed phishing"
                 elif 'onedrive' in url:
-                    misp_val = 'Possible OneDrive themed phishing page'
-                    misp_comment = 'There was resource containing a \'OneDrive\' string in the URL.'
+                    misp_val = "Possible OneDrive-themed phishing"
                 elif 'docusign' in url:
-                    misp_val = 'Possible DocuSign themed phishing page'
-                    misp_comment = 'There was resource containing a \'DocuSign\' string in the URL'
+                    misp_val = "Possible DocuSign-themed phishing"
                 r.append({'types': 'text',
                           'categories': ['External analysis'],
                           'values': misp_val,
@@ -155,7 +158,6 @@ def lookup_indicator(client, query):
     if result.get('task'):
         if result['task'].get('reportURL'):
             misp_val = result['task']['reportURL']
-            misp_comment = 'Link to full report (source: urlscan.io)'
             r.append({'types': 'link',
                       'categories': ['External analysis'],
                       'values': misp_val,
@@ -163,7 +165,6 @@ def lookup_indicator(client, query):
 
         if result['task'].get('screenshotURL'):
             image_url = result['task']['screenshotURL']
-            misp_comment = 'Link to screenshot (source: urlscan.io)'
             r.append({'types': 'link',
                       'categories': ['External analysis'],
                       'values': image_url,
@@ -177,14 +178,6 @@ def lookup_indicator(client, query):
             #           'image': str(base64.b64encode(screenshot), 'utf-8'),
             #           'comment': 'Screenshot of website'})
 
-        if result['task'].get('domURL'):
-            misp_val = result['task']['domURL']
-            misp_comment = 'Link to DOM (source: urlscan.io)'
-            r.append({'types': 'link',
-                      'categories': ['External analysis'],
-                      'values': misp_val,
-                      'comment': misp_comment})
-
     return r
 
 
@@ -262,6 +255,7 @@ class urlscanAPI():
                     tries -= 1
                 else:
                     return json.loads(results)
+
             raise Exception('Results contained a 404 status error and could not be processed.')
 
     def search_url(self, query):

From 7deeb95820e1e4099208496025807e6ae6ef1164 Mon Sep 17 00:00:00 2001
From: Christophe Vandeplas <christophe@vandeplas.com>
Date: Tue, 21 Aug 2018 11:13:08 +0200
Subject: [PATCH 3/3] fix: ta_import - bugfixes

---
 .../import_mod/threatanalyzer_import.py       | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/misp_modules/modules/import_mod/threatanalyzer_import.py b/misp_modules/modules/import_mod/threatanalyzer_import.py
index 916628e..2e3a507 100755
--- a/misp_modules/modules/import_mod/threatanalyzer_import.py
+++ b/misp_modules/modules/import_mod/threatanalyzer_import.py
@@ -15,7 +15,7 @@ misperrors = {'error': 'Error'}
 userConfig = {}
 inputSource = ['file']
 
-moduleinfo = {'version': '0.8', 'author': 'Christophe Vandeplas',
+moduleinfo = {'version': '0.9', 'author': 'Christophe Vandeplas',
               'description': 'Import for ThreatAnalyzer archive.zip/analysis.json files',
               'module-type': ['import']}
 
@@ -45,7 +45,7 @@ def handler(q=False):
                 if re.match(r"Analysis/proc_\d+/modified_files/mapping\.log", zip_file_name):
                     with zf.open(zip_file_name, mode='r', pwd=None) as fp:
                         file_data = fp.read()
-                        for line in file_data.decode().split('\n'):
+                        for line in file_data.decode("utf-8", 'ignore').split('\n'):
                             if not line:
                                 continue
                             if line.count('|') == 3:
@@ -55,7 +55,8 @@ def handler(q=False):
                             l_fname = cleanup_filepath(l_fname)
                             if l_fname:
                                 if l_size == 0:
-                                    pass  # FIXME create an attribute for the filename/path
+                                    results.append({'values': l_fname, 'type': 'filename', 'to_ids': True,
+                                                    'categories': ['Artifacts dropped', 'Payload delivery'], 'comment': ''})
                                 else:
                                     # file is a non empty sample, upload the sample later
                                     modified_files_mapping[l_md5] = l_fname
@@ -144,13 +145,14 @@ def process_analysis_json(analysis_json):
                         # )
                         yield({'values': connection_section_connection['@remote_hostname'], 'type': 'hostname', 'to_ids': True, 'comment': ''})
                     if 'http_command' in connection_section_connection:
-                        # print('connection_section_connection HTTP COMMAND: {}\t{}'.format(
-                        #     connection_section_connection['http_command']['@method'],                    # comment
-                        #     connection_section_connection['http_command']['@url'])                       # url
-                        # )
-                        val = cleanup_url(connection_section_connection['http_command']['@url'])
-                        if val:
-                            yield({'values': val, 'type': 'url', 'categories': ['Network activity'], 'to_ids': True, 'comment': connection_section_connection['http_command']['@method']})
+                        for http_command in connection_section_connection['http_command']:
+                            # print('connection_section_connection HTTP COMMAND: {}\t{}'.format(
+                            #     connection_section_connection['http_command']['@method'],                    # comment
+                            #     connection_section_connection['http_command']['@url'])                       # url
+                            # )
+                            val = cleanup_url(http_command['@url'])
+                            if val:
+                                yield({'values': val, 'type': 'url', 'categories': ['Network activity'], 'to_ids': True, 'comment': http_command['@method']})
 
                     if 'http_header' in connection_section_connection:
                         for http_header in connection_section_connection['http_header']:
@@ -453,9 +455,9 @@ def cleanup_filepath(item):
         '\\AppData\\Roaming\\Adobe\\Acrobat\\9.0\\UserCache.bin',
 
         '\\AppData\\Roaming\\Macromedia\\Flash Player\\macromedia.com\\support\\flashplayer\\sys\\settings.sol',
-        '\\AppData\\Roaming\Adobe\\Flash Player\\NativeCache\\',
+        '\\AppData\\Roaming\\Adobe\\Flash Player\\NativeCache\\',
         'C:\\Windows\\AppCompat\\Programs\\',
-        'C:\~'  # caused by temp file created by MS Office when opening malicious doc/xls/...
+        'C:\\~'  # caused by temp file created by MS Office when opening malicious doc/xls/...
     }
     if list_in_string(noise_substrings, item):
         return None
@@ -531,4 +533,3 @@ def introspection():
 def version():
     moduleinfo['config'] = moduleconfig
     return moduleinfo
-