diff --git a/misp_modules/modules/import_mod/cuckooimport.py b/misp_modules/modules/import_mod/cuckooimport.py index fc0c30e..c89e88d 100755 --- a/misp_modules/modules/import_mod/cuckooimport.py +++ b/misp_modules/modules/import_mod/cuckooimport.py @@ -1,198 +1,712 @@ import json import base64 +import tarfile +import logging +import posixpath +from io import BytesIO, BufferedReader +from pymisp import MISPEvent, MISPObject, MISPAttribute +from pymisp.tools import make_binary_objects +from collections import OrderedDict + +log = logging.getLogger(__name__) misperrors = {'error': 'Error'} -userConfig = {} -inputSource = ['file'] -moduleinfo = {'version': '0.1', 'author': 'Victor van der Stoep', - 'description': 'Cuckoo JSON import', +moduleinfo = {'version': '1.0', + 'author': 'Pierre-Jean Grenier', + 'description': 'Cuckoo archive import', 'module-type': ['import']} moduleconfig = [] +mispattributes = { + 'inputSource': ['file'], + 'output': ['MISP objects', 'malware-sample'], + 'format': 'misp_standard', +} + +# Attributes for which we can set the "Artifacts dropped" +# category if we want to +ARTIFACTS_DROPPED = ( + "filename", + "md5", + "sha1", + "sha256", + "sha512", + "malware-sample", + "mimetype", + "ssdeep", +) + +# Same for the category "Payload delivery" +PAYLOAD_DELIVERY = ARTIFACTS_DROPPED + + +class PrettyDict(OrderedDict): + """ + This class is just intended for a pretty print + of its keys and values. + """ + MAX_SIZE = 30 + + def __str__(self): + tmp = [] + for k, v in self.items(): + v = str(v) + if len(v) > self.MAX_SIZE: + k += ',cut' + v = v[:self.MAX_SIZE] + v.replace('\n', ' ') + tmp.append((k, v)) + return "; ".join(f"({k}) {v}" for k, v in tmp) + + +def search_objects(event, name, attributes=[]): + """ + Search for objects in event, which name is `name` and + contain at least the attributes given. + Return a generator. + @ param attributes: a list of (object_relation, value) + """ + match = filter( + lambda obj: all( + obj.name == name + and (obj_relation, str(attr_value)) in map( + lambda attr: (attr.object_relation, str(attr.value)), + obj.attributes + ) + for obj_relation, attr_value in attributes + ), event.objects + ) + return match + + +def find_process_by_pid(event, pid): + """ + Find a 'process' MISPObject by its PID. If multiple objects are found, + only return the first one. + @ param pid: integer or str + """ + generator = search_objects( + event, + "process", + (('pid', pid),) + ) + return next(generator, None) + + +class CuckooParser(): + # This dict is used to generate the userConfig and link the different + # options to the corresponding method of the parser. This way, we avoid + # redundancy and make future changes easier (instead of for instance + # defining all the options in userConfig directly, and then making a + # switch when running the parser). + # Careful about the order here, as we create references between + # MISPObjects/MISPAttributes at the same time we generate them. + # Hence when we create object B, which we want to reference to + # object A, we should already have created object A. + # TODO create references only after all parsing is done + options = { + "Sandbox info": { + "method": lambda self: self.add_sandbox_info(), + "userConfig": { + 'type': 'Boolean', + 'message': "Add info related to the sandbox", + 'checked': 'true', + }, + }, + "Upload sample": { + "method": lambda self: self.add_sample(), + "userConfig": { + 'type': 'Boolean', + 'message': "Upload the sample", + 'checked': 'true', + }, + }, + "Processes": { + "method": lambda self: self.add_process_tree(), + "userConfig": { + 'type': 'Boolean', + 'message': "Add info related to the processes", + 'checked': 'true', + }, + }, + "DNS": { + "method": lambda self: self.add_dns(), + "userConfig": { + 'type': 'Boolean', + 'message': "Add DNS queries/answers", + 'checked': 'true', + }, + }, + "TCP": { + "method": lambda self: self.add_network("tcp"), + "userConfig": { + 'type': 'Boolean', + 'message': "Add TCP connections", + 'checked': 'true', + }, + }, + "UDP": { + "method": lambda self: self.add_network("udp"), + "userConfig": { + 'type': 'Boolean', + 'message': "Add UDP connections", + 'checked': 'true', + }, + }, + "HTTP": { + "method": lambda self: self.add_http(), + "userConfig": { + 'type': 'Boolean', + 'message': "Add HTTP requests", + 'checked': 'true', + }, + }, + "Signatures": { + "method": lambda self: self.add_signatures(), + "userConfig": { + 'type': 'Boolean', + 'message': "Add Cuckoo's triggered signatures", + 'checked': 'true', + }, + }, + "Screenshots": { + "method": lambda self: self.add_screenshots(), + "userConfig": { + 'type': 'Boolean', + 'message': "Upload the screenshots", + 'checked': 'true', + }, + }, + "Dropped files": { + "method": lambda self: self.add_dropped_files(), + "userConfig": { + 'type': 'Boolean', + 'message': "Upload the dropped files", + 'checked': 'true', + }, + }, + "Dropped buffers": { + "method": lambda self: self.add_dropped_buffers(), + "userConfig": { + 'type': 'Boolean', + 'message': "Upload the dropped buffers", + 'checked': 'true', + }, + }, + } + + def __init__(self, config): + self.event = MISPEvent() + self.files = None + self.malware_binary = None + self.report = None + self.config = {key: int(on) for key, on in config.items()} + + def get_file(self, relative_filepath): + """Return a BufferedReader for the corresponding relative_filepath + in the Cuckoo archive. If not found, return an empty BufferedReader + to avoid fatal errors.""" + blackhole = BufferedReader(open('/dev/null', 'rb')) + res = self.files.get(relative_filepath, blackhole) + if res == blackhole: + log.debug(f"Did not find file {relative_filepath}, " + f"returned an empty file instead") + return res + + def read_archive(self, archive_encoded): + """Read the archive exported from Cuckoo and initialize the class""" + # archive_encoded is base 64 encoded content + # we extract the info about each file but do not retrieve + # it automatically, as it may take too much space in memory + buf_io = BytesIO(base64.b64decode(archive_encoded)) + f = tarfile.open(fileobj=buf_io, mode='r:bz2') + self.files = { + info.name: f.extractfile(info) + for info in f.getmembers() + } + + # We want to keep the order of the keys of sub-dicts in the report, + # eg. the signatures have marks with unknown keys such as + # {'marks': [ + # {"suspicious_features": "Connection to IP address", + # "suspicious_request": "OPTIONS http://85.20.18.18/doc"} + # ]} + # To render those marks properly, we can only hope the developpers + # thought about the order in which they put the keys, and keep this + # order so that the signature makes sense to the reader. + # We use PrettyDict, a customization of OrderedDict to do so. + # It will be instanced iteratively when parsing the json (ie. subdicts + # will also be instanced as PrettyDict) + self.report = json.load( + self.get_file("reports/report.json"), + object_pairs_hook=PrettyDict, + ) + + def read_malware(self): + self.malware_binary = self.get_file("binary").read() + if not self.malware_binary: + log.warn("No malware binary found") + + def add_sandbox_info(self): + info = self.report.get("info", {}) + if not info: + log.warning("The 'info' field was not found " + "in the report, skipping") + return False + + o = MISPObject(name='sandbox-report') + o.add_attribute('score', info['score']) + o.add_attribute('sandbox-type', 'on-premise') + o.add_attribute('on-premise-sandbox', 'cuckoo') + o.add_attribute('raw-report', + f'started on:{info["machine"]["started_on"]} ' + f'duration:{info["duration"]}s ' + f'vm:{info["machine"]["name"]}/' + f'{info["machine"]["label"]}') + self.event.add_object(o) + + def add_sample(self): + """Add the sample/target of the analysis""" + target = self.report.get("target", {}) + category = target.get("category", "") + if not category: + log.warning("Could not find info about the sample " + "in the report, skipping") + return False + + if category == "file": + log.debug("Sample is a file, uploading it") + self.read_malware() + file_o, bin_type_o, bin_section_li = make_binary_objects( + pseudofile=BytesIO(self.malware_binary), + filename=target["file"]["name"], + ) + + file_o.comment = "Submitted sample" + # fix categories + for obj in filter(None, (file_o, bin_type_o, *bin_section_li,)): + for attr in obj.attributes: + if attr.type in PAYLOAD_DELIVERY: + attr.category = "Payload delivery" + self.event.add_object(obj) + + elif category == "url": + log.debug("Sample is a URL") + o = MISPObject(name='url') + o.add_attribute('url', target['url']) + o.add_attribute('text', "Submitted URL") + self.event.add_object(o) + + def add_http(self): + """Add the HTTP requests""" + network = self.report.get("network", []) + http = network.get("http", []) + if not http: + log.info("No HTTP connection found in the report, skipping") + return False + + for request in http: + o = MISPObject(name='http-request') + o.add_attribute('host', request['host']) + o.add_attribute('method', request['method']) + o.add_attribute('uri', request['uri']) + o.add_attribute('user-agent', request['user-agent']) + o.add_attribute('text', f"count:{request['count']} " + f"port:{request['port']}") + self.event.add_object(o) + + def add_network(self, proto=None): + """ + Add UDP/TCP traffic + proto must be one of "tcp", "udp" + """ + network = self.report.get("network", []) + li_conn = network.get(proto, []) + if not li_conn: + log.info(f"No {proto} connection found in the report, skipping") + return False + + from_to = [] + # sort by time to get the "first packet seen" right + li_conn.sort(key=lambda x: x["time"]) + for conn in li_conn: + src = conn['src'] + dst = conn['dst'] + sport = conn['sport'] + dport = conn['dport'] + if (src, sport, dst, dport) in from_to: + continue + + from_to.append((src, sport, dst, dport)) + + o = MISPObject(name='network-connection') + o.add_attribute('ip-src', src) + o.add_attribute('ip-dst', dst) + o.add_attribute('src-port', sport) + o.add_attribute('dst-port', dport) + o.add_attribute('layer3-protocol', "IP") + o.add_attribute('layer4-protocol', proto.upper()) + o.add_attribute('first-packet-seen', conn['time']) + self.event.add_object(o) + + def add_dns(self): + """Add DNS records""" + network = self.report.get("network", []) + dns = network.get("dns", []) + if not dns: + log.info("No DNS connection found in the report, skipping") + return False + + for record in dns: + o = MISPObject(name='dns-record') + o.add_attribute('text', f"request type:{record['type']}") + o.add_attribute('queried-domain', record['request']) + for answer in record.get("answers", []): + if answer["type"] in ("A", "AAAA"): + o.add_attribute('a-record', answer['data']) + # TODO implement MX/NS + + self.event.add_object(o) + + def _get_marks_str(self, marks): + marks_strings = [] + for m in marks: + m_type = m.pop("type") # temporarily remove the type + + if m_type == "generic": + marks_strings.append(str(m)) + + elif m_type == "ioc": + marks_strings.append(m['ioc']) + + elif m_type == "call": + call = m["call"] + arguments = call.get("arguments", {}) + flags = call.get("flags", {}) + info = "" + for details in (arguments, flags): + info += f" {details}" + marks_strings.append(f"Call API '{call['api']}'%s" % info) + + else: + logging.debug(f"Unknown mark type '{m_type}', skipping") + + m["type"] = m_type # restore key 'type' + # TODO implemented marks 'config' and 'volatility' + return marks_strings + + def _add_ttp(self, attribute, ttp_short, ttp_num): + """ + Internal wrapper to add the TTP tag from the MITRE galaxy. + @ params + - attribute: MISPAttribute + - ttp_short: short description of the TTP + (eg. "Credential Dumping") + - ttp_num: formatted as "T"+int + (eg. T1003) + """ + attribute.add_tag(f'misp-galaxy:mitre-attack-pattern=' + f'"{ttp_short} - {ttp_num}"') + + def add_signatures(self): + """Add the Cuckoo signatures, with as many details as possible + regarding the marks""" + signatures = self.report.get("signatures", []) + if not signatures: + log.info("No signature found in the report") + return False + + o = MISPObject(name='sb-signature') + o.add_attribute('software', "Cuckoo") + + for sign in signatures: + marks = sign["marks"] + marks_strings = self._get_marks_str(marks) + summary = sign['description'] + if marks_strings: + summary += "\n---\n" + + marks_strings = set(marks_strings) + description = summary + "\n".join(marks_strings) + + a = MISPAttribute() + a.from_dict(type='text', value=description) + for ttp_num, desc in sign.get("ttp", {}).items(): + ttp_short = desc["short"] + self._add_ttp(a, ttp_short, ttp_num) + + # this signature was triggered by the processes with the following + # PIDs, we can create references + triggered_by_pids = filter( + None, + (m.get("pid", None) for m in marks) + ) + # remove redundancy + triggered_by_pids = set(triggered_by_pids) + for pid in triggered_by_pids: + process_o = find_process_by_pid(self.event, pid) + if process_o: + process_o.add_reference(a, "triggers") + + o.add_attribute('signature', **a) + + self.event.add_object(o) + + def _handle_process(self, proc, accu): + """ + This is an internal recursive function to handle one process + from a process tree and then iterate on its children. + List the objects to be added, based on the tree, into the `accu` list. + The `accu` list uses a DFS-like order. + """ + o = MISPObject(name='process') + accu.append(o) + o.add_attribute('pid', proc['pid']) + o.add_attribute('command-line', proc['command_line']) + o.add_attribute('name', proc['process_name']) + o.add_attribute('parent-pid', proc['ppid']) + for child in proc.get('children', []): + pos_child = len(accu) + o.add_attribute('child-pid', child['pid']) + self._handle_process(child, accu) + child_obj = accu[pos_child] + child_obj.add_reference(o, 'child-of') + + return o + + def add_process_tree(self): + """Add process tree from the report, as separated process objects""" + behavior = self.report.get("behavior", {}) + tree = behavior.get("processtree", []) + if not tree: + log.warning("No process tree found in the report, skipping") + return False + + for proc in tree: + objs = [] + self._handle_process(proc, objs) + for o in objs: + self.event.add_object(o) + + def get_relpath(self, path): + """ + Transform an absolute or relative path into a path relative to the + correct cuckoo analysis directory, without knowing the cuckoo + working directory. + Return an empty string if the path given does not refer to a + file from the analysis directory. + """ + head, tail = posixpath.split(path) + if not tail: + return "" + prev = self.get_relpath(head) + longer = posixpath.join(prev, tail) + if longer in self.files: + return longer + elif tail in self.files: + return tail + else: + return "" + + def add_screenshots(self): + """Add the screenshots taken by Cuckoo in a sandbox-report object""" + screenshots = self.report.get('screenshots', []) + if not screenshots: + log.info("No screenshot found in the report, skipping") + return False + + o = MISPObject(name='sandbox-report') + o.add_attribute('sandbox-type', 'on-premise') + o.add_attribute('on-premise-sandbox', "cuckoo") + for shot in screenshots: + # The path given by Cuckoo is an absolute path, but we need a path + # relative to the analysis folder. + path = self.get_relpath(shot['path']) + img = self.get_file(path) + # .decode('utf-8') in order to avoid the b'' format + img_data = base64.b64encode(img.read()).decode('utf-8') + filename = posixpath.basename(path) + + o.add_attribute( + "sandbox-file", value=filename, + data=img_data, type='attachment', + category="External analysis", + ) + + self.event.add_object(o) + + def _get_dropped_objs(self, path, filename=None, comment=None): + """ + Internal wrapper to get dropped files/buffers as file objects + @ params + - path: relative to the cuckoo analysis directory + - filename: if not specified, deduced from the path + """ + if not filename: + filename = posixpath.basename(path) + + dropped_file = self.get_file(path) + dropped_binary = BytesIO(dropped_file.read()) + # create ad hoc objects + file_o, bin_type_o, bin_section_li = make_binary_objects( + pseudofile=dropped_binary, filename=filename, + ) + + if comment: + file_o.comment = comment + # fix categories + for obj in filter(None, (file_o, bin_type_o, *bin_section_li,)): + for attr in obj.attributes: + if attr.type in ARTIFACTS_DROPPED: + attr.category = "Artifacts dropped" + + return file_o, bin_type_o, bin_section_li + + def _add_yara(self, obj, yara_dict): + """Internal wrapper to add Yara matches to an MISPObject""" + for yara in yara_dict: + description = yara.get("meta", {}).get("description", "") + name = yara.get("name", "") + obj.add_attribute( + "text", + f"Yara match\n(name) {name}\n(description) {description}", + comment="Yara match" + ) + + def add_dropped_files(self): + """Upload the dropped files as file objects""" + dropped = self.report.get("dropped", []) + if not dropped: + log.info("No dropped file found, skipping") + return False + + for d in dropped: + # Cuckoo logs three things that are of interest for us: + # - 'filename' which is not the original name of the file + # but is formatted as follow: + # 8 first bytes of SHA265 + _ + original name in lower case + # - 'filepath' which is the original filepath on the VM, + # where the file was dropped + # - 'path' which is the local path of the stored file, + # in the cuckoo archive + filename = d.get("name", "") + original_path = d.get("filepath", "") + sha256 = d.get("sha256", "") + if original_path and sha256: + log.debug(f"Will now try to restore original filename from " + f"path {original_path}") + try: + s = filename.split("_") + if not s: + raise Exception("unexpected filename read " + "in the report") + sha256_first_8_bytes = s[0] + original_name = s[1] + # check our assumptions are valid, if so we can safely + # restore the filename, if not the format may have changed + # so we'll keep the filename of the report + if sha256.startswith(sha256_first_8_bytes) and \ + original_path.lower().endswith(original_name) and \ + filename not in original_path.lower(): + # we can restore the original case of the filename + position = original_path.lower().rindex(original_name) + filename = original_path[position:] + log.debug(f"Successfully restored original filename: " + f"{filename}") + else: + raise Exception("our assumptions were wrong, " + "filename format may have changed") + except Exception as e: + log.debug(f"Cannot restore filename: {e}") + + if not filename: + filename = "NO NAME FOUND IN THE REPORT" + log.warning(f'No filename found for dropped file! ' + f'Will use "{filename}"') + + file_o, bin_type_o, bin_section_o = self._get_dropped_objs( + self.get_relpath(d['path']), + filename=filename, + comment="Dropped file" + ) + + self._add_yara(file_o, d.get("yara", [])) + + file_o.add_attribute("fullpath", original_path, + category="Artifacts dropped") + + # why is this a list? for when various programs drop the same file? + for pid in d.get("pids", []): + # if we have an object for the process that dropped the file, + # we can link the two (we just take the first result from + # the search) + process_o = find_process_by_pid(self.event, pid) + if process_o: + file_o.add_reference(process_o, "dropped-by") + + self.event.add_object(file_o) + + def add_dropped_buffers(self): + """"Upload the dropped buffers as file objects""" + buffer = self.report.get("buffer", []) + if not buffer: + log.info("No dropped buffer found, skipping") + return False + + for i, buf in enumerate(buffer): + file_o, bin_type_o, bin_section_o = self._get_dropped_objs( + self.get_relpath(buf['path']), + filename=f"buffer {i}", + comment="Dropped buffer" + ) + self._add_yara(file_o, buf.get("yara", [])) + self.event.add_object(file_o) + + def parse(self): + """Run the parsing""" + for name, active in self.config.items(): + if active: + self.options[name]["method"](self) + + def get_misp_event(self): + log.debug("Running MISP expansions") + self.event.run_expansions() + return self.event + def handler(q=False): - # Just in case we have no data + # In case there's no data if q is False: return False - # The return value - r = {'results': []} - - # Load up that JSON q = json.loads(q) - data = base64.b64decode(q.get("data")).decode('utf-8') + data = q['data'] - # If something really weird happened - if not data: - return json.dumps({"success": 0}) + parser = CuckooParser(q['config']) + parser.read_archive(data) + parser.parse() + event = parser.get_misp_event() - data = json.loads(data) - - # Get characteristics of file - targetFile = data['target']['file'] - - # Process the inital binary - processBinary(r, targetFile, initial=True) - - # Get binary information for dropped files - if(data.get('dropped')): - for droppedFile in data['dropped']: - processBinary(r, droppedFile, dropped=True) - - # Add malscore to results - r["results"].append({ - "values": "Malscore: {} ".format(data['malscore']), - "types": "comment", - "categories": "Payload delivery", - "comment": "Cuckoo analysis: MalScore" - }) - - # Add virustotal data, if exists - if(data.get('virustotal')): - processVT(r, data['virustotal']) - - # Add network information, should be improved - processNetwork(r, data['network']) - - # Add behavioral information - processSummary(r, data['behavior']['summary']) - - # Return - return r - - -def processSummary(r, summary): - r["results"].append({ - "values": summary['mutexes'], - "types": "mutex", - "categories": "Artifacts dropped", - "comment": "Cuckoo analysis: Observed mutexes" - }) - - -def processVT(r, virustotal): - category = "Antivirus detection" - comment = "VirusTotal analysis" - - if(virustotal.get('permalink')): - r["results"].append({ - "values": virustotal['permalink'], - "types": "link", - "categories": category, - "comments": comment + " - Permalink" - }) - - if(virustotal.get('total')): - r["results"].append({ - "values": "VirusTotal detection rate {}/{}".format( - virustotal['positives'], - virustotal['total'] - ), - "types": "comment", - "categories": category, - "comment": comment - }) - else: - r["results"].append({ - "values": "Sample not detected on VirusTotal", - "types": "comment", - "categories": category, - "comment": comment - }) - - -def processNetwork(r, network): - category = "Network activity" - - for host in network['hosts']: - r["results"].append({ - "values": host['ip'], - "types": "ip-dst", - "categories": category, - "comment": "Cuckoo analysis: Observed network traffic" - }) - - -def processBinary(r, target, initial=False, dropped=False): - if(initial): - comment = "Cuckoo analysis: Initial file" - category = "Payload delivery" - elif(dropped): - category = "Artifacts dropped" - comment = "Cuckoo analysis: Dropped file" - - r["results"].append({ - "values": target['name'], - "types": "filename", - "categories": category, - "comment": comment - }) - - r["results"].append({ - "values": target['md5'], - "types": "md5", - "categories": category, - "comment": comment - }) - - r["results"].append({ - "values": target['sha1'], - "types": "sha1", - "categories": category, - "comment": comment - }) - - r["results"].append({ - "values": target['sha256'], - "types": "sha256", - "categories": category, - "comment": comment - }) - - r["results"].append({ - "values": target['sha512'], - "types": "sha512", - "categories": category, - "comment": comment - }) - - # todo : add file size? - - if(target.get('guest_paths')): - r["results"].append({ - "values": target['guest_paths'], - "types": "filename", - "categories": "Payload installation", - "comment": comment + " - Path" - }) + event = json.loads(event.to_json()) + results = { + key: event[key] + for key in ('Attribute', 'Object') + if (key in event and event[key]) + } + return {'results': results} def introspection(): - modulesetup = {} - try: - userConfig - modulesetup['userConfig'] = userConfig - except NameError: - pass - try: - inputSource - modulesetup['inputSource'] = inputSource - except NameError: - pass - return modulesetup + userConfig = { + key: o["userConfig"] + for key, o in CuckooParser.options.items() + } + mispattributes['userConfig'] = userConfig + return mispattributes def version(): moduleinfo['config'] = moduleconfig return moduleinfo - - -if __name__ == '__main__': - x = open('test.json', 'r') - q = [] - q['data'] = x.read() - q = base64.base64encode(q) - - handler(q)