misp-modules/misp_modules/modules/import_mod/cuckooimport.py

import json
import base64
import io
import logging
import posixpath
import stat
import tarfile
import zipfile
from pymisp import MISPEvent, MISPObject, MISPAttribute
from pymisp.tools import make_binary_objects
from collections import OrderedDict

log = logging.getLogger(__name__)

misperrors = {'error': 'Error'}

moduleinfo = {
    'version': '1.1',
    'author': 'Pierre-Jean Grenier',
    'description': "Import a Cuckoo archive (zipfile or bzip2 tarball), "
                   "either downloaded manually or exported from the "
                   "API (/tasks/report/{task_id}/all).",
    'module-type': ['import'],
}

moduleconfig = []

mispattributes = {
    'inputSource': ['file'],
    'output': ['MISP objects', 'malware-sample'],
    'format': 'misp_standard',
}

# Attributes for which we can set the "Artifacts dropped"
# category if we want to
ARTIFACTS_DROPPED = (
    "filename",
    "md5",
    "sha1",
    "sha256",
    "sha512",
    "malware-sample",
    "mimetype",
    "ssdeep",
)

# Same for the category "Payload delivery"
PAYLOAD_DELIVERY = ARTIFACTS_DROPPED


class PrettyDict(OrderedDict):
    """
    This class is just intended for a pretty print
    of its keys and values.
    """
    MAX_SIZE = 30

    def __str__(self):
        tmp = []
        for k, v in self.items():
            v = str(v)
            if len(v) > self.MAX_SIZE:
                k += ',cut'
                v = v[:self.MAX_SIZE]
            v.replace('\n', ' ')
            tmp.append((k, v))
        return "; ".join(f"({k}) {v}" for k, v in tmp)


def search_objects(event, name, attributes=[]):
    """
    Search for objects in event, which name is `name` and
    contain at least the attributes given.
    Return a generator.
    @ param attributes: a list of (object_relation, value)
    """
    match = filter(
        lambda obj: all(
            obj.name == name
            and (obj_relation, str(attr_value)) in map(
                lambda attr: (attr.object_relation, str(attr.value)),
                obj.attributes
            )
            for obj_relation, attr_value in attributes
        ), event.objects
    )
    return match


def find_process_by_pid(event, pid):
    """
    Find a 'process' MISPObject by its PID. If multiple objects are found,
    only return the first one.
    @ param pid: integer or str
    """
    generator = search_objects(
        event,
        "process",
        (('pid', pid),)
    )
    return next(generator, None)


class CuckooParser():
    # This dict is used to generate the userConfig and link the different
    # options to the corresponding method of the parser. This way, we avoid
    # redundancy and make future changes easier (instead of for instance
    # defining all the options in userConfig directly, and then making a
    # switch when running the parser).
    # Careful about the order here, as we create references between
    # MISPObjects/MISPAttributes at the same time we generate them.
    # Hence when we create object B, which we want to reference to
    # object A, we should already have created object A.
    # TODO create references only after all parsing is done
    options = {
        "Sandbox info": {
            "method": lambda self: self.add_sandbox_info(),
            "userConfig": {
                'type': 'Boolean',
                'message': "Add info related to the sandbox",
                'checked': 'true',
            },
        },
        "Upload sample": {
            "method": lambda self: self.add_sample(),
            "userConfig": {
                'type': 'Boolean',
                'message': "Upload the sample",
                'checked': 'true',
            },
        },
        "Processes": {
            "method": lambda self: self.add_process_tree(),
            "userConfig": {
                'type': 'Boolean',
                'message': "Add info related to the processes",
                'checked': 'true',
            },
        },
        "DNS": {
            "method": lambda self: self.add_dns(),
            "userConfig": {
                'type': 'Boolean',
                'message': "Add DNS queries/answers",
                'checked': 'true',
            },
        },
        "TCP": {
            "method": lambda self: self.add_network("tcp"),
            "userConfig": {
                'type': 'Boolean',
                'message': "Add TCP connections",
                'checked': 'true',
            },
        },
        "UDP": {
            "method": lambda self: self.add_network("udp"),
            "userConfig": {
                'type': 'Boolean',
                'message': "Add UDP connections",
                'checked': 'true',
            },
        },
        "HTTP": {
            "method": lambda self: self.add_http(),
            "userConfig": {
                'type': 'Boolean',
                'message': "Add HTTP requests",
                'checked': 'true',
            },
        },
        "Signatures": {
            "method": lambda self: self.add_signatures(),
            "userConfig": {
                'type': 'Boolean',
                'message': "Add Cuckoo's triggered signatures",
                'checked': 'true',
            },
        },
        "Screenshots": {
            "method": lambda self: self.add_screenshots(),
            "userConfig": {
                'type': 'Boolean',
                'message': "Upload the screenshots",
                'checked': 'true',
            },
        },
        "Dropped files": {
            "method": lambda self: self.add_dropped_files(),
            "userConfig": {
                'type': 'Boolean',
                'message': "Upload the dropped files",
                'checked': 'true',
            },
        },
        "Dropped buffers": {
            "method": lambda self: self.add_dropped_buffers(),
            "userConfig": {
                'type': 'Boolean',
                'message': "Upload the dropped buffers",
                'checked': 'true',
            },
        },
    }

    def __init__(self, config):
        self.event = MISPEvent()
        self.files = None
        self.malware_binary = None
        self.report = None
        self.config = {
            # if an option is missing (we receive None as a value),
            # fall back to the default specified in the options
            key: int(
                on if on is not None
                else self.options[key]["userConfig"]["checked"] == 'true'
            )
            for key, on in config.items()
        }

    def get_file(self, relative_filepath):
        """Return an io.BufferedIOBase for the corresponding relative_filepath
        in the Cuckoo archive. If not found, return an empty io.BufferedReader
        to avoid fatal errors."""
        blackhole = io.BufferedReader(open('/dev/null', 'rb'))
        res = self.files.get(relative_filepath, blackhole)
        if res == blackhole:
            log.debug(f"Did not find file {relative_filepath}, "
                      f"returned an empty file instead")
        return res

    def read_archive(self, archive_encoded):
        """Read the archive exported from Cuckoo and initialize the class"""
        # archive_encoded is base 64 encoded content
        # we extract the info about each file but do not retrieve
        # it automatically, as it may take too much space in memory
        buf_io = io.BytesIO(base64.b64decode(archive_encoded))
        if zipfile.is_zipfile(buf_io):
            # the archive was probably downloaded from the WebUI
            buf_io.seek(0)  # don't forget this not to read an empty buffer
            z = zipfile.ZipFile(buf_io, 'r')
            self.files = {
                info.filename: z.open(info)
                for info in z.filelist
                # only extract the regular files and dirs, we don't
                # want any symbolic link
                if stat.S_ISREG(info.external_attr >> 16)
                or stat.S_ISDIR(info.external_attr >> 16)
            }
        else:
            # the archive was probably downloaded from the API
            buf_io.seek(0)  # don't forget this not to read an empty buffer
            f = tarfile.open(fileobj=buf_io, mode='r:bz2')
            self.files = {
                info.name: f.extractfile(info)
                for info in f.getmembers()
                # only extract the regular files and dirs, we don't
                # want any symbolic link
                if info.isreg() or info.isdir()
            }

        # We want to keep the order of the keys of sub-dicts in the report,
        # eg. the signatures have marks with unknown keys such as
        #     {'marks': [
        #        {"suspicious_features": "Connection to IP address",
        #         "suspicious_request": "OPTIONS http://85.20.18.18/doc"}
        #     ]}
        # To render those marks properly, we can only hope the developpers
        # thought about the order in which they put the keys, and keep this
        # order so that the signature makes sense to the reader.
        # We use PrettyDict, a customization of OrderedDict to do so.
        # It will be instanced iteratively when parsing the json (ie. subdicts
        # will also be instanced as PrettyDict)
        self.report = json.load(
            self.get_file("reports/report.json"),
            object_pairs_hook=PrettyDict,
        )

    def read_malware(self):
        self.malware_binary = self.get_file("binary").read()
        if not self.malware_binary:
            log.warn("No malware binary found")

    def add_sandbox_info(self):
        info = self.report.get("info", {})
        if not info:
            log.warning("The 'info' field was not found "
                        "in the report, skipping")
            return False

        o = MISPObject(name='sandbox-report')
        o.add_attribute('score', info['score'])
        o.add_attribute('sandbox-type', 'on-premise')
        o.add_attribute('on-premise-sandbox', 'cuckoo')
        o.add_attribute('raw-report',
                        f'started on:{info["machine"]["started_on"]} '
                        f'duration:{info["duration"]}s '
                        f'vm:{info["machine"]["name"]}/'
                        f'{info["machine"]["label"]}')
        self.event.add_object(o)

    def add_sample(self):
        """Add the sample/target of the analysis"""
        target = self.report.get("target", {})
        category = target.get("category", "")
        if not category:
            log.warning("Could not find info about the sample "
                        "in the report, skipping")
            return False

        if category == "file":
            log.debug("Sample is a file, uploading it")
            self.read_malware()
            file_o, bin_type_o, bin_section_li = make_binary_objects(
                pseudofile=io.BytesIO(self.malware_binary),
                filename=target["file"]["name"],
            )

            file_o.comment = "Submitted sample"
            # fix categories
            for obj in filter(None, (file_o, bin_type_o, *bin_section_li,)):
                for attr in obj.attributes:
                    if attr.type in PAYLOAD_DELIVERY:
                        attr.category = "Payload delivery"
                self.event.add_object(obj)

        elif category == "url":
            log.debug("Sample is a URL")
            o = MISPObject(name='url')
            o.add_attribute('url', target['url'])
            o.add_attribute('text', "Submitted URL")
            self.event.add_object(o)

    def add_http(self):
        """Add the HTTP requests"""
        network = self.report.get("network", [])
        http = network.get("http", [])
        if not http:
            log.info("No HTTP connection found in the report, skipping")
            return False

        for request in http:
            o = MISPObject(name='http-request')
            o.add_attribute('host', request['host'])
            o.add_attribute('method', request['method'])
            o.add_attribute('uri', request['uri'])
            o.add_attribute('user-agent', request['user-agent'])
            o.add_attribute('text', f"count:{request['count']} "
                                    f"port:{request['port']}")
            self.event.add_object(o)

    def add_network(self, proto=None):
        """
        Add UDP/TCP traffic
        proto must be one of "tcp", "udp"
        """
        network = self.report.get("network", [])
        li_conn = network.get(proto, [])
        if not li_conn:
            log.info(f"No {proto} connection found in the report, skipping")
            return False

        from_to = []
        # sort by time to get the "first packet seen" right
        li_conn.sort(key=lambda x: x["time"])
        for conn in li_conn:
            src = conn['src']
            dst = conn['dst']
            sport = conn['sport']
            dport = conn['dport']
            if (src, sport, dst, dport) in from_to:
                continue

            from_to.append((src, sport, dst, dport))

            o = MISPObject(name='network-connection')
            o.add_attribute('ip-src', src)
            o.add_attribute('ip-dst', dst)
            o.add_attribute('src-port', sport)
            o.add_attribute('dst-port', dport)
            o.add_attribute('layer3-protocol', "IP")
            o.add_attribute('layer4-protocol', proto.upper())
            o.add_attribute('first-packet-seen', conn['time'])
            self.event.add_object(o)

    def add_dns(self):
        """Add DNS records"""
        network = self.report.get("network", [])
        dns = network.get("dns", [])
        if not dns:
            log.info("No DNS connection found in the report, skipping")
            return False

        for record in dns:
            o = MISPObject(name='dns-record')
            o.add_attribute('text', f"request type:{record['type']}")
            o.add_attribute('queried-domain', record['request'])
            for answer in record.get("answers", []):
                if answer["type"] in ("A", "AAAA"):
                    o.add_attribute('a-record', answer['data'])
                # TODO implement MX/NS

            self.event.add_object(o)

    def _get_marks_str(self, marks):
        marks_strings = []
        for m in marks:
            m_type = m.pop("type")  # temporarily remove the type

            if m_type == "generic":
                marks_strings.append(str(m))

            elif m_type == "ioc":
                marks_strings.append(m['ioc'])

            elif m_type == "call":
                call = m["call"]
                arguments = call.get("arguments", {})
                flags = call.get("flags", {})
                info = ""
                for details in (arguments, flags):
                    info += f" {details}"
                marks_strings.append(f"Call API '{call['api']}'%s" % info)

            else:
                logging.debug(f"Unknown mark type '{m_type}', skipping")

            m["type"] = m_type  # restore key 'type'
            # TODO implemented marks 'config' and 'volatility'
        return marks_strings

    def _add_ttp(self, attribute, ttp_short, ttp_num):
        """
        Internal wrapper to add the TTP tag from the MITRE galaxy.
        @ params
            - attribute: MISPAttribute
            - ttp_short: short description of the TTP
              (eg. "Credential Dumping")
            - ttp_num: formatted as "T"+int
              (eg. T1003)
        """
        attribute.add_tag(f'misp-galaxy:mitre-attack-pattern='
                          f'"{ttp_short} - {ttp_num}"')

    def add_signatures(self):
        """Add the Cuckoo signatures, with as many details as possible
        regarding the marks"""
        signatures = self.report.get("signatures", [])
        if not signatures:
            log.info("No signature found in the report")
            return False

        o = MISPObject(name='sb-signature')
        o.add_attribute('software', "Cuckoo")

        for sign in signatures:
            marks = sign["marks"]
            marks_strings = self._get_marks_str(marks)
            summary = sign['description']
            if marks_strings:
                summary += "\n---\n"

            marks_strings = set(marks_strings)
            description = summary + "\n".join(marks_strings)

            a = MISPAttribute()
            a.from_dict(type='text', value=description)
            for ttp_num, desc in sign.get("ttp", {}).items():
                ttp_short = desc["short"]
                self._add_ttp(a, ttp_short, ttp_num)

            # this signature was triggered by the processes with the following
            # PIDs, we can create references
            triggered_by_pids = filter(
                None,
                (m.get("pid", None) for m in marks)
            )
            # remove redundancy
            triggered_by_pids = set(triggered_by_pids)
            for pid in triggered_by_pids:
                process_o = find_process_by_pid(self.event, pid)
                if process_o:
                    process_o.add_reference(a, "triggers")

            o.add_attribute('signature', **a)

        self.event.add_object(o)

    def _handle_process(self, proc, accu):
        """
        This is an internal recursive function to handle one process
        from a process tree and then iterate on its children.
        List the objects to be added, based on the tree, into the `accu` list.
        The `accu` list uses a DFS-like order.
        """
        o = MISPObject(name='process')
        accu.append(o)
        o.add_attribute('pid', proc['pid'])
        o.add_attribute('command-line', proc['command_line'])
        o.add_attribute('name', proc['process_name'])
        o.add_attribute('parent-pid', proc['ppid'])
        for child in proc.get('children', []):
            pos_child = len(accu)
            o.add_attribute('child-pid', child['pid'])
            self._handle_process(child, accu)
            child_obj = accu[pos_child]
            child_obj.add_reference(o, 'child-of')

        return o

    def add_process_tree(self):
        """Add process tree from the report, as separated process objects"""
        behavior = self.report.get("behavior", {})
        tree = behavior.get("processtree", [])
        if not tree:
            log.warning("No process tree found in the report, skipping")
            return False

        for proc in tree:
            objs = []
            self._handle_process(proc, objs)
            for o in objs:
                self.event.add_object(o)

    def get_relpath(self, path):
        """
        Transform an absolute or relative path into a path relative to the
        correct cuckoo analysis directory, without knowing the cuckoo
        working directory.
        Return an empty string if the path given does not refer to a
        file from the analysis directory.
        """
        head, tail = posixpath.split(path)
        if not tail:
            return ""
        prev = self.get_relpath(head)
        longer = posixpath.join(prev, tail)
        if longer in self.files:
            return longer
        elif tail in self.files:
            return tail
        else:
            return ""

    def add_screenshots(self):
        """Add the screenshots taken by Cuckoo in a sandbox-report object"""
        screenshots = self.report.get('screenshots', [])
        if not screenshots:
            log.info("No screenshot found in the report, skipping")
            return False

        o = MISPObject(name='sandbox-report')
        o.add_attribute('sandbox-type', 'on-premise')
        o.add_attribute('on-premise-sandbox', "cuckoo")
        for shot in screenshots:
            # The path given by Cuckoo is an absolute path, but we need a path
            # relative to the analysis folder.
            path = self.get_relpath(shot['path'])
            img = self.get_file(path)
            # .decode('utf-8') in order to avoid the b'' format
            img_data = base64.b64encode(img.read()).decode('utf-8')
            filename = posixpath.basename(path)

            o.add_attribute(
                "sandbox-file", value=filename,
                data=img_data, type='attachment',
                category="External analysis",
            )

        self.event.add_object(o)

    def _get_dropped_objs(self, path, filename=None, comment=None):
        """
        Internal wrapper to get dropped files/buffers as file objects
        @ params
            - path: relative to the cuckoo analysis directory
            - filename: if not specified, deduced from the path
        """
        if not filename:
            filename = posixpath.basename(path)

        dropped_file = self.get_file(path)
        dropped_binary = io.BytesIO(dropped_file.read())
        # create ad hoc objects
        file_o, bin_type_o, bin_section_li = make_binary_objects(
            pseudofile=dropped_binary, filename=filename,
        )

        if comment:
            file_o.comment = comment
        # fix categories
        for obj in filter(None, (file_o, bin_type_o, *bin_section_li,)):
            for attr in obj.attributes:
                if attr.type in ARTIFACTS_DROPPED:
                    attr.category = "Artifacts dropped"

        return file_o, bin_type_o, bin_section_li

    def _add_yara(self, obj, yara_dict):
        """Internal wrapper to add Yara matches to an MISPObject"""
        for yara in yara_dict:
            description = yara.get("meta", {}).get("description", "")
            name = yara.get("name", "")
            obj.add_attribute(
                "text",
                f"Yara match\n(name) {name}\n(description) {description}",
                comment="Yara match"
            )

    def add_dropped_files(self):
        """Upload the dropped files as file objects"""
        dropped = self.report.get("dropped", [])
        if not dropped:
            log.info("No dropped file found, skipping")
            return False

        for d in dropped:
            # Cuckoo logs three things that are of interest for us:
            #   - 'filename' which is not the original name of the file
            #     but is formatted as follow:
            #        8 first bytes of SHA265 + _ + original name in lower case
            #   - 'filepath' which is the original filepath on the VM,
            #     where the file was dropped
            #   - 'path' which is the local path of the stored file,
            #     in the cuckoo archive
            filename = d.get("name", "")
            original_path = d.get("filepath", "")
            sha256 = d.get("sha256", "")
            if original_path and sha256:
                log.debug(f"Will now try to restore original filename from "
                          f"path {original_path}")
                try:
                    s = filename.split("_")
                    if not s:
                        raise Exception("unexpected filename read "
                                        "in the report")
                    sha256_first_8_bytes = s[0]
                    original_name = s[1]
                    # check our assumptions are valid, if so we can safely
                    # restore the filename, if not the format may have changed
                    # so we'll keep the filename of the report
                    if sha256.startswith(sha256_first_8_bytes) and \
                            original_path.lower().endswith(original_name) and \
                            filename not in original_path.lower():
                        # we can restore the original case of the filename
                        position = original_path.lower().rindex(original_name)
                        filename = original_path[position:]
                        log.debug(f"Successfully restored original filename: "
                                  f"{filename}")
                    else:
                        raise Exception("our assumptions were wrong, "
                                        "filename format may have changed")
                except Exception as e:
                    log.debug(f"Cannot restore filename: {e}")

            if not filename:
                filename = "NO NAME FOUND IN THE REPORT"
                log.warning(f'No filename found for dropped file! '
                            f'Will use "{filename}"')

            file_o, bin_type_o, bin_section_o = self._get_dropped_objs(
                self.get_relpath(d['path']),
                filename=filename,
                comment="Dropped file"
            )

            self._add_yara(file_o, d.get("yara", []))

            file_o.add_attribute("fullpath", original_path,
                                 category="Artifacts dropped")

            # why is this a list? for when various programs drop the same file?
            for pid in d.get("pids", []):
                # if we have an object for the process that dropped the file,
                # we can link the two (we just take the first result from
                # the search)
                process_o = find_process_by_pid(self.event, pid)
                if process_o:
                    file_o.add_reference(process_o, "dropped-by")

            self.event.add_object(file_o)

    def add_dropped_buffers(self):
        """"Upload the dropped buffers as file objects"""
        buffer = self.report.get("buffer", [])
        if not buffer:
            log.info("No dropped buffer found, skipping")
            return False

        for i, buf in enumerate(buffer):
            file_o, bin_type_o, bin_section_o = self._get_dropped_objs(
                self.get_relpath(buf['path']),
                filename=f"buffer {i}",
                comment="Dropped buffer"
            )
            self._add_yara(file_o, buf.get("yara", []))
            self.event.add_object(file_o)

    def parse(self):
        """Run the parsing"""
        for name, active in self.config.items():
            if active:
                self.options[name]["method"](self)

    def get_misp_event(self):
        log.debug("Running MISP expansions")
        self.event.run_expansions()
        return self.event


def handler(q=False):
    # In case there's no data
    if q is False:
        return False

    q = json.loads(q)
    data = q['data']

    parser = CuckooParser(q['config'])
    parser.read_archive(data)
    parser.parse()
    event = parser.get_misp_event()

    event = json.loads(event.to_json())
    results = {
        key: event[key]
        for key in ('Attribute', 'Object')
        if (key in event and event[key])
    }
    return {'results': results}


def introspection():
    userConfig = {
        key: o["userConfig"]
        for key, o in CuckooParser.options.items()
    }
    mispattributes['userConfig'] = userConfig
    return mispattributes


def version():
    moduleinfo['config'] = moduleconfig
    return moduleinfo