Browse Source

new: Rewrite cuckooimport

Pierre-Jean Grenier 2 years ago
  1. 864


@ -1,198 +1,712 @@
import json
import base64
import tarfile
import logging
import posixpath
from io import BytesIO, BufferedReader
from pymisp import MISPEvent, MISPObject, MISPAttribute
from import make_binary_objects
from collections import OrderedDict
log = logging.getLogger(__name__)
misperrors = {'error': 'Error'}
userConfig = {}
inputSource = ['file']
moduleinfo = {'version': '0.1', 'author': 'Victor van der Stoep',
'description': 'Cuckoo JSON import',
moduleinfo = {'version': '1.0',
'author': 'Pierre-Jean Grenier',
'description': 'Cuckoo archive import',
'module-type': ['import']}
moduleconfig = []
mispattributes = {
'inputSource': ['file'],
'output': ['MISP objects', 'malware-sample'],
'format': 'misp_standard',
# Attributes for which we can set the "Artifacts dropped"
# category if we want to
# Same for the category "Payload delivery"
class PrettyDict(OrderedDict):
This class is just intended for a pretty print
of its keys and values.
def __str__(self):
tmp = []
for k, v in self.items():
v = str(v)
if len(v) > self.MAX_SIZE:
k += ',cut'
v = v[:self.MAX_SIZE]
v.replace('\n', ' ')
tmp.append((k, v))
return "; ".join(f"({k}) {v}" for k, v in tmp)
def search_objects(event, name, attributes=[]):
Search for objects in event, which name is `name` and
contain at least the attributes given.
Return a generator.
@ param attributes: a list of (object_relation, value)
match = filter(
lambda obj: all( == name
and (obj_relation, str(attr_value)) in map(
lambda attr: (attr.object_relation, str(attr.value)),
for obj_relation, attr_value in attributes
), event.objects
return match
def find_process_by_pid(event, pid):
Find a 'process' MISPObject by its PID. If multiple objects are found,
only return the first one.
@ param pid: integer or str
generator = search_objects(
(('pid', pid),)
return next(generator, None)
class CuckooParser():
# This dict is used to generate the userConfig and link the different
# options to the corresponding method of the parser. This way, we avoid
# redundancy and make future changes easier (instead of for instance
# defining all the options in userConfig directly, and then making a
# switch when running the parser).
# Careful about the order here, as we create references between
# MISPObjects/MISPAttributes at the same time we generate them.
# Hence when we create object B, which we want to reference to
# object A, we should already have created object A.
# TODO create references only after all parsing is done
options = {
"Sandbox info": {
"method": lambda self: self.add_sandbox_info(),
"userConfig": {
'type': 'Boolean',
'message': "Add info related to the sandbox",
'checked': 'true',
"Upload sample": {
"method": lambda self: self.add_sample(),
"userConfig": {
'type': 'Boolean',
'message': "Upload the sample",
'checked': 'true',
"Processes": {
"method": lambda self: self.add_process_tree(),
"userConfig": {
'type': 'Boolean',
'message': "Add info related to the processes",
'checked': 'true',
"DNS": {
"method": lambda self: self.add_dns(),
"userConfig": {
'type': 'Boolean',
'message': "Add DNS queries/answers",
'checked': 'true',
"TCP": {
"method": lambda self: self.add_network("tcp"),
"userConfig": {
'type': 'Boolean',
'message': "Add TCP connections",
'checked': 'true',
"UDP": {
"method": lambda self: self.add_network("udp"),
"userConfig": {
'type': 'Boolean',
'message': "Add UDP connections",
'checked': 'true',
"HTTP": {
"method": lambda self: self.add_http(),
"userConfig": {
'type': 'Boolean',
'message': "Add HTTP requests",
'checked': 'true',
"Signatures": {
"method": lambda self: self.add_signatures(),
"userConfig": {
'type': 'Boolean',
'message': "Add Cuckoo's triggered signatures",
'checked': 'true',
"Screenshots": {
"method": lambda self: self.add_screenshots(),
"userConfig": {
'type': 'Boolean',
'message': "Upload the screenshots",
'checked': 'true',
"Dropped files": {
"method": lambda self: self.add_dropped_files(),
"userConfig": {
'type': 'Boolean',
'message': "Upload the dropped files",
'checked': 'true',
"Dropped buffers": {
"method": lambda self: self.add_dropped_buffers(),
"userConfig": {
'type': 'Boolean',
'message': "Upload the dropped buffers",
'checked': 'true',
def __init__(self, config):
self.event = MISPEvent()
self.files = None
self.malware_binary = None = None
self.config = {key: int(on) for key, on in config.items()}
def get_file(self, relative_filepath):
"""Return a BufferedReader for the corresponding relative_filepath
in the Cuckoo archive. If not found, return an empty BufferedReader
to avoid fatal errors."""
blackhole = BufferedReader(open('/dev/null', 'rb'))
res = self.files.get(relative_filepath, blackhole)
if res == blackhole:
log.debug(f"Did not find file {relative_filepath}, "
f"returned an empty file instead")
return res
def read_archive(self, archive_encoded):
"""Read the archive exported from Cuckoo and initialize the class"""
# archive_encoded is base 64 encoded content
# we extract the info about each file but do not retrieve
# it automatically, as it may take too much space in memory
buf_io = BytesIO(base64.b64decode(archive_encoded))
f =, mode='r:bz2')
self.files = { f.extractfile(info)
for info in f.getmembers()
# We want to keep the order of the keys of sub-dicts in the report,
# eg. the signatures have marks with unknown keys such as
# {'marks': [
# {"suspicious_features": "Connection to IP address",
# "suspicious_request": "OPTIONS"}
# ]}
# To render those marks properly, we can only hope the developpers
# thought about the order in which they put the keys, and keep this
# order so that the signature makes sense to the reader.
# We use PrettyDict, a customization of OrderedDict to do so.
# It will be instanced iteratively when parsing the json (ie. subdicts
# will also be instanced as PrettyDict) = json.load(
def read_malware(self):
self.malware_binary = self.get_file("binary").read()
if not self.malware_binary:
log.warn("No malware binary found")
def add_sandbox_info(self):
info ="info", {})
if not info:
log.warning("The 'info' field was not found "
"in the report, skipping")
return False
o = MISPObject(name='sandbox-report')
o.add_attribute('score', info['score'])
o.add_attribute('sandbox-type', 'on-premise')
o.add_attribute('on-premise-sandbox', 'cuckoo')
f'started on:{info["machine"]["started_on"]} '
f'duration:{info["duration"]}s '
def add_sample(self):
"""Add the sample/target of the analysis"""
target ="target", {})
category = target.get("category", "")
if not category:
log.warning("Could not find info about the sample "
"in the report, skipping")
return False
if category == "file":
log.debug("Sample is a file, uploading it")
file_o, bin_type_o, bin_section_li = make_binary_objects(
file_o.comment = "Submitted sample"
# fix categories
for obj in filter(None, (file_o, bin_type_o, *bin_section_li,)):
for attr in obj.attributes:
if attr.type in PAYLOAD_DELIVERY:
attr.category = "Payload delivery"
elif category == "url":
log.debug("Sample is a URL")
o = MISPObject(name='url')
o.add_attribute('url', target['url'])
o.add_attribute('text', "Submitted URL")
def add_http(self):
"""Add the HTTP requests"""
network ="network", [])
http = network.get("http", [])
if not http:"No HTTP connection found in the report, skipping")
return False
for request in http:
o = MISPObject(name='http-request')
o.add_attribute('host', request['host'])
o.add_attribute('method', request['method'])
o.add_attribute('uri', request['uri'])
o.add_attribute('user-agent', request['user-agent'])
o.add_attribute('text', f"count:{request['count']} "
def add_network(self, proto=None):
Add UDP/TCP traffic
proto must be one of "tcp", "udp"
network ="network", [])
li_conn = network.get(proto, [])
if not li_conn:"No {proto} connection found in the report, skipping")
return False
from_to = []
# sort by time to get the "first packet seen" right
li_conn.sort(key=lambda x: x["time"])
for conn in li_conn:
src = conn['src']
dst = conn['dst']
sport = conn['sport']
dport = conn['dport']
if (src, sport, dst, dport) in from_to:
from_to.append((src, sport, dst, dport))
o = MISPObject(name='network-connection')
o.add_attribute('ip-src', src)
o.add_attribute('ip-dst', dst)
o.add_attribute('src-port', sport)
o.add_attribute('dst-port', dport)
o.add_attribute('layer3-protocol', "IP")
o.add_attribute('layer4-protocol', proto.upper())
o.add_attribute('first-packet-seen', conn['time'])
def add_dns(self):
"""Add DNS records"""
network ="network", [])
dns = network.get("dns", [])
if not dns:"No DNS connection found in the report, skipping")
return False
for record in dns:
o = MISPObject(name='dns-record')
o.add_attribute('text', f"request type:{record['type']}")
o.add_attribute('queried-domain', record['request'])
for answer in record.get("answers", []):
if answer["type"] in ("A", "AAAA"):
o.add_attribute('a-record', answer['data'])
# TODO implement MX/NS
def _get_marks_str(self, marks):
marks_strings = []
for m in marks:
m_type = m.pop("type") # temporarily remove the type
if m_type == "generic":
elif m_type == "ioc":
elif m_type == "call":
call = m["call"]
arguments = call.get("arguments", {})
flags = call.get("flags", {})
info = ""
for details in (arguments, flags):
info += f" {details}"
marks_strings.append(f"Call API '{call['api']}'%s" % info)
logging.debug(f"Unknown mark type '{m_type}', skipping")
m["type"] = m_type # restore key 'type'
# TODO implemented marks 'config' and 'volatility'
return marks_strings
def _add_ttp(self, attribute, ttp_short, ttp_num):
Internal wrapper to add the TTP tag from the MITRE galaxy.
@ params
- attribute: MISPAttribute
- ttp_short: short description of the TTP
(eg. "Credential Dumping")
- ttp_num: formatted as "T"+int
(eg. T1003)
f'"{ttp_short} - {ttp_num}"')
def add_signatures(self):
"""Add the Cuckoo signatures, with as many details as possible
regarding the marks"""
signatures ="signatures", [])
if not signatures:"No signature found in the report")
return False
o = MISPObject(name='sb-signature')
o.add_attribute('software', "Cuckoo")
for sign in signatures:
marks = sign["marks"]
marks_strings = self._get_marks_str(marks)
summary = sign['description']
if marks_strings:
summary += "\n---\n"
marks_strings = set(marks_strings)
description = summary + "\n".join(marks_strings)
a = MISPAttribute()
a.from_dict(type='text', value=description)
for ttp_num, desc in sign.get("ttp", {}).items():
ttp_short = desc["short"]
self._add_ttp(a, ttp_short, ttp_num)
# this signature was triggered by the processes with the following
# PIDs, we can create references
triggered_by_pids = filter(
(m.get("pid", None) for m in marks)
# remove redundancy
triggered_by_pids = set(triggered_by_pids)
for pid in triggered_by_pids:
process_o = find_process_by_pid(self.event, pid)
if process_o:
process_o.add_reference(a, "triggers")
o.add_attribute('signature', **a)
def _handle_process(self, proc, accu):
This is an internal recursive function to handle one process
from a process tree and then iterate on its children.
List the objects to be added, based on the tree, into the `accu` list.
The `accu` list uses a DFS-like order.
o = MISPObject(name='process')
o.add_attribute('pid', proc['pid'])
o.add_attribute('command-line', proc['command_line'])
o.add_attribute('name', proc['process_name'])
o.add_attribute('parent-pid', proc['ppid'])
for child in proc.get('children', []):
pos_child = len(accu)
o.add_attribute('child-pid', child['pid'])
self._handle_process(child, accu)
child_obj = accu[pos_child]
child_obj.add_reference(o, 'child-of')
return o
def add_process_tree(self):
"""Add process tree from the report, as separated process objects"""
behavior ="behavior", {})
tree = behavior.get("processtree", [])
if not tree:
log.warning("No process tree found in the report, skipping")
return False
for proc in tree:
objs = []
self._handle_process(proc, objs)
for o in objs:
def get_relpath(self, path):
Transform an absolute or relative path into a path relative to the
correct cuckoo analysis directory, without knowing the cuckoo
working directory.
Return an empty string if the path given does not refer to a
file from the analysis directory.
head, tail = posixpath.split(path)
if not tail:
return ""
prev = self.get_relpath(head)
longer = posixpath.join(prev, tail)
if longer in self.files:
return longer
elif tail in self.files:
return tail
return ""
def add_screenshots(self):
"""Add the screenshots taken by Cuckoo in a sandbox-report object"""
screenshots ='screenshots', [])
if not screenshots:"No screenshot found in the report, skipping")
return False
o = MISPObject(name='sandbox-report')
o.add_attribute('sandbox-type', 'on-premise')
o.add_attribute('on-premise-sandbox', "cuckoo")
for shot in screenshots:
# The path given by Cuckoo is an absolute path, but we need a path
# relative to the analysis folder.
path = self.get_relpath(shot['path'])
img = self.get_file(path)
# .decode('utf-8') in order to avoid the b'' format
img_data = base64.b64encode('utf-8')
filename = posixpath.basename(path)
"sandbox-file", value=filename,
data=img_data, type='attachment',
category="External analysis",
def _get_dropped_objs(self, path, filename=None, comment=None):
Internal wrapper to get dropped files/buffers as file objects
@ params
- path: relative to the cuckoo analysis directory
- filename: if not specified, deduced from the path
if not filename:
filename = posixpath.basename(path)
dropped_file = self.get_file(path)
dropped_binary = BytesIO(
# create ad hoc objects
file_o, bin_type_o, bin_section_li = make_binary_objects(
pseudofile=dropped_binary, filename=filename,
if comment:
file_o.comment = comment
# fix categories
for obj in filter(None, (file_o, bin_type_o, *bin_section_li,)):
for attr in obj.attributes:
if attr.type in ARTIFACTS_DROPPED:
attr.category = "Artifacts dropped"
return file_o, bin_type_o, bin_section_li
def _add_yara(self, obj, yara_dict):
"""Internal wrapper to add Yara matches to an MISPObject"""
for yara in yara_dict:
description = yara.get("meta", {}).get("description", "")
name = yara.get("name", "")
f"Yara match\n(name) {name}\n(description) {description}",
comment="Yara match"
def add_dropped_files(self):
"""Upload the dropped files as file objects"""
dropped ="dropped", [])
if not dropped:"No dropped file found, skipping")
return False
for d in dropped:
# Cuckoo logs three things that are of interest for us:
# - 'filename' which is not the original name of the file
# but is formatted as follow:
# 8 first bytes of SHA265 + _ + original name in lower case
# - 'filepath' which is the original filepath on the VM,
# where the file was dropped
# - 'path' which is the local path of the stored file,
# in the cuckoo archive
filename = d.get("name", "")
original_path = d.get("filepath", "")
sha256 = d.get("sha256", "")
if original_path and sha256:
log.debug(f"Will now try to restore original filename from "
f"path {original_path}")
s = filename.split("_")
if not s:
raise Exception("unexpected filename read "
"in the report")
sha256_first_8_bytes = s[0]
original_name = s[1]
# check our assumptions are valid, if so we can safely
# restore the filename, if not the format may have changed
# so we'll keep the filename of the report
if sha256.startswith(sha256_first_8_bytes) and \
original_path.lower().endswith(original_name) and \
filename not in original_path.lower():
# we can restore the original case of the filename
position = original_path.lower().rindex(original_name)
filename = original_path[position:]
log.debug(f"Successfully restored original filename: "
raise Exception("our assumptions were wrong, "
"filename format may have changed")
except Exception as e:
log.debug(f"Cannot restore filename: {e}")
if not filename:
log.warning(f'No filename found for dropped file! '
f'Will use "{filename}"')
file_o, bin_type_o, bin_section_o = self._get_dropped_objs(
comment="Dropped file"
self._add_yara(file_o, d.get("yara", []))
file_o.add_attribute("fullpath", original_path,
category="Artifacts dropped")
# why is this a list? for when various programs drop the same file?
for pid in d.get("pids", []):
# if we have an object for the process that dropped the file,
# we can link the two (we just take the first result from
# the search)
process_o = find_process_by_pid(self.event, pid)
if process_o:
file_o.add_reference(process_o, "dropped-by")
def add_dropped_buffers(self):
""""Upload the dropped buffers as file objects"""
buffer ="buffer", [])
if not buffer:"No dropped buffer found, skipping")
return False
for i, buf in enumerate(buffer):
file_o, bin_type_o, bin_section_o = self._get_dropped_objs(
filename=f"buffer {i}",
comment="Dropped buffer"
self._add_yara(file_o, buf.get("yara", []))
def parse(self):
"""Run the parsing"""
for name, active in self.config.items():
if active:
def get_misp_event(self):
log.debug("Running MISP expansions")
return self.event
def handler(q=False):
# Just in case we have no data
# In case there's no data
if q is False:
return False
# The return value
r = {'results': []}
# Load up that JSON
q = json.loads(q)
data = base64.b64decode(q.get("data")).decode('utf-8')
# If something really weird happened
if not data:
return json.dumps({"success": 0})
data = json.loads(data)
# Get characteristics of file
targetFile = data['target']['file']
# Process the inital binary
processBinary(r, targetFile, initial=True)
# Get binary information for dropped files
for droppedFile in data['dropped']:
processBinary(r, droppedFile, dropped=True)
# Add malscore to results
"values": "Malscore: {} ".format(data['malscore']),
"types": "comment",
"categories": "Payload delivery",
"comment": "Cuckoo analysis: MalScore"
# Add virustotal data, if exists
processVT(r, data['virustotal'])
# Add network information, should be improved
processNetwork(r, data['network'])
# Add behavioral information
processSummary(r, data['behavior']['summary'])
# Return
return r
def processSummary(r, summary):
"values": summary['mutexes'],
"types": "mutex",
"categories": "Artifacts dropped",
"comment": "Cuckoo analysis: Observed mutexes"
def processVT(r, virustotal):
category = "Antivirus detection"
comment = "VirusTotal analysis"
"values": virustotal['permalink'],
"types": "link",
"categories": category,
"comments": comment + " - Permalink"
"values": "VirusTotal detection rate {}/{}".format(
"types": "comment",
"categories": category,
"comment": comment
"values": "Sample not detected on VirusTotal",
"types": "comment",
"categories": category,
"comment": comment
def processNetwork(r, network):
category = "Network activity"
for host in network['hosts']:
"values": host['ip'],
"types": "ip-dst",
"categories": category,
"comment": "Cuckoo analysis: Observed network traffic"
def processBinary(r, target, initial=False, dropped=False):
comment = "Cuckoo analysis: Initial file"
category = "Payload delivery"
category = "Artifacts dropped"
comment = "Cuckoo analysis: Dropped file"
"values": target['name'],
"types": "filename",
"categories": category,
"comment": comment
"values": target['md5'],
"types": "md5",
"categories": category,
"comment": comment
"values": target['sha1'],
"types": "sha1",
"categories": category,
"comment": comment
"values": target['sha256'],
"types": "sha256",
"categories": category,
"comment": comment
"values": target['sha512'],
"types": "sha512",
"categories": category,
"comment": comment
# todo : add file size?
"values": target['guest_paths'],
"types": "filename",
"categories": "Payload installation",
"comment": comment + " - Path"
data = q['data']
parser = CuckooParser(q['config'])
event = parser.get_misp_event()
event = json.loads(event.to_json())
results = {
key: event[key]
for key in ('Attribute', 'Object')
if (key in event and event[key])
return {'results': results}
def introspection():
modulesetup = {}
modulesetup['userConfig'] = userConfig
except NameError:
modulesetup['inputSource'] = inputSource
except NameError:
return modulesetup
userConfig = {
key: o["userConfig"]
for key, o in CuckooParser.options.items()
mispattributes['userConfig'] = userConfig
return mispattributes
def version():
moduleinfo['config'] = moduleconfig
return moduleinfo
if __name__ == '__main__':
x = open('test.json', 'r')
q = []
q['data'] =
q = base64.base64encode(q)