PyMISP/pymisp/tools/fileobject.py

90 lines
3.3 KiB
Python
Raw Normal View History

2017-07-21 18:47:10 +02:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from ..exceptions import InvalidMISPObject
2017-08-31 10:40:18 +02:00
from .abstractgenerator import AbstractMISPObjectGenerator
2017-07-21 18:47:10 +02:00
import os
from io import BytesIO
from hashlib import md5, sha1, sha256, sha512
import math
from collections import Counter
import logging
2020-01-23 10:27:40 +01:00
from pathlib import Path
from typing import Union
logger = logging.getLogger('pymisp')
2017-07-21 18:47:10 +02:00
try:
2020-01-23 10:27:40 +01:00
import pydeep # type: ignore
2017-07-21 18:47:10 +02:00
HAS_PYDEEP = True
except ImportError:
HAS_PYDEEP = False
try:
2020-01-23 10:27:40 +01:00
import magic # type: ignore
2017-07-21 18:47:10 +02:00
HAS_MAGIC = True
except ImportError:
HAS_MAGIC = False
2017-08-28 19:01:53 +02:00
class FileObject(AbstractMISPObjectGenerator):
2017-07-21 18:47:10 +02:00
2020-07-28 20:05:42 +02:00
def __init__(self, filepath: Union[Path, str] = None, pseudofile: BytesIO = None, filename: str = None, **kwargs):
# PY3 way:
# super().__init__('file')
super(FileObject, self).__init__('file', **kwargs)
2017-07-21 18:47:10 +02:00
if not HAS_PYDEEP:
logger.warning("Please install pydeep: pip install git+https://github.com/kbandla/pydeep.git")
2017-07-21 18:47:10 +02:00
if not HAS_MAGIC:
logger.warning("Please install python-magic: pip install python-magic.")
if filename:
# Useful in case the file is copied with a pre-defined name by a script but we want to keep the original name
self.__filename = filename
elif filepath:
self.__filename = os.path.basename(filepath)
else:
raise InvalidMISPObject('A file name is required (either in the path, or as a parameter).')
2017-07-21 18:47:10 +02:00
if filepath:
with open(filepath, 'rb') as f:
self.__pseudofile = BytesIO(f.read())
2017-07-21 18:47:10 +02:00
elif pseudofile and isinstance(pseudofile, BytesIO):
# WARNING: lief.parse requires a path
self.__pseudofile = pseudofile
2017-07-21 18:47:10 +02:00
else:
raise InvalidMISPObject('File buffer (BytesIO) or a path is required.')
self.__data = self.__pseudofile.getvalue()
2017-07-21 18:47:10 +02:00
self.generate_attributes()
def generate_attributes(self):
self.add_attribute('filename', value=self.__filename)
size = self.add_attribute('size-in-bytes', value=len(self.__data))
if int(size.value) > 0:
self.add_attribute('entropy', value=self.__entropy_H(self.__data))
self.add_attribute('md5', value=md5(self.__data).hexdigest())
self.add_attribute('sha1', value=sha1(self.__data).hexdigest())
self.add_attribute('sha256', value=sha256(self.__data).hexdigest())
self.add_attribute('sha512', value=sha512(self.__data).hexdigest())
self.add_attribute('malware-sample', value=self.__filename, data=self.__pseudofile, disable_correlation=True)
if HAS_MAGIC:
self.add_attribute('mimetype', value=magic.from_buffer(self.__data, mime=True))
if HAS_PYDEEP:
self.add_attribute('ssdeep', value=pydeep.hash_buf(self.__data).decode())
2017-07-21 18:47:10 +02:00
2020-01-23 10:27:40 +01:00
def __entropy_H(self, data: bytes) -> float:
2017-07-21 18:47:10 +02:00
"""Calculate the entropy of a chunk of data."""
# NOTE: copy of the entropy function from pefile
if len(data) == 0:
return 0.0
2020-10-01 13:45:29 +02:00
occurrences = Counter(bytearray(data))
2017-07-21 18:47:10 +02:00
2020-01-23 10:27:40 +01:00
entropy = 0.0
2020-10-01 13:45:29 +02:00
for x in occurrences.values():
2017-07-21 18:47:10 +02:00
p_x = float(x) / len(data)
entropy -= p_x * math.log(p_x, 2)
return entropy