2017-07-21 18:47:10 +02:00
|
|
|
#!/usr/bin/env python3
|
2024-01-17 13:13:14 +01:00
|
|
|
|
|
|
|
from __future__ import annotations
|
2017-07-21 18:47:10 +02:00
|
|
|
|
2017-09-20 12:44:55 +02:00
|
|
|
from ..exceptions import InvalidMISPObject
|
2017-08-31 10:40:18 +02:00
|
|
|
from .abstractgenerator import AbstractMISPObjectGenerator
|
2017-07-21 18:47:10 +02:00
|
|
|
import os
|
|
|
|
from io import BytesIO
|
|
|
|
from hashlib import md5, sha1, sha256, sha512
|
|
|
|
import math
|
|
|
|
from collections import Counter
|
2017-11-08 03:10:04 +01:00
|
|
|
import logging
|
2020-01-23 10:27:40 +01:00
|
|
|
from pathlib import Path
|
2017-11-08 03:10:04 +01:00
|
|
|
|
|
|
|
logger = logging.getLogger('pymisp')
|
|
|
|
|
2017-07-21 18:47:10 +02:00
|
|
|
|
|
|
|
try:
|
2020-01-23 10:27:40 +01:00
|
|
|
import pydeep # type: ignore
|
2017-07-21 18:47:10 +02:00
|
|
|
HAS_PYDEEP = True
|
|
|
|
except ImportError:
|
|
|
|
HAS_PYDEEP = False
|
|
|
|
|
|
|
|
try:
|
2024-01-17 13:13:14 +01:00
|
|
|
import magic
|
2017-07-21 18:47:10 +02:00
|
|
|
HAS_MAGIC = True
|
|
|
|
except ImportError:
|
|
|
|
HAS_MAGIC = False
|
|
|
|
|
|
|
|
|
2017-08-28 19:01:53 +02:00
|
|
|
class FileObject(AbstractMISPObjectGenerator):
|
2017-07-21 18:47:10 +02:00
|
|
|
|
2024-02-01 14:40:12 +01:00
|
|
|
def __init__(self, filepath: Path | str | None = None, # type: ignore[no-untyped-def]
|
|
|
|
pseudofile: BytesIO | bytes | None = None,
|
|
|
|
filename: str | None = None, **kwargs) -> None:
|
2021-10-26 02:37:12 +02:00
|
|
|
super().__init__('file', **kwargs)
|
2017-07-21 18:47:10 +02:00
|
|
|
if not HAS_PYDEEP:
|
2023-02-06 20:43:42 +01:00
|
|
|
logger.warning("pydeep is missing, please install pymisp this way: pip install pymisp[fileobjects]")
|
2017-07-21 18:47:10 +02:00
|
|
|
if not HAS_MAGIC:
|
2023-02-06 20:43:42 +01:00
|
|
|
logger.warning("python-magic is missing, please install pymisp this way: pip install pymisp[fileobjects]")
|
2017-09-20 12:44:55 +02:00
|
|
|
if filename:
|
|
|
|
# Useful in case the file is copied with a pre-defined name by a script but we want to keep the original name
|
|
|
|
self.__filename = filename
|
|
|
|
elif filepath:
|
|
|
|
self.__filename = os.path.basename(filepath)
|
|
|
|
else:
|
|
|
|
raise InvalidMISPObject('A file name is required (either in the path, or as a parameter).')
|
|
|
|
|
2017-07-21 18:47:10 +02:00
|
|
|
if filepath:
|
|
|
|
with open(filepath, 'rb') as f:
|
2017-09-12 16:46:06 +02:00
|
|
|
self.__pseudofile = BytesIO(f.read())
|
2017-07-21 18:47:10 +02:00
|
|
|
elif pseudofile and isinstance(pseudofile, BytesIO):
|
|
|
|
# WARNING: lief.parse requires a path
|
2017-09-12 16:46:06 +02:00
|
|
|
self.__pseudofile = pseudofile
|
2017-07-21 18:47:10 +02:00
|
|
|
else:
|
2017-09-20 12:44:55 +02:00
|
|
|
raise InvalidMISPObject('File buffer (BytesIO) or a path is required.')
|
2017-09-12 16:46:06 +02:00
|
|
|
self.__data = self.__pseudofile.getvalue()
|
2017-07-21 18:47:10 +02:00
|
|
|
self.generate_attributes()
|
|
|
|
|
2024-02-01 14:40:12 +01:00
|
|
|
def generate_attributes(self) -> None:
|
2017-09-20 12:44:55 +02:00
|
|
|
self.add_attribute('filename', value=self.__filename)
|
2024-02-01 14:40:12 +01:00
|
|
|
self.add_attribute('size-in-bytes', value=len(self.__data))
|
|
|
|
if len(self.__data) > 0:
|
2017-09-12 16:46:06 +02:00
|
|
|
self.add_attribute('entropy', value=self.__entropy_H(self.__data))
|
|
|
|
self.add_attribute('md5', value=md5(self.__data).hexdigest())
|
|
|
|
self.add_attribute('sha1', value=sha1(self.__data).hexdigest())
|
|
|
|
self.add_attribute('sha256', value=sha256(self.__data).hexdigest())
|
|
|
|
self.add_attribute('sha512', value=sha512(self.__data).hexdigest())
|
2021-02-01 12:14:50 +01:00
|
|
|
self.add_attribute('malware-sample', value=self.__filename, data=self.__pseudofile, disable_correlation=True)
|
2017-08-24 19:21:52 +02:00
|
|
|
if HAS_MAGIC:
|
2020-06-14 18:36:40 +02:00
|
|
|
self.add_attribute('mimetype', value=magic.from_buffer(self.__data, mime=True))
|
2017-08-24 19:21:52 +02:00
|
|
|
if HAS_PYDEEP:
|
2017-09-12 16:46:06 +02:00
|
|
|
self.add_attribute('ssdeep', value=pydeep.hash_buf(self.__data).decode())
|
2017-07-21 18:47:10 +02:00
|
|
|
|
2020-01-23 10:27:40 +01:00
|
|
|
def __entropy_H(self, data: bytes) -> float:
|
2017-07-21 18:47:10 +02:00
|
|
|
"""Calculate the entropy of a chunk of data."""
|
|
|
|
# NOTE: copy of the entropy function from pefile
|
|
|
|
|
|
|
|
if len(data) == 0:
|
|
|
|
return 0.0
|
|
|
|
|
2020-10-01 13:45:29 +02:00
|
|
|
occurrences = Counter(bytearray(data))
|
2017-07-21 18:47:10 +02:00
|
|
|
|
2020-01-23 10:27:40 +01:00
|
|
|
entropy = 0.0
|
2020-10-01 13:45:29 +02:00
|
|
|
for x in occurrences.values():
|
2017-07-21 18:47:10 +02:00
|
|
|
p_x = float(x) / len(data)
|
|
|
|
entropy -= p_x * math.log(p_x, 2)
|
|
|
|
|
|
|
|
return entropy
|