cti-python-stix2/stix2/datastore/filesystem.py

613 lines
22 KiB
Python

"""
Python STIX 2.0 FileSystem Source/Sink
"""
import errno
import json
import os
import stat
import pytz
import six
from stix2.base import _STIXBase
from stix2.core import Bundle, parse
from stix2.datastore import DataSink, DataSource, DataStoreMixin
from stix2.datastore.filters import Filter, FilterSet, apply_common_filters
from stix2.utils import get_type_from_id, is_marking
def _timestamp2filename(timestamp):
"""
Encapsulates a way to create unique filenames based on an object's
"modified" property value. This should not include an extension.
:param timestamp: A timestamp, as a datetime.datetime object.
"""
# Different times will only produce different file names if all timestamps
# are in the same time zone! So if timestamp is timezone-aware convert
# to UTC just to be safe. If naive, just use as-is.
if timestamp.tzinfo is not None:
timestamp = timestamp.astimezone(pytz.utc)
return timestamp.strftime("%Y%m%d%H%M%S%f")
class AuthSet(object):
"""
Represents either a whitelist or blacklist of values, where/what we
must/must not search to find objects which match a query. (Maybe "AuthSet"
isn't the right name, but determining authorization is a typical context in
which black/white lists are used.)
The set may be empty. For a whitelist, this means you mustn't search
anywhere, which means the query was impossible to match, so you can skip
searching altogether. For a blacklist, this means nothing is excluded
and you must search everywhere.
"""
BLACK = 0
WHITE = 1
def __init__(self, allowed, prohibited):
"""
Initialize this AuthSet from the given sets of allowed and/or
prohibited values. The type of set (black or white) is determined
from the allowed and/or prohibited values given.
:param allowed: A set of allowed values (or None if no allow filters
were found in the query)
:param prohibited: A set of prohibited values (not None)
"""
if allowed is None:
self.__values = prohibited
self.__type = AuthSet.BLACK
else:
# There was at least one allow filter, so create a whitelist. But
# any matching prohibited values create a combination of conditions
# which can never match. So exclude those.
self.__values = allowed - prohibited
self.__type = AuthSet.WHITE
@property
def values(self):
"""
Get the values in this white/blacklist, as a set.
"""
return self.__values
@property
def auth_type(self):
"""
Get the type of set: AuthSet.WHITE or AuthSet.BLACK.
"""
return self.__type
def __repr__(self):
return "{}list: {}".format(
"white" if self.auth_type == AuthSet.WHITE else "black",
self.values
)
# A fixed, reusable AuthSet which accepts anything. It came in handy.
_AUTHSET_ANY = AuthSet(None, set())
def _update_allow(allow_set, value):
"""
Updates the given set of "allow" values. The first time an update to the
set occurs, the value(s) are added. Thereafter, since all filters are
implicitly AND'd, the given values are intersected with the existing allow
set, which may remove values. At the end, it may even wind up empty.
:param allow_set: The allow set, or None
:param value: The value(s) to add (single value, or iterable of values)
:return: The updated allow set (not None)
"""
adding_seq = hasattr(value, "__iter__") and \
not isinstance(value, six.string_types)
if allow_set is None:
allow_set = set()
if adding_seq:
allow_set.update(value)
else:
allow_set.add(value)
else:
# strangely, the "&=" operator requires a set on the RHS
# whereas the method allows any iterable.
if adding_seq:
allow_set.intersection_update(value)
else:
allow_set.intersection_update({value})
return allow_set
def _find_search_optimizations(filters):
"""
Searches through all the filters, and creates white/blacklists of types and
IDs, which can be used to optimize the filesystem search.
:param filters: An iterable of filter objects representing a query
:return: A 2-tuple of AuthSet objects: the first is for object types, and
the second is for object IDs.
"""
# The basic approach to this is to determine what is allowed and
# prohibited, independently, and then combine them to create the final
# white/blacklists.
allowed_types = allowed_ids = None
prohibited_types = set()
prohibited_ids = set()
for filter_ in filters:
if filter_.property == "type":
if filter_.op in ("=", "in"):
allowed_types = _update_allow(allowed_types, filter_.value)
elif filter_.op == "!=":
prohibited_types.add(filter_.value)
elif filter_.property == "id":
if filter_.op == "=":
# An "allow" ID filter implies a type filter too, since IDs
# contain types within them.
allowed_ids = _update_allow(allowed_ids, filter_.value)
allowed_types = _update_allow(allowed_types,
get_type_from_id(filter_.value))
elif filter_.op == "!=":
prohibited_ids.add(filter_.value)
elif filter_.op == "in":
allowed_ids = _update_allow(allowed_ids, filter_.value)
allowed_types = _update_allow(allowed_types, (
get_type_from_id(id_) for id_ in filter_.value
))
opt_types = AuthSet(allowed_types, prohibited_types)
opt_ids = AuthSet(allowed_ids, prohibited_ids)
# If we have both type and ID whitelists, perform a type-based intersection
# on them, to further optimize. (Some of the cross-property constraints
# occur above; this is essentially a second pass which operates on the
# final whitelists, which among other things, incorporates any of the
# prohibitions found above.)
if opt_types.auth_type == AuthSet.WHITE and \
opt_ids.auth_type == AuthSet.WHITE:
opt_types.values.intersection_update(
get_type_from_id(id_) for id_ in opt_ids.values
)
opt_ids.values.intersection_update(
id_ for id_ in opt_ids.values
if get_type_from_id(id_) in opt_types.values
)
return opt_types, opt_ids
def _get_matching_dir_entries(parent_dir, auth_set, st_mode_test=None, ext=""):
"""
Search a directory (non-recursively), and find entries which match the
given criteria.
:param parent_dir: The directory to search
:param auth_set: an AuthSet instance, which represents a black/whitelist
filter on filenames
:param st_mode_test: A callable allowing filtering based on the type of
directory entry. E.g. just get directories, or just get files. It
will be passed the st_mode field of a stat() structure and should
return True to include the file, or False to exclude it. Easy thing to
do is pass one of the stat module functions, e.g. stat.S_ISREG. If
None, don't filter based on entry type.
:param ext: Determines how names from auth_set match up to directory
entries, and allows filtering by extension. The extension is added
to auth_set values to obtain directory entries; it is removed from
directory entries to obtain auth_set values. In this way, auth_set
may be treated as having only "basenames" of the entries. Only entries
having the given extension will be included in the results. If not
empty, the extension MUST include a leading ".". The default is the
empty string, which will result in direct comparisons, and no
extension-based filtering.
:return: A list of directory entries matching the criteria. These will not
have any path info included; they will just be bare names.
:raises OSError: If there are errors accessing directory contents or
stat()'ing files
"""
results = []
if auth_set.auth_type == AuthSet.WHITE:
for value in auth_set.values:
try:
filename = value + ext
s = os.stat(os.path.join(parent_dir, filename))
if not st_mode_test or st_mode_test(s.st_mode):
results.append(filename)
except OSError as e:
if e.errno != errno.ENOENT:
raise e
# else, file-not-found is ok, just skip
else: # auth_set is a blacklist
for entry in os.listdir(parent_dir):
if ext:
auth_name, this_ext = os.path.splitext(entry)
if this_ext != ext:
continue
else:
auth_name = entry
if auth_name in auth_set.values:
continue
try:
s = os.stat(os.path.join(parent_dir, entry))
if not st_mode_test or st_mode_test(s.st_mode):
results.append(entry)
except OSError as e:
if e.errno != errno.ENOENT:
raise e
# else, file-not-found is ok, just skip
return results
def _check_object_from_file(query, filepath):
"""
Read a STIX object from the given file, and check it against the given
filters.
:param query: Iterable of filters
:param filepath: Path to file to read
:return: The STIX object, as a dict, if the object passes the filters. If
not, None is returned.
:raises TypeError: If the file had invalid content
:raises IOError: If there are problems opening/reading the file
"""
try:
with open(filepath, "r") as f:
stix_obj = json.load(f)
if stix_obj["type"] == "bundle":
stix_obj = stix_obj["objects"][0]
# naive STIX type checking
stix_obj["type"]
stix_obj["id"]
except (ValueError, KeyError): # likely not a JSON file
raise TypeError(
"STIX JSON object at '{0}' could either not be parsed "
"to JSON or was not valid STIX JSON".format(
filepath))
# check against other filters, add if match
result = next(apply_common_filters([stix_obj], query), None)
return result
def _search_versioned(query, type_path, auth_ids):
"""
Searches the given directory, which contains data for STIX objects of a
particular versioned type (i.e. not markings), and return any which match
the query.
:param query: The query to match against
:param type_path: The directory with type-specific STIX object files
:param auth_ids: Search optimization based on object ID
:return: A list of all matching objects
:raises TypeError: If any objects had invalid content
:raises IOError, OSError: If there were any problems opening/reading files
"""
results = []
id_dirs = _get_matching_dir_entries(type_path, auth_ids,
stat.S_ISDIR)
for id_dir in id_dirs:
id_path = os.path.join(type_path, id_dir)
# This leverages a more sophisticated function to do a simple thing:
# get all the JSON files from a directory. I guess it does give us
# file type checking, ensuring we only get regular files.
version_files = _get_matching_dir_entries(id_path, _AUTHSET_ANY,
stat.S_ISREG, ".json")
for version_file in version_files:
version_path = os.path.join(id_path, version_file)
try:
stix_obj = _check_object_from_file(query, version_path)
if stix_obj:
results.append(stix_obj)
except IOError as e:
if e.errno != errno.ENOENT:
raise e
# else, file-not-found is ok, just skip
return results
def _search_markings(query, markings_path, auth_ids):
"""
Searches the given directory, which contains markings data, and return any
which match the query.
:param query: The query to match against
:param markings_path: The directory with STIX markings files
:param auth_ids: Search optimization based on object ID
:return: A list of all matching objects
:raises TypeError: If any objects had invalid content
:raises IOError: If there were any problems opening/reading files
"""
results = []
id_files = _get_matching_dir_entries(markings_path, auth_ids, stat.S_ISREG,
".json")
for id_file in id_files:
id_path = os.path.join(markings_path, id_file)
try:
stix_obj = _check_object_from_file(query, id_path)
if stix_obj:
results.append(stix_obj)
except IOError as e:
if e.errno != errno.ENOENT:
raise e
# else, file-not-found is ok, just skip
return results
class FileSystemStore(DataStoreMixin):
"""Interface to a file directory of STIX objects.
FileSystemStore is a wrapper around a paired FileSystemSink
and FileSystemSource.
Args:
stix_dir (str): path to directory of STIX objects
allow_custom (bool): whether to allow custom STIX content to be
pushed/retrieved. Defaults to True for FileSystemSource side(retrieving data)
and False for FileSystemSink side(pushing data). However, when
parameter is supplied, it will be applied to both FileSystemSource
and FileSystemSink.
bundlify (bool): whether to wrap objects in bundles when saving them.
Default: False.
Attributes:
source (FileSystemSource): FileSystemSource
sink (FileSystemSink): FileSystemSink
"""
def __init__(self, stix_dir, allow_custom=None, bundlify=False):
if allow_custom is None:
allow_custom_source = True
allow_custom_sink = False
else:
allow_custom_sink = allow_custom_source = allow_custom
super(FileSystemStore, self).__init__(
source=FileSystemSource(stix_dir=stix_dir, allow_custom=allow_custom_source),
sink=FileSystemSink(stix_dir=stix_dir, allow_custom=allow_custom_sink, bundlify=bundlify)
)
class FileSystemSink(DataSink):
"""Interface for adding/pushing STIX objects to file directory of STIX
objects.
Can be paired with a FileSystemSource, together as the two
components of a FileSystemStore.
Args:
stix_dir (str): path to directory of STIX objects.
allow_custom (bool): Whether to allow custom STIX content to be
added to the FileSystemSource. Default: False
bundlify (bool): Whether to wrap objects in bundles when saving them.
Default: False.
"""
def __init__(self, stix_dir, allow_custom=False, bundlify=False):
super(FileSystemSink, self).__init__()
self._stix_dir = os.path.abspath(stix_dir)
self.allow_custom = allow_custom
self.bundlify = bundlify
if not os.path.exists(self._stix_dir):
raise ValueError("directory path for STIX data does not exist")
@property
def stix_dir(self):
return self._stix_dir
def _check_path_and_write(self, stix_obj):
"""Write the given STIX object to a file in the STIX file directory.
"""
type_dir = os.path.join(self._stix_dir, stix_obj["type"])
if is_marking(stix_obj):
filename = stix_obj["id"]
obj_dir = type_dir
else:
filename = _timestamp2filename(stix_obj["modified"])
obj_dir = os.path.join(type_dir, stix_obj["id"])
file_path = os.path.join(obj_dir, filename + ".json")
if not os.path.exists(obj_dir):
os.makedirs(obj_dir)
if self.bundlify:
stix_obj = Bundle(stix_obj, allow_custom=self.allow_custom)
with open(file_path, "w") as f:
f.write(str(stix_obj))
def add(self, stix_data=None, version=None):
"""Add STIX objects to file directory.
Args:
stix_data (STIX object OR dict OR str OR list): valid STIX 2.0 content
in a STIX object (or list of), dict (or list of), or a STIX 2.0
json encoded string.
version (str): Which STIX2 version to use. (e.g. "2.0", "2.1"). If
None, use latest version.
Note:
``stix_data`` can be a Bundle object, but each object in it will be
saved separately; you will be able to retrieve any of the objects
the Bundle contained, but not the Bundle itself.
"""
if isinstance(stix_data, Bundle):
# recursively add individual STIX objects
for stix_obj in stix_data.get("objects", []):
self.add(stix_obj, version=version)
elif isinstance(stix_data, _STIXBase):
# adding python STIX object
self._check_path_and_write(stix_data)
elif isinstance(stix_data, (str, dict)):
stix_data = parse(stix_data, allow_custom=self.allow_custom, version=version)
self.add(stix_data, version=version)
elif isinstance(stix_data, list):
# recursively add individual STIX objects
for stix_obj in stix_data:
self.add(stix_obj, version=version)
else:
raise TypeError("stix_data must be a STIX object (or list of), "
"JSON formatted STIX (or list of), "
"or a JSON formatted STIX bundle")
class FileSystemSource(DataSource):
"""Interface for searching/retrieving STIX objects from a STIX object file
directory.
Can be paired with a FileSystemSink, together as the two
components of a FileSystemStore.
Args:
stix_dir (str): path to directory of STIX objects
allow_custom (bool): Whether to allow custom STIX content to be
added to the FileSystemSink. Default: True
"""
def __init__(self, stix_dir, allow_custom=True):
super(FileSystemSource, self).__init__()
self._stix_dir = os.path.abspath(stix_dir)
self.allow_custom = allow_custom
if not os.path.exists(self._stix_dir):
raise ValueError("directory path for STIX data does not exist: %s" % self._stix_dir)
@property
def stix_dir(self):
return self._stix_dir
def get(self, stix_id, version=None, _composite_filters=None):
"""Retrieve STIX object from file directory via STIX ID.
Args:
stix_id (str): The STIX ID of the STIX object to be retrieved.
_composite_filters (FilterSet): collection of filters passed from the parent
CompositeDataSource, not user supplied
version (str): Which STIX2 version to use. (e.g. "2.0", "2.1"). If
None, use latest version.
Returns:
(STIX object): STIX object that has the supplied STIX ID.
The STIX object is loaded from its json file, parsed into
a python STIX object and then returned
"""
all_data = self.all_versions(stix_id, version=version, _composite_filters=_composite_filters)
if all_data:
stix_obj = sorted(all_data, key=lambda k: k['modified'])[-1]
else:
stix_obj = None
return stix_obj
def all_versions(self, stix_id, version=None, _composite_filters=None):
"""Retrieve STIX object from file directory via STIX ID, all versions.
Note: Since FileSystem sources/sinks don't handle multiple versions
of a STIX object, this operation is unnecessary. Pass call to get().
Args:
stix_id (str): The STIX ID of the STIX objects to be retrieved.
_composite_filters (FilterSet): collection of filters passed from the parent
CompositeDataSource, not user supplied
version (str): Which STIX2 version to use. (e.g. "2.0", "2.1"). If
None, use latest version.
Returns:
(list): of STIX objects that has the supplied STIX ID.
The STIX objects are loaded from their json files, parsed into
a python STIX objects and then returned
"""
query = [Filter("id", "=", stix_id)]
return self.query(query, version=version, _composite_filters=_composite_filters)
def query(self, query=None, version=None, _composite_filters=None):
"""Search and retrieve STIX objects based on the complete query.
A "complete query" includes the filters from the query, the filters
attached to this FileSystemSource, and any filters passed from a
CompositeDataSource (i.e. _composite_filters).
Args:
query (list): list of filters to search on
_composite_filters (FilterSet): collection of filters passed from the
CompositeDataSource, not user supplied
version (str): Which STIX2 version to use. (e.g. "2.0", "2.1"). If
None, use latest version.
Returns:
(list): list of STIX objects that matches the supplied
query. The STIX objects are loaded from their json files,
parsed into a python STIX objects and then returned.
"""
all_data = []
query = FilterSet(query)
# combine all query filters
if self.filters:
query.add(self.filters)
if _composite_filters:
query.add(_composite_filters)
auth_types, auth_ids = _find_search_optimizations(query)
type_dirs = _get_matching_dir_entries(self._stix_dir, auth_types,
stat.S_ISDIR)
for type_dir in type_dirs:
type_path = os.path.join(self._stix_dir, type_dir)
if type_dir == "marking-definition":
type_results = _search_markings(query, type_path, auth_ids)
else:
type_results = _search_versioned(query, type_path, auth_ids)
all_data.extend(type_results)
# parse python STIX objects from the STIX object dicts
stix_objs = [
parse(stix_obj_dict, allow_custom=self.allow_custom,
version=version)
for stix_obj_dict in all_data
]
return stix_objs