cti-python-stix2/stix2/datastore/filesystem.py

774 lines
28 KiB
Python

"""Python STIX2 FileSystem Source/Sink"""
import errno
import io
import json
import os
import re
import stat
from stix2 import v20, v21
from stix2.base import _STIXBase
from stix2.datastore import (
DataSink, DataSource, DataSourceError, DataStoreMixin,
)
from stix2.datastore.filters import Filter, FilterSet, apply_common_filters
from stix2.parsing import parse
from stix2.serialization import fp_serialize
from stix2.utils import format_datetime, get_type_from_id, parse_into_datetime
def _timestamp2filename(timestamp):
"""
Encapsulates a way to create unique filenames based on an object's
"modified" property value. This should not include an extension.
Args:
timestamp: A timestamp, as a datetime.datetime object or string.
"""
# The format_datetime will determine the correct level of precision.
if isinstance(timestamp, str):
timestamp = parse_into_datetime(timestamp)
ts = format_datetime(timestamp)
ts = re.sub(r"[-T:\.Z ]", "", ts)
return ts
class AuthSet(object):
"""
Represents either a whitelist or blacklist of values, where/what we
must/must not search to find objects which match a query. (Maybe "AuthSet"
isn't the right name, but determining authorization is a typical context in
which black/white lists are used.)
The set may be empty. For a whitelist, this means you mustn't search
anywhere, which means the query was impossible to match, so you can skip
searching altogether. For a blacklist, this means nothing is excluded
and you must search everywhere.
"""
BLACK = 0
WHITE = 1
def __init__(self, allowed, prohibited):
"""
Initialize this AuthSet from the given sets of allowed and/or
prohibited values. The type of set (black or white) is determined
from the allowed and/or prohibited values given.
Args:
allowed: A set of allowed values (or None if no allow filters
were found in the query)
prohibited: A set of prohibited values (not None)
"""
if allowed is None:
self.__values = prohibited
self.__type = AuthSet.BLACK
else:
# There was at least one allow filter, so create a whitelist. But
# any matching prohibited values create a combination of conditions
# which can never match. So exclude those.
self.__values = allowed - prohibited
self.__type = AuthSet.WHITE
@property
def values(self):
"""
Get the values in this white/blacklist, as a set.
"""
return self.__values
@property
def auth_type(self):
"""
Get the type of set: AuthSet.WHITE or AuthSet.BLACK.
"""
return self.__type
def __repr__(self):
return "{}list: {}".format(
"white" if self.auth_type == AuthSet.WHITE else "black",
self.values,
)
# A fixed, reusable AuthSet which accepts anything. It came in handy.
_AUTHSET_ANY = AuthSet(None, set())
def _update_allow(allow_set, value):
"""
Updates the given set of "allow" values. The first time an update to the
set occurs, the value(s) are added. Thereafter, since all filters are
implicitly AND'd, the given values are intersected with the existing allow
set, which may remove values. At the end, it may even wind up empty.
Args:
allow_set: The allow set, or None
value: The value(s) to add (single value, or iterable of values)
Returns:
The updated allow set (not None)
"""
adding_seq = hasattr(value, "__iter__") and \
not isinstance(value, str)
if allow_set is None:
allow_set = set()
if adding_seq:
allow_set.update(value)
else:
allow_set.add(value)
else:
# strangely, the "&=" operator requires a set on the RHS
# whereas the method allows any iterable.
if adding_seq:
allow_set.intersection_update(value)
else:
allow_set.intersection_update({value})
return allow_set
def _find_search_optimizations(filters):
"""
Searches through all the filters, and creates white/blacklists of types and
IDs, which can be used to optimize the filesystem search.
Args:
filters: An iterable of filter objects representing a query
Returns:
A 2-tuple of AuthSet objects: the first is for object types, and
the second is for object IDs.
"""
# The basic approach to this is to determine what is allowed and
# prohibited, independently, and then combine them to create the final
# white/blacklists.
allowed_types = allowed_ids = None
prohibited_types = set()
prohibited_ids = set()
for filter_ in filters:
if filter_.property == "type":
if filter_.op in ("=", "in"):
allowed_types = _update_allow(allowed_types, filter_.value)
elif filter_.op == "!=":
prohibited_types.add(filter_.value)
elif filter_.property == "id":
if filter_.op == "=":
# An "allow" ID filter implies a type filter too, since IDs
# contain types within them.
allowed_ids = _update_allow(allowed_ids, filter_.value)
allowed_types = _update_allow(
allowed_types,
get_type_from_id(filter_.value),
)
elif filter_.op == "!=":
prohibited_ids.add(filter_.value)
elif filter_.op == "in":
allowed_ids = _update_allow(allowed_ids, filter_.value)
allowed_types = _update_allow(
allowed_types, (
get_type_from_id(id_) for id_ in filter_.value
),
)
opt_types = AuthSet(allowed_types, prohibited_types)
opt_ids = AuthSet(allowed_ids, prohibited_ids)
# If we have both type and ID whitelists, perform a type-based intersection
# on them, to further optimize. (Some of the cross-property constraints
# occur above; this is essentially a second pass which operates on the
# final whitelists, which among other things, incorporates any of the
# prohibitions found above.)
if opt_types.auth_type == AuthSet.WHITE and \
opt_ids.auth_type == AuthSet.WHITE:
opt_types.values.intersection_update(
get_type_from_id(id_) for id_ in opt_ids.values
)
opt_ids.values.intersection_update(
id_ for id_ in opt_ids.values
if get_type_from_id(id_) in opt_types.values
)
return opt_types, opt_ids
def _get_matching_dir_entries(parent_dir, auth_set, st_mode_test=None, ext=""):
"""
Search a directory (non-recursively), and find entries which match the
given criteria.
Args:
parent_dir: The directory to search
auth_set: an AuthSet instance, which represents a black/whitelist
filter on filenames
st_mode_test: A callable allowing filtering based on the type of
directory entry. E.g. just get directories, or just get files. It
will be passed the st_mode field of a stat() structure and should
return True to include the file, or False to exclude it. Easy thing to
do is pass one of the stat module functions, e.g. stat.S_ISREG. If
None, don't filter based on entry type.
ext: Determines how names from auth_set match up to directory
entries, and allows filtering by extension. The extension is added
to auth_set values to obtain directory entries; it is removed from
directory entries to obtain auth_set values. In this way, auth_set
may be treated as having only "basenames" of the entries. Only entries
having the given extension will be included in the results. If not
empty, the extension MUST include a leading ".". The default is the
empty string, which will result in direct comparisons, and no
extension-based filtering.
Returns:
(list): A list of directory entries matching the criteria. These will not
have any path info included; they will just be bare names.
Raises:
OSError: If there are errors accessing directory contents or stat()'ing
files
"""
results = []
if auth_set.auth_type == AuthSet.WHITE:
for value in auth_set.values:
filename = value + ext
try:
if st_mode_test:
s = os.stat(os.path.join(parent_dir, filename))
type_pass = st_mode_test(s.st_mode)
else:
type_pass = True
if type_pass:
results.append(filename)
except OSError as e:
if e.errno != errno.ENOENT:
raise
# else, file-not-found is ok, just skip
else: # auth_set is a blacklist
for entry in os.listdir(parent_dir):
if ext:
auth_name, this_ext = os.path.splitext(entry)
if this_ext != ext:
continue
else:
auth_name = entry
if auth_name in auth_set.values:
continue
try:
if st_mode_test:
s = os.stat(os.path.join(parent_dir, entry))
type_pass = st_mode_test(s.st_mode)
else:
type_pass = True
if type_pass:
results.append(entry)
except OSError as e:
if e.errno != errno.ENOENT:
raise
# else, file-not-found is ok, just skip
return results
def _check_object_from_file(query, filepath, allow_custom, version, encoding):
"""
Read a STIX object from the given file, and check it against the given
filters.
Args:
query: Iterable of filters
filepath (str): Path to file to read
allow_custom (bool): Whether to allow custom properties as well unknown
custom objects.
version (str): If present, it forces the parser to use the version
provided. Otherwise, the library will make the best effort based
on checking the "spec_version" property.
encoding (str): The encoding to use when reading a file from the
filesystem.
Returns:
The (parsed) STIX object, if the object passes the filters. If
not, None is returned.
Raises:
TypeError: If the file had invalid JSON
IOError: If there are problems opening/reading the file
stix2.exceptions.STIXError: If there were problems creating a STIX
object from the JSON
"""
try:
with io.open(filepath, "r", encoding=encoding) as f:
stix_json = json.load(f)
except ValueError: # not a JSON file
raise TypeError(
"STIX JSON object at '{0}' could either not be parsed "
"to JSON or was not valid STIX JSON".format(filepath),
)
stix_obj = parse(stix_json, allow_custom, version)
if stix_obj["type"] == "bundle":
stix_obj = stix_obj["objects"][0]
# check against other filters, add if match
result = next(apply_common_filters([stix_obj], query), None)
return result
def _is_versioned_type_dir(type_path, type_name):
"""
Try to detect whether the given directory is for a versioned type of STIX
object. This is done by looking for a directory whose name is a STIX ID
of the appropriate type. If found, treat this type as versioned. This
doesn't work when a versioned type directory is empty (it will be
mis-classified as unversioned), but this detection is only necessary when
reading/querying data. If a directory is empty, you'll get no results
either way.
Args:
type_path: A path to a directory containing one type of STIX object.
type_name: The STIX type name.
Returns:
True if the directory looks like it contains versioned objects; False
if not.
Raises:
OSError: If there are errors accessing directory contents or stat()'ing
files
"""
id_regex = re.compile(
r"^" + re.escape(type_name) +
r"--[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}"
r"-[0-9a-f]{12}$",
re.I,
)
for entry in os.listdir(type_path):
s = os.stat(os.path.join(type_path, entry))
if stat.S_ISDIR(s.st_mode) and id_regex.match(entry):
is_versioned = True
break
else:
is_versioned = False
return is_versioned
def _search_versioned(query, type_path, auth_ids, allow_custom, version, encoding):
"""
Searches the given directory, which contains data for STIX objects of a
particular versioned type, and return any which match the query.
Args:
query: The query to match against
type_path: The directory with type-specific STIX object files
auth_ids: Search optimization based on object ID
allow_custom (bool): Whether to allow custom properties as well unknown
custom objects.
version (str): If present, it forces the parser to use the version
provided. Otherwise, the library will make the best effort based
on checking the "spec_version" property.
encoding (str): The encoding to use when reading a file from the
filesystem.
Returns:
A list of all matching objects
Raises:
stix2.exceptions.STIXError: If any objects had invalid content
TypeError: If any objects had invalid content
IOError: If there were any problems opening/reading files
OSError: If there were any problems opening/reading files
"""
results = []
id_dirs = _get_matching_dir_entries(
type_path, auth_ids,
stat.S_ISDIR,
)
for id_dir in id_dirs:
id_path = os.path.join(type_path, id_dir)
# This leverages a more sophisticated function to do a simple thing:
# get all the JSON files from a directory. I guess it does give us
# file type checking, ensuring we only get regular files.
version_files = _get_matching_dir_entries(
id_path, _AUTHSET_ANY,
stat.S_ISREG, ".json",
)
for version_file in version_files:
version_path = os.path.join(id_path, version_file)
try:
stix_obj = _check_object_from_file(
query, version_path,
allow_custom, version,
encoding,
)
if stix_obj:
results.append(stix_obj)
except IOError as e:
if e.errno != errno.ENOENT:
raise
# else, file-not-found is ok, just skip
# For backward-compatibility, also search for plain files named after
# object IDs, in the type directory.
backcompat_results = _search_unversioned(
query, type_path, auth_ids, allow_custom, version, encoding,
)
results.extend(backcompat_results)
return results
def _search_unversioned(
query, type_path, auth_ids, allow_custom, version, encoding,
):
"""
Searches the given directory, which contains unversioned data, and return
any objects which match the query.
Args:
query: The query to match against
type_path: The directory with STIX files of unversioned type
auth_ids: Search optimization based on object ID
allow_custom (bool): Whether to allow custom properties as well unknown
custom objects.
version (str): If present, it forces the parser to use the version
provided. Otherwise, the library will make the best effort based
on checking the "spec_version" property.
encoding (str): The encoding to use when reading a file from the
filesystem.
Returns:
A list of all matching objects
Raises:
stix2.exceptions.STIXError: If any objects had invalid content
TypeError: If any objects had invalid content
IOError: If there were any problems opening/reading files
OSError: If there were any problems opening/reading files
"""
results = []
id_files = _get_matching_dir_entries(
type_path, auth_ids, stat.S_ISREG,
".json",
)
for id_file in id_files:
id_path = os.path.join(type_path, id_file)
try:
stix_obj = _check_object_from_file(
query, id_path, allow_custom,
version, encoding,
)
if stix_obj:
results.append(stix_obj)
except IOError as e:
if e.errno != errno.ENOENT:
raise
# else, file-not-found is ok, just skip
return results
class FileSystemStore(DataStoreMixin):
"""Interface to a file directory of STIX objects.
FileSystemStore is a wrapper around a paired FileSystemSink
and FileSystemSource.
Args:
stix_dir (str): path to directory of STIX objects
allow_custom (bool): whether to allow custom STIX content to be
pushed/retrieved. Defaults to True for FileSystemSource side
(retrieving data) and False for FileSystemSink
side(pushing data). However, when parameter is supplied, it
will be applied to both FileSystemSource and FileSystemSink.
bundlify (bool): whether to wrap objects in bundles when saving
them. Default: False.
encoding (str): The encoding to use when reading a file from the
filesystem.
Attributes:
source (FileSystemSource): FileSystemSource
sink (FileSystemSink): FileSystemSink
"""
def __init__(self, stix_dir, allow_custom=None, bundlify=False, encoding='utf-8'):
if allow_custom is None:
allow_custom_source = True
allow_custom_sink = False
else:
allow_custom_sink = allow_custom_source = allow_custom
super(FileSystemStore, self).__init__(
source=FileSystemSource(stix_dir=stix_dir, allow_custom=allow_custom_source, encoding=encoding),
sink=FileSystemSink(stix_dir=stix_dir, allow_custom=allow_custom_sink, bundlify=bundlify),
)
class FileSystemSink(DataSink):
"""Interface for adding/pushing STIX objects to file directory of STIX
objects.
Can be paired with a FileSystemSource, together as the two
components of a FileSystemStore.
Args:
stix_dir (str): path to directory of STIX objects.
allow_custom (bool): Whether to allow custom STIX content to be
added to the FileSystemSource. Default: False
bundlify (bool): Whether to wrap objects in bundles when saving them.
Default: False.
"""
def __init__(self, stix_dir, allow_custom=False, bundlify=False):
super(FileSystemSink, self).__init__()
self._stix_dir = os.path.abspath(stix_dir)
self.allow_custom = allow_custom
self.bundlify = bundlify
if not os.path.exists(self._stix_dir):
raise ValueError("directory path for STIX data does not exist")
@property
def stix_dir(self):
return self._stix_dir
def _check_path_and_write(self, stix_obj, encoding='utf-8'):
"""Write the given STIX object to a file in the STIX file directory.
"""
type_dir = os.path.join(self._stix_dir, stix_obj["type"])
# All versioned objects should have a "modified" property.
if "modified" in stix_obj:
filename = _timestamp2filename(stix_obj["modified"])
obj_dir = os.path.join(type_dir, stix_obj["id"])
else:
filename = stix_obj["id"]
obj_dir = type_dir
file_path = os.path.join(obj_dir, filename + ".json")
if not os.path.exists(obj_dir):
os.makedirs(obj_dir)
if self.bundlify:
if 'spec_version' in stix_obj:
# Assuming future specs will allow multiple SDO/SROs
# versions in a single bundle we won't need to check this
# and just use the latest supported Bundle version.
stix_obj = v21.Bundle(stix_obj, allow_custom=self.allow_custom)
else:
stix_obj = v20.Bundle(stix_obj, allow_custom=self.allow_custom)
if os.path.isfile(file_path):
raise DataSourceError("Attempted to overwrite file (!) at: {}".format(file_path))
with io.open(file_path, mode='w', encoding=encoding) as f:
fp_serialize(stix_obj, f, pretty=True, encoding=encoding, ensure_ascii=False)
def add(self, stix_data=None, version=None):
"""Add STIX objects to file directory.
Args:
stix_data (STIX object OR dict OR str OR list): valid STIX 2.0 content
in a STIX object (or list of), dict (or list of), or a STIX 2.0
json encoded string.
version (str): If present, it forces the parser to use the version
provided. Otherwise, the library will make the best effort based
on checking the "spec_version" property.
Note:
``stix_data`` can be a Bundle object, but each object in it will be
saved separately; you will be able to retrieve any of the objects
the Bundle contained, but not the Bundle itself.
"""
if isinstance(stix_data, (v20.Bundle, v21.Bundle)):
# recursively add individual STIX objects
for stix_obj in stix_data.get("objects", []):
self.add(stix_obj, version=version)
elif isinstance(stix_data, _STIXBase):
# adding python STIX object
self._check_path_and_write(stix_data)
elif isinstance(stix_data, (str, dict)):
parsed_data = parse(stix_data, allow_custom=self.allow_custom, version=version)
if isinstance(parsed_data, _STIXBase):
self.add(parsed_data, version=version)
else:
# custom unregistered object type
self._check_path_and_write(parsed_data)
elif isinstance(stix_data, list):
# recursively add individual STIX objects
for stix_obj in stix_data:
self.add(stix_obj)
else:
raise TypeError(
"stix_data must be a STIX object (or list of), "
"JSON formatted STIX (or list of), "
"or a JSON formatted STIX bundle",
)
class FileSystemSource(DataSource):
"""Interface for searching/retrieving STIX objects from a STIX object file
directory.
Can be paired with a FileSystemSink, together as the two
components of a FileSystemStore.
Args:
stix_dir (str): path to directory of STIX objects
allow_custom (bool): Whether to allow custom STIX content to be
added to the FileSystemSink. Default: True
encoding (str): The encoding to use when reading a file from the
filesystem.
"""
def __init__(self, stix_dir, allow_custom=True, encoding='utf-8'):
super(FileSystemSource, self).__init__()
self._stix_dir = os.path.abspath(stix_dir)
self.allow_custom = allow_custom
self.encoding = encoding
if not os.path.exists(self._stix_dir):
raise ValueError("directory path for STIX data does not exist: %s" % self._stix_dir)
@property
def stix_dir(self):
return self._stix_dir
def get(self, stix_id, version=None, _composite_filters=None):
"""Retrieve STIX object from file directory via STIX ID.
Args:
stix_id (str): The STIX ID of the STIX object to be retrieved.
_composite_filters (FilterSet): collection of filters passed from the parent
CompositeDataSource, not user supplied
version (str): If present, it forces the parser to use the version
provided. Otherwise, the library will make the best effort based
on checking the "spec_version" property.
Returns:
(STIX object): STIX object that has the supplied STIX ID.
The STIX object is loaded from its json file, parsed into
a python STIX object and then returned
"""
all_data = self.all_versions(stix_id, version=version, _composite_filters=_composite_filters)
if all_data:
# Simple check for a versioned STIX type: see if the objects have a
# "modified" property. (Need only check one, since they are all of
# the same type.)
is_versioned = "modified" in all_data[0]
if is_versioned:
stix_obj = sorted(all_data, key=lambda k: k['modified'])[-1]
else:
stix_obj = all_data[0]
else:
stix_obj = None
return stix_obj
def all_versions(self, stix_id, version=None, _composite_filters=None):
"""Retrieve STIX object from file directory via STIX ID, all versions.
Note: Since FileSystem sources/sinks don't handle multiple versions
of a STIX object, this operation is unnecessary. Pass call to get().
Args:
stix_id (str): The STIX ID of the STIX objects to be retrieved.
_composite_filters (FilterSet): collection of filters passed from
the parent CompositeDataSource, not user supplied
version (str): If present, it forces the parser to use the version
provided. Otherwise, the library will make the best effort based
on checking the "spec_version" property.
Returns:
(list): of STIX objects that has the supplied STIX ID.
The STIX objects are loaded from their json files, parsed into
a python STIX objects and then returned
"""
query = [Filter("id", "=", stix_id)]
return self.query(query, version=version, _composite_filters=_composite_filters)
def query(self, query=None, version=None, _composite_filters=None):
"""Search and retrieve STIX objects based on the complete query.
A "complete query" includes the filters from the query, the filters
attached to this FileSystemSource, and any filters passed from a
CompositeDataSource (i.e. _composite_filters).
Args:
query (list): list of filters to search on
_composite_filters (FilterSet): collection of filters passed from
the CompositeDataSource, not user supplied
version (str): If present, it forces the parser to use the version
provided. Otherwise, the library will make the best effort based
on checking the "spec_version" property.
Returns:
(list): list of STIX objects that matches the supplied
query. The STIX objects are loaded from their json files,
parsed into a python STIX objects and then returned.
"""
all_data = []
query = FilterSet(query)
# combine all query filters
if self.filters:
query.add(self.filters)
if _composite_filters:
query.add(_composite_filters)
auth_types, auth_ids = _find_search_optimizations(query)
type_dirs = _get_matching_dir_entries(
self._stix_dir, auth_types,
stat.S_ISDIR,
)
for type_dir in type_dirs:
type_path = os.path.join(self._stix_dir, type_dir)
type_is_versioned = _is_versioned_type_dir(type_path, type_dir)
if type_is_versioned:
type_results = _search_versioned(
query, type_path, auth_ids,
self.allow_custom, version,
self.encoding,
)
else:
type_results = _search_unversioned(
query, type_path, auth_ids,
self.allow_custom, version,
self.encoding,
)
all_data.extend(type_results)
return all_data