cti-python-stix2/stix2/datastore/filesystem.py

"""Python STIX2 FileSystem Source/Sink"""
import errno
import io
import json
import os
import re
import stat

from stix2 import v20, v21
from stix2.base import _STIXBase
from stix2.datastore import (
    DataSink, DataSource, DataSourceError, DataStoreMixin,
)
from stix2.datastore.filters import Filter, FilterSet, apply_common_filters
from stix2.parsing import parse
from stix2.serialization import fp_serialize
from stix2.utils import format_datetime, get_type_from_id, parse_into_datetime


def _timestamp2filename(timestamp):
    """
    Encapsulates a way to create unique filenames based on an object's
    "modified" property value.  This should not include an extension.

    Args:
        timestamp: A timestamp, as a datetime.datetime object or string.

    """
    # The format_datetime will determine the correct level of precision.
    if isinstance(timestamp, str):
        timestamp = parse_into_datetime(timestamp)
    ts = format_datetime(timestamp)
    ts = re.sub(r"[-T:\.Z ]", "", ts)
    return ts


class AuthSet(object):
    """
    Represents either a whitelist or blacklist of values, where/what we
    must/must not search to find objects which match a query.  (Maybe "AuthSet"
    isn't the right name, but determining authorization is a typical context in
    which black/white lists are used.)

    The set may be empty.  For a whitelist, this means you mustn't search
    anywhere, which means the query was impossible to match, so you can skip
    searching altogether.  For a blacklist, this means nothing is excluded
    and you must search everywhere.

    """
    BLACK = 0
    WHITE = 1

    def __init__(self, allowed, prohibited):
        """
        Initialize this AuthSet from the given sets of allowed and/or
        prohibited values.  The type of set (black or white) is determined
        from the allowed and/or prohibited values given.

        Args:
            allowed: A set of allowed values (or None if no allow filters
                were found in the query)
            prohibited: A set of prohibited values (not None)

        """
        if allowed is None:
            self.__values = prohibited
            self.__type = AuthSet.BLACK

        else:
            # There was at least one allow filter, so create a whitelist.  But
            # any matching prohibited values create a combination of conditions
            # which can never match.  So exclude those.
            self.__values = allowed - prohibited
            self.__type = AuthSet.WHITE

    @property
    def values(self):
        """
        Get the values in this white/blacklist, as a set.
        """
        return self.__values

    @property
    def auth_type(self):
        """
        Get the type of set: AuthSet.WHITE or AuthSet.BLACK.
        """
        return self.__type

    def __repr__(self):
        return "{}list: {}".format(
            "white" if self.auth_type == AuthSet.WHITE else "black",
            self.values,
        )


# A fixed, reusable AuthSet which accepts anything.  It came in handy.
_AUTHSET_ANY = AuthSet(None, set())


def _update_allow(allow_set, value):
    """
    Updates the given set of "allow" values.  The first time an update to the
    set occurs, the value(s) are added.  Thereafter, since all filters are
    implicitly AND'd, the given values are intersected with the existing allow
    set, which may remove values.  At the end, it may even wind up empty.

    Args:
        allow_set: The allow set, or None
        value: The value(s) to add (single value, or iterable of values)

    Returns:
        The updated allow set (not None)

    """
    adding_seq = hasattr(value, "__iter__") and \
        not isinstance(value, str)

    if allow_set is None:
        allow_set = set()
        if adding_seq:
            allow_set.update(value)
        else:
            allow_set.add(value)
    else:
        # strangely, the "&=" operator requires a set on the RHS
        # whereas the method allows any iterable.
        if adding_seq:
            allow_set.intersection_update(value)
        else:
            allow_set.intersection_update({value})

    return allow_set


def _find_search_optimizations(filters):
    """
    Searches through all the filters, and creates white/blacklists of types and
    IDs, which can be used to optimize the filesystem search.

    Args:
        filters: An iterable of filter objects representing a query

    Returns:
        A 2-tuple of AuthSet objects: the first is for object types, and
        the second is for object IDs.

    """
    # The basic approach to this is to determine what is allowed and
    # prohibited, independently, and then combine them to create the final
    # white/blacklists.

    allowed_types = allowed_ids = None
    prohibited_types = set()
    prohibited_ids = set()

    for filter_ in filters:
        if filter_.property == "type":
            if filter_.op in ("=", "in"):
                allowed_types = _update_allow(allowed_types, filter_.value)
            elif filter_.op == "!=":
                prohibited_types.add(filter_.value)

        elif filter_.property == "id":
            if filter_.op == "=":
                # An "allow" ID filter implies a type filter too, since IDs
                # contain types within them.
                allowed_ids = _update_allow(allowed_ids, filter_.value)
                allowed_types = _update_allow(
                    allowed_types,
                    get_type_from_id(filter_.value),
                )
            elif filter_.op == "!=":
                prohibited_ids.add(filter_.value)
            elif filter_.op == "in":
                allowed_ids = _update_allow(allowed_ids, filter_.value)
                allowed_types = _update_allow(
                    allowed_types, (
                        get_type_from_id(id_) for id_ in filter_.value
                    ),
                )

    opt_types = AuthSet(allowed_types, prohibited_types)
    opt_ids = AuthSet(allowed_ids, prohibited_ids)

    # If we have both type and ID whitelists, perform a type-based intersection
    # on them, to further optimize.  (Some of the cross-property constraints
    # occur above; this is essentially a second pass which operates on the
    # final whitelists, which among other things, incorporates any of the
    # prohibitions found above.)
    if opt_types.auth_type == AuthSet.WHITE and \
            opt_ids.auth_type == AuthSet.WHITE:

        opt_types.values.intersection_update(
            get_type_from_id(id_) for id_ in opt_ids.values
        )

        opt_ids.values.intersection_update(
            id_ for id_ in opt_ids.values
            if get_type_from_id(id_) in opt_types.values
        )

    return opt_types, opt_ids


def _get_matching_dir_entries(parent_dir, auth_set, st_mode_test=None, ext=""):
    """
    Search a directory (non-recursively), and find entries which match the
    given criteria.

    Args:
        parent_dir: The directory to search
        auth_set: an AuthSet instance, which represents a black/whitelist
            filter on filenames
        st_mode_test: A callable allowing filtering based on the type of
            directory entry.  E.g. just get directories, or just get files.  It
            will be passed the st_mode field of a stat() structure and should
            return True to include the file, or False to exclude it.  Easy thing to
            do is pass one of the stat module functions, e.g. stat.S_ISREG.  If
            None, don't filter based on entry type.
        ext: Determines how names from auth_set match up to directory
            entries, and allows filtering by extension.  The extension is added
            to auth_set values to obtain directory entries; it is removed from
            directory entries to obtain auth_set values.  In this way, auth_set
            may be treated as having only "basenames" of the entries.  Only entries
            having the given extension will be included in the results.  If not
            empty, the extension MUST include a leading ".".  The default is the
            empty string, which will result in direct comparisons, and no
            extension-based filtering.

    Returns:
        (list): A list of directory entries matching the criteria.  These will not
            have any path info included; they will just be bare names.

    Raises:
        OSError: If there are errors accessing directory contents or stat()'ing
            files

    """
    results = []
    if auth_set.auth_type == AuthSet.WHITE:
        for value in auth_set.values:
            filename = value + ext
            try:
                if st_mode_test:
                    s = os.stat(os.path.join(parent_dir, filename))
                    type_pass = st_mode_test(s.st_mode)
                else:
                    type_pass = True

                if type_pass:
                    results.append(filename)
            except OSError as e:
                if e.errno != errno.ENOENT:
                    raise
                # else, file-not-found is ok, just skip
    else:  # auth_set is a blacklist
        for entry in os.listdir(parent_dir):
            if ext:
                auth_name, this_ext = os.path.splitext(entry)
                if this_ext != ext:
                    continue
            else:
                auth_name = entry

            if auth_name in auth_set.values:
                continue

            try:
                if st_mode_test:
                    s = os.stat(os.path.join(parent_dir, entry))
                    type_pass = st_mode_test(s.st_mode)
                else:
                    type_pass = True

                if type_pass:
                    results.append(entry)
            except OSError as e:
                if e.errno != errno.ENOENT:
                    raise
                # else, file-not-found is ok, just skip

    return results


def _check_object_from_file(query, filepath, allow_custom, version, encoding):
    """
    Read a STIX object from the given file, and check it against the given
    filters.

    Args:
        query: Iterable of filters
        filepath (str): Path to file to read
        allow_custom (bool): Whether to allow custom properties as well unknown
        custom objects.
        version (str): If present, it forces the parser to use the version
            provided. Otherwise, the library will make the best effort based
            on checking the "spec_version" property.
        encoding (str): The encoding to use when reading a file from the
            filesystem.

    Returns:
        The (parsed) STIX object, if the object passes the filters.  If
        not, None is returned.

    Raises:
        TypeError: If the file had invalid JSON
        IOError: If there are problems opening/reading the file
        stix2.exceptions.STIXError: If there were problems creating a STIX
            object from the JSON

    """
    try:
        with io.open(filepath, "r", encoding=encoding) as f:
            stix_json = json.load(f)
    except ValueError:  # not a JSON file
        raise TypeError(
            "STIX JSON object at '{0}' could either not be parsed "
            "to JSON or was not valid STIX JSON".format(filepath),
        )

    stix_obj = parse(stix_json, allow_custom, version)

    if stix_obj["type"] == "bundle":
        stix_obj = stix_obj["objects"][0]

    # check against other filters, add if match
    result = next(apply_common_filters([stix_obj], query), None)

    return result


def _is_versioned_type_dir(type_path, type_name):
    """
    Try to detect whether the given directory is for a versioned type of STIX
    object.  This is done by looking for a directory whose name is a STIX ID
    of the appropriate type.  If found, treat this type as versioned.  This
    doesn't work when a versioned type directory is empty (it will be
    mis-classified as unversioned), but this detection is only necessary when
    reading/querying data.  If a directory is empty, you'll get no results
    either way.

    Args:
        type_path: A path to a directory containing one type of STIX object.
        type_name: The STIX type name.

    Returns:
        True if the directory looks like it contains versioned objects; False
        if not.

    Raises:
        OSError: If there are errors accessing directory contents or stat()'ing
            files
    """
    id_regex = re.compile(
        r"^" + re.escape(type_name) +
        r"--[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}"
        r"-[0-9a-f]{12}$",
        re.I,
    )

    for entry in os.listdir(type_path):
        s = os.stat(os.path.join(type_path, entry))
        if stat.S_ISDIR(s.st_mode) and id_regex.match(entry):
            is_versioned = True
            break
    else:
        is_versioned = False

    return is_versioned


def _search_versioned(query, type_path, auth_ids, allow_custom, version, encoding):
    """
    Searches the given directory, which contains data for STIX objects of a
    particular versioned type, and return any which match the query.

    Args:
        query: The query to match against
        type_path: The directory with type-specific STIX object files
        auth_ids: Search optimization based on object ID
        allow_custom (bool): Whether to allow custom properties as well unknown
            custom objects.
        version (str): If present, it forces the parser to use the version
            provided. Otherwise, the library will make the best effort based
            on checking the "spec_version" property.
        encoding (str): The encoding to use when reading a file from the
            filesystem.

    Returns:
        A list of all matching objects

    Raises:
        stix2.exceptions.STIXError: If any objects had invalid content
        TypeError: If any objects had invalid content
        IOError: If there were any problems opening/reading files
        OSError: If there were any problems opening/reading files

    """
    results = []
    id_dirs = _get_matching_dir_entries(
        type_path, auth_ids,
        stat.S_ISDIR,
    )
    for id_dir in id_dirs:
        id_path = os.path.join(type_path, id_dir)

        # This leverages a more sophisticated function to do a simple thing:
        # get all the JSON files from a directory.  I guess it does give us
        # file type checking, ensuring we only get regular files.
        version_files = _get_matching_dir_entries(
            id_path, _AUTHSET_ANY,
            stat.S_ISREG, ".json",
        )
        for version_file in version_files:
            version_path = os.path.join(id_path, version_file)

            try:
                stix_obj = _check_object_from_file(
                    query, version_path,
                    allow_custom, version,
                    encoding,
                )
                if stix_obj:
                    results.append(stix_obj)
            except IOError as e:
                if e.errno != errno.ENOENT:
                    raise
                # else, file-not-found is ok, just skip

    # For backward-compatibility, also search for plain files named after
    # object IDs, in the type directory.
    backcompat_results = _search_unversioned(
        query, type_path, auth_ids, allow_custom, version, encoding,
    )
    results.extend(backcompat_results)

    return results


def _search_unversioned(
    query, type_path, auth_ids, allow_custom, version, encoding,
):
    """
    Searches the given directory, which contains unversioned data, and return
    any objects which match the query.

    Args:
        query: The query to match against
        type_path: The directory with STIX files of unversioned type
        auth_ids: Search optimization based on object ID
        allow_custom (bool): Whether to allow custom properties as well unknown
            custom objects.
        version (str): If present, it forces the parser to use the version
            provided. Otherwise, the library will make the best effort based
            on checking the "spec_version" property.
        encoding (str): The encoding to use when reading a file from the
            filesystem.

    Returns:
        A list of all matching objects

    Raises:
        stix2.exceptions.STIXError: If any objects had invalid content
        TypeError: If any objects had invalid content
        IOError: If there were any problems opening/reading files
        OSError: If there were any problems opening/reading files

    """
    results = []
    id_files = _get_matching_dir_entries(
        type_path, auth_ids, stat.S_ISREG,
        ".json",
    )
    for id_file in id_files:
        id_path = os.path.join(type_path, id_file)

        try:
            stix_obj = _check_object_from_file(
                query, id_path, allow_custom,
                version, encoding,
            )
            if stix_obj:
                results.append(stix_obj)
        except IOError as e:
            if e.errno != errno.ENOENT:
                raise
            # else, file-not-found is ok, just skip

    return results


class FileSystemStore(DataStoreMixin):
    """Interface to a file directory of STIX objects.

    FileSystemStore is a wrapper around a paired FileSystemSink
    and FileSystemSource.

    Args:
        stix_dir (str): path to directory of STIX objects
        allow_custom (bool): whether to allow custom STIX content to be
            pushed/retrieved. Defaults to True for FileSystemSource side
            (retrieving data) and False for FileSystemSink
            side(pushing data). However, when parameter is supplied, it
            will be applied to both FileSystemSource and FileSystemSink.
        bundlify (bool): whether to wrap objects in bundles when saving
            them. Default: False.
        encoding (str): The encoding to use when reading a file from the
            filesystem.

    Attributes:
        source (FileSystemSource): FileSystemSource
        sink (FileSystemSink): FileSystemSink

    """
    def __init__(self, stix_dir, allow_custom=None, bundlify=False, encoding='utf-8'):
        if allow_custom is None:
            allow_custom_source = True
            allow_custom_sink = False
        else:
            allow_custom_sink = allow_custom_source = allow_custom

        super(FileSystemStore, self).__init__(
            source=FileSystemSource(stix_dir=stix_dir, allow_custom=allow_custom_source, encoding=encoding),
            sink=FileSystemSink(stix_dir=stix_dir, allow_custom=allow_custom_sink, bundlify=bundlify),
        )


class FileSystemSink(DataSink):
    """Interface for adding/pushing STIX objects to file directory of STIX
    objects.

    Can be paired with a FileSystemSource, together as the two
    components of a FileSystemStore.

    Args:
        stix_dir (str): path to directory of STIX objects.
        allow_custom (bool): Whether to allow custom STIX content to be
            added to the FileSystemSource. Default: False
        bundlify (bool): Whether to wrap objects in bundles when saving them.
            Default: False.

    """
    def __init__(self, stix_dir, allow_custom=False, bundlify=False):
        super(FileSystemSink, self).__init__()
        self._stix_dir = os.path.abspath(stix_dir)
        self.allow_custom = allow_custom
        self.bundlify = bundlify

        if not os.path.exists(self._stix_dir):
            raise ValueError("directory path for STIX data does not exist")

    @property
    def stix_dir(self):
        return self._stix_dir

    def _check_path_and_write(self, stix_obj, encoding='utf-8'):
        """Write the given STIX object to a file in the STIX file directory.
        """
        type_dir = os.path.join(self._stix_dir, stix_obj["type"])

        # All versioned objects should have a "modified" property.
        if "modified" in stix_obj:
            filename = _timestamp2filename(stix_obj["modified"])
            obj_dir = os.path.join(type_dir, stix_obj["id"])
        else:
            filename = stix_obj["id"]
            obj_dir = type_dir

        file_path = os.path.join(obj_dir, filename + ".json")

        if not os.path.exists(obj_dir):
            os.makedirs(obj_dir)

        if self.bundlify:
            if 'spec_version' in stix_obj:
                # Assuming future specs will allow multiple SDO/SROs
                # versions in a single bundle we won't need to check this
                # and just use the latest supported Bundle version.
                stix_obj = v21.Bundle(stix_obj, allow_custom=self.allow_custom)
            else:
                stix_obj = v20.Bundle(stix_obj, allow_custom=self.allow_custom)

        if os.path.isfile(file_path):
            raise DataSourceError("Attempted to overwrite file (!) at: {}".format(file_path))

        with io.open(file_path, mode='w', encoding=encoding) as f:
            fp_serialize(stix_obj, f, pretty=True, encoding=encoding, ensure_ascii=False)

    def add(self, stix_data=None, version=None):
        """Add STIX objects to file directory.

        Args:
            stix_data (STIX object OR dict OR str OR list): valid STIX 2.0 content
                in a STIX object (or list of), dict (or list of), or a STIX 2.0
                json encoded string.
            version (str): If present, it forces the parser to use the version
                provided. Otherwise, the library will make the best effort based
                on checking the "spec_version" property.

        Note:
            ``stix_data`` can be a Bundle object, but each object in it will be
            saved separately; you will be able to retrieve any of the objects
            the Bundle contained, but not the Bundle itself.

        """
        if isinstance(stix_data, (v20.Bundle, v21.Bundle)):
            # recursively add individual STIX objects
            for stix_obj in stix_data.get("objects", []):
                self.add(stix_obj, version=version)

        elif isinstance(stix_data, _STIXBase):
            # adding python STIX object
            self._check_path_and_write(stix_data)

        elif isinstance(stix_data, (str, dict)):
            parsed_data = parse(stix_data, allow_custom=self.allow_custom, version=version)
            if isinstance(parsed_data, _STIXBase):
                self.add(parsed_data, version=version)
            else:
                # custom unregistered object type
                self._check_path_and_write(parsed_data)

        elif isinstance(stix_data, list):
            # recursively add individual STIX objects
            for stix_obj in stix_data:
                self.add(stix_obj)

        else:
            raise TypeError(
                "stix_data must be a STIX object (or list of), "
                "JSON formatted STIX (or list of), "
                "or a JSON formatted STIX bundle",
            )


class FileSystemSource(DataSource):
    """Interface for searching/retrieving STIX objects from a STIX object file
    directory.

    Can be paired with a FileSystemSink, together as the two
    components of a FileSystemStore.

    Args:
        stix_dir (str): path to directory of STIX objects
        allow_custom (bool): Whether to allow custom STIX content to be
            added to the FileSystemSink. Default: True
        encoding (str): The encoding to use when reading a file from the
            filesystem.

    """
    def __init__(self, stix_dir, allow_custom=True, encoding='utf-8'):
        super(FileSystemSource, self).__init__()
        self._stix_dir = os.path.abspath(stix_dir)
        self.allow_custom = allow_custom
        self.encoding = encoding

        if not os.path.exists(self._stix_dir):
            raise ValueError("directory path for STIX data does not exist: %s" % self._stix_dir)

    @property
    def stix_dir(self):
        return self._stix_dir

    def get(self, stix_id, version=None, _composite_filters=None):
        """Retrieve STIX object from file directory via STIX ID.

        Args:
            stix_id (str): The STIX ID of the STIX object to be retrieved.
            _composite_filters (FilterSet): collection of filters passed from the parent
                CompositeDataSource, not user supplied
            version (str): If present, it forces the parser to use the version
                provided. Otherwise, the library will make the best effort based
                on checking the "spec_version" property.

        Returns:
            (STIX object): STIX object that has the supplied STIX ID.
                The STIX object is loaded from its json file, parsed into
                a python STIX object and then returned

        """
        all_data = self.all_versions(stix_id, version=version, _composite_filters=_composite_filters)

        if all_data:
            # Simple check for a versioned STIX type: see if the objects have a
            # "modified" property.  (Need only check one, since they are all of
            # the same type.)
            is_versioned = "modified" in all_data[0]
            if is_versioned:
                stix_obj = sorted(all_data, key=lambda k: k['modified'])[-1]
            else:
                stix_obj = all_data[0]
        else:
            stix_obj = None

        return stix_obj

    def all_versions(self, stix_id, version=None, _composite_filters=None):
        """Retrieve STIX object from file directory via STIX ID, all versions.

        Note: Since FileSystem sources/sinks don't handle multiple versions
        of a STIX object, this operation is unnecessary. Pass call to get().

        Args:
            stix_id (str): The STIX ID of the STIX objects to be retrieved.
            _composite_filters (FilterSet): collection of filters passed from
                the parent CompositeDataSource, not user supplied
            version (str): If present, it forces the parser to use the version
                provided. Otherwise, the library will make the best effort based
                on checking the "spec_version" property.

        Returns:
            (list): of STIX objects that has the supplied STIX ID.
                The STIX objects are loaded from their json files, parsed into
                a python STIX objects and then returned

        """
        query = [Filter("id", "=", stix_id)]
        return self.query(query, version=version, _composite_filters=_composite_filters)

    def query(self, query=None, version=None, _composite_filters=None):
        """Search and retrieve STIX objects based on the complete query.

        A "complete query" includes the filters from the query, the filters
        attached to this FileSystemSource, and any filters passed from a
        CompositeDataSource (i.e. _composite_filters).

        Args:
            query (list): list of filters to search on
            _composite_filters (FilterSet): collection of filters passed from
                the CompositeDataSource, not user supplied
            version (str): If present, it forces the parser to use the version
                provided. Otherwise, the library will make the best effort based
                on checking the "spec_version" property.

        Returns:
            (list): list of STIX objects that matches the supplied
                query. The STIX objects are loaded from their json files,
                parsed into a python STIX objects and then returned.

        """
        all_data = []
        query = FilterSet(query)

        # combine all query filters
        if self.filters:
            query.add(self.filters)
        if _composite_filters:
            query.add(_composite_filters)

        auth_types, auth_ids = _find_search_optimizations(query)
        type_dirs = _get_matching_dir_entries(
            self._stix_dir, auth_types,
            stat.S_ISDIR,
        )
        for type_dir in type_dirs:
            type_path = os.path.join(self._stix_dir, type_dir)
            type_is_versioned = _is_versioned_type_dir(type_path, type_dir)
            if type_is_versioned:
                type_results = _search_versioned(
                    query, type_path, auth_ids,
                    self.allow_custom, version,
                    self.encoding,
                )
            else:
                type_results = _search_unversioned(
                    query, type_path, auth_ids,
                    self.allow_custom, version,
                    self.encoding,
                )
            all_data.extend(type_results)

        return all_data