Merge pull request #214 from chisholm/multi_version_memory_store

Multi version memory store
stix2.0
Chris Lenk 2018-10-23 12:01:10 -04:00 committed by GitHub
commit dbe3f7a000
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 233 additions and 92 deletions

View File

@ -123,8 +123,11 @@ def apply_common_filters(stix_objs, query):
Supports only STIX 2.0 common property properties.
Args:
stix_objs (list): list of STIX objects to apply the query to
query (set): set of filters (combined form complete query)
stix_objs (iterable): iterable of STIX objects to apply the query to
query (non-iterator iterable): iterable of filters. Can't be an
iterator (e.g. generator iterators won't work), since this is
used in an inner loop of a nested loop. So we require the ability
to traverse the filters repeatedly.
Yields:
STIX objects that successfully evaluate against the query.

View File

@ -1,27 +1,18 @@
"""
Python STIX 2.0 Memory Source/Sink
TODO:
Use deduplicate() calls only when memory corpus is dirty (been added to)
can save a lot of time for successive queries
Note:
Not worrying about STIX versioning. The in memory STIX data at anytime
will only hold one version of a STIX object. As such, when save() is called,
the single versions of all the STIX objects are what is written to file.
"""
import itertools
import json
import os
from stix2.base import _STIXBase
from stix2.core import Bundle, parse
from stix2.datastore import DataSink, DataSource, DataStoreMixin
from stix2.datastore.filters import Filter, FilterSet, apply_common_filters
from stix2.datastore.filters import FilterSet, apply_common_filters
def _add(store, stix_data=None, version=None):
def _add(store, stix_data=None, allow_custom=True, version=None):
"""Add STIX objects to MemoryStore/Sink.
Adds STIX objects to an in-memory dictionary for fast lookup.
@ -33,27 +24,77 @@ def _add(store, stix_data=None, version=None):
None, use latest version.
"""
if isinstance(stix_data, _STIXBase):
# adding a python STIX object
store._data[stix_data["id"]] = stix_data
elif isinstance(stix_data, dict):
if stix_data["type"] == "bundle":
# adding a json bundle - so just grab STIX objects
for stix_obj in stix_data.get("objects", []):
_add(store, stix_obj, version=version)
else:
# adding a json STIX object
store._data[stix_data["id"]] = stix_data
elif isinstance(stix_data, list):
if isinstance(stix_data, list):
# STIX objects are in a list- recurse on each object
for stix_obj in stix_data:
_add(store, stix_obj, version=version)
_add(store, stix_obj, allow_custom, version)
elif stix_data["type"] == "bundle":
# adding a json bundle - so just grab STIX objects
for stix_obj in stix_data.get("objects", []):
_add(store, stix_obj, allow_custom, version)
else:
raise TypeError("stix_data expected to be a python-stix2 object (or list of), JSON formatted STIX (or list of),"
" or a JSON formatted STIX bundle. stix_data was of type: " + str(type(stix_data)))
# Adding a single non-bundle object
if isinstance(stix_data, _STIXBase):
stix_obj = stix_data
else:
stix_obj = parse(stix_data, allow_custom, version)
# Map ID directly to the object, if it is a marking. Otherwise,
# map to a family, so we can track multiple versions.
if _is_marking(stix_obj):
store._data[stix_obj.id] = stix_obj
else:
if stix_obj.id in store._data:
obj_family = store._data[stix_obj.id]
else:
obj_family = _ObjectFamily()
store._data[stix_obj.id] = obj_family
obj_family.add(stix_obj)
def _is_marking(obj_or_id):
"""Determines whether the given object or object ID is/is for a marking
definition.
:param obj_or_id: A STIX object or object ID as a string.
:return: True if a marking definition, False otherwise.
"""
if isinstance(obj_or_id, _STIXBase):
id_ = obj_or_id.id
else:
id_ = obj_or_id
return id_.startswith("marking-definition--")
class _ObjectFamily(object):
"""
An internal implementation detail of memory sources/sinks/stores.
Represents a "family" of STIX objects: all objects with a particular
ID. (I.e. all versions.) The latest version is also tracked so that it
can be obtained quickly.
"""
def __init__(self):
self.all_versions = {}
self.latest_version = None
def add(self, obj):
self.all_versions[obj.modified] = obj
if self.latest_version is None or \
obj.modified > self.latest_version.modified:
self.latest_version = obj
def __str__(self):
return "<<{}; latest={}>>".format(self.all_versions,
self.latest_version.modified)
def __repr__(self):
return str(self)
class MemoryStore(DataStoreMixin):
@ -83,7 +124,7 @@ class MemoryStore(DataStoreMixin):
self._data = {}
if stix_data:
_add(self, stix_data, version=version)
_add(self, stix_data, allow_custom, version=version)
super(MemoryStore, self).__init__(
source=MemorySource(stix_data=self._data, allow_custom=allow_custom, version=version, _store=True),
@ -138,25 +179,32 @@ class MemorySink(DataSink):
"""
def __init__(self, stix_data=None, allow_custom=True, version=None, _store=False):
super(MemorySink, self).__init__()
self._data = {}
self.allow_custom = allow_custom
if _store:
self._data = stix_data
elif stix_data:
_add(self, stix_data, version=version)
else:
self._data = {}
if stix_data:
_add(self, stix_data, allow_custom, version=version)
def add(self, stix_data, version=None):
_add(self, stix_data, version=version)
_add(self, stix_data, self.allow_custom, version)
add.__doc__ = _add.__doc__
def save_to_file(self, file_path):
file_path = os.path.abspath(file_path)
all_objs = itertools.chain.from_iterable(
value.all_versions.values() if isinstance(value, _ObjectFamily)
else [value]
for value in self._data.values()
)
if not os.path.exists(os.path.dirname(file_path)):
os.makedirs(os.path.dirname(file_path))
with open(file_path, "w") as f:
f.write(str(Bundle(list(self._data.values()), allow_custom=self.allow_custom)))
f.write(str(Bundle(list(all_objs), allow_custom=self.allow_custom)))
save_to_file.__doc__ = MemoryStore.save_to_file.__doc__
@ -184,13 +232,14 @@ class MemorySource(DataSource):
"""
def __init__(self, stix_data=None, allow_custom=True, version=None, _store=False):
super(MemorySource, self).__init__()
self._data = {}
self.allow_custom = allow_custom
if _store:
self._data = stix_data
elif stix_data:
_add(self, stix_data, version=version)
else:
self._data = {}
if stix_data:
_add(self, stix_data, allow_custom, version=version)
def get(self, stix_id, _composite_filters=None):
"""Retrieve STIX object from in-memory dict via STIX ID.
@ -207,26 +256,26 @@ class MemorySource(DataSource):
is returned in the same form as it as added
"""
if _composite_filters is None:
# if get call is only based on 'id', no need to search, just retrieve from dict
try:
stix_obj = self._data[stix_id]
except KeyError:
stix_obj = None
return stix_obj
stix_obj = None
# if there are filters from the composite level, process full query
query = [Filter("id", "=", stix_id)]
all_data = self.query(query=query, _composite_filters=_composite_filters)
if all_data:
# reduce to most recent version
stix_obj = sorted(all_data, key=lambda k: k['modified'])[0]
return stix_obj
if _is_marking(stix_id):
stix_obj = self._data.get(stix_id)
else:
return None
object_family = self._data.get(stix_id)
if object_family:
stix_obj = object_family.latest_version
if stix_obj:
all_filters = list(
itertools.chain(
_composite_filters or [],
self.filters
)
)
stix_obj = next(apply_common_filters([stix_obj], all_filters), None)
return stix_obj
def all_versions(self, stix_id, _composite_filters=None):
"""Retrieve STIX objects from in-memory dict via STIX ID, all versions of it
@ -246,8 +295,30 @@ class MemorySource(DataSource):
is returned in the same form as it as added
"""
results = []
stix_objs_to_filter = None
if _is_marking(stix_id):
stix_obj = self._data.get(stix_id)
if stix_obj:
stix_objs_to_filter = [stix_obj]
else:
object_family = self._data.get(stix_id)
if object_family:
stix_objs_to_filter = object_family.all_versions.values()
return [self.get(stix_id=stix_id, _composite_filters=_composite_filters)]
if stix_objs_to_filter:
all_filters = list(
itertools.chain(
_composite_filters or [],
self.filters
)
)
results.extend(
apply_common_filters(stix_objs_to_filter, all_filters)
)
return results
def query(self, query=None, _composite_filters=None):
"""Search and retrieve STIX objects based on the complete query.
@ -265,7 +336,7 @@ class MemorySource(DataSource):
(list): list of STIX objects that matches the supplied
query. As the MemoryStore(i.e. MemorySink) adds STIX objects to memory
as they are supplied (either as python dictionary or STIX object), it
is returned in the same form as it as added.
is returned in the same form as it was added.
"""
query = FilterSet(query)
@ -276,17 +347,24 @@ class MemorySource(DataSource):
if _composite_filters:
query.add(_composite_filters)
all_objs = itertools.chain.from_iterable(
value.all_versions.values() if isinstance(value, _ObjectFamily)
else [value]
for value in self._data.values()
)
# Apply STIX common property filters.
all_data = list(apply_common_filters(self._data.values(), query))
all_data = list(apply_common_filters(all_objs, query))
return all_data
def load_from_file(self, file_path, version=None):
stix_data = json.load(open(os.path.abspath(file_path), "r"))
with open(os.path.abspath(file_path), "r") as f:
stix_data = json.load(f)
# Override user version selection if loading a bundle
if stix_data["type"] == "bundle":
for stix_obj in stix_data["objects"]:
_add(self, stix_data=parse(stix_obj, allow_custom=self.allow_custom, version=stix_data["spec_version"]))
else:
_add(self, stix_data=parse(stix_data, allow_custom=self.allow_custom, version=version))
version = stix_data["spec_version"]
_add(self, stix_data, self.allow_custom, version)
load_from_file.__doc__ = MemoryStore.load_from_file.__doc__

View File

@ -2,7 +2,9 @@ import pytest
from stix2.datastore import CompositeDataSource, make_id
from stix2.datastore.filters import Filter
from stix2.datastore.memory import MemorySink, MemorySource
from stix2.datastore.memory import MemorySink, MemorySource, MemoryStore
from stix2.utils import parse_into_datetime
from stix2.v20.common import TLP_GREEN
def test_add_remove_composite_datasource():
@ -44,14 +46,14 @@ def test_composite_datasource_operations(stix_objs1, stix_objs2):
indicators = cds1.all_versions("indicator--00000000-0000-4000-8000-000000000001")
# In STIX_OBJS2 changed the 'modified' property to a later time...
assert len(indicators) == 2
assert len(indicators) == 3
cds1.add_data_sources([cds2])
indicator = cds1.get("indicator--00000000-0000-4000-8000-000000000001")
assert indicator["id"] == "indicator--00000000-0000-4000-8000-000000000001"
assert indicator["modified"] == "2017-01-31T13:49:53.935Z"
assert indicator["modified"] == parse_into_datetime("2017-01-31T13:49:53.935Z")
assert indicator["type"] == "indicator"
query1 = [
@ -68,20 +70,80 @@ def test_composite_datasource_operations(stix_objs1, stix_objs2):
# STIX_OBJS2 has indicator with later time, one with different id, one with
# original time in STIX_OBJS1
assert len(results) == 3
assert len(results) == 4
indicator = cds1.get("indicator--00000000-0000-4000-8000-000000000001")
assert indicator["id"] == "indicator--00000000-0000-4000-8000-000000000001"
assert indicator["modified"] == "2017-01-31T13:49:53.935Z"
assert indicator["modified"] == parse_into_datetime("2017-01-31T13:49:53.935Z")
assert indicator["type"] == "indicator"
# There is only one indicator with different ID. Since we use the same data
# when deduplicated, only two indicators (one with different modified).
results = cds1.all_versions("indicator--00000000-0000-4000-8000-000000000001")
assert len(results) == 2
assert len(results) == 3
# Since we have filters already associated with our CompositeSource providing
# nothing returns the same as cds1.query(query1) (the associated query is query2)
results = cds1.query([])
assert len(results) == 3
assert len(results) == 4
def test_source_markings():
msrc = MemorySource(TLP_GREEN)
assert msrc.get(TLP_GREEN.id) == TLP_GREEN
assert msrc.all_versions(TLP_GREEN.id) == [TLP_GREEN]
assert msrc.query(Filter("id", "=", TLP_GREEN.id)) == [TLP_GREEN]
def test_sink_markings():
# just make sure there is no crash
msink = MemorySink(TLP_GREEN)
msink.add(TLP_GREEN)
def test_store_markings():
mstore = MemoryStore(TLP_GREEN)
assert mstore.get(TLP_GREEN.id) == TLP_GREEN
assert mstore.all_versions(TLP_GREEN.id) == [TLP_GREEN]
assert mstore.query(Filter("id", "=", TLP_GREEN.id)) == [TLP_GREEN]
def test_source_mixed(indicator):
msrc = MemorySource([TLP_GREEN, indicator])
assert msrc.get(TLP_GREEN.id) == TLP_GREEN
assert msrc.all_versions(TLP_GREEN.id) == [TLP_GREEN]
assert msrc.query(Filter("id", "=", TLP_GREEN.id)) == [TLP_GREEN]
assert msrc.get(indicator.id) == indicator
assert msrc.all_versions(indicator.id) == [indicator]
assert msrc.query(Filter("id", "=", indicator.id)) == [indicator]
all_objs = msrc.query()
assert TLP_GREEN in all_objs
assert indicator in all_objs
assert len(all_objs) == 2
def test_sink_mixed(indicator):
# just make sure there is no crash
msink = MemorySink([TLP_GREEN, indicator])
msink.add([TLP_GREEN, indicator])
def test_store_mixed(indicator):
mstore = MemoryStore([TLP_GREEN, indicator])
assert mstore.get(TLP_GREEN.id) == TLP_GREEN
assert mstore.all_versions(TLP_GREEN.id) == [TLP_GREEN]
assert mstore.query(Filter("id", "=", TLP_GREEN.id)) == [TLP_GREEN]
assert mstore.get(indicator.id) == indicator
assert mstore.all_versions(indicator.id) == [indicator]
assert mstore.query(Filter("id", "=", indicator.id)) == [indicator]
all_objs = mstore.query()
assert TLP_GREEN in all_objs
assert indicator in all_objs
assert len(all_objs) == 2

View File

@ -113,7 +113,7 @@ def test_environment_functions():
# Get both versions of the object
resp = env.all_versions(INDICATOR_ID)
assert len(resp) == 1 # should be 2, but MemoryStore only keeps 1 version of objects
assert len(resp) == 2
# Get just the most recent version of the object
resp = env.get(INDICATOR_ID)

View File

@ -7,6 +7,7 @@ from stix2 import (Bundle, Campaign, CustomObject, Filter, Identity, Indicator,
Malware, MemorySource, MemoryStore, Relationship,
properties)
from stix2.datastore import make_id
from stix2.utils import parse_into_datetime
from .constants import (CAMPAIGN_ID, CAMPAIGN_KWARGS, IDENTITY_ID,
IDENTITY_KWARGS, INDICATOR_ID, INDICATOR_KWARGS,
@ -167,7 +168,7 @@ def test_memory_store_all_versions(mem_store):
type="bundle"))
resp = mem_store.all_versions("indicator--00000000-0000-4000-8000-000000000001")
assert len(resp) == 1 # MemoryStore can only store 1 version of each object
assert len(resp) == 3
def test_memory_store_query(mem_store):
@ -179,25 +180,27 @@ def test_memory_store_query(mem_store):
def test_memory_store_query_single_filter(mem_store):
query = Filter('id', '=', 'indicator--00000000-0000-4000-8000-000000000001')
resp = mem_store.query(query)
assert len(resp) == 1
assert len(resp) == 2
def test_memory_store_query_empty_query(mem_store):
resp = mem_store.query()
# sort since returned in random order
resp = sorted(resp, key=lambda k: k['id'])
assert len(resp) == 2
resp = sorted(resp, key=lambda k: (k['id'], k['modified']))
assert len(resp) == 3
assert resp[0]['id'] == 'indicator--00000000-0000-4000-8000-000000000001'
assert resp[0]['modified'] == '2017-01-27T13:49:53.936Z'
assert resp[1]['id'] == 'indicator--00000000-0000-4000-8000-000000000002'
assert resp[1]['modified'] == '2017-01-27T13:49:53.935Z'
assert resp[0]['modified'] == parse_into_datetime('2017-01-27T13:49:53.935Z')
assert resp[1]['id'] == 'indicator--00000000-0000-4000-8000-000000000001'
assert resp[1]['modified'] == parse_into_datetime('2017-01-27T13:49:53.936Z')
assert resp[2]['id'] == 'indicator--00000000-0000-4000-8000-000000000002'
assert resp[2]['modified'] == parse_into_datetime('2017-01-27T13:49:53.935Z')
def test_memory_store_query_multiple_filters(mem_store):
mem_store.source.filters.add(Filter('type', '=', 'indicator'))
query = Filter('id', '=', 'indicator--00000000-0000-4000-8000-000000000001')
resp = mem_store.query(query)
assert len(resp) == 1
assert len(resp) == 2
def test_memory_store_save_load_file(mem_store, fs_mem_store):
@ -218,12 +221,8 @@ def test_memory_store_save_load_file(mem_store, fs_mem_store):
def test_memory_store_add_invalid_object(mem_store):
ind = ('indicator', IND1) # tuple isn't valid
with pytest.raises(TypeError) as excinfo:
with pytest.raises(TypeError):
mem_store.add(ind)
assert 'stix_data expected to be' in str(excinfo.value)
assert 'a python-stix2 object' in str(excinfo.value)
assert 'JSON formatted STIX' in str(excinfo.value)
assert 'JSON formatted STIX bundle' in str(excinfo.value)
def test_memory_store_object_with_custom_property(mem_store):
@ -246,10 +245,9 @@ def test_memory_store_object_with_custom_property_in_bundle(mem_store):
allow_custom=True)
bundle = Bundle(camp, allow_custom=True)
mem_store.add(bundle, True)
mem_store.add(bundle)
bundle_r = mem_store.get(bundle.id)
camp_r = bundle_r['objects'][0]
camp_r = mem_store.get(camp.id)
assert camp_r.id == camp.id
assert camp_r.x_empire == camp.x_empire