Data* suite reorg, fixing bugs

stix2.1
mbastian1135 2017-07-12 10:58:31 -04:00 committed by Greg Back
parent 20958b908a
commit 9d72d60706
4 changed files with 1380 additions and 602 deletions

File diff suppressed because it is too large Load Diff

256
stix2/sources/filesystem.py Normal file
View File

@ -0,0 +1,256 @@
"""
Python STIX 2.0 FileSystem Source/Sink
Classes:
FileSystemStore
FileSystemSink
FileSystemSource
TODO: Test everything
"""
import json
import os
from sources import DataSink, DataSource, DataStore, make_id
from stix2 import Bundle
class FileSystemStore(DataStore):
"""
"""
def __init__(self, stix_dir="stix_data", source=None, sink=None, name="FileSystemStore"):
self.name = name
self.id = make_id()
if source:
self.source = source
else:
self.source = FileSystemSource(stix_dir=stix_dir)
if sink:
self.sink = sink
else:
self.sink = FileSystemSink(stix_dir=stix_dir)
@property
def source(self):
return self.source
@source.setter
def source(self, source):
self.source = source
@property
def sink(self):
return self.sink
@sink.setter
def sink(self, sink):
self.sink = sink
# file system sink API calls
def add(self, stix_objs):
return self.sink.add(stix_objs=stix_objs)
def remove(self, stix_ids):
return self.sink.remove(stix_ids=stix_ids)
# file sytem source API calls
def get(self, stix_id):
return self.source.get(stix_id=stix_id)
def all_versions(self, stix_id):
return self.source.all_versions(stix_id=stix_id)
def query(self, query):
return self.source.query(query=query)
class FileSystemSink(DataSink):
"""
"""
def __init__(self, stix_dir="stix_data", name="FileSystemSink"):
super(FileSystemSink, self).__init__(name=name)
self.stix_dir = os.path.abspath(stix_dir)
# check directory path exists
if not os.path.exists(self.stix_dir):
print("Error: directory path for STIX data does not exist")
@property
def stix_dir(self):
return self.stix_dir
@stix_dir.setter
def stix_dir(self, dir):
self.stix_dir = dir
def add(self, stix_objs=None):
"""
Q: bundlify or no?
"""
if not stix_objs:
stix_objs = []
for stix_obj in stix_objs:
path = os.path.join(self.stix_dir, stix_obj["type"], stix_obj["id"])
json.dump(Bundle([stix_obj]), open(path, 'w+', indent=4))
def remove(self, stix_ids=None):
if not stix_ids:
stix_ids = []
for stix_id in stix_ids:
stix_type = stix_id.split("--")[0]
try:
os.remove(os.path.join(self.stix_dir, stix_type, stix_id))
except OSError:
# log error? nonexistent object in data with directory
continue
class FileSystemSource(DataSource):
"""
"""
def __init__(self, stix_dir="stix_data", name="FileSystemSource"):
super(FileSystemSource, self).__init__(name=name)
self.stix_dir = os.path.abspath(stix_dir)
# check directory path exists
if not os.path.exists(self.stix_dir):
print("Error: directory path for STIX data does not exist")
@property
def stix_dir(self):
return self.stix_dir
@stix_dir.setter
def stix_dir(self, dir):
self.stix_dir = dir
def get(self, stix_id, _composite_filters=None):
"""
"""
query = [
{
"field": "id",
"op": "=",
"value": stix_id
}
]
all_data = self.query(query=query, _composite_filters=_composite_filters)
stix_obj = sorted(all_data, key=lambda k: k['modified'])[0]
return stix_obj
def all_versions(self, stix_id, _composite_filters=None):
"""
NOTE: since FileSystem sources/sinks dont handle mutliple verions of a STIX object,
this operation is futile. Pass call to get(). (Appoved by G.B.)
"""
# query = [
# {
# "field": "id",
# "op": "=",
# "value": stix_id
# }
# ]
# all_data = self.query(query=query, _composite_filters=_composite_filters)
return [self.get(stix_id=stix_id, _composite_filters=_composite_filters)]
def query(self, query=None, _composite_filters=None):
"""
"""
all_data = []
if query is None:
query = []
# combine all query filters
if self.filters:
query.extend(self.filters.values())
if _composite_filters:
query.extend(_composite_filters)
# extract any filters that are for "type" or "id" , as we can then do
# filtering before reading in the STIX objects. A STIX 'type' filter
# can reduce the query to a single sub-directory. A STIX 'id' filter
# allows for the fast checking of the file names versus loading it.
file_filters = self._parse_file_filters(query)
# establish which subdirectories can be avoided in query
# by decluding as many as possible. A filter with "type" as the field
# means that certain STIX object types can be ruled out, and thus
# the corresponding subdirectories as well
include_paths = []
declude_paths = []
if "type" in [filter_['field'] for filter_ in file_filters]:
for filter_ in file_filters:
if filter_['field'] == "type":
if filter_['op'] == '=':
include_paths.append(os.path.join(self.stix_dir, filter_['value']))
elif filter_['op'] == "!=":
declude_paths.append(os.path.join(self.stix_dir, filter_['value']))
else:
# have to walk entire STIX directory
include_paths.append(self.stix_dir)
# if a user specifies a "type" filter like "type = <stix-object_type>",
# the filter is reducing the search space to single stix object types
# (and thus single directories). This makes such a filter more powerful
# than "type != <stix-object_type>" bc the latter is substracting
# only one type of stix object type (and thus only one directory),
# As such the former type of filters are given preference over the latter;
# i.e. if both exist in a query, that latter type will be ignored
if not include_paths:
# user has specified types that are not wanted (i.e. "!=")
# so query will look in all STIX directories that are not
# the specified type. Compile correct dir paths
for dir_ in os.listdir(self.stix_dir):
if os.path.abspath(dir_) not in declude_paths:
include_paths.append(os.path.abspath(dir_))
# grab stix object ID as well - if present in filters, as
# may forgo the loading of STIX content into memory
if "id" in [filter_['field'] for filter_ in file_filters]:
for filter_ in file_filters:
if filter_['field'] == 'id' and filter_['field'] == '=':
id_ = filter_['value']
else:
id_ = None
# now iterate through all STIX objs
for path in include_paths:
for root, dirs, files in os.walk(path):
for file in files:
if id_:
if id_ == file.split(".")[0]:
# since ID is specified in one of filters, can evaluate against filename first without loading
stix_obj = json.load(file)['objects']
# check against other filters, add if match
all_data.extend(self.apply_common_filters([stix_obj], query))
else:
# have to load into memory regardless to evaluate other filters
stix_obj = json.load(file)['objects']
all_data.extend(self.apply_common_filters([stix_obj], query))
all_data = self.deduplicate(all_data)
return all_data
def _parse_file_filters(self, query):
"""
"""
file_filters = []
for filter_ in query:
if filter_['field'] == "id" or filter_['field'] == "type":
file_filters.append(filter_)
return file_filters

268
stix2/sources/memory.py Normal file
View File

@ -0,0 +1,268 @@
"""
Python STIX 2.0 Memory Source/Sink
Classes:
MemoryStore
MemorySink
MemorySource
TODO: Test everything.
TODO: Use deduplicate() calls only when memory corpus is dirty (been added to)
can save a lot of time for successive queries
NOTE: Not worrying about STIX versioning. The in memory STIX data at anytime
will only hold one version of a STIX object. As such, when save() is called,
the single versions of all the STIX objects are what is written to file.
"""
import json
import os
from stix2 import Bundle
from stix2.sources import DataSink, DataSource, DataStore, make_id
from stix2validator import validate_string
class MemoryStore(DataStore):
"""
"""
def __init__(self, stix_data=None, source=None, sink=None, name="MemoryStore"):
self.name = name
self.id = make_id()
if source:
self.source = source
else:
self.source = MemorySource(stix_data=stix_data)
if sink:
self.sink = sink
else:
self.sink = MemorySink(stix_data=stix_data)
@property
def source(self):
return self.source
@source.setter
def source(self, source):
self.source = source
@property
def sink(self):
return self.sink
@sink.setter
def sink(self, sink):
self.sink = sink
# memory sink API calls
def add(self, stix_data):
return self.sink.add(stix_data=stix_data)
def remove(self, stix_ids):
return self.sink.remove(stix_ids=stix_ids)
def save(self):
return self.sink.save()
# memory source API calls
def get(self, stix_id):
return self.source.get(stix_id=stix_id)
def all_versions(self, stix_id):
return self.source.all_versions(stix_id=stix_id)
def query(self, query):
return self.source.query(query=query)
class MemorySink(DataSink):
"""
"""
def __init__(self, stix_data=None, name="MemorySink"):
"""
Args:
data (dictionary OR list): valid STIX 2.0 content in bundle or a list
name (string): optional name tag of the data source
"""
super(MemorySink, self).__init__(name=name)
self.data = {}
if stix_data:
if type(stix_data) == dict:
# stix objects are in a bundle
# verify STIX json data
r = validate_string(json.dumps(stix_data))
# make dictionary of the objects for easy lookup
if r.is_valid:
for stix_obj in stix_data["objects"]:
self.data[stix_obj["id"]] = stix_obj
else:
print("Error: json data passed to MemorySink() was found to not be validated by STIX 2 Validator")
print(r)
self.data = {}
elif type(stix_data) == list:
# stix objects are in a list
for stix_obj in stix_data:
r = validate_string(json.dumps(stix_obj))
if r.is_valid:
self.data[stix_obj["id"]] = stix_obj
else:
print("Error: STIX object %s is not valid under STIX 2 validator.") % stix_obj["id"]
print(r)
else:
raise ValueError("stix_data must be in bundle format or raw list")
def add(self, stix_data):
"""
"""
if type(stix_data) == dict:
# stix data is in bundle
r = validate_string(json.dumps(stix_data))
if r.is_valid:
for stix_obj in stix_data["objects"]:
self.data[stix_obj["id"]] = stix_obj
else:
print("Error: json data passed to MemorySink() was found to not be validated by STIX 2 Validator")
print(r)
elif type(stix_data) == list:
# stix data is in list
for stix_obj in stix_data:
r = validate_string(json.dumps(stix_obj))
if r.is_valid:
self.data[stix_obj["id"]] = stix_obj
else:
print("Error: STIX object %s is not valid under STIX 2 validator.") % stix_obj["id"]
print(r)
else:
raise ValueError("stix_data must be in bundle format or raw list")
def remove(self, stix_ids):
"""
"""
for stix_id in stix_ids:
try:
del self.data[stix_id]
except KeyError:
pass
def save(self, file_path=None):
"""
"""
if not file_path:
file_path = os.path.dirname(os.path.realpath(__file__))
json.dump(Bundle(self.data.values()), file_path, indent=4)
class MemorySource(DataSource):
def __init__(self, stix_data=None, name="MemorySource"):
"""
Args:
data (dictionary OR list): valid STIX 2.0 content in bundle or list
name (string): optional name tag of the data source
"""
super(MemorySource, self).__init__(name=name)
self.data = {}
if stix_data:
if type(stix_data) == dict:
# stix objects are in a bundle
# verify STIX json data
r = validate_string(json.dumps(stix_data))
# make dictionary of the objects for easy lookup
if r.is_valid:
for stix_obj in stix_data["objects"]:
self.data[stix_obj["id"]] = stix_obj
else:
print("Error: json data passed to MemorySink() was found to not be validated by STIX 2 Validator")
print(r)
self.data = {}
elif type(stix_data) == list:
# stix objects are in a list
for stix_obj in stix_data:
r = validate_string(json.dumps(stix_obj))
if r.is_valid:
self.data[stix_obj["id"]] = stix_obj
else:
print("Error: STIX object %s is not valid under STIX 2 validator.") % stix_obj["id"]
print(r)
else:
raise ValueError("stix_data must be in bundle format or raw list")
def get(self, stix_id, _composite_filters=None):
"""
"""
if _composite_filters is None:
# if get call is only based on 'id', no need to search, just retrieve from dict
try:
stix_obj = self.data[stix_id]
except KeyError:
stix_obj = None
return stix_obj
# if there are filters from the composite level, process full query
query = [
{
"field": "id",
"op": "=",
"value": stix_id
}
]
all_data = self.query(query=query, _composite_filters=_composite_filters)
# reduce to most recent version
stix_obj = sorted(all_data, key=lambda k: k['modified'])[0]
return stix_obj
def all_versions(self, stix_id, _composite_filters=None):
"""
NOTE: since Memory sources/sinks dont handle mutliple verions of a STIX object,
this operation is futile. Translate call to get(). (Appoved by G.B.)
"""
# query = [
# {
# "field": "id",
# "op": "=",
# "value": stix_id
# }
# ]
# all_data = self.query(query=query, _composite_filters=_composite_filters)
return [self.get(stix_id=stix_id, _composite_filters=_composite_filters)]
def query(self, query=None, _composite_filters=None):
"""
"""
if query is None:
query = []
# combine all query filters
if self.filters:
query.extend(self.filters.values())
if _composite_filters:
query.extend(_composite_filters)
# deduplicate data before filtering -> Deduplication is not required as Memory only ever holds one version of an object
# all_data = self.depuplicate(all_data)
# apply STIX common property filters
all_data = self.apply_common_filters(self.data.values(), query)
return all_data

View File

@ -1,131 +1,213 @@
import requests
from requests.auth import HTTPBasicAuth
"""
Python STIX 2.0 TAXII Source/Sink
from stix2.sources import DataSource
Classes:
TAXIICollectionStore
TAXIICollectionSink
TAXIICollectionSource
# TODO: -Should we make properties for the TAXIIDataSource address and other
# possible variables that are found in "self.taxii_info"
TODO: Test everything
"""
import json
import uuid
from stix2.sources import DataSink, DataSource, DataStore, make_id
from taxii2_client import TAXII2Client
TAXII_FILTERS = ['added_after', 'id', 'type', 'version']
test = True
class TAXIICollectionStore(DataStore):
"""
"""
def __init__(self,
source=None,
sink=None,
server_uri=None,
api_root_name=None,
collection_id=None,
user=None,
password=None,
name="TAXIICollectionStore"):
self.name = name
self.id = make_id()
if source:
self.source = source
else:
self.source = TAXIICollectionSource(server_uri, api_root_name, collection_id, user, password)
if sink:
self.sink = sink
else:
self.TAXIICollectionSink(server_uri, api_root_name, collection_id, user, password)
@property
def source(self):
return self.source
@source.setter
def source(self, source):
self.source = source
@property
def sink(self):
return self.sink
@sink.setter
def sink(self, sink):
self.sink = sink
# file system sink API calls
def add(self, stix_objs):
return self.sink.add(stix_objs=stix_objs)
# file sytem source API calls
def get(self, stix_id):
return self.source.get(stix_id=stix_id)
def all_versions(self, stix_id):
return self.source.all_versions(stix_id=stix_id)
def query(self, query):
return self.source.query(query=query)
class TAXIIDataSource(DataSource):
"""STIX 2.0 Data Source - TAXII 2.0 module"""
class TAXIICollectionSink(DataSink):
"""
"""
def __init__(self, api_root=None, auth=None, name="TAXII"):
super(TAXIIDataSource, self).__init__(name=name)
def __init__(self, server_uri=None, api_root_name=None, collection_id=None, user=None, password=None, name="TAXIICollectionSink"):
super(TAXIICollectionSink, self).__init__(name=name)
if not api_root:
api_root = "http://localhost:5000"
if not auth:
auth = {"user": "admin", "pass": "taxii"}
self.taxii_client = TAXII2Client(server_uri, user, password)
self.taxii_client.populate_available_information()
self.taxii_info = {
"api_root": {
"url": api_root
},
"auth": auth
}
if test:
return
try:
# check api-root is reachable/exists and grab api collections
coll_url = self.taxii_info['api_root']['url'] + "/collections/"
headers = {}
resp = requests.get(coll_url,
headers=headers,
auth=HTTPBasicAuth(self.taxii_info['auth']['user'],
self.taxii_info['auth']['pass']))
# TESTING
# print("\n-------__init__() ----\n")
# print(resp.text)
# print("\n")
# print(resp.status_code)
# END TESTING
# raise http error if request returned error code
resp.raise_for_status()
resp_json = resp.json()
try:
self.taxii_info['api_root']['collections'] = resp_json['collections']
except KeyError as e:
if e == "collections":
raise
# raise type(e), type(e)(e.message +
# "To connect to the TAXII collections, the API root
# resource must contain a collection endpoint URL.
# This was not found in the API root resource received
# from the API root" ), sys.exc_info()[2]
except requests.ConnectionError as e:
raise
# raise type(e), type(e)(e.message +
# "Attempting to connect to %s" % coll_url)
def get(self, id_, _composite_filters=None):
"""Get STIX 2.0 object from TAXII source by specified 'id'
Notes:
Just pass _composite_filters to the query() as they are applied
there. de-duplication of results is also done within query()
Args:
id_ (str): id of STIX object to retrieve
_composite_filters (list): filters passed from a Composite Data
Source (if this data source is attached to one)
Returns:
if not api_root_name:
raise ValueError("No api_root specified.")
else:
self.api_root = None
for a_r in self.taxii_client.api_roots:
if api_root_name == a_r.name:
self.api_root = a_r
break
if not self.api_root:
raise ValueError("The api_root %s is not found on this taxii server" % api_root_name)
if not collection_id:
raise ValueError("No collection specified.")
else:
self.collection = None
for c in self.api_root.collections:
if c.id_ == collection_id:
self.collection = c
break
if not self.collection:
raise ValueError("The collection %s is not found on the api_root %s of this taxii server" %
(collection_id, api_root_name))
def save(self, stix_obj):
"""
"""
self.collection.add_objects(self.create_bundle([json.loads(str(stix_obj))]))
# make query in TAXII query format since 'id' is TAXii field
query = [
{
"field": "match[id]",
"op": "=",
"value": id_
}
]
@staticmethod
def create_bundle(objects):
return dict(id="bundle--" + str(uuid.uuid4()),
objects=objects,
spec_version="2.0",
type="bundle")
all_data = self.query(query=query, _composite_filters=_composite_filters)
# utility functions for the current set collection and api root
def get_api_root_info(self):
"""
"""
return self.api_root.get_information()
# reduce to most recent version
stix_obj = sorted(all_data, key=lambda k: k['modified'])[0]
def get_api_root_collections(self):
"""
"""
return self.api_root.get_collections()
def get_collection_manifest(self):
"""
"""
return self.collection.get_collection_manifest()
class TAXIICollectionSource(DataSource):
"""
"""
def __init__(self, server_uri=None, api_root_name=None, collection_id=None, user=None, password=None, name="TAXIICollectionSourc"):
super(TAXIICollectionSource, self).__init__(name=name)
self.taxii_client = TAXII2Client(server_uri, user, password)
self.taxii_client.populate_available_information()
if not api_root_name:
raise ValueError("No api_root specified.")
else:
self.api_root = None
for a_r in self.taxii_client.api_roots:
if api_root_name == a_r.name:
self.api_root = a_r
break
if not self.api_root:
raise ValueError("The api_root %s is not found on this taxii server" % api_root_name)
if not collection_id:
raise ValueError("No collection specified.")
else:
self.collection = None
for c in self.api_root.collections:
if c.id_ == collection_id:
self.collection = c
break
if not self.collection:
raise ValueError("The collection %s is not found on the api_root %s of this taxii server" %
(collection_id, api_root_name))
def get(self, stix_id, _composite_filters=None):
"""
"""
# combine all query filters
query = []
if self.filters:
query.extend(self.filters.values())
if _composite_filters:
query.extend(_composite_filters)
# separate taxii query terms (can be done remotely)
taxii_filters = self._parse_taxii_filters(query)
stix_objs = self.collection.get_object(stix_id, taxii_filters)["objects"]
stix_obj = self.apply_common_filters(stix_objs, query)
if len(stix_obj) > 0:
stix_obj = stix_obj[0]
else:
stix_obj = None
return stix_obj
def all_versions(self, id_, _composite_filters=None):
"""Get all versions of STIX 2.0 object from TAXII source by
specified 'id'
Notes:
Just passes _composite_filters to the query() as they are applied
there. de-duplication of results is also done within query()
Args:
id_ (str): id of STIX objects to retrieve
_composite_filters (list): filters passed from a Composite Data
Source (if this data source is attached to one)
Returns:
The query results with filters applied.
def all_versions(self, stix_id, _composite_filters=None):
"""
"""
# make query in TAXII query format since 'id' is TAXII field
query = [
{
"field": "match[id]",
"op": "=",
"value": id_
"value": stix_id
},
{
"field": "match[version]",
"op": "=",
"value": "all"
}
]
@ -134,84 +216,22 @@ class TAXIIDataSource(DataSource):
return all_data
def query(self, query=None, _composite_filters=None):
"""Query the TAXII data source for STIX objects matching the query
The final full query could contain filters from:
-the current API call
-Composite Data source filters (that are passed in via
'_composite_filters')
-TAXII data source filters that are attached
TAXII filters ['added_after', 'match[<>]'] are extracted and sent
to TAXII if they are present
TODO: Authentication for TAXII
Args:
query(list): list of filters (dicts) to search on
_composite_filters (list): filters passed from a
Composite Data Source (if this data source is attached to one)
Returns:
"""
all_data = []
"""
if query is None:
query = []
# combine all query filters
if self.filters:
query += self.filters.values()
query.extend(self.filters.values())
if _composite_filters:
query += _composite_filters
query.extend(_composite_filters)
# separate taxii query terms (can be done remotely)
taxii_filters = self._parse_taxii_filters(query)
# for each collection endpoint - send query request
for collection in self.taxii_info['api_root']['collections']:
coll_obj_url = "/".join([self.taxii_info['api_root']['url'],
"collections", str(collection['id']),
"objects"])
headers = {}
try:
resp = requests.get(coll_obj_url,
params=taxii_filters,
headers=headers,
auth=HTTPBasicAuth(self.taxii_info['auth']['user'],
self.taxii_info['auth']['pass']))
# TESTING
# print("\n-------query() ----\n")
# print("Request that was sent: \n")
# print(resp.url)
# print("Response: \n")
# print(json.dumps(resp.json(),indent=4))
# print("\n")
# print(resp.status_code)
# print("------------------")
# END TESTING
# raise http error if request returned error code
resp.raise_for_status()
resp_json = resp.json()
# grab all STIX 2.0 objects in json response
for stix_obj in resp_json['objects']:
all_data.append(stix_obj)
except requests.exceptions.RequestException as e:
raise e
# raise type(e), type(e)(e.message +
# "Attempting to connect to %s" % coll_url)
# TODO: Is there a way to collect exceptions while carrying
# on then raise all of them at the end?
# query TAXII collection
all_data = self.collection.get_objects(filters=taxii_filters)["objects"]
# deduplicate data (before filtering as reduces wasted filtering)
all_data = self.deduplicate(all_data)
@ -224,11 +244,10 @@ class TAXIIDataSource(DataSource):
def _parse_taxii_filters(self, query):
"""Parse out TAXII filters that the TAXII server can filter on
TAXII filters should be analgous to how they are supplied
in the url to the TAXII endpoint. For instance
For instance
"?match[type]=indicator,sighting" should be in a query dict as follows
{
"field": "match[type]"
"field": "type"
"op": "=",
"value": "indicator,sighting"
}
@ -244,19 +263,62 @@ class TAXIIDataSource(DataSource):
params = {}
for q in query:
if q['field'] in TAXII_FILTERS:
if q['field'] == 'added_after':
params[q['field']] = q['value']
for filter_ in query:
if filter_["field"] in TAXII_FILTERS:
if filter_["field"] == "added_after":
params[filter_["field"]] = filter_["value"]
else:
taxii_field = 'match[' + q['field'] + ']'
params[taxii_field] = q['value']
taxii_field = "match[" + filter_["field"] + ']'
params[taxii_field] = filter_["value"]
return params
def close(self):
"""Close down the Data Source - if any clean up is required.
# utility functions for the current attached collection and api root
def get_api_root_info(self):
"""
pass
"""
return self.api_root.get_information()
# TODO: - getters/setters (properties) for TAXII config info
def get_api_root_collections(self):
"""
"""
return self.api_root.get_collections()
def get_collection_manifest(self):
"""
"""
return self.collection.get_collection_manifest()
def get_server_api_roots(taxii_client):
"""
"""
api_root_info = []
taxii_client.populate_available_information()
for api_root in taxii_client.api_roots:
api_root_info.append(api_root.information())
return api_root_info
def get_server_collections(taxii_client):
"""
"""
server_collections = []
taxii_client.populate_available_information()
for api_root in taxii_client.api_roots:
server_collections.extend(api_root.get_collections())
return server_collections
def get_api_root_collections(taxii_client, api_root_name):
"""
"""
taxii_client.populate_available_information()
for api_root in taxii_client.api_roots:
if api_root == api_root_name:
return api_root.get_collections()