From ace64c4042984439536ce4fcbbe500ca2215dfa7 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 21 Dec 2020 17:53:53 -0500 Subject: [PATCH 01/23] provide pagination support for requests in the TAXIICollectionSource --- stix2/datastore/taxii.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/stix2/datastore/taxii.py b/stix2/datastore/taxii.py index 41d1e54..1600253 100644 --- a/stix2/datastore/taxii.py +++ b/stix2/datastore/taxii.py @@ -12,6 +12,8 @@ from stix2.parsing import parse from stix2.utils import deduplicate try: + from taxii2client import v20 as tcv20 + from taxii2client import v21 as tcv21 from taxii2client.exceptions import ValidationError _taxii2_client = True except ImportError: @@ -144,9 +146,12 @@ class TAXIICollectionSource(DataSource): collection (taxii2.Collection): TAXII Collection instance allow_custom (bool): Whether to allow custom STIX content to be added to the FileSystemSink. Default: True + items_per_page (int): How many STIX objects to request per call + to TAXII Server. This value is tunable, but servers may override + if their internal limit is surpassed. """ - def __init__(self, collection, allow_custom=True): + def __init__(self, collection, allow_custom=True, items_per_page=5000): super(TAXIICollectionSource, self).__init__() if not _taxii2_client: raise ImportError("taxii2client library is required for usage of TAXIICollectionSource") @@ -167,6 +172,7 @@ class TAXIICollectionSource(DataSource): ) self.allow_custom = allow_custom + self.items_per_page = items_per_page def get(self, stix_id, version=None, _composite_filters=None): """Retrieve STIX object from local/remote STIX Collection @@ -286,8 +292,19 @@ class TAXIICollectionSource(DataSource): taxii_filters_dict = dict((f.property, f.value) for f in taxii_filters) # query TAXII collection + all_data = [] try: - all_data = self.collection.get_objects(**taxii_filters_dict).get('objects', []) + if isinstance(self.collection, tcv21.Collection): + envelope = self.collection.get_objects(**taxii_filters_dict) + all_data.extend(envelope.get("objects", [])) + + # The while loop will not be executed if the response is received in full. + while envelope.get("more", False): + envelope = self.collection.get_objects(limit=self.items_per_page, next=envelope.get("next", "")) + all_data.extend(envelope.get("objects", [])) + else: + for bundle in tcv20.as_pages(self.collection.get_objects, per_request=self.items_per_page): + all_data.extend(bundle.get("objects", [])) # deduplicate data (before filtering as reduces wasted filtering) all_data = deduplicate(all_data) From 76eebeb549bb8bfbedee944bae356eebbe236abe Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 22 Dec 2020 16:52:27 -0500 Subject: [PATCH 02/23] expose **taxii_filters_dict on requests --- stix2/datastore/taxii.py | 4 ++-- stix2/test/v20/test_datastore_taxii.py | 11 +++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/stix2/datastore/taxii.py b/stix2/datastore/taxii.py index 1600253..61d2366 100644 --- a/stix2/datastore/taxii.py +++ b/stix2/datastore/taxii.py @@ -300,10 +300,10 @@ class TAXIICollectionSource(DataSource): # The while loop will not be executed if the response is received in full. while envelope.get("more", False): - envelope = self.collection.get_objects(limit=self.items_per_page, next=envelope.get("next", "")) + envelope = self.collection.get_objects(limit=self.items_per_page, next=envelope.get("next", ""), **taxii_filters_dict) all_data.extend(envelope.get("objects", [])) else: - for bundle in tcv20.as_pages(self.collection.get_objects, per_request=self.items_per_page): + for bundle in tcv20.as_pages(self.collection.get_objects, per_request=self.items_per_page, **taxii_filters_dict): all_data.extend(bundle.get("objects", [])) # deduplicate data (before filtering as reduces wasted filtering) diff --git a/stix2/test/v20/test_datastore_taxii.py b/stix2/test/v20/test_datastore_taxii.py index 0b21981..cd051f1 100644 --- a/stix2/test/v20/test_datastore_taxii.py +++ b/stix2/test/v20/test_datastore_taxii.py @@ -5,7 +5,7 @@ import pytest from requests.models import Response import six from taxii2client.common import _filter_kwargs_to_query_params -from taxii2client.v20 import Collection +from taxii2client.v20 import MEDIA_TYPE_STIX_V20, Collection import stix2 from stix2.datastore import DataSourceError @@ -35,12 +35,12 @@ class MockTAXIICollectionEndpoint(Collection): { "date_added": get_timestamp(), "id": object["id"], - "media_type": "application/stix+json;version=2.1", + "media_type": "application/stix+json;version=2.0", "version": object.get("modified", object.get("created", get_timestamp())), }, ) - def get_objects(self, **filter_kwargs): + def get_objects(self, accept=MEDIA_TYPE_STIX_V20, start=0, per_request=0, **filter_kwargs): self._verify_can_read() query_params = _filter_kwargs_to_query_params(filter_kwargs) assert isinstance(query_params, dict) @@ -52,7 +52,10 @@ class MockTAXIICollectionEndpoint(Collection): 100, )[0] if objs: - return stix2.v20.Bundle(objects=objs) + resp = Response() + resp.encoding = "utf-8" + resp._content = six.ensure_binary(stix2.v20.Bundle(objects=objs).serialize(ensure_ascii=False)) + return resp else: resp = Response() resp.status_code = 404 From a7eb4113deb931e8cf744d0711d66fa695d6405d Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 2 Feb 2021 00:04:04 -0500 Subject: [PATCH 03/23] minor change to align API --- stix2/datastore/taxii.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/stix2/datastore/taxii.py b/stix2/datastore/taxii.py index 61d2366..a0e2706 100644 --- a/stix2/datastore/taxii.py +++ b/stix2/datastore/taxii.py @@ -35,9 +35,12 @@ class TAXIICollectionStore(DataStoreMixin): side(retrieving data) and False for TAXIICollectionSink side(pushing data). However, when parameter is supplied, it will be applied to both TAXIICollectionSource/Sink. + items_per_page (int): How many STIX objects to request per call + to TAXII Server. The value can be tuned, but servers may override + if their internal limit is surpassed. Used by TAXIICollectionSource """ - def __init__(self, collection, allow_custom=None): + def __init__(self, collection, allow_custom=None, items_per_page=5000): if allow_custom is None: allow_custom_source = True allow_custom_sink = False @@ -45,7 +48,7 @@ class TAXIICollectionStore(DataStoreMixin): allow_custom_sink = allow_custom_source = allow_custom super(TAXIICollectionStore, self).__init__( - source=TAXIICollectionSource(collection, allow_custom=allow_custom_source), + source=TAXIICollectionSource(collection, allow_custom=allow_custom_source, items_per_page=items_per_page), sink=TAXIICollectionSink(collection, allow_custom=allow_custom_sink), ) @@ -147,7 +150,7 @@ class TAXIICollectionSource(DataSource): allow_custom (bool): Whether to allow custom STIX content to be added to the FileSystemSink. Default: True items_per_page (int): How many STIX objects to request per call - to TAXII Server. This value is tunable, but servers may override + to TAXII Server. The value can be tuned, but servers may override if their internal limit is surpassed. """ @@ -295,12 +298,7 @@ class TAXIICollectionSource(DataSource): all_data = [] try: if isinstance(self.collection, tcv21.Collection): - envelope = self.collection.get_objects(**taxii_filters_dict) - all_data.extend(envelope.get("objects", [])) - - # The while loop will not be executed if the response is received in full. - while envelope.get("more", False): - envelope = self.collection.get_objects(limit=self.items_per_page, next=envelope.get("next", ""), **taxii_filters_dict) + for envelope in tcv21.as_pages(self.collection.get_objects, per_request=self.items_per_page, **taxii_filters_dict): all_data.extend(envelope.get("objects", [])) else: for bundle in tcv20.as_pages(self.collection.get_objects, per_request=self.items_per_page, **taxii_filters_dict): From 30fd8c3464dd58895ad87d42dc584b2fee6ceb1c Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 2 Feb 2021 00:08:11 -0500 Subject: [PATCH 04/23] compact calls --- stix2/datastore/taxii.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/stix2/datastore/taxii.py b/stix2/datastore/taxii.py index a0e2706..9ad6df9 100644 --- a/stix2/datastore/taxii.py +++ b/stix2/datastore/taxii.py @@ -297,12 +297,10 @@ class TAXIICollectionSource(DataSource): # query TAXII collection all_data = [] try: - if isinstance(self.collection, tcv21.Collection): - for envelope in tcv21.as_pages(self.collection.get_objects, per_request=self.items_per_page, **taxii_filters_dict): - all_data.extend(envelope.get("objects", [])) - else: - for bundle in tcv20.as_pages(self.collection.get_objects, per_request=self.items_per_page, **taxii_filters_dict): - all_data.extend(bundle.get("objects", [])) + paged_request = tcv21.as_pages if isinstance(self.collection, tcv21.Collection) else tcv20.as_pages + + for resource in paged_request(self.collection.get_objects, per_request=self.items_per_page, **taxii_filters_dict): + all_data.extend(resource.get("objects", [])) # deduplicate data (before filtering as reduces wasted filtering) all_data = deduplicate(all_data) From f9a52eeed3236c9721d889d4b32e6f48cce1c120 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Fri, 26 Feb 2021 19:19:33 -0500 Subject: [PATCH 05/23] WIP: changes to both similarity functions, expose settings --- stix2/environment.py | 18 ++-- stix2/equivalence/graph/__init__.py | 62 ++++--------- stix2/equivalence/object/__init__.py | 126 ++++++++++++++++++++------- 3 files changed, 123 insertions(+), 83 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index d0f694e..75e5fa5 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -189,7 +189,8 @@ class Environment(DataStoreMixin): return None @staticmethod - def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): + def object_similarity(obj1, obj2, prop_scores={}, ignore_spec_version=False, + versioning_checks=False, max_depth=1, **weight_dict): """This method returns a measure of how similar the two objects are. Args: @@ -220,10 +221,12 @@ class Environment(DataStoreMixin): see `the Committee Note `__. """ - return object_similarity(obj1, obj2, prop_scores, **weight_dict) + return object_similarity(obj1, obj2, prop_scores, ignore_spec_version, + versioning_checks, max_depth, **weight_dict) @staticmethod - def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict): + def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, ignore_spec_version=False, + versioning_checks=False, max_depth=1, **weight_dict): """This method returns a true/false value if two objects are semantically equivalent. Internally, it calls the object_similarity function and compares it against the given threshold value. @@ -263,7 +266,8 @@ class Environment(DataStoreMixin): return object_equivalence(obj1, obj2, prop_scores, threshold, **weight_dict) @staticmethod - def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): + def graph_similarity(ds1, ds2, prop_scores={}, ignore_spec_version=False, + versioning_checks=False, max_depth=1, **weight_dict): """This method returns a similarity score for two given graphs. Each DataStore can contain a connected or disconnected graph and the final result is weighted over the amount of objects we managed to compare. @@ -298,10 +302,12 @@ class Environment(DataStoreMixin): see `the Committee Note `__. """ - return graph_similarity(ds1, ds2, prop_scores, **weight_dict) + return graph_similarity(ds1, ds2, prop_scores, ignore_spec_version, + versioning_checks, max_depth, **weight_dict) @staticmethod - def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict): + def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, ignore_spec_version=False, + versioning_checks=False, max_depth=1, **weight_dict): """This method returns a true/false value if two graphs are semantically equivalent. Internally, it calls the graph_similarity function and compares it against the given threshold value. diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index e78624e..1dcccf1 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -53,7 +53,8 @@ def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict): return False -def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): +def graph_similarity(ds1, ds2, prop_scores={}, ignore_spec_version=False, + versioning_checks=False, max_depth=1, **weight_dict): """This method returns a similarity score for two given graphs. Each DataStore can contain a connected or disconnected graph and the final result is weighted over the amount of objects we managed to compare. @@ -65,6 +66,9 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): ds2: A DataStore object instance representing your graph prop_scores: A dictionary that can hold individual property scores, weights, contributing score, matching score and sum of weights. + ignore_spec_version: As + versioning_checks: As + max_depth: As weight_dict: A dictionary that can be used to override settings in the similarity process @@ -90,13 +94,21 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): """ results = {} similarity_score = 0 - weights = GRAPH_WEIGHTS.copy() + weights = WEIGHTS.copy() if weight_dict: weights.update(weight_dict) + weights["_internal"] = { + "ignore_spec_version": ignore_spec_version, + "versioning_checks": versioning_checks, + "ds1": ds1, + "ds2": ds2, + "max_depth": max_depth, + } + if weights["_internal"]["max_depth"] <= 0: - raise ValueError("weight_dict['_internal']['max_depth'] must be greater than 0") + raise ValueError("'max_depth' must be greater than 0") pairs = _object_pairs( _bucket_per_type(ds1.query([])), @@ -104,16 +116,15 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): weights, ) - weights["_internal"]["ds1"] = ds1 - weights["_internal"]["ds2"] = ds2 - logger.debug("Starting graph similarity process between DataStores: '%s' and '%s'", ds1.id, ds2.id) for object1, object2 in pairs: iprop_score = {} object1_id = object1["id"] object2_id = object2["id"] - result = object_similarity(object1, object2, iprop_score, **weights) + result = object_similarity(object1, object2, iprop_score, ds1, ds2, + ignore_spec_version, versioning_checks, + max_depth, **weights) if object1_id not in results: results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result} @@ -141,40 +152,3 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): similarity_score, ) return similarity_score - - -# default weights used for the graph similarity process -GRAPH_WEIGHTS = WEIGHTS.copy() -GRAPH_WEIGHTS.update({ - "grouping": { - "name": (20, partial_string_based), - "context": (20, partial_string_based), - "object_refs": (60, list_reference_check), - }, - "relationship": { - "relationship_type": (20, exact_match), - "source_ref": (40, reference_check), - "target_ref": (40, reference_check), - }, - "report": { - "name": (30, partial_string_based), - "published": (10, partial_timestamp_based), - "object_refs": (60, list_reference_check), - "tdelta": 1, # One day interval - }, - "sighting": { - "first_seen": (5, partial_timestamp_based), - "last_seen": (5, partial_timestamp_based), - "sighting_of_ref": (40, reference_check), - "observed_data_refs": (20, list_reference_check), - "where_sighted_refs": (20, list_reference_check), - "summary": (10, exact_match), - }, - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "ds1": None, - "ds2": None, - "max_depth": 1, - }, -}) # :autodoc-skip: diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index e175938..8bae111 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -4,7 +4,7 @@ import itertools import logging import time -from ...datastore import Filter +from ...datastore import Filter, DataStoreMixin, DataSink, DataSource from ...utils import STIXdatetime, parse_into_datetime from ..pattern import equivalent_patterns @@ -54,7 +54,9 @@ def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict): return False -def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): +def object_similarity(obj1, obj2, prop_scores={}, ds1=None, ds2=None, + ignore_spec_version=False, versioning_checks=False, + max_depth=1, **weight_dict): """This method returns a measure of similarity depending on how similar the two objects are. @@ -63,6 +65,11 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): obj2: A stix2 object instance prop_scores: A dictionary that can hold individual property scores, weights, contributing score, matching score and sum of weights. + ds1: As + ds2: As + ignore_spec_version: As + versioning_checks: As + max_depth: As weight_dict: A dictionary that can be used to override settings in the similarity process @@ -91,6 +98,14 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): if weight_dict: weights.update(weight_dict) + weights["_internal"] = { + "ignore_spec_version": ignore_spec_version, + "versioning_checks": versioning_checks, + "ds1": ds1, + "ds2": ds2, + "max_depth": max_depth, + } + type1, type2 = obj1["type"], obj2["type"] ignore_spec_version = weights["_internal"]["ignore_spec_version"] @@ -117,6 +132,7 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): if check_property_present(prop, obj1, obj2): w = weights[type1][prop][0] comp_funct = weights[type1][prop][1] + prop_scores[prop] = {} if comp_funct == partial_timestamp_based: contributing_score = w * comp_funct(obj1[prop], obj2[prop], weights[type1]["tdelta"]) @@ -124,24 +140,30 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): threshold = weights[type1]["threshold"] contributing_score = w * comp_funct(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"], threshold) elif comp_funct == reference_check or comp_funct == list_reference_check: - max_depth = weights["_internal"]["max_depth"] - if max_depth > 0: - weights["_internal"]["max_depth"] = max_depth - 1 + max_depth_i = weights["_internal"]["max_depth"] + if max_depth_i > 0: + weights["_internal"]["max_depth"] = max_depth_i - 1 ds1, ds2 = weights["_internal"]["ds1"], weights["_internal"]["ds2"] - contributing_score = w * comp_funct(obj1[prop], obj2[prop], ds1, ds2, **weights) + if _datastore_check(ds1, ds2): + contributing_score = w * comp_funct(obj1[prop], obj2[prop], ds1, ds2, **weights) + elif comp_funct == reference_check: + comp_funct = exact_match + contributing_score = w * comp_funct(obj1[prop], obj2[prop]) + elif comp_funct == list_reference_check: + comp_funct = partial_list_based + contributing_score = w * comp_funct(obj1[prop], obj2[prop]) + prop_scores[prop]["method"] = comp_funct.__name__ else: continue # prevent excessive recursion - weights["_internal"]["max_depth"] = max_depth + weights["_internal"]["max_depth"] = max_depth_i else: contributing_score = w * comp_funct(obj1[prop], obj2[prop]) sum_weights += w matching_score += contributing_score - prop_scores[prop] = { - "weight": w, - "contributing_score": contributing_score, - } + prop_scores[prop]["weight"] = w + prop_scores[prop]["contributing_score"] = contributing_score logger.debug("'%s' check -- weight: %s, contributing score: %s", prop, w, contributing_score) prop_scores["matching_score"] = matching_score @@ -196,7 +218,9 @@ def partial_timestamp_based(t1, t2, tdelta): def partial_list_based(l1, l2): - """Performs a partial list matching via finding the intersection between common values. + """Performs a partial list matching via finding the intersection between + common values. Repeated values are counted only once. This method can be + used for *_refs equality checks when de-reference is not possible. Args: l1: A list of values. @@ -213,7 +237,8 @@ def partial_list_based(l1, l2): def exact_match(val1, val2): - """Performs an exact value match based on two values + """Performs an exact value match based on two values. This method can be + used for *_ref equality check when de-reference is not possible. Args: val1: A value suitable for an equality test. @@ -275,15 +300,8 @@ def partial_external_reference_based(refs1, refs2): allowed = {"veris", "cve", "capec", "mitre-attack"} matches = 0 - if len(refs1) >= len(refs2): - l1 = refs1 - l2 = refs2 - else: - l1 = refs2 - l2 = refs1 - - for ext_ref1 in l1: - for ext_ref2 in l2: + for ext_ref1 in refs1: + for ext_ref2 in refs2: sn_match = False ei_match = False url_match = False @@ -352,17 +370,21 @@ def _versioned_checks(ref1, ref2, ds1, ds2, **weights): """Checks multiple object versions if present in graph. Maximizes for the similarity score of a particular version.""" results = {} - objects1 = ds1.query([Filter("id", "=", ref1)]) - objects2 = ds2.query([Filter("id", "=", ref2)]) pairs = _object_pairs( - _bucket_per_type(objects1), - _bucket_per_type(objects2), + _bucket_per_type(ds1.query([Filter("id", "=", ref1)])), + _bucket_per_type(ds2.query([Filter("id", "=", ref2)])), weights, ) + ignore_spec_version = weights["_internal"]["ignore_spec_version"] + versioning_checks = weights["_internal"]["versioning_checks"] + max_depth = weights["_internal"]["max_depth"] for object1, object2 in pairs: - result = object_similarity(object1, object2, **weights) + result = object_similarity(object1, object2, ds1=ds1, ds2=ds2, + ignore_spec_version=ignore_spec_version, + versioning_checks=versioning_checks, + max_depth=max_depth, **weights) if ref1 not in results: results[ref1] = {"matched": ref2, "value": result} elif result > results[ref1]["value"]: @@ -383,12 +405,18 @@ def reference_check(ref1, ref2, ds1, ds2, **weights): result = 0.0 if type1 == type2 and type1 in weights: - if weights["_internal"]["versioning_checks"]: + ignore_spec_version = weights["_internal"]["ignore_spec_version"] + versioning_checks = weights["_internal"]["versioning_checks"] + max_depth = weights["_internal"]["max_depth"] + if versioning_checks: result = _versioned_checks(ref1, ref2, ds1, ds2, **weights) / 100.0 else: o1, o2 = ds1.get(ref1), ds2.get(ref2) if o1 and o2: - result = object_similarity(o1, o2, **weights) / 100.0 + result = object_similarity(o1, o2, ds1=ds1, ds2=ds2, + ignore_spec_version=ignore_spec_version, + versioning_checks=versioning_checks, + max_depth=max_depth, **weights) / 100.0 logger.debug( "--\t\treference_check '%s' '%s'\tresult: '%s'", @@ -439,6 +467,13 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights): return result +def _datastore_check(ds1, ds2): + if (issubclass(ds1.__class__, (DataStoreMixin, DataSink, DataSource)) or + issubclass(ds2.__class__, (DataStoreMixin, DataSink, DataSource))): + return True + return False + + def _bucket_per_type(graph, mode="type"): """Given a list of objects or references, bucket them by type. Depending on the list type: extract from 'type' property or using @@ -480,11 +515,20 @@ WEIGHTS = { "name": (60, partial_string_based), "external_references": (40, partial_external_reference_based), }, + "grouping": { + "name": (20, partial_string_based), + "context": (20, partial_string_based), + "object_refs": (60, list_reference_check), + }, "identity": { "name": (60, partial_string_based), "identity_class": (20, exact_match), "sectors": (20, partial_list_based), }, + "incident": { + "name": (60, partial_string_based), + "external_references": (40, partial_external_reference_based), + }, "indicator": { "indicator_types": (15, partial_list_based), "pattern": (80, custom_pattern_based), @@ -511,6 +555,25 @@ WEIGHTS = { "definition": (60, exact_match), "definition_type": (20, exact_match), }, + "relationship": { + "relationship_type": (20, exact_match), + "source_ref": (40, reference_check), + "target_ref": (40, reference_check), + }, + "report": { + "name": (30, partial_string_based), + "published": (10, partial_timestamp_based), + "object_refs": (60, list_reference_check), + "tdelta": 1, # One day interval + }, + "sighting": { + "first_seen": (5, partial_timestamp_based), + "last_seen": (5, partial_timestamp_based), + "sighting_of_ref": (40, reference_check), + "observed_data_refs": (20, list_reference_check), + "where_sighted_refs": (20, list_reference_check), + "summary": (10, exact_match), + }, "threat-actor": { "name": (60, partial_string_based), "threat_actor_types": (20, partial_list_based), @@ -523,8 +586,5 @@ WEIGHTS = { "vulnerability": { "name": (30, partial_string_based), "external_references": (70, partial_external_reference_based), - }, - "_internal": { - "ignore_spec_version": False, - }, + } } # :autodoc-skip: From ff5014c606858053ad6eee6a13438a67dffe388f Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 1 Mar 2021 12:27:52 -0500 Subject: [PATCH 06/23] expose configuration options, combine weight dictionary, update tests --- docs/conf.py | 9 +- stix2/environment.py | 109 +++++++++++++---- stix2/equivalence/graph/__init__.py | 60 +++++++--- stix2/equivalence/object/__init__.py | 169 +++++++++++++++----------- stix2/test/v20/test_environment.py | 143 +++------------------- stix2/test/v21/test_environment.py | 172 ++++++--------------------- 6 files changed, 284 insertions(+), 378 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 5d12af3..62e829d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -66,16 +66,9 @@ object_default_sem_eq_weights = json.dumps(WEIGHTS, indent=4, default=lambda o: object_default_sem_eq_weights = object_default_sem_eq_weights.replace('\n', '\n ') object_default_sem_eq_weights = object_default_sem_eq_weights.replace(' "', ' ') object_default_sem_eq_weights = object_default_sem_eq_weights.replace('"\n', '\n') -with open('object_default_sem_eq_weights.rst', 'w') as f: +with open('similarity_weights.rst', 'w') as f: f.write(".. code-block:: python\n\n {}\n\n".format(object_default_sem_eq_weights)) -graph_default_sem_eq_weights = json.dumps(GRAPH_WEIGHTS, indent=4, default=lambda o: o.__name__) -graph_default_sem_eq_weights = graph_default_sem_eq_weights.replace('\n', '\n ') -graph_default_sem_eq_weights = graph_default_sem_eq_weights.replace(' "', ' ') -graph_default_sem_eq_weights = graph_default_sem_eq_weights.replace('"\n', '\n') -with open('graph_default_sem_eq_weights.rst', 'w') as f: - f.write(".. code-block:: python\n\n {}\n\n".format(graph_default_sem_eq_weights)) - def get_property_type(prop): """Convert property classname into pretty string name of property. diff --git a/stix2/environment.py b/stix2/environment.py index 75e5fa5..b37b485 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -189,8 +189,11 @@ class Environment(DataStoreMixin): return None @staticmethod - def object_similarity(obj1, obj2, prop_scores={}, ignore_spec_version=False, - versioning_checks=False, max_depth=1, **weight_dict): + def object_similarity( + obj1, obj2, prop_scores={}, ds1=None, ds2=None, + ignore_spec_version=False, versioning_checks=False, + max_depth=1, **weight_dict + ): """This method returns a measure of how similar the two objects are. Args: @@ -198,8 +201,19 @@ class Environment(DataStoreMixin): obj2: A stix2 object instance prop_scores: A dictionary that can hold individual property scores, weights, contributing score, matching score and sum of weights. - weight_dict: A dictionary that can be used to override settings - in the similarity process + ds1: A DataStore object instance representing your graph + ds2: A DataStore object instance representing your graph + ignore_spec_version: A boolean indicating whether to test object types + that belong to different spec versions (STIX 2.0 and STIX 2.1 for example). + If set to True this check will be skipped. + versioning_checks: A boolean indicating whether to test multiple revisions + of the same object (when present) to maximize similarity against a + particular version. If set to True the algorithm will perform this step. + max_depth: A positive integer indicating the maximum recursion depth the + algorithm can reach when de-referencing objects and performing the + object_similarity algorithm. + weight_dict: A dictionary that can be used to override what checks are done + to objects in the similarity process. Returns: float: A number between 0.0 and 100.0 as a measurement of similarity. @@ -221,12 +235,17 @@ class Environment(DataStoreMixin): see `the Committee Note `__. """ - return object_similarity(obj1, obj2, prop_scores, ignore_spec_version, - versioning_checks, max_depth, **weight_dict) + return object_similarity( + obj1, obj2, prop_scores, ds1, ds2, ignore_spec_version, + versioning_checks, max_depth, **weight_dict + ) @staticmethod - def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, ignore_spec_version=False, - versioning_checks=False, max_depth=1, **weight_dict): + def object_equivalence( + obj1, obj2, prop_scores={}, threshold=70, ds1=None, ds2=None, + ignore_spec_version=False, versioning_checks=False, + max_depth=1, **weight_dict + ): """This method returns a true/false value if two objects are semantically equivalent. Internally, it calls the object_similarity function and compares it against the given threshold value. @@ -239,8 +258,19 @@ class Environment(DataStoreMixin): threshold: A numerical value between 0 and 100 to determine the minimum score to result in successfully calling both objects equivalent. This value can be tuned. - weight_dict: A dictionary that can be used to override settings - in the similarity process + ds1: A DataStore object instance representing your graph + ds2: A DataStore object instance representing your graph + ignore_spec_version: A boolean indicating whether to test object types + that belong to different spec versions (STIX 2.0 and STIX 2.1 for example). + If set to True this check will be skipped. + versioning_checks: A boolean indicating whether to test multiple revisions + of the same object (when present) to maximize similarity against a + particular version. If set to True the algorithm will perform this step. + max_depth: A positive integer indicating the maximum recursion depth the + algorithm can reach when de-referencing objects and performing the + object_similarity algorithm. + weight_dict: A dictionary that can be used to override what checks are done + to objects in the similarity process. Returns: bool: True if the result of the object similarity is greater than or equal to @@ -263,11 +293,16 @@ class Environment(DataStoreMixin): see `the Committee Note `__. """ - return object_equivalence(obj1, obj2, prop_scores, threshold, **weight_dict) + return object_equivalence( + obj1, obj2, prop_scores, threshold, ds1, ds2, + ignore_spec_version, versioning_checks, max_depth, **weight_dict + ) @staticmethod - def graph_similarity(ds1, ds2, prop_scores={}, ignore_spec_version=False, - versioning_checks=False, max_depth=1, **weight_dict): + def graph_similarity( + ds1, ds2, prop_scores={}, ignore_spec_version=False, + versioning_checks=False, max_depth=1, **weight_dict + ): """This method returns a similarity score for two given graphs. Each DataStore can contain a connected or disconnected graph and the final result is weighted over the amount of objects we managed to compare. @@ -279,8 +314,17 @@ class Environment(DataStoreMixin): ds2: A DataStore object instance representing your graph prop_scores: A dictionary that can hold individual property scores, weights, contributing score, matching score and sum of weights. - weight_dict: A dictionary that can be used to override settings - in the similarity process + ignore_spec_version: A boolean indicating whether to test object types + that belong to different spec versions (STIX 2.0 and STIX 2.1 for example). + If set to True this check will be skipped. + versioning_checks: A boolean indicating whether to test multiple revisions + of the same object (when present) to maximize similarity against a + particular version. If set to True the algorithm will perform this step. + max_depth: A positive integer indicating the maximum recursion depth the + algorithm can reach when de-referencing objects and performing the + object_similarity algorithm. + weight_dict: A dictionary that can be used to override what checks are done + to objects in the similarity process. Returns: float: A number between 0.0 and 100.0 as a measurement of similarity. @@ -295,19 +339,24 @@ class Environment(DataStoreMixin): Note: Default weight_dict: - .. include:: ../graph_default_sem_eq_weights.rst + .. include:: ../similarity_weights.rst Note: This implementation follows the Semantic Equivalence Committee Note. see `the Committee Note `__. """ - return graph_similarity(ds1, ds2, prop_scores, ignore_spec_version, - versioning_checks, max_depth, **weight_dict) + return graph_similarity( + ds1, ds2, prop_scores, ignore_spec_version, + versioning_checks, max_depth, **weight_dict + ) @staticmethod - def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, ignore_spec_version=False, - versioning_checks=False, max_depth=1, **weight_dict): + def graph_equivalence( + ds1, ds2, prop_scores={}, threshold=70, + ignore_spec_version=False, versioning_checks=False, + max_depth=1, **weight_dict + ): """This method returns a true/false value if two graphs are semantically equivalent. Internally, it calls the graph_similarity function and compares it against the given threshold value. @@ -320,8 +369,17 @@ class Environment(DataStoreMixin): threshold: A numerical value between 0 and 100 to determine the minimum score to result in successfully calling both graphs equivalent. This value can be tuned. - weight_dict: A dictionary that can be used to override settings - in the similarity process + ignore_spec_version: A boolean indicating whether to test object types + that belong to different spec versions (STIX 2.0 and STIX 2.1 for example). + If set to True this check will be skipped. + versioning_checks: A boolean indicating whether to test multiple revisions + of the same object (when present) to maximize similarity against a + particular version. If set to True the algorithm will perform this step. + max_depth: A positive integer indicating the maximum recursion depth the + algorithm can reach when de-referencing objects and performing the + object_similarity algorithm. + weight_dict: A dictionary that can be used to override what checks are done + to objects in the similarity process. Returns: bool: True if the result of the graph similarity is greater than or equal to @@ -337,11 +395,14 @@ class Environment(DataStoreMixin): Note: Default weight_dict: - .. include:: ../graph_default_sem_eq_weights.rst + .. include:: ../similarity_weights.rst Note: This implementation follows the Semantic Equivalence Committee Note. see `the Committee Note `__. """ - return graph_equivalence(ds1, ds2, prop_scores, threshold, **weight_dict) + return graph_equivalence( + ds1, ds2, prop_scores, threshold, ignore_spec_version, + versioning_checks, max_depth, **weight_dict + ) diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index 1dcccf1..1d43219 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -10,7 +10,11 @@ from ..object import ( logger = logging.getLogger(__name__) -def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict): +def graph_equivalence( + ds1, ds2, prop_scores={}, threshold=70, + ignore_spec_version=False, versioning_checks=False, + max_depth=1, **weight_dict +): """This method returns a true/false value if two graphs are semantically equivalent. Internally, it calls the graph_similarity function and compares it against the given threshold value. @@ -23,8 +27,17 @@ def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict): threshold: A numerical value between 0 and 100 to determine the minimum score to result in successfully calling both graphs equivalent. This value can be tuned. - weight_dict: A dictionary that can be used to override settings - in the similarity process + ignore_spec_version: A boolean indicating whether to test object types + that belong to different spec versions (STIX 2.0 and STIX 2.1 for example). + If set to True this check will be skipped. + versioning_checks: A boolean indicating whether to test multiple revisions + of the same object (when present) to maximize similarity against a + particular version. If set to True the algorithm will perform this step. + max_depth: A positive integer indicating the maximum recursion depth the + algorithm can reach when de-referencing objects and performing the + object_similarity algorithm. + weight_dict: A dictionary that can be used to override what checks are done + to objects in the similarity process. Returns: bool: True if the result of the graph similarity is greater than or equal to @@ -40,21 +53,26 @@ def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict): Note: Default weight_dict: - .. include:: ../../graph_default_sem_eq_weights.rst + .. include:: ../../similarity_weights.rst Note: This implementation follows the Semantic Equivalence Committee Note. see `the Committee Note `__. """ - similarity_result = graph_similarity(ds1, ds2, prop_scores, **weight_dict) + similarity_result = graph_similarity( + ds1, ds2, prop_scores, ignore_spec_version, + versioning_checks, max_depth, **weight_dict + ) if similarity_result >= threshold: return True return False -def graph_similarity(ds1, ds2, prop_scores={}, ignore_spec_version=False, - versioning_checks=False, max_depth=1, **weight_dict): +def graph_similarity( + ds1, ds2, prop_scores={}, ignore_spec_version=False, + versioning_checks=False, max_depth=1, **weight_dict +): """This method returns a similarity score for two given graphs. Each DataStore can contain a connected or disconnected graph and the final result is weighted over the amount of objects we managed to compare. @@ -66,11 +84,17 @@ def graph_similarity(ds1, ds2, prop_scores={}, ignore_spec_version=False, ds2: A DataStore object instance representing your graph prop_scores: A dictionary that can hold individual property scores, weights, contributing score, matching score and sum of weights. - ignore_spec_version: As - versioning_checks: As - max_depth: As - weight_dict: A dictionary that can be used to override settings - in the similarity process + ignore_spec_version: A boolean indicating whether to test object types + that belong to different spec versions (STIX 2.0 and STIX 2.1 for example). + If set to True this check will be skipped. + versioning_checks: A boolean indicating whether to test multiple revisions + of the same object (when present) to maximize similarity against a + particular version. If set to True the algorithm will perform this step. + max_depth: A positive integer indicating the maximum recursion depth the + algorithm can reach when de-referencing objects and performing the + object_similarity algorithm. + weight_dict: A dictionary that can be used to override what checks are done + to objects in the similarity process. Returns: float: A number between 0.0 and 100.0 as a measurement of similarity. @@ -85,7 +109,7 @@ def graph_similarity(ds1, ds2, prop_scores={}, ignore_spec_version=False, Note: Default weight_dict: - .. include:: ../../graph_default_sem_eq_weights.rst + .. include:: ../../similarity_weights.rst Note: This implementation follows the Semantic Equivalence Committee Note. @@ -107,7 +131,7 @@ def graph_similarity(ds1, ds2, prop_scores={}, ignore_spec_version=False, "max_depth": max_depth, } - if weights["_internal"]["max_depth"] <= 0: + if max_depth <= 0: raise ValueError("'max_depth' must be greater than 0") pairs = _object_pairs( @@ -122,9 +146,11 @@ def graph_similarity(ds1, ds2, prop_scores={}, ignore_spec_version=False, object1_id = object1["id"] object2_id = object2["id"] - result = object_similarity(object1, object2, iprop_score, ds1, ds2, - ignore_spec_version, versioning_checks, - max_depth, **weights) + result = object_similarity( + object1, object2, iprop_score, ds1, ds2, + ignore_spec_version, versioning_checks, + max_depth, **weights + ) if object1_id not in results: results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result} diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 8bae111..71a263c 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -4,14 +4,18 @@ import itertools import logging import time -from ...datastore import Filter, DataStoreMixin, DataSink, DataSource +from ...datastore import DataSink, DataSource, DataStoreMixin, Filter from ...utils import STIXdatetime, parse_into_datetime from ..pattern import equivalent_patterns logger = logging.getLogger(__name__) -def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict): +def object_equivalence( + obj1, obj2, prop_scores={}, threshold=70, ds1=None, + ds2=None, ignore_spec_version=False, + versioning_checks=False, max_depth=1, **weight_dict +): """This method returns a true/false value if two objects are semantically equivalent. Internally, it calls the object_similarity function and compares it against the given threshold value. @@ -24,8 +28,19 @@ def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict): threshold: A numerical value between 0 and 100 to determine the minimum score to result in successfully calling both objects equivalent. This value can be tuned. - weight_dict: A dictionary that can be used to override settings - in the similarity process + ds1: A DataStore object instance representing your graph + ds2: A DataStore object instance representing your graph + ignore_spec_version: A boolean indicating whether to test object types + that belong to different spec versions (STIX 2.0 and STIX 2.1 for example). + If set to True this check will be skipped. + versioning_checks: A boolean indicating whether to test multiple revisions + of the same object (when present) to maximize similarity against a + particular version. If set to True the algorithm will perform this step. + max_depth: A positive integer indicating the maximum recursion depth the + algorithm can reach when de-referencing objects and performing the + object_similarity algorithm. + weight_dict: A dictionary that can be used to override what checks are done + to objects in the similarity process. Returns: bool: True if the result of the object similarity is greater than or equal to @@ -41,22 +56,27 @@ def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict): Note: Default weight_dict: - .. include:: ../../object_default_sem_eq_weights.rst + .. include:: ../../similarity_weights.rst Note: This implementation follows the Semantic Equivalence Committee Note. see `the Committee Note `__. """ - similarity_result = object_similarity(obj1, obj2, prop_scores, **weight_dict) + similarity_result = object_similarity( + obj1, obj2, prop_scores, ds1, ds2, ignore_spec_version, + versioning_checks, max_depth, **weight_dict + ) if similarity_result >= threshold: return True return False -def object_similarity(obj1, obj2, prop_scores={}, ds1=None, ds2=None, - ignore_spec_version=False, versioning_checks=False, - max_depth=1, **weight_dict): +def object_similarity( + obj1, obj2, prop_scores={}, ds1=None, ds2=None, + ignore_spec_version=False, versioning_checks=False, + max_depth=1, **weight_dict +): """This method returns a measure of similarity depending on how similar the two objects are. @@ -65,13 +85,19 @@ def object_similarity(obj1, obj2, prop_scores={}, ds1=None, ds2=None, obj2: A stix2 object instance prop_scores: A dictionary that can hold individual property scores, weights, contributing score, matching score and sum of weights. - ds1: As - ds2: As - ignore_spec_version: As - versioning_checks: As - max_depth: As - weight_dict: A dictionary that can be used to override settings - in the similarity process + ds1: A DataStore object instance representing your graph + ds2: A DataStore object instance representing your graph + ignore_spec_version: A boolean indicating whether to test object types + that belong to different spec versions (STIX 2.0 and STIX 2.1 for example). + If set to True this check will be skipped. + versioning_checks: A boolean indicating whether to test multiple revisions + of the same object (when present) to maximize similarity against a + particular version. If set to True the algorithm will perform this step. + max_depth: A positive integer indicating the maximum recursion depth the + algorithm can reach when de-referencing objects and performing the + object_similarity algorithm. + weight_dict: A dictionary that can be used to override what checks are done + to objects in the similarity process. Returns: float: A number between 0.0 and 100.0 as a measurement of similarity. @@ -86,7 +112,7 @@ def object_similarity(obj1, obj2, prop_scores={}, ds1=None, ds2=None, Note: Default weight_dict: - .. include:: ../../object_default_sem_eq_weights.rst + .. include:: ../../similarity_weights.rst Note: This implementation follows the Semantic Equivalence Committee Note. @@ -107,7 +133,6 @@ def object_similarity(obj1, obj2, prop_scores={}, ds1=None, ds2=None, } type1, type2 = obj1["type"], obj2["type"] - ignore_spec_version = weights["_internal"]["ignore_spec_version"] if type1 != type2: raise ValueError('The objects to compare must be of the same type!') @@ -140,9 +165,8 @@ def object_similarity(obj1, obj2, prop_scores={}, ds1=None, ds2=None, threshold = weights[type1]["threshold"] contributing_score = w * comp_funct(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"], threshold) elif comp_funct == reference_check or comp_funct == list_reference_check: - max_depth_i = weights["_internal"]["max_depth"] - if max_depth_i > 0: - weights["_internal"]["max_depth"] = max_depth_i - 1 + if max_depth > 0: + weights["_internal"]["max_depth"] = max_depth - 1 ds1, ds2 = weights["_internal"]["ds1"], weights["_internal"]["ds2"] if _datastore_check(ds1, ds2): contributing_score = w * comp_funct(obj1[prop], obj2[prop], ds1, ds2, **weights) @@ -155,7 +179,7 @@ def object_similarity(obj1, obj2, prop_scores={}, ds1=None, ds2=None, prop_scores[prop]["method"] = comp_funct.__name__ else: continue # prevent excessive recursion - weights["_internal"]["max_depth"] = max_depth_i + weights["_internal"]["max_depth"] = max_depth else: contributing_score = w * comp_funct(obj1[prop], obj2[prop]) @@ -187,7 +211,7 @@ def object_similarity(obj1, obj2, prop_scores={}, ds1=None, ds2=None, def check_property_present(prop, obj1, obj2): """Helper method checks if a property is present on both objects.""" if prop == "longitude_latitude": - if all(x in obj1 and x in obj2 for x in ['latitude', 'longitude']): + if all(x in obj1 and x in obj2 for x in ('latitude', 'longitude')): return True elif prop in obj1 and prop in obj2: return True @@ -286,12 +310,12 @@ def custom_pattern_based(pattern1, pattern2): return equivalent_patterns(pattern1, pattern2) -def partial_external_reference_based(refs1, refs2): +def partial_external_reference_based(ext_refs1, ext_refs2): """Performs a matching on External References. Args: - refs1: A list of external references. - refs2: A list of external references. + ext_refs1: A list of external references. + ext_refs2: A list of external references. Returns: float: Number between 0.0 and 1.0 depending on matches. @@ -300,44 +324,47 @@ def partial_external_reference_based(refs1, refs2): allowed = {"veris", "cve", "capec", "mitre-attack"} matches = 0 - for ext_ref1 in refs1: - for ext_ref2 in refs2: - sn_match = False - ei_match = False - url_match = False - source_name = None + ref_pairs = itertools.chain( + itertools.product(ext_refs1, ext_refs2), + ) - if check_property_present("source_name", ext_ref1, ext_ref2): - if ext_ref1["source_name"] == ext_ref2["source_name"]: - source_name = ext_ref1["source_name"] - sn_match = True - if check_property_present("external_id", ext_ref1, ext_ref2): - if ext_ref1["external_id"] == ext_ref2["external_id"]: - ei_match = True - if check_property_present("url", ext_ref1, ext_ref2): - if ext_ref1["url"] == ext_ref2["url"]: - url_match = True + for ext_ref1, ext_ref2 in ref_pairs: + sn_match = False + ei_match = False + url_match = False + source_name = None - # Special case: if source_name is a STIX defined name and either - # external_id or url match then its a perfect match and other entries - # can be ignored. - if sn_match and (ei_match or url_match) and source_name in allowed: - result = 1.0 - logger.debug( - "--\t\tpartial_external_reference_based '%s' '%s'\tresult: '%s'", - refs1, refs2, result, - ) - return result + if check_property_present("source_name", ext_ref1, ext_ref2): + if ext_ref1["source_name"] == ext_ref2["source_name"]: + source_name = ext_ref1["source_name"] + sn_match = True + if check_property_present("external_id", ext_ref1, ext_ref2): + if ext_ref1["external_id"] == ext_ref2["external_id"]: + ei_match = True + if check_property_present("url", ext_ref1, ext_ref2): + if ext_ref1["url"] == ext_ref2["url"]: + url_match = True - # Regular check. If the source_name (not STIX-defined) or external_id or - # url matches then we consider the entry a match. - if (sn_match or ei_match or url_match) and source_name not in allowed: - matches += 1 + # Special case: if source_name is a STIX defined name and either + # external_id or url match then its a perfect match and other entries + # can be ignored. + if sn_match and (ei_match or url_match) and source_name in allowed: + result = 1.0 + logger.debug( + "--\t\tpartial_external_reference_based '%s' '%s'\tresult: '%s'", + ext_refs1, ext_refs2, result, + ) + return result - result = matches / max(len(refs1), len(refs2)) + # Regular check. If the source_name (not STIX-defined) or external_id or + # url matches then we consider the entry a match. + if (sn_match or ei_match or url_match) and source_name not in allowed: + matches += 1 + + result = matches / max(len(ext_refs1), len(ext_refs2)) logger.debug( "--\t\tpartial_external_reference_based '%s' '%s'\tresult: '%s'", - refs1, refs2, result, + ext_refs1, ext_refs2, result, ) return result @@ -381,10 +408,11 @@ def _versioned_checks(ref1, ref2, ds1, ds2, **weights): max_depth = weights["_internal"]["max_depth"] for object1, object2 in pairs: - result = object_similarity(object1, object2, ds1=ds1, ds2=ds2, - ignore_spec_version=ignore_spec_version, - versioning_checks=versioning_checks, - max_depth=max_depth, **weights) + result = object_similarity( + object1, object2, ds1, ds2, + ignore_spec_version, versioning_checks, + max_depth, **weights + ) if ref1 not in results: results[ref1] = {"matched": ref2, "value": result} elif result > results[ref1]["value"]: @@ -413,10 +441,11 @@ def reference_check(ref1, ref2, ds1, ds2, **weights): else: o1, o2 = ds1.get(ref1), ds2.get(ref2) if o1 and o2: - result = object_similarity(o1, o2, ds1=ds1, ds2=ds2, - ignore_spec_version=ignore_spec_version, - versioning_checks=versioning_checks, - max_depth=max_depth, **weights) / 100.0 + result = object_similarity( + o1, o2, ds1, ds2, + ignore_spec_version, versioning_checks, + max_depth, **weights + ) / 100.0 logger.debug( "--\t\treference_check '%s' '%s'\tresult: '%s'", @@ -468,8 +497,10 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights): def _datastore_check(ds1, ds2): - if (issubclass(ds1.__class__, (DataStoreMixin, DataSink, DataSource)) or - issubclass(ds2.__class__, (DataStoreMixin, DataSink, DataSource))): + if ( + issubclass(ds1.__class__, (DataStoreMixin, DataSink, DataSource)) or + issubclass(ds2.__class__, (DataStoreMixin, DataSink, DataSource)) + ): return True return False @@ -586,5 +617,5 @@ WEIGHTS = { "vulnerability": { "name": (30, partial_string_based), "external_references": (70, partial_external_reference_based), - } + }, } # :autodoc-skip: diff --git a/stix2/test/v20/test_environment.py b/stix2/test/v20/test_environment.py index 33e0985..c8867b0 100644 --- a/stix2/test/v20/test_environment.py +++ b/stix2/test/v20/test_environment.py @@ -424,7 +424,7 @@ def test_related_to_by_target(ds): def test_versioned_checks(ds, ds2): - weights = stix2.equivalence.graph.GRAPH_WEIGHTS.copy() + weights = stix2.equivalence.graph.WEIGHTS.copy() weights.update({ "_internal": { "ignore_spec_version": True, @@ -437,7 +437,7 @@ def test_versioned_checks(ds, ds2): def test_semantic_check_with_versioning(ds, ds2): - weights = stix2.equivalence.graph.GRAPH_WEIGHTS.copy() + weights = stix2.equivalence.graph.WEIGHTS.copy() weights.update({ "_internal": { "ignore_spec_version": False, @@ -467,13 +467,11 @@ def test_semantic_check_with_versioning(ds, ds2): def test_list_semantic_check(ds, ds2): - weights = stix2.equivalence.graph.GRAPH_WEIGHTS.copy() + weights = stix2.equivalence.graph.WEIGHTS.copy() weights.update({ "_internal": { "ignore_spec_version": False, "versioning_checks": False, - "ds1": ds, - "ds2": ds2, "max_depth": 1, }, }) @@ -504,39 +502,18 @@ def test_list_semantic_check(ds, ds2): def test_graph_similarity_raises_value_error(ds): - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "max_depth": -1, - }, - } with pytest.raises(ValueError): prop_scores1 = {} - stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + stix2.Environment().graph_similarity(ds, ds2, prop_scores1, max_depth=-1) def test_graph_similarity_with_filesystem_source(ds, fs): - weights = { - "_internal": { - "ignore_spec_version": True, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores1 = {} - env1 = stix2.Environment().graph_similarity(fs, ds, prop_scores1, **weights) + env1 = stix2.Environment().graph_similarity(fs, ds, prop_scores1, ignore_spec_version=True) # Switching parameters - weights = { - "_internal": { - "ignore_spec_version": True, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores2 = {} - env2 = stix2.Environment().graph_similarity(ds, fs, prop_scores2, **weights) + env2 = stix2.Environment().graph_similarity(ds, fs, prop_scores2, ignore_spec_version=True) assert round(env1) == 25 assert round(prop_scores1["matching_score"]) == 451 @@ -552,41 +529,20 @@ def test_graph_similarity_with_filesystem_source(ds, fs): def test_graph_similarity_with_duplicate_graph(ds): - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores = {} - env = stix2.Environment().graph_similarity(ds, ds, prop_scores, **weights) + env = stix2.Environment().graph_similarity(ds, ds, prop_scores) assert round(env) == 100 assert round(prop_scores["matching_score"]) == 800 assert round(prop_scores["len_pairs"]) == 8 def test_graph_similarity_with_versioning_check_on(ds2, ds): - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": True, - "max_depth": 1, - }, - } prop_scores1 = {} - env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, versioning_checks=True) # Switching parameters - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": True, - "max_depth": 1, - }, - } prop_scores2 = {} - env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, **weights) + env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, versioning_checks=True) assert round(env1) == 88 assert round(prop_scores1["matching_score"]) == 789 @@ -602,26 +558,12 @@ def test_graph_similarity_with_versioning_check_on(ds2, ds): def test_graph_similarity_with_versioning_check_off(ds2, ds): - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores1 = {} - env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1) # Switching parameters - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores2 = {} - env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, **weights) + env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2) assert round(env1) == 88 assert round(prop_scores1["matching_score"]) == 789 @@ -637,26 +579,12 @@ def test_graph_similarity_with_versioning_check_off(ds2, ds): def test_graph_equivalence_with_filesystem_source(ds, fs): - weights = { - "_internal": { - "ignore_spec_version": True, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores1 = {} - env1 = stix2.Environment().graph_equivalence(fs, ds, prop_scores1, **weights) + env1 = stix2.Environment().graph_equivalence(fs, ds, prop_scores1, ignore_spec_version=True) # Switching parameters - weights = { - "_internal": { - "ignore_spec_version": True, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores2 = {} - env2 = stix2.Environment().graph_equivalence(ds, fs, prop_scores2, **weights) + env2 = stix2.Environment().graph_equivalence(ds, fs, prop_scores2, ignore_spec_version=True) assert env1 is False assert round(prop_scores1["matching_score"]) == 451 @@ -672,41 +600,20 @@ def test_graph_equivalence_with_filesystem_source(ds, fs): def test_graph_equivalence_with_duplicate_graph(ds): - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores = {} - env = stix2.Environment().graph_equivalence(ds, ds, prop_scores, **weights) + env = stix2.Environment().graph_equivalence(ds, ds, prop_scores) assert env is True assert round(prop_scores["matching_score"]) == 800 assert round(prop_scores["len_pairs"]) == 8 def test_graph_equivalence_with_versioning_check_on(ds2, ds): - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": True, - "max_depth": 1, - }, - } prop_scores1 = {} - env1 = stix2.Environment().graph_equivalence(ds, ds2, prop_scores1, **weights) + env1 = stix2.Environment().graph_equivalence(ds, ds2, prop_scores1, versioning_checks=True) # Switching parameters - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": True, - "max_depth": 1, - }, - } prop_scores2 = {} - env2 = stix2.Environment().graph_equivalence(ds2, ds, prop_scores2, **weights) + env2 = stix2.Environment().graph_equivalence(ds2, ds, prop_scores2, versioning_checks=True) assert env1 is True assert round(prop_scores1["matching_score"]) == 789 @@ -722,26 +629,12 @@ def test_graph_equivalence_with_versioning_check_on(ds2, ds): def test_graph_equivalence_with_versioning_check_off(ds2, ds): - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores1 = {} - env1 = stix2.Environment().graph_equivalence(ds, ds2, prop_scores1, **weights) + env1 = stix2.Environment().graph_equivalence(ds, ds2, prop_scores1) # Switching parameters - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores2 = {} - env2 = stix2.Environment().graph_equivalence(ds2, ds, prop_scores2, **weights) + env2 = stix2.Environment().graph_equivalence(ds2, ds, prop_scores2) assert env1 is True assert round(prop_scores1["matching_score"]) == 789 diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index e7bf4da..6a14bf3 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -760,16 +760,13 @@ def test_object_similarity_different_spec_version(): "valid_from": (5, stix2.equivalence.object.partial_timestamp_based), "tdelta": 1, # One day interval }, - "_internal": { - "ignore_spec_version": True, # Disables spec_version check. - }, } ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) ind2 = stix2.v20.Indicator(id=INDICATOR_ID, **IND_KWARGS) - env = stix2.Environment().object_similarity(ind1, ind2, **weights) + env = stix2.Environment().object_similarity(ind1, ind2, ignore_spec_version=True, **weights) assert round(env) == 0 - env = stix2.Environment().object_similarity(ind2, ind1, **weights) + env = stix2.Environment().object_similarity(ind2, ind1, ignore_spec_version=True, **weights) assert round(env) == 0 @@ -861,7 +858,9 @@ def test_object_similarity_exact_match(): def test_non_existent_config_for_object(): r1 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) r2 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) - assert stix2.Environment().object_similarity(r1, r2) == 0.0 + prop_scores = {} + assert stix2.Environment().object_similarity(r1, r2, prop_scores) == 100.0 + assert prop_scores["object_refs"]["method"] == "partial_list_based" def custom_semantic_equivalence_method(obj1, obj2, **weights): @@ -937,7 +936,8 @@ def test_object_similarity_prop_scores_method_provided(): def test_versioned_checks(ds, ds2): - weights = stix2.equivalence.graph.GRAPH_WEIGHTS.copy() + # Testing internal method + weights = stix2.equivalence.graph.WEIGHTS.copy() weights.update({ "_internal": { "ignore_spec_version": True, @@ -950,7 +950,7 @@ def test_versioned_checks(ds, ds2): def test_semantic_check_with_versioning(ds, ds2): - weights = stix2.equivalence.graph.GRAPH_WEIGHTS.copy() + weights = stix2.equivalence.graph.WEIGHTS.copy() weights.update({ "_internal": { "ignore_spec_version": False, @@ -981,7 +981,7 @@ def test_semantic_check_with_versioning(ds, ds2): def test_list_semantic_check(ds, ds2): - weights = stix2.equivalence.graph.GRAPH_WEIGHTS.copy() + weights = stix2.equivalence.graph.WEIGHTS.copy() weights.update({ "_internal": { "ignore_spec_version": False, @@ -1027,39 +1027,28 @@ def test_list_semantic_check(ds, ds2): def test_graph_similarity_raises_value_error(ds): - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "max_depth": -1, - }, - } with pytest.raises(ValueError): prop_scores1 = {} - stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + stix2.Environment().graph_similarity(ds, ds2, prop_scores1, max_depth=-1) def test_graph_similarity_with_filesystem_source(ds, fs): - weights = { - "_internal": { - "ignore_spec_version": True, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores1 = {} - env1 = stix2.Environment().graph_similarity(fs, ds, prop_scores1, **weights) + env1 = stix2.Environment().graph_similarity( + fs, ds, prop_scores1, + ignore_spec_version=True, + versioning_checks=False, + max_depth=1, + ) # Switching parameters - weights = { - "_internal": { - "ignore_spec_version": True, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores2 = {} - env2 = stix2.Environment().graph_similarity(ds, fs, prop_scores2, **weights) + env2 = stix2.Environment().graph_similarity( + ds, fs, prop_scores2, + ignore_spec_version=True, + versioning_checks=False, + max_depth=1, + ) assert round(env1) == 23 assert round(prop_scores1["matching_score"]) == 411 @@ -1154,14 +1143,11 @@ def test_depth_limiting(): "some2_ref": (33, stix2.equivalence.object.reference_check), "name": (34, stix2.equivalence.object.partial_string_based), }, - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "max_depth": 1, - }, } prop_scores1 = {} - env1 = stix2.equivalence.graph.graph_similarity(mem_store1, mem_store2, prop_scores1, **custom_weights) + env1 = stix2.equivalence.graph.graph_similarity( + mem_store1, mem_store2, prop_scores1, **custom_weights + ) assert round(env1) == 38 assert round(prop_scores1["matching_score"]) == 300 @@ -1185,44 +1171,23 @@ def test_depth_limiting(): def test_graph_similarity_with_duplicate_graph(ds): - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores = {} - env = stix2.Environment().graph_similarity(ds, ds, prop_scores, **weights) + env = stix2.Environment().graph_similarity(ds, ds, prop_scores) assert round(env) == 100 assert round(prop_scores["matching_score"]) == 800 assert round(prop_scores["len_pairs"]) == 8 def test_graph_similarity_with_versioning_check_on(ds2, ds): - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": True, - "max_depth": 1, - }, - } prop_scores1 = {} - env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, versioning_checks=True) assert round(env1) == 88 assert round(prop_scores1["matching_score"]) == 789 assert round(prop_scores1["len_pairs"]) == 9 # Switching parameters - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores2 = {} - env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, **weights) + env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, versioning_checks=True) assert round(env2) == 88 assert round(prop_scores2["matching_score"]) == 789 assert round(prop_scores2["len_pairs"]) == 9 @@ -1233,29 +1198,15 @@ def test_graph_similarity_with_versioning_check_on(ds2, ds): def test_graph_similarity_with_versioning_check_off(ds2, ds): - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores1 = {} - env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1) assert round(env1) == 88 assert round(prop_scores1["matching_score"]) == 789 assert round(prop_scores1["len_pairs"]) == 9 # Switching parameters - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores2 = {} - env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, **weights) + env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2) assert round(env2) == 88 assert round(prop_scores2["matching_score"]) == 789 assert round(prop_scores2["len_pairs"]) == 9 @@ -1266,26 +1217,12 @@ def test_graph_similarity_with_versioning_check_off(ds2, ds): def test_graph_equivalence_with_filesystem_source(ds, fs): - weights = { - "_internal": { - "ignore_spec_version": True, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores1 = {} - env1 = stix2.Environment().graph_equivalence(fs, ds, prop_scores1, **weights) + env1 = stix2.Environment().graph_equivalence(fs, ds, prop_scores1, ignore_spec_version=True) # Switching parameters - weights = { - "_internal": { - "ignore_spec_version": True, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores2 = {} - env2 = stix2.Environment().graph_equivalence(ds, fs, prop_scores2, **weights) + env2 = stix2.Environment().graph_equivalence(ds, fs, prop_scores2, ignore_spec_version=True) assert env1 is False assert round(prop_scores1["matching_score"]) == 411 @@ -1301,41 +1238,20 @@ def test_graph_equivalence_with_filesystem_source(ds, fs): def test_graph_equivalence_with_duplicate_graph(ds): - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores = {} - env = stix2.Environment().graph_equivalence(ds, ds, prop_scores, **weights) + env = stix2.Environment().graph_equivalence(ds, ds, prop_scores) assert env is True assert round(prop_scores["matching_score"]) == 800 assert round(prop_scores["len_pairs"]) == 8 def test_graph_equivalence_with_versioning_check_on(ds2, ds): - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": True, - "max_depth": 1, - }, - } prop_scores1 = {} - env1 = stix2.Environment().graph_equivalence(ds, ds2, prop_scores1, **weights) + env1 = stix2.Environment().graph_equivalence(ds, ds2, prop_scores1, versioning_checks=True) # Switching parameters - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": True, - "max_depth": 1, - }, - } prop_scores2 = {} - env2 = stix2.Environment().graph_equivalence(ds2, ds, prop_scores2, **weights) + env2 = stix2.Environment().graph_equivalence(ds2, ds, prop_scores2, versioning_checks=True) assert env1 is True assert round(prop_scores1["matching_score"]) == 789 @@ -1351,26 +1267,12 @@ def test_graph_equivalence_with_versioning_check_on(ds2, ds): def test_graph_equivalence_with_versioning_check_off(ds2, ds): - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores1 = {} - env1 = stix2.Environment().graph_equivalence(ds, ds2, prop_scores1, **weights) + env1 = stix2.Environment().graph_equivalence(ds, ds2, prop_scores1) # Switching parameters - weights = { - "_internal": { - "ignore_spec_version": False, - "versioning_checks": False, - "max_depth": 1, - }, - } prop_scores2 = {} - env2 = stix2.Environment().graph_equivalence(ds2, ds, prop_scores2, **weights) + env2 = stix2.Environment().graph_equivalence(ds2, ds, prop_scores2) assert env1 is True assert round(prop_scores1["matching_score"]) == 789 From d2e867b52ead1793a32e021f7583ec37a6232bc3 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 1 Mar 2021 12:29:33 -0500 Subject: [PATCH 07/23] docstring corrections --- .gitignore | 3 +-- stix2/environment.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 72b31cd..4d16202 100644 --- a/.gitignore +++ b/.gitignore @@ -55,8 +55,7 @@ coverage.xml # Sphinx documentation docs/_build/ .ipynb_checkpoints -graph_default_sem_eq_weights.rst -object_default_sem_eq_weights.rst +similarity_weights.rst # PyBuilder target/ diff --git a/stix2/environment.py b/stix2/environment.py index b37b485..2905b9e 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -228,7 +228,7 @@ class Environment(DataStoreMixin): Note: Default weight_dict: - .. include:: ../object_default_sem_eq_weights.rst + .. include:: ../similarity_weights.rst Note: This implementation follows the Semantic Equivalence Committee Note. @@ -286,7 +286,7 @@ class Environment(DataStoreMixin): Note: Default weight_dict: - .. include:: ../object_default_sem_eq_weights.rst + .. include:: ../similarity_weights.rst Note: This implementation follows the Semantic Equivalence Committee Note. From 3efa4c1ce980afd54f21d5a77471b3d8eae3c464 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 1 Mar 2021 12:44:35 -0500 Subject: [PATCH 08/23] revert part changes --- stix2/equivalence/object/__init__.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 71a263c..81bf23c 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -176,7 +176,7 @@ def object_similarity( elif comp_funct == list_reference_check: comp_funct = partial_list_based contributing_score = w * comp_funct(obj1[prop], obj2[prop]) - prop_scores[prop]["method"] = comp_funct.__name__ + prop_scores[prop]["check_type"] = comp_funct.__name__ else: continue # prevent excessive recursion weights["_internal"]["max_depth"] = max_depth @@ -409,10 +409,11 @@ def _versioned_checks(ref1, ref2, ds1, ds2, **weights): for object1, object2 in pairs: result = object_similarity( - object1, object2, ds1, ds2, - ignore_spec_version, versioning_checks, - max_depth, **weights - ) + object1, object2, ds1=ds1, ds2=ds2, + ignore_spec_version=ignore_spec_version, + versioning_checks=versioning_checks, + max_depth=max_depth, **weights + ) if ref1 not in results: results[ref1] = {"matched": ref2, "value": result} elif result > results[ref1]["value"]: @@ -442,9 +443,10 @@ def reference_check(ref1, ref2, ds1, ds2, **weights): o1, o2 = ds1.get(ref1), ds2.get(ref2) if o1 and o2: result = object_similarity( - o1, o2, ds1, ds2, - ignore_spec_version, versioning_checks, - max_depth, **weights + o1, o2, ds1=ds1, ds2=ds2, + ignore_spec_version=ignore_spec_version, + versioning_checks=versioning_checks, + max_depth=max_depth, **weights ) / 100.0 logger.debug( From e4e6f46089db5c30698435a0e5b692faaf33b47a Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 1 Mar 2021 12:54:01 -0500 Subject: [PATCH 09/23] change key name for _refs check --- stix2/test/v21/test_environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 6a14bf3..2b781f4 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -860,7 +860,7 @@ def test_non_existent_config_for_object(): r2 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) prop_scores = {} assert stix2.Environment().object_similarity(r1, r2, prop_scores) == 100.0 - assert prop_scores["object_refs"]["method"] == "partial_list_based" + assert prop_scores["object_refs"]["check_type"] == "partial_list_based" def custom_semantic_equivalence_method(obj1, obj2, **weights): From 72a12e96ba23f4da68d1cc2af5fd7b60481b5e0f Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 1 Mar 2021 13:14:03 -0500 Subject: [PATCH 10/23] update guide example, conf.py and remove some unused imports --- docs/conf.py | 1 - docs/guide/equivalence.ipynb | 11 +---------- stix2/equivalence/graph/__init__.py | 4 +--- stix2/equivalence/object/__init__.py | 6 +++--- 4 files changed, 5 insertions(+), 17 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 62e829d..b6dd6ea 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -7,7 +7,6 @@ import sys from sphinx.ext.autodoc import ClassDocumenter from stix2.base import _STIXBase -from stix2.equivalence.graph import GRAPH_WEIGHTS from stix2.equivalence.object import WEIGHTS from stix2.version import __version__ diff --git a/docs/guide/equivalence.ipynb b/docs/guide/equivalence.ipynb index e61e9ed..9e9c679 100644 --- a/docs/guide/equivalence.ipynb +++ b/docs/guide/equivalence.ipynb @@ -4607,20 +4607,11 @@ " ),\n", "]\n", "\n", - "\n", - "weights = {\n", - " \"_internal\": {\n", - " \"ignore_spec_version\": False,\n", - " \"versioning_checks\": False,\n", - " \"max_depth\": 1,\n", - " },\n", - "}\n", - "\n", "memstore1 = MemoryStore(g1)\n", "memstore2 = MemoryStore(g2)\n", "prop_scores = {}\n", "\n", - "similarity_result = env.graph_similarity(memstore1, memstore2, prop_scores, **weights)\n", + "similarity_result = env.graph_similarity(memstore1, memstore2, prop_scores)\n", "equivalence_result = env.graph_equivalence(memstore1, memstore2, threshold=60)\n", "\n", "print(similarity_result)\n", diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index 1d43219..1f46fd3 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -2,9 +2,7 @@ import logging from ..object import ( - WEIGHTS, _bucket_per_type, _object_pairs, exact_match, - list_reference_check, object_similarity, partial_string_based, - partial_timestamp_based, reference_check, + WEIGHTS, _bucket_per_type, _object_pairs, object_similarity, ) logger = logging.getLogger(__name__) diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 81bf23c..c9bfb34 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -412,8 +412,8 @@ def _versioned_checks(ref1, ref2, ds1, ds2, **weights): object1, object2, ds1=ds1, ds2=ds2, ignore_spec_version=ignore_spec_version, versioning_checks=versioning_checks, - max_depth=max_depth, **weights - ) + max_depth=max_depth, **weights, + ) if ref1 not in results: results[ref1] = {"matched": ref2, "value": result} elif result > results[ref1]["value"]: @@ -446,7 +446,7 @@ def reference_check(ref1, ref2, ds1, ds2, **weights): o1, o2, ds1=ds1, ds2=ds2, ignore_spec_version=ignore_spec_version, versioning_checks=versioning_checks, - max_depth=max_depth, **weights + max_depth=max_depth, **weights, ) / 100.0 logger.debug( From bd996b8750eb6ba6612a5ba47e03c7ee9a45adc3 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 1 Mar 2021 14:40:05 -0500 Subject: [PATCH 11/23] Update __init__.py --- stix2/equivalence/object/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index c9bfb34..8dcafb6 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -409,10 +409,10 @@ def _versioned_checks(ref1, ref2, ds1, ds2, **weights): for object1, object2 in pairs: result = object_similarity( - object1, object2, ds1=ds1, ds2=ds2, - ignore_spec_version=ignore_spec_version, - versioning_checks=versioning_checks, - max_depth=max_depth, **weights, + object1, object2, ds1=ds1, ds2=ds2, + ignore_spec_version=ignore_spec_version, + versioning_checks=versioning_checks, + max_depth=max_depth, **weights, ) if ref1 not in results: results[ref1] = {"matched": ref2, "value": result} From fb6852b38f6a7ff47a2161a8ca0a06dfae0e18f0 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 1 Mar 2021 20:47:30 -0500 Subject: [PATCH 12/23] update test suite --- stix2/test/v20/test_datastore_taxii.py | 2 ++ stix2/test/v21/test_datastore_taxii.py | 18 ++++++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/stix2/test/v20/test_datastore_taxii.py b/stix2/test/v20/test_datastore_taxii.py index cd051f1..32aba92 100644 --- a/stix2/test/v20/test_datastore_taxii.py +++ b/stix2/test/v20/test_datastore_taxii.py @@ -53,6 +53,8 @@ class MockTAXIICollectionEndpoint(Collection): )[0] if objs: resp = Response() + resp.status_code = 200 + resp.headers["Content-Range"] = f"items 0-{len(objs)}/{len(objs)}" resp.encoding = "utf-8" resp._content = six.ensure_binary(stix2.v20.Bundle(objects=objs).serialize(ensure_ascii=False)) return resp diff --git a/stix2/test/v21/test_datastore_taxii.py b/stix2/test/v21/test_datastore_taxii.py index 92ae6dc..7a91f6f 100644 --- a/stix2/test/v21/test_datastore_taxii.py +++ b/stix2/test/v21/test_datastore_taxii.py @@ -29,14 +29,14 @@ class MockTAXIICollectionEndpoint(Collection): self._verify_can_write() if isinstance(bundle, six.string_types): bundle = json.loads(bundle) - for object in bundle.get("objects", []): - self.objects.append(object) + for obj in bundle.get("objects", []): + self.objects.append(obj) self.manifests.append( { "date_added": get_timestamp(), - "id": object["id"], + "id": obj["id"], "media_type": "application/stix+json;version=2.1", - "version": object.get("modified", object.get("created", get_timestamp())), + "version": obj.get("modified", obj.get("created", get_timestamp())), }, ) @@ -52,7 +52,10 @@ class MockTAXIICollectionEndpoint(Collection): 100, )[0] if objs: - return stix2.v21.Bundle(objects=objs) + return { + "objects": objs, + "more": False, + } else: resp = Response() resp.status_code = 404 @@ -76,7 +79,10 @@ class MockTAXIICollectionEndpoint(Collection): else: filtered_objects = [] if filtered_objects: - return stix2.v21.Bundle(objects=filtered_objects) + return { + "objects": filtered_objects, + "more": False, + } else: resp = Response() resp.status_code = 404 From 262284444ef6aafe7bba7a38568254d1489fffae Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Wed, 10 Mar 2021 09:52:15 -0500 Subject: [PATCH 13/23] Update stix2/environment.py Co-authored-by: Chris Lenk --- stix2/environment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index 2905b9e..f8624c7 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -201,8 +201,8 @@ class Environment(DataStoreMixin): obj2: A stix2 object instance prop_scores: A dictionary that can hold individual property scores, weights, contributing score, matching score and sum of weights. - ds1: A DataStore object instance representing your graph - ds2: A DataStore object instance representing your graph + ds1 (optional): A DataStore object instance from which to pull related objects + ds2 (optional): A DataStore object instance from which to pull related objects ignore_spec_version: A boolean indicating whether to test object types that belong to different spec versions (STIX 2.0 and STIX 2.1 for example). If set to True this check will be skipped. From 7d7c56c64b73ce02ec73be819ab8dad87b9c973a Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Wed, 10 Mar 2021 09:52:24 -0500 Subject: [PATCH 14/23] Update stix2/environment.py --- stix2/environment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index f8624c7..f7c13ee 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -258,8 +258,8 @@ class Environment(DataStoreMixin): threshold: A numerical value between 0 and 100 to determine the minimum score to result in successfully calling both objects equivalent. This value can be tuned. - ds1: A DataStore object instance representing your graph - ds2: A DataStore object instance representing your graph + ds1 (optional): A DataStore object instance from which to pull related objects + ds2 (optional): A DataStore object instance from which to pull related objects ignore_spec_version: A boolean indicating whether to test object types that belong to different spec versions (STIX 2.0 and STIX 2.1 for example). If set to True this check will be skipped. From c9e66def60c2791b3d3b5cde038de4ef8abcc7de Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Wed, 10 Mar 2021 13:32:02 -0500 Subject: [PATCH 15/23] rename test, update the rest of the docstrings for object_similarity() and object_equivalence() --- stix2/equivalence/object/__init__.py | 14 +++++++------- stix2/test/v21/test_environment.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 8dcafb6..da2097e 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -4,7 +4,7 @@ import itertools import logging import time -from ...datastore import DataSink, DataSource, DataStoreMixin, Filter +from ...datastore import DataSource, DataStoreMixin, Filter from ...utils import STIXdatetime, parse_into_datetime from ..pattern import equivalent_patterns @@ -28,8 +28,8 @@ def object_equivalence( threshold: A numerical value between 0 and 100 to determine the minimum score to result in successfully calling both objects equivalent. This value can be tuned. - ds1: A DataStore object instance representing your graph - ds2: A DataStore object instance representing your graph + ds1 (optional): A DataStore object instance from which to pull related objects + ds2 (optional): A DataStore object instance from which to pull related objects ignore_spec_version: A boolean indicating whether to test object types that belong to different spec versions (STIX 2.0 and STIX 2.1 for example). If set to True this check will be skipped. @@ -85,8 +85,8 @@ def object_similarity( obj2: A stix2 object instance prop_scores: A dictionary that can hold individual property scores, weights, contributing score, matching score and sum of weights. - ds1: A DataStore object instance representing your graph - ds2: A DataStore object instance representing your graph + ds1 (optional): A DataStore object instance from which to pull related objects + ds2 (optional): A DataStore object instance from which to pull related objects ignore_spec_version: A boolean indicating whether to test object types that belong to different spec versions (STIX 2.0 and STIX 2.1 for example). If set to True this check will be skipped. @@ -500,8 +500,8 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights): def _datastore_check(ds1, ds2): if ( - issubclass(ds1.__class__, (DataStoreMixin, DataSink, DataSource)) or - issubclass(ds2.__class__, (DataStoreMixin, DataSink, DataSource)) + issubclass(ds1.__class__, (DataStoreMixin, DataSource)) or + issubclass(ds2.__class__, (DataStoreMixin, DataSource)) ): return True return False diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 2b781f4..7f6b71c 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -855,7 +855,7 @@ def test_object_similarity_exact_match(): assert stix2.equivalence.object.exact_match(t1, t2) == 0.0 -def test_non_existent_config_for_object(): +def test_no_datastore_fallsback_list_based_check_for_refs_check(): r1 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) r2 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) prop_scores = {} From 34e9da805fe849de38bb25bb2b6fae50406a903a Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Fri, 12 Mar 2021 16:31:28 -0500 Subject: [PATCH 16/23] move taxii2client dependency to 2.3.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2fc5d70..397b98f 100644 --- a/setup.py +++ b/setup.py @@ -60,7 +60,7 @@ setup( 'Bug Tracker': 'https://github.com/oasis-open/cti-python-stix2/issues/', }, extras_require={ - 'taxii': ['taxii2-client>=2.2.1'], + 'taxii': ['taxii2-client>=2.3.0'], 'semantic': ['haversine', 'rapidfuzz'], }, ) From 7e5d31742c020c6c6007683a133a0c0c26178fb3 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Fri, 12 Mar 2021 16:46:34 -0500 Subject: [PATCH 17/23] remove `six` from tests --- stix2/test/v20/test_datastore_taxii.py | 5 ++--- stix2/test/v21/test_datastore_taxii.py | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/stix2/test/v20/test_datastore_taxii.py b/stix2/test/v20/test_datastore_taxii.py index 32aba92..075f0a3 100644 --- a/stix2/test/v20/test_datastore_taxii.py +++ b/stix2/test/v20/test_datastore_taxii.py @@ -3,7 +3,6 @@ import json from medallion.filters.basic_filter import BasicFilter import pytest from requests.models import Response -import six from taxii2client.common import _filter_kwargs_to_query_params from taxii2client.v20 import MEDIA_TYPE_STIX_V20, Collection @@ -27,7 +26,7 @@ class MockTAXIICollectionEndpoint(Collection): def add_objects(self, bundle): self._verify_can_write() - if isinstance(bundle, six.string_types): + if isinstance(bundle, str): bundle = json.loads(bundle) for object in bundle.get("objects", []): self.objects.append(object) @@ -56,7 +55,7 @@ class MockTAXIICollectionEndpoint(Collection): resp.status_code = 200 resp.headers["Content-Range"] = f"items 0-{len(objs)}/{len(objs)}" resp.encoding = "utf-8" - resp._content = six.ensure_binary(stix2.v20.Bundle(objects=objs).serialize(ensure_ascii=False)) + resp._content = bytes(stix2.v20.Bundle(objects=objs).serialize(ensure_ascii=False), resp.encoding) return resp else: resp = Response() diff --git a/stix2/test/v21/test_datastore_taxii.py b/stix2/test/v21/test_datastore_taxii.py index 7a91f6f..62ddadc 100644 --- a/stix2/test/v21/test_datastore_taxii.py +++ b/stix2/test/v21/test_datastore_taxii.py @@ -3,7 +3,6 @@ import json from medallion.filters.basic_filter import BasicFilter import pytest from requests.models import Response -import six from taxii2client.common import _filter_kwargs_to_query_params from taxii2client.v21 import Collection @@ -27,7 +26,7 @@ class MockTAXIICollectionEndpoint(Collection): def add_objects(self, bundle): self._verify_can_write() - if isinstance(bundle, six.string_types): + if isinstance(bundle, str): bundle = json.loads(bundle) for obj in bundle.get("objects", []): self.objects.append(obj) From f7ebd34c8c9c2331dea5fe80389e6bfbff9fe124 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Fri, 12 Mar 2021 16:49:23 -0500 Subject: [PATCH 18/23] Update __init__.py fix incident entry --- stix2/equivalence/object/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index da2097e..dde52ec 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -559,8 +559,8 @@ WEIGHTS = { "sectors": (20, partial_list_based), }, "incident": { - "name": (60, partial_string_based), - "external_references": (40, partial_external_reference_based), + "name": (30, partial_string_based), + "external_references": (70, partial_external_reference_based), }, "indicator": { "indicator_types": (15, partial_list_based), From 827f622c045d7350d9dbbbc36f9899f692c51e43 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Wed, 17 Mar 2021 15:01:49 -0400 Subject: [PATCH 19/23] provide ability to stream STIX output to fp ``.write()`` file-like object --- stix2/base.py | 32 ++++++++++++++++++++++++++++- stix2/serialization.py | 38 +++++++++++++++++++++++++++++++++++ stix2/test/v20/test_bundle.py | 22 ++++++++++++++++++++ stix2/test/v21/test_bundle.py | 22 ++++++++++++++++++++ 4 files changed, 113 insertions(+), 1 deletion(-) diff --git a/stix2/base.py b/stix2/base.py index b158f06..03d8ea4 100644 --- a/stix2/base.py +++ b/stix2/base.py @@ -17,7 +17,8 @@ from .exceptions import ( from .markings import _MarkingsMixin from .markings.utils import validate from .serialization import ( - STIXJSONEncoder, STIXJSONIncludeOptionalDefaultsEncoder, serialize, + STIXJSONEncoder, STIXJSONIncludeOptionalDefaultsEncoder, fp_serialize, + serialize, ) from .utils import NOW, PREFIX_21_REGEX, get_timestamp from .versioning import new_version as _new_version @@ -260,6 +261,35 @@ class _STIXBase(Mapping): """ return serialize(self, *args, **kwargs) + def fp_serialize(self, *args, **kwargs): + """ + Serialize a STIX object to a file-like supporting object. + + Examples: + >>> import stix2 + >>> identity = stix2.Identity(name='Example Corp.', identity_class='organization') + >>> print(identity.serialize(sort_keys=True)) + {"created": "2018-06-08T19:03:54.066Z", ... "name": "Example Corp.", "type": "identity"} + >>> print(identity.serialize(sort_keys=True, indent=4)) + { + "created": "2018-06-08T19:03:54.066Z", + "id": "identity--d7f3e25a-ba1c-447a-ab71-6434b092b05e", + "identity_class": "organization", + "modified": "2018-06-08T19:03:54.066Z", + "name": "Example Corp.", + "type": "identity" + } + >>> with open("example.json", mode="w", encoding="utf-8") as f: + >>> identity.fp_serialize(f, pretty=True) + + Returns: + None + + See Also: + ``stix2.serialization.fp_serialize`` for options. + """ + fp_serialize(self, *args, **kwargs) + class _DomainObject(_STIXBase, _MarkingsMixin): pass diff --git a/stix2/serialization.py b/stix2/serialization.py index 7488eb5..8822f33 100644 --- a/stix2/serialization.py +++ b/stix2/serialization.py @@ -85,6 +85,44 @@ def serialize(obj, pretty=False, include_optional_defaults=False, **kwargs): return json.dumps(obj, cls=STIXJSONEncoder, **kwargs) +def fp_serialize(obj, fp, pretty=False, include_optional_defaults=False, **kwargs): + """ + Serialize a STIX object as a stream to file-like supporting object. + + Args: + obj: The STIX object to be serialized. + fp: A ``.write()``-supporting file-like object. + pretty (bool): If True, output properties following the STIX specs + formatting. This includes indentation. Refer to notes for more + details. (Default: ``False``) + include_optional_defaults (bool): Determines whether to include + optional properties set to the default value defined in the spec. + **kwargs: The arguments for a json.dumps() call. + + Returns: + None + + Note: + The argument ``pretty=True`` will output the STIX object following + spec order. Using this argument greatly impacts object serialization + performance. If your use case is centered across machine-to-machine + operation it is recommended to set ``pretty=False``. + + When ``pretty=True`` the following key-value pairs will be added or + overridden: indent=4, separators=(",", ": "), item_sort_key=sort_by. + """ + if pretty: + def sort_by(element): + return find_property_index(obj, *element) + + kwargs.update({'indent': 4, 'separators': (',', ': '), 'item_sort_key': sort_by}) + + if include_optional_defaults: + json.dump(obj, fp, cls=STIXJSONIncludeOptionalDefaultsEncoder, **kwargs) + else: + json.dump(obj, fp, cls=STIXJSONEncoder, **kwargs) + + def _find(seq, val): """ Search sequence 'seq' for val. This behaves like str.find(): if not found, diff --git a/stix2/test/v20/test_bundle.py b/stix2/test/v20/test_bundle.py index f53d0cb..fed91e1 100644 --- a/stix2/test/v20/test_bundle.py +++ b/stix2/test/v20/test_bundle.py @@ -1,3 +1,4 @@ +import io import json import pytest @@ -113,6 +114,27 @@ def test_bundle_id_must_start_with_bundle(): assert str(excinfo.value) == "Invalid value for Bundle 'id': must start with 'bundle--'." +def test_create_bundle_fp_serialize_true(indicator, malware, relationship): + bundle = stix2.v20.Bundle(objects=[indicator, malware, relationship]) + buffer = io.StringIO() + + bundle.fp_serialize(buffer, pretty=True) + + assert str(bundle) == EXPECTED_BUNDLE + assert bundle.serialize(pretty=True) == EXPECTED_BUNDLE + assert buffer.getvalue() == EXPECTED_BUNDLE + + +def test_create_bundle_fp_serialize_false(indicator, malware, relationship): + bundle = stix2.v20.Bundle(objects=[indicator, malware, relationship]) + buffer = io.StringIO() + + bundle.fp_serialize(buffer, sort_keys=True) + + assert bundle.serialize(sort_keys=True) == json.dumps(json.loads(EXPECTED_BUNDLE), sort_keys=True) + assert buffer.getvalue() == json.dumps(json.loads(EXPECTED_BUNDLE), sort_keys=True) + + def test_create_bundle1(indicator, malware, relationship): bundle = stix2.v20.Bundle(objects=[indicator, malware, relationship]) diff --git a/stix2/test/v21/test_bundle.py b/stix2/test/v21/test_bundle.py index 4e30c84..07014c6 100644 --- a/stix2/test/v21/test_bundle.py +++ b/stix2/test/v21/test_bundle.py @@ -1,3 +1,4 @@ +import io import json import pytest @@ -123,6 +124,27 @@ def test_bundle_id_must_start_with_bundle(): assert str(excinfo.value) == "Invalid value for Bundle 'id': must start with 'bundle--'." +def test_create_bundle_fp_serialize_true(indicator, malware, relationship): + bundle = stix2.v21.Bundle(objects=[indicator, malware, relationship]) + buffer = io.StringIO() + + bundle.fp_serialize(buffer, pretty=True) + + assert str(bundle) == EXPECTED_BUNDLE + assert bundle.serialize(pretty=True) == EXPECTED_BUNDLE + assert buffer.getvalue() == EXPECTED_BUNDLE + + +def test_create_bundle_fp_serialize_false(indicator, malware, relationship): + bundle = stix2.v21.Bundle(objects=[indicator, malware, relationship]) + buffer = io.StringIO() + + bundle.fp_serialize(buffer, sort_keys=True) + + assert bundle.serialize(sort_keys=True) == json.dumps(json.loads(EXPECTED_BUNDLE), sort_keys=True) + assert buffer.getvalue() == json.dumps(json.loads(EXPECTED_BUNDLE), sort_keys=True) + + def test_create_bundle1(indicator, malware, relationship): bundle = stix2.v21.Bundle(objects=[indicator, malware, relationship]) From 2ea9c0c63c1e40e9ecbf78dddf296e2bec28992a Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Wed, 17 Mar 2021 15:15:26 -0400 Subject: [PATCH 20/23] use it on filesystem.py data sink --- stix2/datastore/filesystem.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/stix2/datastore/filesystem.py b/stix2/datastore/filesystem.py index d844115..2209116 100644 --- a/stix2/datastore/filesystem.py +++ b/stix2/datastore/filesystem.py @@ -13,7 +13,7 @@ from stix2.datastore import ( ) from stix2.datastore.filters import Filter, FilterSet, apply_common_filters from stix2.parsing import parse -from stix2.serialization import serialize +from stix2.serialization import fp_serialize from stix2.utils import format_datetime, get_type_from_id, parse_into_datetime @@ -584,9 +584,8 @@ class FileSystemSink(DataSink): if os.path.isfile(file_path): raise DataSourceError("Attempted to overwrite file (!) at: {}".format(file_path)) - with io.open(file_path, 'w', encoding=encoding) as f: - stix_obj = serialize(stix_obj, pretty=True, encoding=encoding, ensure_ascii=False) - f.write(stix_obj) + with io.open(file_path, mode='w', encoding=encoding) as f: + fp_serialize(stix_obj, f, pretty=True, encoding=encoding, ensure_ascii=False) def add(self, stix_data=None, version=None): """Add STIX objects to file directory. From 922de111ed965bb0f1b24c78d77a4e9dccacc7d8 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Mar 2021 10:14:36 -0400 Subject: [PATCH 21/23] minor tweaks to docstrings --- stix2/base.py | 2 +- stix2/serialization.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/stix2/base.py b/stix2/base.py index 03d8ea4..2c48ef6 100644 --- a/stix2/base.py +++ b/stix2/base.py @@ -263,7 +263,7 @@ class _STIXBase(Mapping): def fp_serialize(self, *args, **kwargs): """ - Serialize a STIX object to a file-like supporting object. + Serialize a STIX object to ``fp`` (a text stream file-like supporting object). Examples: >>> import stix2 diff --git a/stix2/serialization.py b/stix2/serialization.py index 8822f33..660bba6 100644 --- a/stix2/serialization.py +++ b/stix2/serialization.py @@ -87,11 +87,11 @@ def serialize(obj, pretty=False, include_optional_defaults=False, **kwargs): def fp_serialize(obj, fp, pretty=False, include_optional_defaults=False, **kwargs): """ - Serialize a STIX object as a stream to file-like supporting object. + Serialize a STIX object to ``fp`` (a text stream file-like supporting object). Args: obj: The STIX object to be serialized. - fp: A ``.write()``-supporting file-like object. + fp: A text stream file-like object supporting ``.write()``. pretty (bool): If True, output properties following the STIX specs formatting. This includes indentation. Refer to notes for more details. (Default: ``False``) From c2d360d22345475ee39ca642735c4a2e338f8b50 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Mar 2021 18:08:31 -0400 Subject: [PATCH 22/23] apply fp_serialize() changes on main serialize() method --- stix2/serialization.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/stix2/serialization.py b/stix2/serialization.py index 660bba6..2784d39 100644 --- a/stix2/serialization.py +++ b/stix2/serialization.py @@ -2,6 +2,7 @@ import copy import datetime as dt +import io import simplejson as json @@ -73,16 +74,9 @@ def serialize(obj, pretty=False, include_optional_defaults=False, **kwargs): When ``pretty=True`` the following key-value pairs will be added or overridden: indent=4, separators=(",", ": "), item_sort_key=sort_by. """ - if pretty: - def sort_by(element): - return find_property_index(obj, *element) - - kwargs.update({'indent': 4, 'separators': (',', ': '), 'item_sort_key': sort_by}) - - if include_optional_defaults: - return json.dumps(obj, cls=STIXJSONIncludeOptionalDefaultsEncoder, **kwargs) - else: - return json.dumps(obj, cls=STIXJSONEncoder, **kwargs) + with io.StringIO() as fp: + fp_serialize(obj, fp, pretty, include_optional_defaults, **kwargs) + return fp.getvalue() def fp_serialize(obj, fp, pretty=False, include_optional_defaults=False, **kwargs): From 19196654c5137def1dada0b3b4a8c95d5842b322 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Fri, 19 Mar 2021 15:31:01 -0400 Subject: [PATCH 23/23] Apply suggestions from code review Co-authored-by: Chris Lenk --- stix2/test/v20/test_bundle.py | 4 ++-- stix2/test/v21/test_bundle.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/stix2/test/v20/test_bundle.py b/stix2/test/v20/test_bundle.py index fed91e1..ac5d239 100644 --- a/stix2/test/v20/test_bundle.py +++ b/stix2/test/v20/test_bundle.py @@ -114,7 +114,7 @@ def test_bundle_id_must_start_with_bundle(): assert str(excinfo.value) == "Invalid value for Bundle 'id': must start with 'bundle--'." -def test_create_bundle_fp_serialize_true(indicator, malware, relationship): +def test_create_bundle_fp_serialize_pretty(indicator, malware, relationship): bundle = stix2.v20.Bundle(objects=[indicator, malware, relationship]) buffer = io.StringIO() @@ -125,7 +125,7 @@ def test_create_bundle_fp_serialize_true(indicator, malware, relationship): assert buffer.getvalue() == EXPECTED_BUNDLE -def test_create_bundle_fp_serialize_false(indicator, malware, relationship): +def test_create_bundle_fp_serialize_nonpretty(indicator, malware, relationship): bundle = stix2.v20.Bundle(objects=[indicator, malware, relationship]) buffer = io.StringIO() diff --git a/stix2/test/v21/test_bundle.py b/stix2/test/v21/test_bundle.py index 07014c6..1cf30d0 100644 --- a/stix2/test/v21/test_bundle.py +++ b/stix2/test/v21/test_bundle.py @@ -124,7 +124,7 @@ def test_bundle_id_must_start_with_bundle(): assert str(excinfo.value) == "Invalid value for Bundle 'id': must start with 'bundle--'." -def test_create_bundle_fp_serialize_true(indicator, malware, relationship): +def test_create_bundle_fp_serialize_pretty(indicator, malware, relationship): bundle = stix2.v21.Bundle(objects=[indicator, malware, relationship]) buffer = io.StringIO() @@ -135,7 +135,7 @@ def test_create_bundle_fp_serialize_true(indicator, malware, relationship): assert buffer.getvalue() == EXPECTED_BUNDLE -def test_create_bundle_fp_serialize_false(indicator, malware, relationship): +def test_create_bundle_fp_serialize_nonpretty(indicator, malware, relationship): bundle = stix2.v21.Bundle(objects=[indicator, malware, relationship]) buffer = io.StringIO()