From 489970718f038a67011f468c3649225971689415 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 1 Feb 2021 22:35:37 -0500 Subject: [PATCH 01/19] WIP: changes to graph_similarity busted main loop, symmetrical properties not present --- stix2/environment.py | 12 ++--- stix2/equivalence/graph/__init__.py | 73 +++++++++++++++---------- stix2/equivalence/object/__init__.py | 6 +-- stix2/test/v21/test_environment.py | 79 +++++++++++++++------------- 4 files changed, 97 insertions(+), 73 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index 4dc6ff0..bc7fcaf 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -2,12 +2,12 @@ import copy from .datastore import CompositeDataSource, DataStoreMixin -from .equivalence.graph import graphically_equivalent +from .equivalence.graph import graph_similarity from .equivalence.object import ( # noqa: F401 WEIGHTS, check_property_present, custom_pattern_based, exact_match, list_reference_check, partial_external_reference_based, partial_list_based, partial_location_distance, partial_string_based, partial_timestamp_based, - reference_check, semantically_equivalent, + reference_check, object_similarity, ) from .parsing import parse as _parse @@ -197,7 +197,7 @@ class Environment(DataStoreMixin): return None @staticmethod - def semantically_equivalent(obj1, obj2, prop_scores={}, **weight_dict): + def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): """This method verifies if two objects of the same type are semantically equivalent. @@ -229,10 +229,10 @@ class Environment(DataStoreMixin): see `the Committee Note `__. """ - return semantically_equivalent(obj1, obj2, prop_scores, **weight_dict) + return object_similarity(obj1, obj2, prop_scores, **weight_dict) @staticmethod - def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict): + def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): """This method verifies if two graphs are semantically equivalent. Each DataStore can contain a connected or disconnected graph and the final result is weighted over the amount of objects we managed to compare. @@ -267,4 +267,4 @@ class Environment(DataStoreMixin): see `the Committee Note `__. """ - return graphically_equivalent(ds1, ds2, prop_scores, **weight_dict) + return graph_similarity(ds1, ds2, prop_scores, **weight_dict) diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index 680f42f..cff99d0 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -1,15 +1,17 @@ """Python APIs for STIX 2 Graph-based Semantic Equivalence.""" +import collections +import itertools import logging from ..object import ( WEIGHTS, exact_match, list_reference_check, partial_string_based, - partial_timestamp_based, reference_check, semantically_equivalent, + partial_timestamp_based, reference_check, object_similarity, ) logger = logging.getLogger(__name__) -def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict): +def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): """This method verifies if two graphs are semantically equivalent. Each DataStore can contain a connected or disconnected graph and the final result is weighted over the amount of objects we managed to compare. @@ -44,49 +46,48 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict): see `the Committee Note `__. """ + results = {} + equivalence_score = 0 weights = GRAPH_WEIGHTS.copy() if weight_dict: weights.update(weight_dict) - results = {} depth = weights["_internal"]["max_depth"] - graph1 = ds1.query([]) - graph2 = ds2.query([]) + graph1 = bucket_per_type(ds1.query([])) + graph2 = bucket_per_type(ds2.query([])) + pairs = object_pairs(graph1, graph2, weights) - graph1.sort(key=lambda x: x["type"]) - graph2.sort(key=lambda x: x["type"]) - - if len(graph1) < len(graph2): + for object1, object2 in pairs: + iprop_score1 = {} + iprop_score2 = {} + object1_id = object1["id"] + object2_id = object2["id"] + weights["_internal"]["max_depth"] = depth weights["_internal"]["ds1"] = ds1 weights["_internal"]["ds2"] = ds2 - g1 = graph1 - g2 = graph2 - else: + result1 = object_similarity(object1, object2, iprop_score1, **weights) + weights["_internal"]["ds1"] = ds2 weights["_internal"]["ds2"] = ds1 - g1 = graph2 - g2 = graph1 + result2 = object_similarity(object2, object1, iprop_score2, **weights) - for object1 in g1: - for object2 in g2: - if object1["type"] == object2["type"] and object1["type"] in weights: - iprop_score = {} - result = semantically_equivalent(object1, object2, iprop_score, **weights) - objects1_id = object1["id"] - weights["_internal"]["max_depth"] = depth + if object1_id not in results: + results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1} + elif result1 > results[object1_id]["value"]: + results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1} - if objects1_id not in results: - results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result} - elif result > results[objects1_id]["value"]: - results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result} + if object2_id not in results: + results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2} + elif result1 > results[object2_id]["value"]: + results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2} - equivalence_score = 0 matching_score = sum(x["value"] for x in results.values()) - sum_weights = len(results) * 100.0 + sum_weights = len(results) if sum_weights > 0: - equivalence_score = (matching_score / sum_weights) * 100 + equivalence_score = matching_score / sum_weights + prop_scores["matching_score"] = matching_score prop_scores["sum_weights"] = sum_weights prop_scores["summary"] = results @@ -100,6 +101,22 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict): return equivalence_score +def bucket_per_type(g): + buckets = collections.defaultdict(list) + [buckets[obj["type"]].append(obj) for obj in g] + return buckets + + +def object_pairs(g1, g2, w): + types_in_common = set(g1.keys()).intersection(g2.keys()) + testable_types = types_in_common.intersection(w.keys()) + + return itertools.chain.from_iterable( + itertools.product(g1[stix_type], g2[stix_type]) + for stix_type in testable_types + ) + + # default weights used for the graph semantic equivalence process GRAPH_WEIGHTS = WEIGHTS.copy() GRAPH_WEIGHTS.update({ diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 0225788..8b1ceaa 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -9,7 +9,7 @@ from ..pattern import equivalent_patterns logger = logging.getLogger(__name__) -def semantically_equivalent(obj1, obj2, prop_scores={}, **weight_dict): +def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): """This method verifies if two objects of the same type are semantically equivalent. @@ -312,7 +312,7 @@ def _versioned_checks(ref1, ref2, ds1, ds2, **weights): if len(objects1) > 0 and len(objects2) > 0: for o1 in objects1: for o2 in objects2: - result = semantically_equivalent(o1, o2, **weights) + result = object_similarity(o1, o2, **weights) if ref1 not in results: results[ref1] = {"matched": ref2, "value": result} elif result > results[ref1]["value"]: @@ -337,7 +337,7 @@ def reference_check(ref1, ref2, ds1, ds2, **weights): else: o1, o2 = ds1.get(ref1), ds2.get(ref2) if o1 and o2: - result = semantically_equivalent(o1, o2, **weights) / 100.0 + result = object_similarity(o1, o2, **weights) / 100.0 logger.debug( "--\t\treference_check '%s' '%s'\tresult: '%s'", diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 0da01d1..5682ad1 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -429,7 +429,7 @@ def test_related_to_by_target(ds): def test_semantic_equivalence_on_same_attack_pattern1(): ap1 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS) ap2 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS) - env = stix2.Environment().semantically_equivalent(ap1, ap2) + env = stix2.Environment().object_similarity(ap1, ap2) assert round(env) == 100 @@ -445,14 +445,14 @@ def test_semantic_equivalence_on_same_attack_pattern2(): ) ap1 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_KWARGS) ap2 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_KWARGS) - env = stix2.Environment().semantically_equivalent(ap1, ap2) + env = stix2.Environment().object_similarity(ap1, ap2) assert round(env) == 100 def test_semantic_equivalence_on_same_campaign1(): camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) - env = stix2.Environment().semantically_equivalent(camp1, camp2) + env = stix2.Environment().object_similarity(camp1, camp2) assert round(env) == 100 @@ -464,14 +464,14 @@ def test_semantic_equivalence_on_same_campaign2(): ) camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMP_KWARGS) camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMP_KWARGS) - env = stix2.Environment().semantically_equivalent(camp1, camp2) + env = stix2.Environment().object_similarity(camp1, camp2) assert round(env) == 100 def test_semantic_equivalence_on_same_identity1(): iden1 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS) iden2 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS) - env = stix2.Environment().semantically_equivalent(iden1, iden2) + env = stix2.Environment().object_similarity(iden1, iden2) assert round(env) == 100 @@ -483,14 +483,14 @@ def test_semantic_equivalence_on_same_identity2(): ) iden1 = stix2.v21.Identity(id=IDENTITY_ID, **IDEN_KWARGS) iden2 = stix2.v21.Identity(id=IDENTITY_ID, **IDEN_KWARGS) - env = stix2.Environment().semantically_equivalent(iden1, iden2) + env = stix2.Environment().object_similarity(iden1, iden2) assert round(env) == 100 def test_semantic_equivalence_on_same_indicator(): ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) - env = stix2.Environment().semantically_equivalent(ind1, ind2) + env = stix2.Environment().object_similarity(ind1, ind2) assert round(env) == 100 @@ -498,7 +498,7 @@ def test_semantic_equivalence_on_same_location1(): location_kwargs = dict(latitude=45, longitude=179) loc1 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs) loc2 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs) - env = stix2.Environment().semantically_equivalent(loc1, loc2) + env = stix2.Environment().object_similarity(loc1, loc2) assert round(env) == 100 @@ -511,7 +511,7 @@ def test_semantic_equivalence_on_same_location2(): ) loc1 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs) loc2 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs) - env = stix2.Environment().semantically_equivalent(loc1, loc2) + env = stix2.Environment().object_similarity(loc1, loc2) assert round(env) == 100 @@ -519,21 +519,21 @@ def test_semantic_equivalence_location_with_no_latlong(): loc_kwargs = dict(country="US", administrative_area="US-DC") loc1 = stix2.v21.Location(id=LOCATION_ID, **LOCATION_KWARGS) loc2 = stix2.v21.Location(id=LOCATION_ID, **loc_kwargs) - env = stix2.Environment().semantically_equivalent(loc1, loc2) + env = stix2.Environment().object_similarity(loc1, loc2) assert round(env) != 100 def test_semantic_equivalence_on_same_malware(): malw1 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS) malw2 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS) - env = stix2.Environment().semantically_equivalent(malw1, malw2) + env = stix2.Environment().object_similarity(malw1, malw2) assert round(env) == 100 def test_semantic_equivalence_on_same_threat_actor1(): ta1 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS) ta2 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS) - env = stix2.Environment().semantically_equivalent(ta1, ta2) + env = stix2.Environment().object_similarity(ta1, ta2) assert round(env) == 100 @@ -545,21 +545,21 @@ def test_semantic_equivalence_on_same_threat_actor2(): ) ta1 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_KWARGS) ta2 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_KWARGS) - env = stix2.Environment().semantically_equivalent(ta1, ta2) + env = stix2.Environment().object_similarity(ta1, ta2) assert round(env) == 100 def test_semantic_equivalence_on_same_tool(): tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) - env = stix2.Environment().semantically_equivalent(tool1, tool2) + env = stix2.Environment().object_similarity(tool1, tool2) assert round(env) == 100 def test_semantic_equivalence_on_same_vulnerability1(): vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) - env = stix2.Environment().semantically_equivalent(vul1, vul2) + env = stix2.Environment().object_similarity(vul1, vul2) assert round(env) == 100 @@ -584,7 +584,7 @@ def test_semantic_equivalence_on_same_vulnerability2(): ) vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS1) vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS2) - env = stix2.Environment().semantically_equivalent(vul1, vul2) + env = stix2.Environment().object_similarity(vul1, vul2) assert round(env) == 0.0 @@ -640,7 +640,7 @@ def test_semantic_equivalence_on_unknown_object(): } cust1 = stix2.parse(CUSTOM_KWARGS1, allow_custom=True) cust2 = stix2.parse(CUSTOM_KWARGS2, allow_custom=True) - env = stix2.Environment().semantically_equivalent(cust1, cust2, **weights) + env = stix2.Environment().object_similarity(cust1, cust2, **weights) assert round(env) == 0 @@ -648,7 +648,7 @@ def test_semantic_equivalence_different_type_raises(): with pytest.raises(ValueError) as excinfo: vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) - stix2.Environment().semantically_equivalent(vul1, ind1) + stix2.Environment().object_similarity(vul1, ind1) assert str(excinfo.value) == "The objects to compare must be of the same type!" @@ -661,7 +661,7 @@ def test_semantic_equivalence_different_spec_version_raises(): ) ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) ind2 = stix2.v20.Indicator(id=INDICATOR_ID, **V20_KWARGS) - stix2.Environment().semantically_equivalent(ind1, ind2) + stix2.Environment().object_similarity(ind1, ind2) assert str(excinfo.value) == "The objects to compare must be of the same spec version!" @@ -686,7 +686,7 @@ def test_semantic_equivalence_zero_match(): } ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **IND_KWARGS) - env = stix2.Environment().semantically_equivalent(ind1, ind2, **weights) + env = stix2.Environment().object_similarity(ind1, ind2, **weights) assert round(env) == 0 @@ -708,7 +708,7 @@ def test_semantic_equivalence_different_spec_version(): } ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) ind2 = stix2.v20.Indicator(id=INDICATOR_ID, **IND_KWARGS) - env = stix2.Environment().semantically_equivalent(ind1, ind2, **weights) + env = stix2.Environment().object_similarity(ind1, ind2, **weights) assert round(env) == 0 @@ -800,7 +800,7 @@ def test_semantic_equivalence_exact_match(): def test_non_existent_config_for_object(): r1 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) r2 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) - assert stix2.Environment().semantically_equivalent(r1, r2) == 0.0 + assert stix2.Environment().object_similarity(r1, r2) == 0.0 def custom_semantic_equivalence_method(obj1, obj2, **weights): @@ -824,7 +824,7 @@ def test_semantic_equivalence_method_provided(): tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS) - env = stix2.Environment().semantically_equivalent(tool1, tool2, **weights) + env = stix2.Environment().object_similarity(tool1, tool2, **weights) assert round(env) == 96 @@ -838,7 +838,7 @@ def test_semantic_equivalence_prop_scores(): tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS) - stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores) + stix2.Environment().object_similarity(tool1, tool2, prop_scores) assert len(prop_scores) == 4 assert round(prop_scores["matching_score"], 1) == 8.9 assert round(prop_scores["sum_weights"], 1) == 100.0 @@ -868,7 +868,7 @@ def test_semantic_equivalence_prop_scores_method_provided(): tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS) - env = stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores, **weights) + env = stix2.Environment().object_similarity(tool1, tool2, prop_scores, **weights) assert round(env) == 96 assert len(prop_scores) == 2 assert prop_scores["matching_score"] == 96.0 @@ -964,12 +964,19 @@ def test_graph_equivalence_with_filesystem_source(ds): "max_depth": 1, }, } - prop_scores = {} + prop_scores1 = {} + prop_scores2 = {} fs = stix2.FileSystemSource(FS_PATH) - env = stix2.Environment().graphically_equivalent(fs, ds, prop_scores, **weights) - assert round(env) == 24 - assert round(prop_scores["matching_score"]) == 122 - assert round(prop_scores["sum_weights"]) == 500 + env = stix2.Environment().graph_similarity(fs, ds, prop_scores1, **weights) + assert round(env) == 26 + assert round(prop_scores1["matching_score"]) == 460 + assert round(prop_scores1["sum_weights"]) == 18 + + env = stix2.Environment().graph_similarity(ds, fs, prop_scores2, **weights) + assert round(env) == 47 + assert round(prop_scores2["matching_score"]) == 852 + assert round(prop_scores2["sum_weights"]) == 18 + assert prop_scores1 == prop_scores2 def test_graph_equivalence_with_duplicate_graph(ds): @@ -981,10 +988,10 @@ def test_graph_equivalence_with_duplicate_graph(ds): }, } prop_scores = {} - env = stix2.Environment().graphically_equivalent(ds, ds, prop_scores, **weights) + env = stix2.Environment().graph_similarity(ds, ds, prop_scores, **weights) assert round(env) == 100 assert round(prop_scores["matching_score"]) == 800 - assert round(prop_scores["sum_weights"]) == 800 + assert round(prop_scores["sum_weights"]) == 8 def test_graph_equivalence_with_versioning_check_on(ds2, ds): @@ -996,10 +1003,10 @@ def test_graph_equivalence_with_versioning_check_on(ds2, ds): }, } prop_scores = {} - env = stix2.Environment().graphically_equivalent(ds, ds2, prop_scores, **weights) + env = stix2.Environment().graph_similarity(ds, ds2, prop_scores, **weights) assert round(env) == 93 assert round(prop_scores["matching_score"]) == 745 - assert round(prop_scores["sum_weights"]) == 800 + assert round(prop_scores["sum_weights"]) == 8 def test_graph_equivalence_with_versioning_check_off(ds2, ds): @@ -1011,7 +1018,7 @@ def test_graph_equivalence_with_versioning_check_off(ds2, ds): }, } prop_scores = {} - env = stix2.Environment().graphically_equivalent(ds, ds2, prop_scores, **weights) + env = stix2.Environment().graph_similarity(ds, ds2, prop_scores, **weights) assert round(env) == 93 assert round(prop_scores["matching_score"]) == 745 - assert round(prop_scores["sum_weights"]) == 800 + assert round(prop_scores["sum_weights"]) == 8 From 02b076b3bb60e1c8fd659d56010780acc32e4ce0 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 16 Feb 2021 00:57:26 -0500 Subject: [PATCH 02/19] resolve issues with graph similarity - new methods for graph equivalence and similarity - remove sorting and len comparisons - rename some variables --- stix2/equivalence/graph/__init__.py | 111 ++++++++++++++++--------- stix2/equivalence/object/__init__.py | 119 ++++++++++++++++++++------- 2 files changed, 160 insertions(+), 70 deletions(-) diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index cff99d0..797aa23 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -1,21 +1,62 @@ -"""Python APIs for STIX 2 Graph-based Semantic Equivalence.""" -import collections -import itertools +"""Python APIs for STIX 2 Graph-based Semantic Equivalence and Similarity.""" import logging from ..object import ( WEIGHTS, exact_match, list_reference_check, partial_string_based, - partial_timestamp_based, reference_check, object_similarity, + partial_timestamp_based, reference_check, object_similarity, object_pairs, bucket_per_type ) logger = logging.getLogger(__name__) +def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict): + """This method returns a true/false value if two graphs are semantically equivalent. + Internally, it calls the graph_similarity function and compares it against the given + threshold value. + + Args: + ds1: A DataStore object instance representing your graph + ds2: A DataStore object instance representing your graph + prop_scores: A dictionary that can hold individual property scores, + weights, contributing score, matching score and sum of weights. + threshold: A numerical value between 0 and 100 to determine the minimum + score to result in successfully calling both graphs equivalent. This + value can be tuned. + weight_dict: A dictionary that can be used to override settings + in the similarity process + + Returns: + bool: True if the result of the graph similarity is greater than or equal to + the threshold value. False otherwise. + + Warning: + Object types need to have property weights defined for the similarity process. + Otherwise, those objects will not influence the final score. The WEIGHTS + dictionary under `stix2.equivalence.graph` can give you an idea on how to add + new entries and pass them via the `weight_dict` argument. Similarly, the values + or methods can be fine tuned for a particular use case. + + Note: + Default weight_dict: + + .. include:: ../../graph_default_sem_eq_weights.rst + + Note: + This implementation follows the Semantic Equivalence Committee Note. + see `the Committee Note `__. + + """ + similarity_result = graph_similarity(ds1, ds2, prop_scores, **weight_dict) + if similarity_result >= threshold: + return True + return False + + def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): - """This method verifies if two graphs are semantically equivalent. + """This method returns a similarity score for two given graphs. Each DataStore can contain a connected or disconnected graph and the final result is weighted over the amount of objects we managed to compare. - This approach builds on top of the object-based semantic equivalence process + This approach builds on top of the object-based similarity process and each comparison can return a value between 0 and 100. Args: @@ -24,20 +65,20 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): prop_scores: A dictionary that can hold individual property scores, weights, contributing score, matching score and sum of weights. weight_dict: A dictionary that can be used to override settings - in the semantic equivalence process + in the similarity process Returns: - float: A number between 0.0 and 100.0 as a measurement of equivalence. + float: A number between 0.0 and 100.0 as a measurement of similarity. Warning: - Object types need to have property weights defined for the equivalence process. + Object types need to have property weights defined for the similarity process. Otherwise, those objects will not influence the final score. The WEIGHTS dictionary under `stix2.equivalence.graph` can give you an idea on how to add new entries and pass them via the `weight_dict` argument. Similarly, the values or methods can be fine tuned for a particular use case. Note: - Default weights_dict: + Default weight_dict: .. include:: ../../graph_default_sem_eq_weights.rst @@ -47,12 +88,14 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): """ results = {} - equivalence_score = 0 + similarity_score = 0 weights = GRAPH_WEIGHTS.copy() if weight_dict: weights.update(weight_dict) + if weights["_internal"]["max_depth"] <= 0: + raise ValueError("weight_dict['_internal']['max_depth'] must be greater than 0") depth = weights["_internal"]["max_depth"] graph1 = bucket_per_type(ds1.query([])) @@ -64,60 +107,46 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): iprop_score2 = {} object1_id = object1["id"] object2_id = object2["id"] + weights["_internal"]["max_depth"] = depth weights["_internal"]["ds1"] = ds1 weights["_internal"]["ds2"] = ds2 result1 = object_similarity(object1, object2, iprop_score1, **weights) + weights["_internal"]["max_depth"] = depth weights["_internal"]["ds1"] = ds2 weights["_internal"]["ds2"] = ds1 result2 = object_similarity(object2, object1, iprop_score2, **weights) if object1_id not in results: - results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1} + results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score1, "value": result1} elif result1 > results[object1_id]["value"]: - results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1} + results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score1, "value": result1} if object2_id not in results: - results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2} - elif result1 > results[object2_id]["value"]: - results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2} + results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score2, "value": result2} + elif result2 > results[object2_id]["value"]: + results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score2, "value": result2} matching_score = sum(x["value"] for x in results.values()) - sum_weights = len(results) - if sum_weights > 0: - equivalence_score = matching_score / sum_weights + len_pairs = len(results) + if len_pairs > 0: + similarity_score = matching_score / len_pairs prop_scores["matching_score"] = matching_score - prop_scores["sum_weights"] = sum_weights + prop_scores["len_pairs"] = len_pairs prop_scores["summary"] = results logger.debug( - "DONE\t\tSUM_WEIGHT: %.2f\tMATCHING_SCORE: %.2f\t SCORE: %.2f", - sum_weights, + "DONE\t\tSUM_PAIRS: %.2f\tMATCHING_SCORE: %.2f\t SIMILARITY_SCORE: %.2f", + len_pairs, matching_score, - equivalence_score, + similarity_score, ) - return equivalence_score + return similarity_score -def bucket_per_type(g): - buckets = collections.defaultdict(list) - [buckets[obj["type"]].append(obj) for obj in g] - return buckets - - -def object_pairs(g1, g2, w): - types_in_common = set(g1.keys()).intersection(g2.keys()) - testable_types = types_in_common.intersection(w.keys()) - - return itertools.chain.from_iterable( - itertools.product(g1[stix_type], g2[stix_type]) - for stix_type in testable_types - ) - - -# default weights used for the graph semantic equivalence process +# default weights used for the graph similarity process GRAPH_WEIGHTS = WEIGHTS.copy() GRAPH_WEIGHTS.update({ "grouping": { diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 8b1ceaa..13e029c 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -1,4 +1,6 @@ -"""Python APIs for STIX 2 Object-based Semantic Equivalence.""" +"""Python APIs for STIX 2 Object-based Semantic Equivalence and Similarity.""" +import collections +import itertools import logging import time @@ -9,9 +11,52 @@ from ..pattern import equivalent_patterns logger = logging.getLogger(__name__) +def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict): + """This method returns a true/false value if two objects are semantically equivalent. + Internally, it calls the object_similarity function and compares it against the given + threshold value. + + Args: + obj1: A stix2 object instance + obj2: A stix2 object instance + prop_scores: A dictionary that can hold individual property scores, + weights, contributing score, matching score and sum of weights. + threshold: A numerical value between 0 and 100 to determine the minimum + score to result in successfully calling both objects equivalent. This + value can be tuned. + weight_dict: A dictionary that can be used to override settings + in the semantic equivalence process + + Returns: + bool: True if the result of the object similarity is greater than or equal to + the threshold value. False otherwise. + + Warning: + Object types need to have property weights defined for the similarity process. + Otherwise, those objects will not influence the final score. The WEIGHTS + dictionary under `stix2.equivalence.object` can give you an idea on how to add + new entries and pass them via the `weight_dict` argument. Similarly, the values + or methods can be fine tuned for a particular use case. + + Note: + Default weight_dict: + + .. include:: ../../object_default_sem_eq_weights.rst + + Note: + This implementation follows the Semantic Equivalence Committee Note. + see `the Committee Note `__. + + """ + similarity_result = object_similarity(obj1, obj2, prop_scores, **weight_dict) + if similarity_result >= threshold: + return True + return False + + def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): - """This method verifies if two objects of the same type are - semantically equivalent. + """This method returns a measure of similarity depending on how + similar the two objects are. Args: obj1: A stix2 object instance @@ -22,17 +67,17 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): in the semantic equivalence process Returns: - float: A number between 0.0 and 100.0 as a measurement of equivalence. + float: A number between 0.0 and 100.0 as a measurement of similarity. Warning: - Object types need to have property weights defined for the equivalence process. + Object types need to have property weights defined for the similarity process. Otherwise, those objects will not influence the final score. The WEIGHTS dictionary under `stix2.equivalence.object` can give you an idea on how to add new entries and pass them via the `weight_dict` argument. Similarly, the values or methods can be fine tuned for a particular use case. Note: - Default weights_dict: + Default weight_dict: .. include:: ../../object_default_sem_eq_weights.rst @@ -352,34 +397,31 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights): The score influences the objects containing these references. The result is weighted on the amount of unique objects that could 1) be de-referenced 2) """ results = {} - if len(refs1) >= len(refs2): - l1 = refs1 - l2 = refs2 - b1 = ds1 - b2 = ds2 - else: - l1 = refs2 - l2 = refs1 - b1 = ds2 - b2 = ds1 - l1.sort() - l2.sort() + pairs = object_pairs( + bucket_per_type(refs1, "id-split"), + bucket_per_type(refs2, "id-split"), + weights + ) - for ref1 in l1: - for ref2 in l2: - type1, type2 = ref1.split("--")[0], ref2.split("--")[0] - if type1 == type2: - score = reference_check(ref1, ref2, b1, b2, **weights) * 100.0 + for ref1, ref2 in pairs: + type1, type2 = ref1.split("--")[0], ref2.split("--")[0] + if type1 == type2: + score = reference_check(ref1, ref2, ds1, ds2, **weights) - if ref1 not in results: - results[ref1] = {"matched": ref2, "value": score} - elif score > results[ref1]["value"]: - results[ref1] = {"matched": ref2, "value": score} + if ref1 not in results: + results[ref1] = {"matched": ref2, "value": score} + elif score > results[ref1]["value"]: + results[ref1] = {"matched": ref2, "value": score} + + if ref2 not in results: + results[ref2] = {"matched": ref1, "value": score} + elif score > results[ref2]["value"]: + results[ref2] = {"matched": ref1, "value": score} result = 0.0 total_sum = sum(x["value"] for x in results.values()) - max_score = len(results) * 100.0 + max_score = len(results) if max_score > 0: result = total_sum / max_score @@ -391,7 +433,26 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights): return result -# default weights used for the semantic equivalence process +def bucket_per_type(g, mode="type"): + buckets = collections.defaultdict(list) + if mode == "type": + [buckets[obj["type"]].append(obj) for obj in g] + elif mode == "id-split": + [buckets[obj.split("--")[0]].append(obj) for obj in g] + return buckets + + +def object_pairs(g1, g2, w): + types_in_common = set(g1.keys()).intersection(g2.keys()) + testable_types = types_in_common.intersection(w.keys()) + + return itertools.chain.from_iterable( + itertools.product(g1[stix_type], g2[stix_type]) + for stix_type in testable_types + ) + + +# default weights used for the similarity process WEIGHTS = { "attack-pattern": { "name": (30, partial_string_based), From 690a515f0063232528db1a232ec2a11437361b0b Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 16 Feb 2021 00:58:33 -0500 Subject: [PATCH 03/19] add methods to environment.py --- stix2/environment.py | 104 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 92 insertions(+), 12 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index bc7fcaf..61751f9 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -2,12 +2,12 @@ import copy from .datastore import CompositeDataSource, DataStoreMixin -from .equivalence.graph import graph_similarity +from .equivalence.graph import graph_equivalence, graph_similarity from .equivalence.object import ( # noqa: F401 WEIGHTS, check_property_present, custom_pattern_based, exact_match, list_reference_check, partial_external_reference_based, partial_list_based, partial_location_distance, partial_string_based, partial_timestamp_based, - reference_check, object_similarity, + reference_check, object_equivalence, object_similarity, ) from .parsing import parse as _parse @@ -198,8 +198,8 @@ class Environment(DataStoreMixin): @staticmethod def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): - """This method verifies if two objects of the same type are - semantically equivalent. + """This method returns a measure of similarity depending on how + similar the two objects are. Args: obj1: A stix2 object instance @@ -210,10 +210,50 @@ class Environment(DataStoreMixin): in the semantic equivalence process Returns: - float: A number between 0.0 and 100.0 as a measurement of equivalence. + float: A number between 0.0 and 100.0 as a measurement of similarity. Warning: - Object types need to have property weights defined for the equivalence process. + Object types need to have property weights defined for the similarity process. + Otherwise, those objects will not influence the final score. The WEIGHTS + dictionary under `stix2.equivalence.object` can give you an idea on how to add + new entries and pass them via the `weight_dict` argument. Similarly, the values + or methods can be fine tuned for a particular use case. + + Note: + Default weight_dict: + + .. include:: ../../object_default_sem_eq_weights.rst + + Note: + This implementation follows the Semantic Equivalence Committee Note. + see `the Committee Note `__. + + """ + return object_similarity(obj1, obj2, prop_scores, **weight_dict) + + @staticmethod + def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict): + """This method returns a true/false value if two objects are semantically equivalent. + Internally, it calls the object_similarity function and compares it against the given + threshold value. + + Args: + obj1: A stix2 object instance + obj2: A stix2 object instance + prop_scores: A dictionary that can hold individual property scores, + weights, contributing score, matching score and sum of weights. + threshold: A numerical value between 0 and 100 to determine the minimum + score to result in successfully calling both objects equivalent. This + value can be tuned. + weight_dict: A dictionary that can be used to override settings + in the semantic equivalence process + + Returns: + bool: True if the result of the object similarity is greater than or equal to + the threshold value. False otherwise. + + Warning: + Object types need to have property weights defined for the similarity process. Otherwise, those objects will not influence the final score. The WEIGHTS dictionary under `stix2.equivalence.object` can give you an idea on how to add new entries and pass them via the `weight_dict` argument. Similarly, the values @@ -229,14 +269,14 @@ class Environment(DataStoreMixin): see `the Committee Note `__. """ - return object_similarity(obj1, obj2, prop_scores, **weight_dict) + return object_equivalence(obj1, obj2, prop_scores, threshold, **weight_dict) @staticmethod def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): - """This method verifies if two graphs are semantically equivalent. + """This method returns a similarity score for two given graphs. Each DataStore can contain a connected or disconnected graph and the final result is weighted over the amount of objects we managed to compare. - This approach builds on top of the object-based semantic equivalence process + This approach builds on top of the object-based similarity process and each comparison can return a value between 0 and 100. Args: @@ -245,13 +285,13 @@ class Environment(DataStoreMixin): prop_scores: A dictionary that can hold individual property scores, weights, contributing score, matching score and sum of weights. weight_dict: A dictionary that can be used to override settings - in the semantic equivalence process + in the similarity process Returns: - float: A number between 0.0 and 100.0 as a measurement of equivalence. + float: A number between 0.0 and 100.0 as a measurement of similarity. Warning: - Object types need to have property weights defined for the equivalence process. + Object types need to have property weights defined for the similarity process. Otherwise, those objects will not influence the final score. The WEIGHTS dictionary under `stix2.equivalence.graph` can give you an idea on how to add new entries and pass them via the `weight_dict` argument. Similarly, the values @@ -268,3 +308,43 @@ class Environment(DataStoreMixin): """ return graph_similarity(ds1, ds2, prop_scores, **weight_dict) + + @staticmethod + def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict): + """This method returns a true/false value if two graphs are semantically equivalent. + Internally, it calls the graph_similarity function and compares it against the given + threshold value. + + Args: + ds1: A DataStore object instance representing your graph + ds2: A DataStore object instance representing your graph + prop_scores: A dictionary that can hold individual property scores, + weights, contributing score, matching score and sum of weights. + threshold: A numerical value between 0 and 100 to determine the minimum + score to result in successfully calling both graphs equivalent. This + value can be tuned. + weight_dict: A dictionary that can be used to override settings + in the similarity process + + Returns: + bool: True if the result of the graph similarity is greater than or equal to + the threshold value. False otherwise. + + Warning: + Object types need to have property weights defined for the similarity process. + Otherwise, those objects will not influence the final score. The WEIGHTS + dictionary under `stix2.equivalence.graph` can give you an idea on how to add + new entries and pass them via the `weight_dict` argument. Similarly, the values + or methods can be fine tuned for a particular use case. + + Note: + Default weight_dict: + + .. include:: ../graph_default_sem_eq_weights.rst + + Note: + This implementation follows the Semantic Equivalence Committee Note. + see `the Committee Note `__. + + """ + return graph_equivalence(ds1, ds2, prop_scores, threshold, **weight_dict) From f966c64b40b941ae0c8df7c61760d23e59db0e49 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 16 Feb 2021 01:05:46 -0500 Subject: [PATCH 04/19] update test suite for environments --- stix2/test/v20/test_environment.py | 116 ++++++++++++++++++++++----- stix2/test/v21/test_environment.py | 122 +++++++++++++++++++++++------ 2 files changed, 196 insertions(+), 42 deletions(-) diff --git a/stix2/test/v20/test_environment.py b/stix2/test/v20/test_environment.py index e572aee..bcc2b60 100644 --- a/stix2/test/v20/test_environment.py +++ b/stix2/test/v20/test_environment.py @@ -1,3 +1,4 @@ +import json import os import pytest @@ -67,6 +68,11 @@ def ds2(): yield stix2.MemoryStore(stix_objs) +@pytest.fixture +def fs(): + yield stix2.FileSystemSource(FS_PATH) + + def test_object_factory_created_by_ref_str(): factory = stix2.ObjectFactory(created_by_ref=IDENTITY_ID) ind = factory.create(stix2.v20.Indicator, **INDICATOR_KWARGS) @@ -497,7 +503,20 @@ def test_list_semantic_check(ds, ds2): assert round(score) == 1 -def test_graph_equivalence_with_filesystem_source(ds): +def test_graph_similarity_raises_value_error(ds): + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": -1, + }, + } + with pytest.raises(ValueError): + prop_scores1 = {} + stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + + +def test_graph_equivalence_with_filesystem_source(ds, fs): weights = { "_internal": { "ignore_spec_version": True, @@ -505,12 +524,31 @@ def test_graph_equivalence_with_filesystem_source(ds): "max_depth": 1, }, } - prop_scores = {} - fs = stix2.FileSystemSource(FS_PATH) - env = stix2.Environment().graphically_equivalent(fs, ds, prop_scores, **weights) - assert round(env) == 28 - assert round(prop_scores["matching_score"]) == 139 - assert round(prop_scores["sum_weights"]) == 500 + prop_scores1 = {} + env1 = stix2.Environment().graph_similarity(fs, ds, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": True, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_similarity(ds, fs, prop_scores2, **weights) + + assert round(env1) == 25 + assert round(prop_scores1["matching_score"]) == 451 + assert round(prop_scores1["len_pairs"]) == 18 + + assert round(env2) == 25 + assert round(prop_scores2["matching_score"]) == 451 + assert round(prop_scores2["len_pairs"]) == 18 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) def test_graph_equivalence_with_duplicate_graph(ds): @@ -522,10 +560,10 @@ def test_graph_equivalence_with_duplicate_graph(ds): }, } prop_scores = {} - env = stix2.Environment().graphically_equivalent(ds, ds, prop_scores, **weights) + env = stix2.Environment().graph_similarity(ds, ds, prop_scores, **weights) assert round(env) == 100 assert round(prop_scores["matching_score"]) == 800 - assert round(prop_scores["sum_weights"]) == 800 + assert round(prop_scores["len_pairs"]) == 8 def test_graph_equivalence_with_versioning_check_on(ds2, ds): @@ -536,11 +574,31 @@ def test_graph_equivalence_with_versioning_check_on(ds2, ds): "max_depth": 1, }, } - prop_scores = {} - env = stix2.Environment().graphically_equivalent(ds, ds2, prop_scores, **weights) - assert round(env) == 93 - assert round(prop_scores["matching_score"]) == 745 - assert round(prop_scores["sum_weights"]) == 800 + prop_scores1 = {} + env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": True, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, **weights) + + assert round(env1) == 88 + assert round(prop_scores1["matching_score"]) == 789 + assert round(prop_scores1["len_pairs"]) == 9 + + assert round(env2) == 88 + assert round(prop_scores2["matching_score"]) == 789 + assert round(prop_scores2["len_pairs"]) == 9 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) def test_graph_equivalence_with_versioning_check_off(ds2, ds): @@ -551,8 +609,28 @@ def test_graph_equivalence_with_versioning_check_off(ds2, ds): "max_depth": 1, }, } - prop_scores = {} - env = stix2.Environment().graphically_equivalent(ds, ds2, prop_scores, **weights) - assert round(env) == 93 - assert round(prop_scores["matching_score"]) == 745 - assert round(prop_scores["sum_weights"]) == 800 + prop_scores1 = {} + env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, **weights) + + assert round(env1) == 88 + assert round(prop_scores1["matching_score"]) == 789 + assert round(prop_scores1["len_pairs"]) == 9 + + assert round(env2) == 88 + assert round(prop_scores2["matching_score"]) == 789 + assert round(prop_scores2["len_pairs"]) == 9 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 5682ad1..774d09a 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -1,3 +1,4 @@ +import json import os import pytest @@ -71,6 +72,11 @@ def ds2(): yield stix2.MemoryStore(stix_objs) +@pytest.fixture +def fs(): + yield stix2.FileSystemSource(FS_PATH) + + def test_object_factory_created_by_ref_str(): factory = stix2.ObjectFactory(created_by_ref=IDENTITY_ID) ind = factory.create(stix2.v21.Indicator, **INDICATOR_KWARGS) @@ -955,8 +961,30 @@ def test_list_semantic_check(ds, ds2): ) assert round(score) == 1 + score = stix2.equivalence.object.list_reference_check( + object_refs2, + object_refs1, + ds2, + ds, + **weights, + ) + assert round(score) == 1 -def test_graph_equivalence_with_filesystem_source(ds): + +def test_graph_similarity_raises_value_error(ds): + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": -1, + }, + } + with pytest.raises(ValueError): + prop_scores1 = {} + stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + + +def test_graph_equivalence_with_filesystem_source(ds, fs): weights = { "_internal": { "ignore_spec_version": True, @@ -965,18 +993,30 @@ def test_graph_equivalence_with_filesystem_source(ds): }, } prop_scores1 = {} - prop_scores2 = {} - fs = stix2.FileSystemSource(FS_PATH) - env = stix2.Environment().graph_similarity(fs, ds, prop_scores1, **weights) - assert round(env) == 26 - assert round(prop_scores1["matching_score"]) == 460 - assert round(prop_scores1["sum_weights"]) == 18 + env1 = stix2.Environment().graph_similarity(fs, ds, prop_scores1, **weights) - env = stix2.Environment().graph_similarity(ds, fs, prop_scores2, **weights) - assert round(env) == 47 - assert round(prop_scores2["matching_score"]) == 852 - assert round(prop_scores2["sum_weights"]) == 18 - assert prop_scores1 == prop_scores2 + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": True, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_similarity(ds, fs, prop_scores2, **weights) + + assert round(env1) == 23 + assert round(prop_scores1["matching_score"]) == 411 + assert round(prop_scores1["len_pairs"]) == 18 + + assert round(env2) == 23 + assert round(prop_scores2["matching_score"]) == 411 + assert round(prop_scores2["len_pairs"]) == 18 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) def test_graph_equivalence_with_duplicate_graph(ds): @@ -991,7 +1031,7 @@ def test_graph_equivalence_with_duplicate_graph(ds): env = stix2.Environment().graph_similarity(ds, ds, prop_scores, **weights) assert round(env) == 100 assert round(prop_scores["matching_score"]) == 800 - assert round(prop_scores["sum_weights"]) == 8 + assert round(prop_scores["len_pairs"]) == 8 def test_graph_equivalence_with_versioning_check_on(ds2, ds): @@ -1002,11 +1042,29 @@ def test_graph_equivalence_with_versioning_check_on(ds2, ds): "max_depth": 1, }, } - prop_scores = {} - env = stix2.Environment().graph_similarity(ds, ds2, prop_scores, **weights) - assert round(env) == 93 - assert round(prop_scores["matching_score"]) == 745 - assert round(prop_scores["sum_weights"]) == 8 + prop_scores1 = {} + env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + assert round(env1) == 88 + assert round(prop_scores1["matching_score"]) == 789 + assert round(prop_scores1["len_pairs"]) == 9 + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, **weights) + assert round(env2) == 88 + assert round(prop_scores2["matching_score"]) == 789 + assert round(prop_scores2["len_pairs"]) == 9 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) def test_graph_equivalence_with_versioning_check_off(ds2, ds): @@ -1017,8 +1075,26 @@ def test_graph_equivalence_with_versioning_check_off(ds2, ds): "max_depth": 1, }, } - prop_scores = {} - env = stix2.Environment().graph_similarity(ds, ds2, prop_scores, **weights) - assert round(env) == 93 - assert round(prop_scores["matching_score"]) == 745 - assert round(prop_scores["sum_weights"]) == 8 + prop_scores1 = {} + env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + assert round(env1) == 88 + assert round(prop_scores1["matching_score"]) == 789 + assert round(prop_scores1["len_pairs"]) == 9 + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, **weights) + assert round(env2) == 88 + assert round(prop_scores2["matching_score"]) == 789 + assert round(prop_scores2["len_pairs"]) == 9 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) From ef610ec8d3c0bad28aa7401b33571210449d7f97 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 16 Feb 2021 09:36:09 -0500 Subject: [PATCH 05/19] small docstring fix --- stix2/environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stix2/environment.py b/stix2/environment.py index 61751f9..bd4445f 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -222,7 +222,7 @@ class Environment(DataStoreMixin): Note: Default weight_dict: - .. include:: ../../object_default_sem_eq_weights.rst + .. include:: ../object_default_sem_eq_weights.rst Note: This implementation follows the Semantic Equivalence Committee Note. From fbea229004ced319e5c34357679ca4f1001f54b7 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 16 Feb 2021 09:44:03 -0500 Subject: [PATCH 06/19] add styling changes --- stix2/environment.py | 5 +++-- stix2/equivalence/graph/__init__.py | 5 +++-- stix2/equivalence/object/__init__.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index bd4445f..246d279 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -5,9 +5,10 @@ from .datastore import CompositeDataSource, DataStoreMixin from .equivalence.graph import graph_equivalence, graph_similarity from .equivalence.object import ( # noqa: F401 WEIGHTS, check_property_present, custom_pattern_based, exact_match, - list_reference_check, partial_external_reference_based, partial_list_based, + list_reference_check, object_equivalence, object_similarity, + partial_external_reference_based, partial_list_based, partial_location_distance, partial_string_based, partial_timestamp_based, - reference_check, object_equivalence, object_similarity, + reference_check, ) from .parsing import parse as _parse diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index 797aa23..d9d6e0c 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -2,8 +2,9 @@ import logging from ..object import ( - WEIGHTS, exact_match, list_reference_check, partial_string_based, - partial_timestamp_based, reference_check, object_similarity, object_pairs, bucket_per_type + WEIGHTS, bucket_per_type, exact_match, list_reference_check, object_pairs, + object_similarity, partial_string_based, partial_timestamp_based, + reference_check, ) logger = logging.getLogger(__name__) diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 13e029c..29e3c4f 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -401,7 +401,7 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights): pairs = object_pairs( bucket_per_type(refs1, "id-split"), bucket_per_type(refs2, "id-split"), - weights + weights, ) for ref1, ref2 in pairs: From 09fd8c060bb5a42236efc4dd40641ff98e1bbbdb Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 16 Feb 2021 10:23:02 -0500 Subject: [PATCH 07/19] clear debug message --- stix2/equivalence/graph/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index d9d6e0c..1a25484 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -139,7 +139,7 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): prop_scores["summary"] = results logger.debug( - "DONE\t\tSUM_PAIRS: %.2f\tMATCHING_SCORE: %.2f\t SIMILARITY_SCORE: %.2f", + "DONE\t\tLEN_PAIRS: %.2f\tMATCHING_SCORE: %.2f\t SIMILARITY_SCORE: %.2f", len_pairs, matching_score, similarity_score, From d2d85badb2c297ea1abdfc1612d4ad847fd8a2a6 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 16 Feb 2021 11:01:07 -0500 Subject: [PATCH 08/19] make some functions internal, add some docs for them --- stix2/equivalence/graph/__init__.py | 14 ++++++++------ stix2/equivalence/object/__init__.py | 16 +++++++++++----- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index 1a25484..3d892f4 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -2,9 +2,9 @@ import logging from ..object import ( - WEIGHTS, bucket_per_type, exact_match, list_reference_check, object_pairs, - object_similarity, partial_string_based, partial_timestamp_based, - reference_check, + WEIGHTS, _bucket_per_type, _object_pairs, exact_match, + list_reference_check, object_similarity, partial_string_based, + partial_timestamp_based, reference_check, ) logger = logging.getLogger(__name__) @@ -99,9 +99,11 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): raise ValueError("weight_dict['_internal']['max_depth'] must be greater than 0") depth = weights["_internal"]["max_depth"] - graph1 = bucket_per_type(ds1.query([])) - graph2 = bucket_per_type(ds2.query([])) - pairs = object_pairs(graph1, graph2, weights) + pairs = _object_pairs( + _bucket_per_type(ds1.query([])), + _bucket_per_type(ds2.query([])), + weights, + ) for object1, object2 in pairs: iprop_score1 = {} diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 29e3c4f..39eb99a 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -398,9 +398,9 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights): weighted on the amount of unique objects that could 1) be de-referenced 2) """ results = {} - pairs = object_pairs( - bucket_per_type(refs1, "id-split"), - bucket_per_type(refs2, "id-split"), + pairs = _object_pairs( + _bucket_per_type(refs1, "id-split"), + _bucket_per_type(refs2, "id-split"), weights, ) @@ -433,7 +433,10 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights): return result -def bucket_per_type(g, mode="type"): +def _bucket_per_type(g, mode="type"): + """Given a list of objects or references, bucket them by type. + Depending on the list type: extract from 'type' property or using + the 'id'""" buckets = collections.defaultdict(list) if mode == "type": [buckets[obj["type"]].append(obj) for obj in g] @@ -442,7 +445,10 @@ def bucket_per_type(g, mode="type"): return buckets -def object_pairs(g1, g2, w): +def _object_pairs(g1, g2, w): + """Returns a generator with the product of the comparable + objects for the graph similarity process. It determines + objects in common between graphs and objects with weights.""" types_in_common = set(g1.keys()).intersection(g2.keys()) testable_types = types_in_common.intersection(w.keys()) From c656d35da5b934d3152160000e7a3c65e427f8d5 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 16 Feb 2021 13:40:42 -0500 Subject: [PATCH 09/19] add more test coverage for new functions --- stix2/test/v20/test_environment.py | 134 +++++++++++++++- stix2/test/v21/test_environment.py | 239 ++++++++++++++++++++++++----- 2 files changed, 331 insertions(+), 42 deletions(-) diff --git a/stix2/test/v20/test_environment.py b/stix2/test/v20/test_environment.py index bcc2b60..33e0985 100644 --- a/stix2/test/v20/test_environment.py +++ b/stix2/test/v20/test_environment.py @@ -516,7 +516,7 @@ def test_graph_similarity_raises_value_error(ds): stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) -def test_graph_equivalence_with_filesystem_source(ds, fs): +def test_graph_similarity_with_filesystem_source(ds, fs): weights = { "_internal": { "ignore_spec_version": True, @@ -551,7 +551,7 @@ def test_graph_equivalence_with_filesystem_source(ds, fs): assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) -def test_graph_equivalence_with_duplicate_graph(ds): +def test_graph_similarity_with_duplicate_graph(ds): weights = { "_internal": { "ignore_spec_version": False, @@ -566,7 +566,7 @@ def test_graph_equivalence_with_duplicate_graph(ds): assert round(prop_scores["len_pairs"]) == 8 -def test_graph_equivalence_with_versioning_check_on(ds2, ds): +def test_graph_similarity_with_versioning_check_on(ds2, ds): weights = { "_internal": { "ignore_spec_version": False, @@ -601,6 +601,126 @@ def test_graph_equivalence_with_versioning_check_on(ds2, ds): assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) +def test_graph_similarity_with_versioning_check_off(ds2, ds): + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores1 = {} + env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, **weights) + + assert round(env1) == 88 + assert round(prop_scores1["matching_score"]) == 789 + assert round(prop_scores1["len_pairs"]) == 9 + + assert round(env2) == 88 + assert round(prop_scores2["matching_score"]) == 789 + assert round(prop_scores2["len_pairs"]) == 9 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) + + +def test_graph_equivalence_with_filesystem_source(ds, fs): + weights = { + "_internal": { + "ignore_spec_version": True, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores1 = {} + env1 = stix2.Environment().graph_equivalence(fs, ds, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": True, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_equivalence(ds, fs, prop_scores2, **weights) + + assert env1 is False + assert round(prop_scores1["matching_score"]) == 451 + assert round(prop_scores1["len_pairs"]) == 18 + + assert env2 is False + assert round(prop_scores2["matching_score"]) == 451 + assert round(prop_scores2["len_pairs"]) == 18 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) + + +def test_graph_equivalence_with_duplicate_graph(ds): + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores = {} + env = stix2.Environment().graph_equivalence(ds, ds, prop_scores, **weights) + assert env is True + assert round(prop_scores["matching_score"]) == 800 + assert round(prop_scores["len_pairs"]) == 8 + + +def test_graph_equivalence_with_versioning_check_on(ds2, ds): + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": True, + "max_depth": 1, + }, + } + prop_scores1 = {} + env1 = stix2.Environment().graph_equivalence(ds, ds2, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": True, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_equivalence(ds2, ds, prop_scores2, **weights) + + assert env1 is True + assert round(prop_scores1["matching_score"]) == 789 + assert round(prop_scores1["len_pairs"]) == 9 + + assert env2 is True + assert round(prop_scores2["matching_score"]) == 789 + assert round(prop_scores2["len_pairs"]) == 9 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) + + def test_graph_equivalence_with_versioning_check_off(ds2, ds): weights = { "_internal": { @@ -610,7 +730,7 @@ def test_graph_equivalence_with_versioning_check_off(ds2, ds): }, } prop_scores1 = {} - env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + env1 = stix2.Environment().graph_equivalence(ds, ds2, prop_scores1, **weights) # Switching parameters weights = { @@ -621,13 +741,13 @@ def test_graph_equivalence_with_versioning_check_off(ds2, ds): }, } prop_scores2 = {} - env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, **weights) + env2 = stix2.Environment().graph_equivalence(ds2, ds, prop_scores2, **weights) - assert round(env1) == 88 + assert env1 is True assert round(prop_scores1["matching_score"]) == 789 assert round(prop_scores1["len_pairs"]) == 9 - assert round(env2) == 88 + assert env2 is True assert round(prop_scores2["matching_score"]) == 789 assert round(prop_scores2["len_pairs"]) == 9 diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 774d09a..80c4ba8 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -38,7 +38,7 @@ def ds(): @pytest.fixture -def ds2(): +def ds2_objects(): cam = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) idy = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS) ind = stix2.v21.Indicator(id=INDICATOR_ID, created_by_ref=idy.id, **INDICATOR_KWARGS) @@ -69,7 +69,12 @@ def ds2(): published="2021-04-09T08:22:22Z", object_refs=stix_objs, ) stix_objs.append(reprt) - yield stix2.MemoryStore(stix_objs) + yield stix_objs + + +@pytest.fixture +def ds2(ds2_objects): + yield stix2.MemoryStore(ds2_objects) @pytest.fixture @@ -432,14 +437,14 @@ def test_related_to_by_target(ds): assert any(x['id'] == INDICATOR_ID for x in resp) -def test_semantic_equivalence_on_same_attack_pattern1(): +def test_object_similarity_on_same_attack_pattern1(): ap1 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS) ap2 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS) env = stix2.Environment().object_similarity(ap1, ap2) assert round(env) == 100 -def test_semantic_equivalence_on_same_attack_pattern2(): +def test_object_similarity_on_same_attack_pattern2(): ATTACK_KWARGS = dict( name="Phishing", external_references=[ @@ -455,14 +460,14 @@ def test_semantic_equivalence_on_same_attack_pattern2(): assert round(env) == 100 -def test_semantic_equivalence_on_same_campaign1(): +def test_object_similarity_on_same_campaign1(): camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) env = stix2.Environment().object_similarity(camp1, camp2) assert round(env) == 100 -def test_semantic_equivalence_on_same_campaign2(): +def test_object_similarity_on_same_campaign2(): CAMP_KWARGS = dict( name="Green Group Attacks Against Finance", description="Campaign by Green Group against a series of targets in the financial services sector.", @@ -474,14 +479,14 @@ def test_semantic_equivalence_on_same_campaign2(): assert round(env) == 100 -def test_semantic_equivalence_on_same_identity1(): +def test_object_similarity_on_same_identity1(): iden1 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS) iden2 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS) env = stix2.Environment().object_similarity(iden1, iden2) assert round(env) == 100 -def test_semantic_equivalence_on_same_identity2(): +def test_object_similarity_on_same_identity2(): IDEN_KWARGS = dict( name="John Smith", identity_class="individual", @@ -493,14 +498,14 @@ def test_semantic_equivalence_on_same_identity2(): assert round(env) == 100 -def test_semantic_equivalence_on_same_indicator(): +def test_object_similarity_on_same_indicator(): ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) env = stix2.Environment().object_similarity(ind1, ind2) assert round(env) == 100 -def test_semantic_equivalence_on_same_location1(): +def test_object_similarity_on_same_location1(): location_kwargs = dict(latitude=45, longitude=179) loc1 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs) loc2 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs) @@ -508,7 +513,7 @@ def test_semantic_equivalence_on_same_location1(): assert round(env) == 100 -def test_semantic_equivalence_on_same_location2(): +def test_object_similarity_on_same_location2(): location_kwargs = dict( latitude=38.889, longitude=-77.023, @@ -521,7 +526,7 @@ def test_semantic_equivalence_on_same_location2(): assert round(env) == 100 -def test_semantic_equivalence_location_with_no_latlong(): +def test_object_similarity_location_with_no_latlong(): loc_kwargs = dict(country="US", administrative_area="US-DC") loc1 = stix2.v21.Location(id=LOCATION_ID, **LOCATION_KWARGS) loc2 = stix2.v21.Location(id=LOCATION_ID, **loc_kwargs) @@ -529,21 +534,21 @@ def test_semantic_equivalence_location_with_no_latlong(): assert round(env) != 100 -def test_semantic_equivalence_on_same_malware(): +def test_object_similarity_on_same_malware(): malw1 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS) malw2 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS) env = stix2.Environment().object_similarity(malw1, malw2) assert round(env) == 100 -def test_semantic_equivalence_on_same_threat_actor1(): +def test_object_similarity_on_same_threat_actor1(): ta1 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS) ta2 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS) env = stix2.Environment().object_similarity(ta1, ta2) assert round(env) == 100 -def test_semantic_equivalence_on_same_threat_actor2(): +def test_object_similarity_on_same_threat_actor2(): THREAT_KWARGS = dict( threat_actor_types=["crime-syndicate"], aliases=["super-evil"], @@ -555,21 +560,34 @@ def test_semantic_equivalence_on_same_threat_actor2(): assert round(env) == 100 -def test_semantic_equivalence_on_same_tool(): +def test_object_similarity_on_same_tool(): tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) env = stix2.Environment().object_similarity(tool1, tool2) assert round(env) == 100 -def test_semantic_equivalence_on_same_vulnerability1(): +def test_object_similarity_on_same_vulnerability1(): vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) - env = stix2.Environment().object_similarity(vul1, vul2) + prop_scores = {} + env = stix2.Environment().object_similarity(vul1, vul2, prop_scores) assert round(env) == 100 + assert round(prop_scores["matching_score"]) == 30 + assert round(prop_scores["sum_weights"]) == 30 -def test_semantic_equivalence_on_same_vulnerability2(): +def test_object_equivalence_on_same_vulnerability1(): + vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) + vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) + prop_scores = {} + env = stix2.Environment().object_equivalence(vul1, vul2, prop_scores) + assert env is True + assert round(prop_scores["matching_score"]) == 30 + assert round(prop_scores["sum_weights"]) == 30 + + +def test_object_similarity_on_same_vulnerability2(): VULN_KWARGS1 = dict( name="Heartbleed", external_references=[ @@ -590,11 +608,42 @@ def test_semantic_equivalence_on_same_vulnerability2(): ) vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS1) vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS2) - env = stix2.Environment().object_similarity(vul1, vul2) + prop_scores = {} + env = stix2.Environment().object_similarity(vul1, vul2, prop_scores) assert round(env) == 0.0 + assert round(prop_scores["matching_score"]) == 0 + assert round(prop_scores["sum_weights"]) == 100 -def test_semantic_equivalence_on_unknown_object(): +def test_object_equivalence_on_same_vulnerability2(): + VULN_KWARGS1 = dict( + name="Heartbleed", + external_references=[ + { + "url": "https://example", + "source_name": "some-source", + }, + ], + ) + VULN_KWARGS2 = dict( + name="Foo", + external_references=[ + { + "url": "https://example2", + "source_name": "some-source2", + }, + ], + ) + vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS1) + vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS2) + prop_scores = {} + env = stix2.Environment().object_equivalence(vul1, vul2, prop_scores) + assert env is False + assert round(prop_scores["matching_score"]) == 0 + assert round(prop_scores["sum_weights"]) == 100 + + +def test_object_similarity_on_unknown_object(): CUSTOM_KWARGS1 = dict( type="x-foobar", id="x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061", @@ -650,7 +699,7 @@ def test_semantic_equivalence_on_unknown_object(): assert round(env) == 0 -def test_semantic_equivalence_different_type_raises(): +def test_object_similarity_different_type_raises(): with pytest.raises(ValueError) as excinfo: vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) @@ -659,7 +708,7 @@ def test_semantic_equivalence_different_type_raises(): assert str(excinfo.value) == "The objects to compare must be of the same type!" -def test_semantic_equivalence_different_spec_version_raises(): +def test_object_similarity_different_spec_version_raises(): with pytest.raises(ValueError) as excinfo: V20_KWARGS = dict( labels=['malicious-activity'], @@ -672,7 +721,7 @@ def test_semantic_equivalence_different_spec_version_raises(): assert str(excinfo.value) == "The objects to compare must be of the same spec version!" -def test_semantic_equivalence_zero_match(): +def test_object_similarity_zero_match(): IND_KWARGS = dict( indicator_types=["APTX"], pattern="[ipv4-addr:value = '192.168.1.1']", @@ -696,7 +745,7 @@ def test_semantic_equivalence_zero_match(): assert round(env) == 0 -def test_semantic_equivalence_different_spec_version(): +def test_object_similarity_different_spec_version(): IND_KWARGS = dict( labels=["APTX"], pattern="[ipv4-addr:value = '192.168.1.1']", @@ -786,18 +835,18 @@ def test_semantic_equivalence_different_spec_version(): ), ], ) -def test_semantic_equivalence_external_references(refs1, refs2, ret_val): +def test_object_similarity_external_references(refs1, refs2, ret_val): value = stix2.environment.partial_external_reference_based(refs1, refs2) assert value == ret_val -def test_semantic_equivalence_timestamp(): +def test_object_similarity_timestamp(): t1 = "2018-10-17T00:14:20.652Z" t2 = "2018-10-17T12:14:20.652Z" assert stix2.environment.partial_timestamp_based(t1, t2, 1) == 0.5 -def test_semantic_equivalence_exact_match(): +def test_object_similarity_exact_match(): t1 = "2018-10-17T00:14:20.652Z" t2 = "2018-10-17T12:14:20.652Z" assert stix2.environment.exact_match(t1, t2) == 0.0 @@ -813,7 +862,7 @@ def custom_semantic_equivalence_method(obj1, obj2, **weights): return 96.0, 100.0 -def test_semantic_equivalence_method_provided(): +def test_object_similarity_method_provided(): # Because `method` is provided, `partial_list_based` will be ignored TOOL2_KWARGS = dict( name="Random Software", @@ -834,7 +883,7 @@ def test_semantic_equivalence_method_provided(): assert round(env) == 96 -def test_semantic_equivalence_prop_scores(): +def test_object_similarity_prop_scores(): TOOL2_KWARGS = dict( name="Random Software", tool_types=["information-gathering"], @@ -856,7 +905,7 @@ def custom_semantic_equivalence_method_prop_scores(obj1, obj2, prop_scores, **we return 96.0, 100.0 -def test_semantic_equivalence_prop_scores_method_provided(): +def test_object_similarity_prop_scores_method_provided(): TOOL2_KWARGS = dict( name="Random Software", tool_types=["information-gathering"], @@ -984,7 +1033,7 @@ def test_graph_similarity_raises_value_error(ds): stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) -def test_graph_equivalence_with_filesystem_source(ds, fs): +def test_graph_similarity_with_filesystem_source(ds, fs): weights = { "_internal": { "ignore_spec_version": True, @@ -1019,7 +1068,7 @@ def test_graph_equivalence_with_filesystem_source(ds, fs): assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) -def test_graph_equivalence_with_duplicate_graph(ds): +def test_graph_similarity_with_duplicate_graph(ds): weights = { "_internal": { "ignore_spec_version": False, @@ -1034,7 +1083,7 @@ def test_graph_equivalence_with_duplicate_graph(ds): assert round(prop_scores["len_pairs"]) == 8 -def test_graph_equivalence_with_versioning_check_on(ds2, ds): +def test_graph_similarity_with_versioning_check_on(ds2, ds): weights = { "_internal": { "ignore_spec_version": False, @@ -1067,7 +1116,7 @@ def test_graph_equivalence_with_versioning_check_on(ds2, ds): assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) -def test_graph_equivalence_with_versioning_check_off(ds2, ds): +def test_graph_similarity_with_versioning_check_off(ds2, ds): weights = { "_internal": { "ignore_spec_version": False, @@ -1098,3 +1147,123 @@ def test_graph_equivalence_with_versioning_check_off(ds2, ds): prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) + + +def test_graph_equivalence_with_filesystem_source(ds, fs): + weights = { + "_internal": { + "ignore_spec_version": True, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores1 = {} + env1 = stix2.Environment().graph_equivalence(fs, ds, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": True, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_equivalence(ds, fs, prop_scores2, **weights) + + assert env1 is False + assert round(prop_scores1["matching_score"]) == 411 + assert round(prop_scores1["len_pairs"]) == 18 + + assert env2 is False + assert round(prop_scores2["matching_score"]) == 411 + assert round(prop_scores2["len_pairs"]) == 18 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) + + +def test_graph_equivalence_with_duplicate_graph(ds): + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores = {} + env = stix2.Environment().graph_equivalence(ds, ds, prop_scores, **weights) + assert env is True + assert round(prop_scores["matching_score"]) == 800 + assert round(prop_scores["len_pairs"]) == 8 + + +def test_graph_equivalence_with_versioning_check_on(ds2, ds): + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": True, + "max_depth": 1, + }, + } + prop_scores1 = {} + env1 = stix2.Environment().graph_equivalence(ds, ds2, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": True, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_equivalence(ds2, ds, prop_scores2, **weights) + + assert env1 is True + assert round(prop_scores1["matching_score"]) == 789 + assert round(prop_scores1["len_pairs"]) == 9 + + assert env2 is True + assert round(prop_scores2["matching_score"]) == 789 + assert round(prop_scores2["len_pairs"]) == 9 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) + + +def test_graph_equivalence_with_versioning_check_off(ds2, ds): + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores1 = {} + env1 = stix2.Environment().graph_equivalence(ds, ds2, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_equivalence(ds2, ds, prop_scores2, **weights) + + assert env1 is True + assert round(prop_scores1["matching_score"]) == 789 + assert round(prop_scores1["len_pairs"]) == 9 + + assert env2 is True + assert round(prop_scores2["matching_score"]) == 789 + assert round(prop_scores2["len_pairs"]) == 9 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) From ee63e9faf4b017d2a43560db70ec1f287a504e0c Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Wed, 17 Feb 2021 21:30:14 -0500 Subject: [PATCH 10/19] resolve issue regarding reference_check or list_reference_check, remove redundant object_similarity call update test suite --- stix2/equivalence/graph/__init__.py | 27 +++--- stix2/equivalence/object/__init__.py | 13 +-- stix2/test/v21/test_environment.py | 120 ++++++++++++++++++++++++++- 3 files changed, 136 insertions(+), 24 deletions(-) diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index 3d892f4..402bcb2 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -105,31 +105,26 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): weights, ) + weights["_internal"]["ds1"] = ds1 + weights["_internal"]["ds2"] = ds2 + for object1, object2 in pairs: - iprop_score1 = {} - iprop_score2 = {} + iprop_score = {} object1_id = object1["id"] object2_id = object2["id"] + result = object_similarity(object1, object2, iprop_score, **weights) weights["_internal"]["max_depth"] = depth - weights["_internal"]["ds1"] = ds1 - weights["_internal"]["ds2"] = ds2 - result1 = object_similarity(object1, object2, iprop_score1, **weights) - - weights["_internal"]["max_depth"] = depth - weights["_internal"]["ds1"] = ds2 - weights["_internal"]["ds2"] = ds1 - result2 = object_similarity(object2, object1, iprop_score2, **weights) if object1_id not in results: - results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score1, "value": result1} - elif result1 > results[object1_id]["value"]: - results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score1, "value": result1} + results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result} + elif result > results[object1_id]["value"]: + results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result} if object2_id not in results: - results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score2, "value": result2} - elif result2 > results[object2_id]["value"]: - results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score2, "value": result2} + results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score, "value": result} + elif result > results[object2_id]["value"]: + results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score, "value": result} matching_score = sum(x["value"] for x in results.values()) len_pairs = len(results) diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 39eb99a..7f348b6 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -125,12 +125,13 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): contributing_score = w * comp_funct(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"], threshold) elif comp_funct == reference_check or comp_funct == list_reference_check: max_depth = weights["_internal"]["max_depth"] - if max_depth < 0: - continue # prevent excessive recursion + if max_depth > 0: + weights["_internal"]["max_depth"] = max_depth - 1 + ds1, ds2 = weights["_internal"]["ds1"], weights["_internal"]["ds2"] + contributing_score = w * comp_funct(obj1[prop], obj2[prop], ds1, ds2, **weights) + weights["_internal"]["max_depth"] = max_depth + 1 else: - weights["_internal"]["max_depth"] -= 1 - ds1, ds2 = weights["_internal"]["ds1"], weights["_internal"]["ds2"] - contributing_score = w * comp_funct(obj1[prop], obj2[prop], ds1, ds2, **weights) + continue # prevent excessive recursion else: contributing_score = w * comp_funct(obj1[prop], obj2[prop]) @@ -376,7 +377,7 @@ def reference_check(ref1, ref2, ds1, ds2, **weights): type1, type2 = ref1.split("--")[0], ref2.split("--")[0] result = 0.0 - if type1 == type2: + if type1 == type2 and type1 in weights: if weights["_internal"]["versioning_checks"]: result = _versioned_checks(ref1, ref2, ds1, ds2, **weights) / 100.0 else: diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 80c4ba8..fb651af 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -723,10 +723,11 @@ def test_object_similarity_different_spec_version_raises(): def test_object_similarity_zero_match(): IND_KWARGS = dict( - indicator_types=["APTX"], + indicator_types=["malicious-activity", "bar"], pattern="[ipv4-addr:value = '192.168.1.1']", pattern_type="stix", valid_from="2019-01-01T12:34:56Z", + labels=["APTX", "foo"], ) weights = { "indicator": { @@ -742,7 +743,9 @@ def test_object_similarity_zero_match(): ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **IND_KWARGS) env = stix2.Environment().object_similarity(ind1, ind2, **weights) - assert round(env) == 0 + assert round(env) == 8 + env = stix2.Environment().object_similarity(ind2, ind1, **weights) + assert round(env) == 8 def test_object_similarity_different_spec_version(): @@ -766,6 +769,9 @@ def test_object_similarity_different_spec_version(): env = stix2.Environment().object_similarity(ind1, ind2, **weights) assert round(env) == 0 + env = stix2.Environment().object_similarity(ind2, ind1, **weights) + assert round(env) == 0 + @pytest.mark.parametrize( "refs1,refs2,ret_val", [ @@ -1068,6 +1074,116 @@ def test_graph_similarity_with_filesystem_source(ds, fs): assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) +def test_depth_limiting(): + g1 = [ + { + "type": "foo", + "id": "foo--07f9dd2a-1cce-45bb-8cbe-dba3f007aafd", + "spec_version": "2.1", + "created": "1986-02-08T00:20:17Z", + "modified": "1989-12-11T06:54:29Z", + "some1_ref": "foo--700a8a3c-9936-412f-b4eb-ede466476180", + "some2_ref": "foo--f4a999a3-df94-499d-9cac-6c02e21775ee", + }, + { + "type": "foo", + "id": "foo--700a8a3c-9936-412f-b4eb-ede466476180", + "spec_version": "2.1", + "created": "1989-01-06T10:31:54Z", + "modified": "1995-06-18T10:25:01Z", + "some1_ref": "foo--705afd45-eb56-43fc-a214-313d63d199a3", + }, + { + "type": "foo", + "id": "foo--705afd45-eb56-43fc-a214-313d63d199a3", + "spec_version": "2.1", + "created": "1977-11-06T21:19:29Z", + "modified": "1997-12-02T20:33:34Z", + }, + { + "type": "foo", + "id": "foo--f4a999a3-df94-499d-9cac-6c02e21775ee", + "spec_version": "2.1", + "created": "1991-09-17T00:40:52Z", + "modified": "1992-12-06T11:02:47Z", + "name": "alice", + }, + ] + + g2 = [ + { + "type": "foo", + "id": "foo--71570479-3e6e-48d2-81fb-897454dec55d", + "spec_version": "2.1", + "created": "1975-12-22T05:20:38Z", + "modified": "1980-11-11T01:09:03Z", + "some1_ref": "foo--4aeda39b-31fa-4ffb-a847-d8edc175a579", + "some2_ref": "foo--941e48d6-3100-4419-9e8c-cf1eb59e71b2", + }, + { + "type": "foo", + "id": "foo--4aeda39b-31fa-4ffb-a847-d8edc175a579", + "spec_version": "2.1", + "created": "1976-01-05T08:32:03Z", + "modified": "1980-11-09T05:41:02Z", + "some1_ref": "foo--689252c3-5d20-43ff-bbf7-c8e45d713768", + }, + { + "type": "foo", + "id": "foo--689252c3-5d20-43ff-bbf7-c8e45d713768", + "spec_version": "2.1", + "created": "1974-09-11T18:56:30Z", + "modified": "1976-10-31T11:59:43Z", + }, + { + "type": "foo", + "id": "foo--941e48d6-3100-4419-9e8c-cf1eb59e71b2", + "spec_version": "2.1", + "created": "1985-01-03T01:07:03Z", + "modified": "1992-07-20T21:32:31Z", + "name": "alice", + } + ] + + mem_store1 = stix2.MemorySource(g1) + mem_store2 = stix2.MemorySource(g2) + + custom_weights = { + "foo": { + "some1_ref": (33, stix2.equivalence.object.reference_check), + "some2_ref": (33, stix2.equivalence.object.reference_check), + "name": (34, stix2.equivalence.object.partial_string_based), + }, + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores1 = {} + env1 = stix2.equivalence.graph.graph_similarity(mem_store1, mem_store2, prop_scores1, **custom_weights) + + assert round(env1) == 38 + assert round(prop_scores1["matching_score"]) == 300 + assert round(prop_scores1["len_pairs"]) == 8 + # from 'alice' check in de-reference + assert prop_scores1['summary']['foo--71570479-3e6e-48d2-81fb-897454dec55d']['prop_score']['some2_ref']['weight'] == 33 + assert prop_scores1['summary']['foo--07f9dd2a-1cce-45bb-8cbe-dba3f007aafd']['prop_score']['some2_ref']['weight'] == 33 + + # Switching parameters + prop_scores2 = {} + env2 = stix2.equivalence.graph.graph_similarity( + mem_store2, mem_store1, prop_scores2, **custom_weights + ) + + assert round(env2) == 38 + assert round(prop_scores2["matching_score"]) == 300 + assert round(prop_scores2["len_pairs"]) == 8 + # from 'alice' check in de-reference + assert prop_scores2['summary']['foo--71570479-3e6e-48d2-81fb-897454dec55d']['prop_score']['some2_ref']['weight'] == 33 + assert prop_scores2['summary']['foo--07f9dd2a-1cce-45bb-8cbe-dba3f007aafd']['prop_score']['some2_ref']['weight'] == 33 + + def test_graph_similarity_with_duplicate_graph(ds): weights = { "_internal": { From 702c80bd5361f3ca70b56a58972cb8a2307e9303 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Wed, 17 Feb 2021 21:47:56 -0500 Subject: [PATCH 11/19] add styling changes --- stix2/test/v21/test_environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index fb651af..c3c2701 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -1142,7 +1142,7 @@ def test_depth_limiting(): "created": "1985-01-03T01:07:03Z", "modified": "1992-07-20T21:32:31Z", "name": "alice", - } + }, ] mem_store1 = stix2.MemorySource(g1) From 52c5f3ad296054db47937b92bb772baf6679bd2d Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Feb 2021 09:14:10 -0500 Subject: [PATCH 12/19] Update stix2/equivalence/object/__init__.py Co-authored-by: Chris Lenk --- stix2/equivalence/object/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 7f348b6..0f2ae54 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -64,7 +64,7 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): prop_scores: A dictionary that can hold individual property scores, weights, contributing score, matching score and sum of weights. weight_dict: A dictionary that can be used to override settings - in the semantic equivalence process + in the similarity process Returns: float: A number between 0.0 and 100.0 as a measurement of similarity. From 154fc4e236d65ff39618ee39e30875b63eecd37e Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Feb 2021 09:14:34 -0500 Subject: [PATCH 13/19] Update stix2/equivalence/object/__init__.py update variable names Co-authored-by: Chris Lenk --- stix2/equivalence/object/__init__.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 0f2ae54..20b60a2 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -434,27 +434,29 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights): return result -def _bucket_per_type(g, mode="type"): +def _bucket_per_type(graph, mode="type"): """Given a list of objects or references, bucket them by type. Depending on the list type: extract from 'type' property or using - the 'id'""" + the 'id'. + """ buckets = collections.defaultdict(list) if mode == "type": - [buckets[obj["type"]].append(obj) for obj in g] + [buckets[obj["type"]].append(obj) for obj in graph] elif mode == "id-split": - [buckets[obj.split("--")[0]].append(obj) for obj in g] + [buckets[obj.split("--")[0]].append(obj) for obj in graph] return buckets -def _object_pairs(g1, g2, w): +def _object_pairs(graph1, graph2, weights): """Returns a generator with the product of the comparable objects for the graph similarity process. It determines - objects in common between graphs and objects with weights.""" - types_in_common = set(g1.keys()).intersection(g2.keys()) - testable_types = types_in_common.intersection(w.keys()) + objects in common between graphs and objects with weights. + """ + types_in_common = set(graph1.keys()).intersection(graph2.keys()) + testable_types = types_in_common.intersection(weights.keys()) return itertools.chain.from_iterable( - itertools.product(g1[stix_type], g2[stix_type]) + itertools.product(graph1[stix_type], graph2[stix_type]) for stix_type in testable_types ) From 34feac6ae78d0dccc69729198decb4e8fbe1a2f2 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Feb 2021 09:14:47 -0500 Subject: [PATCH 14/19] Update stix2/environment.py Co-authored-by: Chris Lenk --- stix2/environment.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index 246d279..99b4a5e 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -199,8 +199,7 @@ class Environment(DataStoreMixin): @staticmethod def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): - """This method returns a measure of similarity depending on how - similar the two objects are. + """This method returns a measure of how similar the two objects are. Args: obj1: A stix2 object instance From 75574c94273c45beed691a3c912240fc35c69141 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Feb 2021 09:15:09 -0500 Subject: [PATCH 15/19] Update stix2/equivalence/object/__init__.py docstrings Co-authored-by: Chris Lenk --- stix2/equivalence/object/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 20b60a2..790fc87 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -25,7 +25,7 @@ def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict): score to result in successfully calling both objects equivalent. This value can be tuned. weight_dict: A dictionary that can be used to override settings - in the semantic equivalence process + in the similarity process Returns: bool: True if the result of the object similarity is greater than or equal to From 75b411df85ed539655307c328f515ef4204d477b Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Feb 2021 09:15:26 -0500 Subject: [PATCH 16/19] Update stix2/environment.py docstrings Co-authored-by: Chris Lenk --- stix2/environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stix2/environment.py b/stix2/environment.py index 99b4a5e..672fb11 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -207,7 +207,7 @@ class Environment(DataStoreMixin): prop_scores: A dictionary that can hold individual property scores, weights, contributing score, matching score and sum of weights. weight_dict: A dictionary that can be used to override settings - in the semantic equivalence process + in the similarity process Returns: float: A number between 0.0 and 100.0 as a measurement of similarity. From 010593345ccc3420788de22c518dbae929e4ade3 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Feb 2021 09:16:19 -0500 Subject: [PATCH 17/19] Update stix2/environment.py docstrings Co-authored-by: Chris Lenk --- stix2/environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stix2/environment.py b/stix2/environment.py index 672fb11..cebb080 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -246,7 +246,7 @@ class Environment(DataStoreMixin): score to result in successfully calling both objects equivalent. This value can be tuned. weight_dict: A dictionary that can be used to override settings - in the semantic equivalence process + in the similarity process Returns: bool: True if the result of the object similarity is greater than or equal to From fa6978969bd63d93e2b2b51c60d720a46e630e49 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Feb 2021 10:20:42 -0500 Subject: [PATCH 18/19] removing unused imports (backwards breaking) --- stix2/environment.py | 11 +---------- stix2/test/v21/test_environment.py | 30 +++++++++++++++--------------- 2 files changed, 16 insertions(+), 25 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index cebb080..d0f694e 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -3,18 +3,9 @@ import copy from .datastore import CompositeDataSource, DataStoreMixin from .equivalence.graph import graph_equivalence, graph_similarity -from .equivalence.object import ( # noqa: F401 - WEIGHTS, check_property_present, custom_pattern_based, exact_match, - list_reference_check, object_equivalence, object_similarity, - partial_external_reference_based, partial_list_based, - partial_location_distance, partial_string_based, partial_timestamp_based, - reference_check, -) +from .equivalence.object import object_equivalence, object_similarity from .parsing import parse as _parse -# TODO: Remove all unused imports that now belong to the equivalence module in the next major release. -# Kept for backwards compatibility. - class ObjectFactory(object): """Easily create STIX objects with default values for certain properties. diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index c3c2701..e7bf4da 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -670,17 +670,17 @@ def test_object_similarity_on_unknown_object(): def _x_foobar_checks(obj1, obj2, **weights): matching_score = 0.0 sum_weights = 0.0 - if stix2.environment.check_property_present("external_references", obj1, obj2): + if stix2.equivalence.object.check_property_present("external_references", obj1, obj2): w = weights["external_references"] sum_weights += w - matching_score += w * stix2.environment.partial_external_reference_based( + matching_score += w * stix2.equivalence.object.partial_external_reference_based( obj1["external_references"], obj2["external_references"], ) - if stix2.environment.check_property_present("name", obj1, obj2): + if stix2.equivalence.object.check_property_present("name", obj1, obj2): w = weights["name"] sum_weights += w - matching_score += w * stix2.environment.partial_string_based(obj1["name"], obj2["name"]) + matching_score += w * stix2.equivalence.object.partial_string_based(obj1["name"], obj2["name"]) return matching_score, sum_weights weights = { @@ -731,9 +731,9 @@ def test_object_similarity_zero_match(): ) weights = { "indicator": { - "indicator_types": (15, stix2.environment.partial_list_based), - "pattern": (80, stix2.environment.custom_pattern_based), - "valid_from": (5, stix2.environment.partial_timestamp_based), + "indicator_types": (15, stix2.equivalence.object.partial_list_based), + "pattern": (80, stix2.equivalence.object.custom_pattern_based), + "valid_from": (5, stix2.equivalence.object.partial_timestamp_based), "tdelta": 1, # One day interval }, "_internal": { @@ -755,9 +755,9 @@ def test_object_similarity_different_spec_version(): ) weights = { "indicator": { - "indicator_types": (15, stix2.environment.partial_list_based), - "pattern": (80, stix2.environment.custom_pattern_based), - "valid_from": (5, stix2.environment.partial_timestamp_based), + "indicator_types": (15, stix2.equivalence.object.partial_list_based), + "pattern": (80, stix2.equivalence.object.custom_pattern_based), + "valid_from": (5, stix2.equivalence.object.partial_timestamp_based), "tdelta": 1, # One day interval }, "_internal": { @@ -842,20 +842,20 @@ def test_object_similarity_different_spec_version(): ], ) def test_object_similarity_external_references(refs1, refs2, ret_val): - value = stix2.environment.partial_external_reference_based(refs1, refs2) + value = stix2.equivalence.object.partial_external_reference_based(refs1, refs2) assert value == ret_val def test_object_similarity_timestamp(): t1 = "2018-10-17T00:14:20.652Z" t2 = "2018-10-17T12:14:20.652Z" - assert stix2.environment.partial_timestamp_based(t1, t2, 1) == 0.5 + assert stix2.equivalence.object.partial_timestamp_based(t1, t2, 1) == 0.5 def test_object_similarity_exact_match(): t1 = "2018-10-17T00:14:20.652Z" t2 = "2018-10-17T12:14:20.652Z" - assert stix2.environment.exact_match(t1, t2) == 0.0 + assert stix2.equivalence.object.exact_match(t1, t2) == 0.0 def test_non_existent_config_for_object(): @@ -877,8 +877,8 @@ def test_object_similarity_method_provided(): weights = { "tool": { - "tool_types": (20, stix2.environment.partial_list_based), - "name": (80, stix2.environment.partial_string_based), + "tool_types": (20, stix2.equivalence.object.partial_list_based), + "name": (80, stix2.equivalence.object.partial_string_based), "method": custom_semantic_equivalence_method, }, } From 99453770cfe0d072d6335b395308b5a6b3337fcd Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Feb 2021 10:37:34 -0500 Subject: [PATCH 19/19] doctring changes, _versioned_checks changes --- stix2/equivalence/__init__.py | 2 +- stix2/equivalence/graph/__init__.py | 3 +-- stix2/equivalence/object/__init__.py | 37 ++++++++++++++++------------ 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/stix2/equivalence/__init__.py b/stix2/equivalence/__init__.py index f175024..0ca9d83 100644 --- a/stix2/equivalence/__init__.py +++ b/stix2/equivalence/__init__.py @@ -1,4 +1,4 @@ -"""Python APIs for STIX 2 Semantic Equivalence. +"""Python APIs for STIX 2 Semantic Equivalence and Similarity. .. autosummary:: :toctree: equivalence diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index 402bcb2..e78624e 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -97,7 +97,6 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): if weights["_internal"]["max_depth"] <= 0: raise ValueError("weight_dict['_internal']['max_depth'] must be greater than 0") - depth = weights["_internal"]["max_depth"] pairs = _object_pairs( _bucket_per_type(ds1.query([])), @@ -108,13 +107,13 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): weights["_internal"]["ds1"] = ds1 weights["_internal"]["ds2"] = ds2 + logger.debug("Starting graph similarity process between DataStores: '%s' and '%s'", ds1.id, ds2.id) for object1, object2 in pairs: iprop_score = {} object1_id = object1["id"] object2_id = object2["id"] result = object_similarity(object1, object2, iprop_score, **weights) - weights["_internal"]["max_depth"] = depth if object1_id not in results: results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result} diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 790fc87..e175938 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -103,13 +103,13 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): try: weights[type1] except KeyError: - logger.warning("'%s' type has no 'weights' dict specified & thus no semantic equivalence method to call!", type1) + logger.warning("'%s' type has no 'weights' dict specified & thus no object similarity method to call!", type1) sum_weights = matching_score = 0 else: try: method = weights[type1]["method"] except KeyError: - logger.debug("Starting semantic equivalence process between: '%s' and '%s'", obj1["id"], obj2["id"]) + logger.debug("Starting object similarity process between: '%s' and '%s'", obj1["id"], obj2["id"]) matching_score = 0.0 sum_weights = 0.0 @@ -129,9 +129,9 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): weights["_internal"]["max_depth"] = max_depth - 1 ds1, ds2 = weights["_internal"]["ds1"], weights["_internal"]["ds2"] contributing_score = w * comp_funct(obj1[prop], obj2[prop], ds1, ds2, **weights) - weights["_internal"]["max_depth"] = max_depth + 1 else: continue # prevent excessive recursion + weights["_internal"]["max_depth"] = max_depth else: contributing_score = w * comp_funct(obj1[prop], obj2[prop]) @@ -148,7 +148,7 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): prop_scores["sum_weights"] = sum_weights logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) else: - logger.debug("Starting semantic equivalence process between: '%s' and '%s'", obj1["id"], obj2["id"]) + logger.debug("Starting object similarity process between: '%s' and '%s'", obj1["id"], obj2["id"]) try: matching_score, sum_weights = method(obj1, obj2, prop_scores, **weights[type1]) except TypeError: @@ -350,19 +350,24 @@ def partial_location_distance(lat1, long1, lat2, long2, threshold): def _versioned_checks(ref1, ref2, ds1, ds2, **weights): """Checks multiple object versions if present in graph. - Maximizes for the semantic equivalence score of a particular version.""" + Maximizes for the similarity score of a particular version.""" results = {} objects1 = ds1.query([Filter("id", "=", ref1)]) objects2 = ds2.query([Filter("id", "=", ref2)]) - if len(objects1) > 0 and len(objects2) > 0: - for o1 in objects1: - for o2 in objects2: - result = object_similarity(o1, o2, **weights) - if ref1 not in results: - results[ref1] = {"matched": ref2, "value": result} - elif result > results[ref1]["value"]: - results[ref1] = {"matched": ref2, "value": result} + pairs = _object_pairs( + _bucket_per_type(objects1), + _bucket_per_type(objects2), + weights, + ) + + for object1, object2 in pairs: + result = object_similarity(object1, object2, **weights) + if ref1 not in results: + results[ref1] = {"matched": ref2, "value": result} + elif result > results[ref1]["value"]: + results[ref1] = {"matched": ref2, "value": result} + result = results.get(ref1, {}).get("value", 0.0) logger.debug( "--\t\t_versioned_checks '%s' '%s'\tresult: '%s'", @@ -372,8 +377,8 @@ def _versioned_checks(ref1, ref2, ds1, ds2, **weights): def reference_check(ref1, ref2, ds1, ds2, **weights): - """For two references, de-reference the object and perform object-based - semantic equivalence. The score influences the result of an edge check.""" + """For two references, de-reference the object and perform object_similarity. + The score influences the result of an edge check.""" type1, type2 = ref1.split("--")[0], ref2.split("--")[0] result = 0.0 @@ -394,7 +399,7 @@ def reference_check(ref1, ref2, ds1, ds2, **weights): def list_reference_check(refs1, refs2, ds1, ds2, **weights): """For objects that contain multiple references (i.e., object_refs) perform - the same de-reference procedure and perform object-based semantic equivalence. + the same de-reference procedure and perform object_similarity. The score influences the objects containing these references. The result is weighted on the amount of unique objects that could 1) be de-referenced 2) """ results = {}