From 489970718f038a67011f468c3649225971689415 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 1 Feb 2021 22:35:37 -0500 Subject: [PATCH 01/27] WIP: changes to graph_similarity busted main loop, symmetrical properties not present --- stix2/environment.py | 12 ++--- stix2/equivalence/graph/__init__.py | 73 +++++++++++++++---------- stix2/equivalence/object/__init__.py | 6 +-- stix2/test/v21/test_environment.py | 79 +++++++++++++++------------- 4 files changed, 97 insertions(+), 73 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index 4dc6ff0..bc7fcaf 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -2,12 +2,12 @@ import copy from .datastore import CompositeDataSource, DataStoreMixin -from .equivalence.graph import graphically_equivalent +from .equivalence.graph import graph_similarity from .equivalence.object import ( # noqa: F401 WEIGHTS, check_property_present, custom_pattern_based, exact_match, list_reference_check, partial_external_reference_based, partial_list_based, partial_location_distance, partial_string_based, partial_timestamp_based, - reference_check, semantically_equivalent, + reference_check, object_similarity, ) from .parsing import parse as _parse @@ -197,7 +197,7 @@ class Environment(DataStoreMixin): return None @staticmethod - def semantically_equivalent(obj1, obj2, prop_scores={}, **weight_dict): + def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): """This method verifies if two objects of the same type are semantically equivalent. @@ -229,10 +229,10 @@ class Environment(DataStoreMixin): see `the Committee Note `__. """ - return semantically_equivalent(obj1, obj2, prop_scores, **weight_dict) + return object_similarity(obj1, obj2, prop_scores, **weight_dict) @staticmethod - def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict): + def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): """This method verifies if two graphs are semantically equivalent. Each DataStore can contain a connected or disconnected graph and the final result is weighted over the amount of objects we managed to compare. @@ -267,4 +267,4 @@ class Environment(DataStoreMixin): see `the Committee Note `__. """ - return graphically_equivalent(ds1, ds2, prop_scores, **weight_dict) + return graph_similarity(ds1, ds2, prop_scores, **weight_dict) diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index 680f42f..cff99d0 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -1,15 +1,17 @@ """Python APIs for STIX 2 Graph-based Semantic Equivalence.""" +import collections +import itertools import logging from ..object import ( WEIGHTS, exact_match, list_reference_check, partial_string_based, - partial_timestamp_based, reference_check, semantically_equivalent, + partial_timestamp_based, reference_check, object_similarity, ) logger = logging.getLogger(__name__) -def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict): +def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): """This method verifies if two graphs are semantically equivalent. Each DataStore can contain a connected or disconnected graph and the final result is weighted over the amount of objects we managed to compare. @@ -44,49 +46,48 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict): see `the Committee Note `__. """ + results = {} + equivalence_score = 0 weights = GRAPH_WEIGHTS.copy() if weight_dict: weights.update(weight_dict) - results = {} depth = weights["_internal"]["max_depth"] - graph1 = ds1.query([]) - graph2 = ds2.query([]) + graph1 = bucket_per_type(ds1.query([])) + graph2 = bucket_per_type(ds2.query([])) + pairs = object_pairs(graph1, graph2, weights) - graph1.sort(key=lambda x: x["type"]) - graph2.sort(key=lambda x: x["type"]) - - if len(graph1) < len(graph2): + for object1, object2 in pairs: + iprop_score1 = {} + iprop_score2 = {} + object1_id = object1["id"] + object2_id = object2["id"] + weights["_internal"]["max_depth"] = depth weights["_internal"]["ds1"] = ds1 weights["_internal"]["ds2"] = ds2 - g1 = graph1 - g2 = graph2 - else: + result1 = object_similarity(object1, object2, iprop_score1, **weights) + weights["_internal"]["ds1"] = ds2 weights["_internal"]["ds2"] = ds1 - g1 = graph2 - g2 = graph1 + result2 = object_similarity(object2, object1, iprop_score2, **weights) - for object1 in g1: - for object2 in g2: - if object1["type"] == object2["type"] and object1["type"] in weights: - iprop_score = {} - result = semantically_equivalent(object1, object2, iprop_score, **weights) - objects1_id = object1["id"] - weights["_internal"]["max_depth"] = depth + if object1_id not in results: + results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1} + elif result1 > results[object1_id]["value"]: + results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1} - if objects1_id not in results: - results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result} - elif result > results[objects1_id]["value"]: - results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result} + if object2_id not in results: + results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2} + elif result1 > results[object2_id]["value"]: + results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2} - equivalence_score = 0 matching_score = sum(x["value"] for x in results.values()) - sum_weights = len(results) * 100.0 + sum_weights = len(results) if sum_weights > 0: - equivalence_score = (matching_score / sum_weights) * 100 + equivalence_score = matching_score / sum_weights + prop_scores["matching_score"] = matching_score prop_scores["sum_weights"] = sum_weights prop_scores["summary"] = results @@ -100,6 +101,22 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict): return equivalence_score +def bucket_per_type(g): + buckets = collections.defaultdict(list) + [buckets[obj["type"]].append(obj) for obj in g] + return buckets + + +def object_pairs(g1, g2, w): + types_in_common = set(g1.keys()).intersection(g2.keys()) + testable_types = types_in_common.intersection(w.keys()) + + return itertools.chain.from_iterable( + itertools.product(g1[stix_type], g2[stix_type]) + for stix_type in testable_types + ) + + # default weights used for the graph semantic equivalence process GRAPH_WEIGHTS = WEIGHTS.copy() GRAPH_WEIGHTS.update({ diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 0225788..8b1ceaa 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -9,7 +9,7 @@ from ..pattern import equivalent_patterns logger = logging.getLogger(__name__) -def semantically_equivalent(obj1, obj2, prop_scores={}, **weight_dict): +def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): """This method verifies if two objects of the same type are semantically equivalent. @@ -312,7 +312,7 @@ def _versioned_checks(ref1, ref2, ds1, ds2, **weights): if len(objects1) > 0 and len(objects2) > 0: for o1 in objects1: for o2 in objects2: - result = semantically_equivalent(o1, o2, **weights) + result = object_similarity(o1, o2, **weights) if ref1 not in results: results[ref1] = {"matched": ref2, "value": result} elif result > results[ref1]["value"]: @@ -337,7 +337,7 @@ def reference_check(ref1, ref2, ds1, ds2, **weights): else: o1, o2 = ds1.get(ref1), ds2.get(ref2) if o1 and o2: - result = semantically_equivalent(o1, o2, **weights) / 100.0 + result = object_similarity(o1, o2, **weights) / 100.0 logger.debug( "--\t\treference_check '%s' '%s'\tresult: '%s'", diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 0da01d1..5682ad1 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -429,7 +429,7 @@ def test_related_to_by_target(ds): def test_semantic_equivalence_on_same_attack_pattern1(): ap1 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS) ap2 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS) - env = stix2.Environment().semantically_equivalent(ap1, ap2) + env = stix2.Environment().object_similarity(ap1, ap2) assert round(env) == 100 @@ -445,14 +445,14 @@ def test_semantic_equivalence_on_same_attack_pattern2(): ) ap1 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_KWARGS) ap2 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_KWARGS) - env = stix2.Environment().semantically_equivalent(ap1, ap2) + env = stix2.Environment().object_similarity(ap1, ap2) assert round(env) == 100 def test_semantic_equivalence_on_same_campaign1(): camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) - env = stix2.Environment().semantically_equivalent(camp1, camp2) + env = stix2.Environment().object_similarity(camp1, camp2) assert round(env) == 100 @@ -464,14 +464,14 @@ def test_semantic_equivalence_on_same_campaign2(): ) camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMP_KWARGS) camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMP_KWARGS) - env = stix2.Environment().semantically_equivalent(camp1, camp2) + env = stix2.Environment().object_similarity(camp1, camp2) assert round(env) == 100 def test_semantic_equivalence_on_same_identity1(): iden1 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS) iden2 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS) - env = stix2.Environment().semantically_equivalent(iden1, iden2) + env = stix2.Environment().object_similarity(iden1, iden2) assert round(env) == 100 @@ -483,14 +483,14 @@ def test_semantic_equivalence_on_same_identity2(): ) iden1 = stix2.v21.Identity(id=IDENTITY_ID, **IDEN_KWARGS) iden2 = stix2.v21.Identity(id=IDENTITY_ID, **IDEN_KWARGS) - env = stix2.Environment().semantically_equivalent(iden1, iden2) + env = stix2.Environment().object_similarity(iden1, iden2) assert round(env) == 100 def test_semantic_equivalence_on_same_indicator(): ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) - env = stix2.Environment().semantically_equivalent(ind1, ind2) + env = stix2.Environment().object_similarity(ind1, ind2) assert round(env) == 100 @@ -498,7 +498,7 @@ def test_semantic_equivalence_on_same_location1(): location_kwargs = dict(latitude=45, longitude=179) loc1 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs) loc2 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs) - env = stix2.Environment().semantically_equivalent(loc1, loc2) + env = stix2.Environment().object_similarity(loc1, loc2) assert round(env) == 100 @@ -511,7 +511,7 @@ def test_semantic_equivalence_on_same_location2(): ) loc1 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs) loc2 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs) - env = stix2.Environment().semantically_equivalent(loc1, loc2) + env = stix2.Environment().object_similarity(loc1, loc2) assert round(env) == 100 @@ -519,21 +519,21 @@ def test_semantic_equivalence_location_with_no_latlong(): loc_kwargs = dict(country="US", administrative_area="US-DC") loc1 = stix2.v21.Location(id=LOCATION_ID, **LOCATION_KWARGS) loc2 = stix2.v21.Location(id=LOCATION_ID, **loc_kwargs) - env = stix2.Environment().semantically_equivalent(loc1, loc2) + env = stix2.Environment().object_similarity(loc1, loc2) assert round(env) != 100 def test_semantic_equivalence_on_same_malware(): malw1 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS) malw2 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS) - env = stix2.Environment().semantically_equivalent(malw1, malw2) + env = stix2.Environment().object_similarity(malw1, malw2) assert round(env) == 100 def test_semantic_equivalence_on_same_threat_actor1(): ta1 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS) ta2 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS) - env = stix2.Environment().semantically_equivalent(ta1, ta2) + env = stix2.Environment().object_similarity(ta1, ta2) assert round(env) == 100 @@ -545,21 +545,21 @@ def test_semantic_equivalence_on_same_threat_actor2(): ) ta1 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_KWARGS) ta2 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_KWARGS) - env = stix2.Environment().semantically_equivalent(ta1, ta2) + env = stix2.Environment().object_similarity(ta1, ta2) assert round(env) == 100 def test_semantic_equivalence_on_same_tool(): tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) - env = stix2.Environment().semantically_equivalent(tool1, tool2) + env = stix2.Environment().object_similarity(tool1, tool2) assert round(env) == 100 def test_semantic_equivalence_on_same_vulnerability1(): vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) - env = stix2.Environment().semantically_equivalent(vul1, vul2) + env = stix2.Environment().object_similarity(vul1, vul2) assert round(env) == 100 @@ -584,7 +584,7 @@ def test_semantic_equivalence_on_same_vulnerability2(): ) vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS1) vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS2) - env = stix2.Environment().semantically_equivalent(vul1, vul2) + env = stix2.Environment().object_similarity(vul1, vul2) assert round(env) == 0.0 @@ -640,7 +640,7 @@ def test_semantic_equivalence_on_unknown_object(): } cust1 = stix2.parse(CUSTOM_KWARGS1, allow_custom=True) cust2 = stix2.parse(CUSTOM_KWARGS2, allow_custom=True) - env = stix2.Environment().semantically_equivalent(cust1, cust2, **weights) + env = stix2.Environment().object_similarity(cust1, cust2, **weights) assert round(env) == 0 @@ -648,7 +648,7 @@ def test_semantic_equivalence_different_type_raises(): with pytest.raises(ValueError) as excinfo: vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) - stix2.Environment().semantically_equivalent(vul1, ind1) + stix2.Environment().object_similarity(vul1, ind1) assert str(excinfo.value) == "The objects to compare must be of the same type!" @@ -661,7 +661,7 @@ def test_semantic_equivalence_different_spec_version_raises(): ) ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) ind2 = stix2.v20.Indicator(id=INDICATOR_ID, **V20_KWARGS) - stix2.Environment().semantically_equivalent(ind1, ind2) + stix2.Environment().object_similarity(ind1, ind2) assert str(excinfo.value) == "The objects to compare must be of the same spec version!" @@ -686,7 +686,7 @@ def test_semantic_equivalence_zero_match(): } ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **IND_KWARGS) - env = stix2.Environment().semantically_equivalent(ind1, ind2, **weights) + env = stix2.Environment().object_similarity(ind1, ind2, **weights) assert round(env) == 0 @@ -708,7 +708,7 @@ def test_semantic_equivalence_different_spec_version(): } ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) ind2 = stix2.v20.Indicator(id=INDICATOR_ID, **IND_KWARGS) - env = stix2.Environment().semantically_equivalent(ind1, ind2, **weights) + env = stix2.Environment().object_similarity(ind1, ind2, **weights) assert round(env) == 0 @@ -800,7 +800,7 @@ def test_semantic_equivalence_exact_match(): def test_non_existent_config_for_object(): r1 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) r2 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) - assert stix2.Environment().semantically_equivalent(r1, r2) == 0.0 + assert stix2.Environment().object_similarity(r1, r2) == 0.0 def custom_semantic_equivalence_method(obj1, obj2, **weights): @@ -824,7 +824,7 @@ def test_semantic_equivalence_method_provided(): tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS) - env = stix2.Environment().semantically_equivalent(tool1, tool2, **weights) + env = stix2.Environment().object_similarity(tool1, tool2, **weights) assert round(env) == 96 @@ -838,7 +838,7 @@ def test_semantic_equivalence_prop_scores(): tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS) - stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores) + stix2.Environment().object_similarity(tool1, tool2, prop_scores) assert len(prop_scores) == 4 assert round(prop_scores["matching_score"], 1) == 8.9 assert round(prop_scores["sum_weights"], 1) == 100.0 @@ -868,7 +868,7 @@ def test_semantic_equivalence_prop_scores_method_provided(): tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS) - env = stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores, **weights) + env = stix2.Environment().object_similarity(tool1, tool2, prop_scores, **weights) assert round(env) == 96 assert len(prop_scores) == 2 assert prop_scores["matching_score"] == 96.0 @@ -964,12 +964,19 @@ def test_graph_equivalence_with_filesystem_source(ds): "max_depth": 1, }, } - prop_scores = {} + prop_scores1 = {} + prop_scores2 = {} fs = stix2.FileSystemSource(FS_PATH) - env = stix2.Environment().graphically_equivalent(fs, ds, prop_scores, **weights) - assert round(env) == 24 - assert round(prop_scores["matching_score"]) == 122 - assert round(prop_scores["sum_weights"]) == 500 + env = stix2.Environment().graph_similarity(fs, ds, prop_scores1, **weights) + assert round(env) == 26 + assert round(prop_scores1["matching_score"]) == 460 + assert round(prop_scores1["sum_weights"]) == 18 + + env = stix2.Environment().graph_similarity(ds, fs, prop_scores2, **weights) + assert round(env) == 47 + assert round(prop_scores2["matching_score"]) == 852 + assert round(prop_scores2["sum_weights"]) == 18 + assert prop_scores1 == prop_scores2 def test_graph_equivalence_with_duplicate_graph(ds): @@ -981,10 +988,10 @@ def test_graph_equivalence_with_duplicate_graph(ds): }, } prop_scores = {} - env = stix2.Environment().graphically_equivalent(ds, ds, prop_scores, **weights) + env = stix2.Environment().graph_similarity(ds, ds, prop_scores, **weights) assert round(env) == 100 assert round(prop_scores["matching_score"]) == 800 - assert round(prop_scores["sum_weights"]) == 800 + assert round(prop_scores["sum_weights"]) == 8 def test_graph_equivalence_with_versioning_check_on(ds2, ds): @@ -996,10 +1003,10 @@ def test_graph_equivalence_with_versioning_check_on(ds2, ds): }, } prop_scores = {} - env = stix2.Environment().graphically_equivalent(ds, ds2, prop_scores, **weights) + env = stix2.Environment().graph_similarity(ds, ds2, prop_scores, **weights) assert round(env) == 93 assert round(prop_scores["matching_score"]) == 745 - assert round(prop_scores["sum_weights"]) == 800 + assert round(prop_scores["sum_weights"]) == 8 def test_graph_equivalence_with_versioning_check_off(ds2, ds): @@ -1011,7 +1018,7 @@ def test_graph_equivalence_with_versioning_check_off(ds2, ds): }, } prop_scores = {} - env = stix2.Environment().graphically_equivalent(ds, ds2, prop_scores, **weights) + env = stix2.Environment().graph_similarity(ds, ds2, prop_scores, **weights) assert round(env) == 93 assert round(prop_scores["matching_score"]) == 745 - assert round(prop_scores["sum_weights"]) == 800 + assert round(prop_scores["sum_weights"]) == 8 From 02b076b3bb60e1c8fd659d56010780acc32e4ce0 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 16 Feb 2021 00:57:26 -0500 Subject: [PATCH 02/27] resolve issues with graph similarity - new methods for graph equivalence and similarity - remove sorting and len comparisons - rename some variables --- stix2/equivalence/graph/__init__.py | 111 ++++++++++++++++--------- stix2/equivalence/object/__init__.py | 119 ++++++++++++++++++++------- 2 files changed, 160 insertions(+), 70 deletions(-) diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index cff99d0..797aa23 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -1,21 +1,62 @@ -"""Python APIs for STIX 2 Graph-based Semantic Equivalence.""" -import collections -import itertools +"""Python APIs for STIX 2 Graph-based Semantic Equivalence and Similarity.""" import logging from ..object import ( WEIGHTS, exact_match, list_reference_check, partial_string_based, - partial_timestamp_based, reference_check, object_similarity, + partial_timestamp_based, reference_check, object_similarity, object_pairs, bucket_per_type ) logger = logging.getLogger(__name__) +def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict): + """This method returns a true/false value if two graphs are semantically equivalent. + Internally, it calls the graph_similarity function and compares it against the given + threshold value. + + Args: + ds1: A DataStore object instance representing your graph + ds2: A DataStore object instance representing your graph + prop_scores: A dictionary that can hold individual property scores, + weights, contributing score, matching score and sum of weights. + threshold: A numerical value between 0 and 100 to determine the minimum + score to result in successfully calling both graphs equivalent. This + value can be tuned. + weight_dict: A dictionary that can be used to override settings + in the similarity process + + Returns: + bool: True if the result of the graph similarity is greater than or equal to + the threshold value. False otherwise. + + Warning: + Object types need to have property weights defined for the similarity process. + Otherwise, those objects will not influence the final score. The WEIGHTS + dictionary under `stix2.equivalence.graph` can give you an idea on how to add + new entries and pass them via the `weight_dict` argument. Similarly, the values + or methods can be fine tuned for a particular use case. + + Note: + Default weight_dict: + + .. include:: ../../graph_default_sem_eq_weights.rst + + Note: + This implementation follows the Semantic Equivalence Committee Note. + see `the Committee Note `__. + + """ + similarity_result = graph_similarity(ds1, ds2, prop_scores, **weight_dict) + if similarity_result >= threshold: + return True + return False + + def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): - """This method verifies if two graphs are semantically equivalent. + """This method returns a similarity score for two given graphs. Each DataStore can contain a connected or disconnected graph and the final result is weighted over the amount of objects we managed to compare. - This approach builds on top of the object-based semantic equivalence process + This approach builds on top of the object-based similarity process and each comparison can return a value between 0 and 100. Args: @@ -24,20 +65,20 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): prop_scores: A dictionary that can hold individual property scores, weights, contributing score, matching score and sum of weights. weight_dict: A dictionary that can be used to override settings - in the semantic equivalence process + in the similarity process Returns: - float: A number between 0.0 and 100.0 as a measurement of equivalence. + float: A number between 0.0 and 100.0 as a measurement of similarity. Warning: - Object types need to have property weights defined for the equivalence process. + Object types need to have property weights defined for the similarity process. Otherwise, those objects will not influence the final score. The WEIGHTS dictionary under `stix2.equivalence.graph` can give you an idea on how to add new entries and pass them via the `weight_dict` argument. Similarly, the values or methods can be fine tuned for a particular use case. Note: - Default weights_dict: + Default weight_dict: .. include:: ../../graph_default_sem_eq_weights.rst @@ -47,12 +88,14 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): """ results = {} - equivalence_score = 0 + similarity_score = 0 weights = GRAPH_WEIGHTS.copy() if weight_dict: weights.update(weight_dict) + if weights["_internal"]["max_depth"] <= 0: + raise ValueError("weight_dict['_internal']['max_depth'] must be greater than 0") depth = weights["_internal"]["max_depth"] graph1 = bucket_per_type(ds1.query([])) @@ -64,60 +107,46 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): iprop_score2 = {} object1_id = object1["id"] object2_id = object2["id"] + weights["_internal"]["max_depth"] = depth weights["_internal"]["ds1"] = ds1 weights["_internal"]["ds2"] = ds2 result1 = object_similarity(object1, object2, iprop_score1, **weights) + weights["_internal"]["max_depth"] = depth weights["_internal"]["ds1"] = ds2 weights["_internal"]["ds2"] = ds1 result2 = object_similarity(object2, object1, iprop_score2, **weights) if object1_id not in results: - results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1} + results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score1, "value": result1} elif result1 > results[object1_id]["value"]: - results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1} + results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score1, "value": result1} if object2_id not in results: - results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2} - elif result1 > results[object2_id]["value"]: - results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2} + results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score2, "value": result2} + elif result2 > results[object2_id]["value"]: + results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score2, "value": result2} matching_score = sum(x["value"] for x in results.values()) - sum_weights = len(results) - if sum_weights > 0: - equivalence_score = matching_score / sum_weights + len_pairs = len(results) + if len_pairs > 0: + similarity_score = matching_score / len_pairs prop_scores["matching_score"] = matching_score - prop_scores["sum_weights"] = sum_weights + prop_scores["len_pairs"] = len_pairs prop_scores["summary"] = results logger.debug( - "DONE\t\tSUM_WEIGHT: %.2f\tMATCHING_SCORE: %.2f\t SCORE: %.2f", - sum_weights, + "DONE\t\tSUM_PAIRS: %.2f\tMATCHING_SCORE: %.2f\t SIMILARITY_SCORE: %.2f", + len_pairs, matching_score, - equivalence_score, + similarity_score, ) - return equivalence_score + return similarity_score -def bucket_per_type(g): - buckets = collections.defaultdict(list) - [buckets[obj["type"]].append(obj) for obj in g] - return buckets - - -def object_pairs(g1, g2, w): - types_in_common = set(g1.keys()).intersection(g2.keys()) - testable_types = types_in_common.intersection(w.keys()) - - return itertools.chain.from_iterable( - itertools.product(g1[stix_type], g2[stix_type]) - for stix_type in testable_types - ) - - -# default weights used for the graph semantic equivalence process +# default weights used for the graph similarity process GRAPH_WEIGHTS = WEIGHTS.copy() GRAPH_WEIGHTS.update({ "grouping": { diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 8b1ceaa..13e029c 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -1,4 +1,6 @@ -"""Python APIs for STIX 2 Object-based Semantic Equivalence.""" +"""Python APIs for STIX 2 Object-based Semantic Equivalence and Similarity.""" +import collections +import itertools import logging import time @@ -9,9 +11,52 @@ from ..pattern import equivalent_patterns logger = logging.getLogger(__name__) +def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict): + """This method returns a true/false value if two objects are semantically equivalent. + Internally, it calls the object_similarity function and compares it against the given + threshold value. + + Args: + obj1: A stix2 object instance + obj2: A stix2 object instance + prop_scores: A dictionary that can hold individual property scores, + weights, contributing score, matching score and sum of weights. + threshold: A numerical value between 0 and 100 to determine the minimum + score to result in successfully calling both objects equivalent. This + value can be tuned. + weight_dict: A dictionary that can be used to override settings + in the semantic equivalence process + + Returns: + bool: True if the result of the object similarity is greater than or equal to + the threshold value. False otherwise. + + Warning: + Object types need to have property weights defined for the similarity process. + Otherwise, those objects will not influence the final score. The WEIGHTS + dictionary under `stix2.equivalence.object` can give you an idea on how to add + new entries and pass them via the `weight_dict` argument. Similarly, the values + or methods can be fine tuned for a particular use case. + + Note: + Default weight_dict: + + .. include:: ../../object_default_sem_eq_weights.rst + + Note: + This implementation follows the Semantic Equivalence Committee Note. + see `the Committee Note `__. + + """ + similarity_result = object_similarity(obj1, obj2, prop_scores, **weight_dict) + if similarity_result >= threshold: + return True + return False + + def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): - """This method verifies if two objects of the same type are - semantically equivalent. + """This method returns a measure of similarity depending on how + similar the two objects are. Args: obj1: A stix2 object instance @@ -22,17 +67,17 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): in the semantic equivalence process Returns: - float: A number between 0.0 and 100.0 as a measurement of equivalence. + float: A number between 0.0 and 100.0 as a measurement of similarity. Warning: - Object types need to have property weights defined for the equivalence process. + Object types need to have property weights defined for the similarity process. Otherwise, those objects will not influence the final score. The WEIGHTS dictionary under `stix2.equivalence.object` can give you an idea on how to add new entries and pass them via the `weight_dict` argument. Similarly, the values or methods can be fine tuned for a particular use case. Note: - Default weights_dict: + Default weight_dict: .. include:: ../../object_default_sem_eq_weights.rst @@ -352,34 +397,31 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights): The score influences the objects containing these references. The result is weighted on the amount of unique objects that could 1) be de-referenced 2) """ results = {} - if len(refs1) >= len(refs2): - l1 = refs1 - l2 = refs2 - b1 = ds1 - b2 = ds2 - else: - l1 = refs2 - l2 = refs1 - b1 = ds2 - b2 = ds1 - l1.sort() - l2.sort() + pairs = object_pairs( + bucket_per_type(refs1, "id-split"), + bucket_per_type(refs2, "id-split"), + weights + ) - for ref1 in l1: - for ref2 in l2: - type1, type2 = ref1.split("--")[0], ref2.split("--")[0] - if type1 == type2: - score = reference_check(ref1, ref2, b1, b2, **weights) * 100.0 + for ref1, ref2 in pairs: + type1, type2 = ref1.split("--")[0], ref2.split("--")[0] + if type1 == type2: + score = reference_check(ref1, ref2, ds1, ds2, **weights) - if ref1 not in results: - results[ref1] = {"matched": ref2, "value": score} - elif score > results[ref1]["value"]: - results[ref1] = {"matched": ref2, "value": score} + if ref1 not in results: + results[ref1] = {"matched": ref2, "value": score} + elif score > results[ref1]["value"]: + results[ref1] = {"matched": ref2, "value": score} + + if ref2 not in results: + results[ref2] = {"matched": ref1, "value": score} + elif score > results[ref2]["value"]: + results[ref2] = {"matched": ref1, "value": score} result = 0.0 total_sum = sum(x["value"] for x in results.values()) - max_score = len(results) * 100.0 + max_score = len(results) if max_score > 0: result = total_sum / max_score @@ -391,7 +433,26 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights): return result -# default weights used for the semantic equivalence process +def bucket_per_type(g, mode="type"): + buckets = collections.defaultdict(list) + if mode == "type": + [buckets[obj["type"]].append(obj) for obj in g] + elif mode == "id-split": + [buckets[obj.split("--")[0]].append(obj) for obj in g] + return buckets + + +def object_pairs(g1, g2, w): + types_in_common = set(g1.keys()).intersection(g2.keys()) + testable_types = types_in_common.intersection(w.keys()) + + return itertools.chain.from_iterable( + itertools.product(g1[stix_type], g2[stix_type]) + for stix_type in testable_types + ) + + +# default weights used for the similarity process WEIGHTS = { "attack-pattern": { "name": (30, partial_string_based), From 690a515f0063232528db1a232ec2a11437361b0b Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 16 Feb 2021 00:58:33 -0500 Subject: [PATCH 03/27] add methods to environment.py --- stix2/environment.py | 104 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 92 insertions(+), 12 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index bc7fcaf..61751f9 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -2,12 +2,12 @@ import copy from .datastore import CompositeDataSource, DataStoreMixin -from .equivalence.graph import graph_similarity +from .equivalence.graph import graph_equivalence, graph_similarity from .equivalence.object import ( # noqa: F401 WEIGHTS, check_property_present, custom_pattern_based, exact_match, list_reference_check, partial_external_reference_based, partial_list_based, partial_location_distance, partial_string_based, partial_timestamp_based, - reference_check, object_similarity, + reference_check, object_equivalence, object_similarity, ) from .parsing import parse as _parse @@ -198,8 +198,8 @@ class Environment(DataStoreMixin): @staticmethod def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): - """This method verifies if two objects of the same type are - semantically equivalent. + """This method returns a measure of similarity depending on how + similar the two objects are. Args: obj1: A stix2 object instance @@ -210,10 +210,50 @@ class Environment(DataStoreMixin): in the semantic equivalence process Returns: - float: A number between 0.0 and 100.0 as a measurement of equivalence. + float: A number between 0.0 and 100.0 as a measurement of similarity. Warning: - Object types need to have property weights defined for the equivalence process. + Object types need to have property weights defined for the similarity process. + Otherwise, those objects will not influence the final score. The WEIGHTS + dictionary under `stix2.equivalence.object` can give you an idea on how to add + new entries and pass them via the `weight_dict` argument. Similarly, the values + or methods can be fine tuned for a particular use case. + + Note: + Default weight_dict: + + .. include:: ../../object_default_sem_eq_weights.rst + + Note: + This implementation follows the Semantic Equivalence Committee Note. + see `the Committee Note `__. + + """ + return object_similarity(obj1, obj2, prop_scores, **weight_dict) + + @staticmethod + def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict): + """This method returns a true/false value if two objects are semantically equivalent. + Internally, it calls the object_similarity function and compares it against the given + threshold value. + + Args: + obj1: A stix2 object instance + obj2: A stix2 object instance + prop_scores: A dictionary that can hold individual property scores, + weights, contributing score, matching score and sum of weights. + threshold: A numerical value between 0 and 100 to determine the minimum + score to result in successfully calling both objects equivalent. This + value can be tuned. + weight_dict: A dictionary that can be used to override settings + in the semantic equivalence process + + Returns: + bool: True if the result of the object similarity is greater than or equal to + the threshold value. False otherwise. + + Warning: + Object types need to have property weights defined for the similarity process. Otherwise, those objects will not influence the final score. The WEIGHTS dictionary under `stix2.equivalence.object` can give you an idea on how to add new entries and pass them via the `weight_dict` argument. Similarly, the values @@ -229,14 +269,14 @@ class Environment(DataStoreMixin): see `the Committee Note `__. """ - return object_similarity(obj1, obj2, prop_scores, **weight_dict) + return object_equivalence(obj1, obj2, prop_scores, threshold, **weight_dict) @staticmethod def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): - """This method verifies if two graphs are semantically equivalent. + """This method returns a similarity score for two given graphs. Each DataStore can contain a connected or disconnected graph and the final result is weighted over the amount of objects we managed to compare. - This approach builds on top of the object-based semantic equivalence process + This approach builds on top of the object-based similarity process and each comparison can return a value between 0 and 100. Args: @@ -245,13 +285,13 @@ class Environment(DataStoreMixin): prop_scores: A dictionary that can hold individual property scores, weights, contributing score, matching score and sum of weights. weight_dict: A dictionary that can be used to override settings - in the semantic equivalence process + in the similarity process Returns: - float: A number between 0.0 and 100.0 as a measurement of equivalence. + float: A number between 0.0 and 100.0 as a measurement of similarity. Warning: - Object types need to have property weights defined for the equivalence process. + Object types need to have property weights defined for the similarity process. Otherwise, those objects will not influence the final score. The WEIGHTS dictionary under `stix2.equivalence.graph` can give you an idea on how to add new entries and pass them via the `weight_dict` argument. Similarly, the values @@ -268,3 +308,43 @@ class Environment(DataStoreMixin): """ return graph_similarity(ds1, ds2, prop_scores, **weight_dict) + + @staticmethod + def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict): + """This method returns a true/false value if two graphs are semantically equivalent. + Internally, it calls the graph_similarity function and compares it against the given + threshold value. + + Args: + ds1: A DataStore object instance representing your graph + ds2: A DataStore object instance representing your graph + prop_scores: A dictionary that can hold individual property scores, + weights, contributing score, matching score and sum of weights. + threshold: A numerical value between 0 and 100 to determine the minimum + score to result in successfully calling both graphs equivalent. This + value can be tuned. + weight_dict: A dictionary that can be used to override settings + in the similarity process + + Returns: + bool: True if the result of the graph similarity is greater than or equal to + the threshold value. False otherwise. + + Warning: + Object types need to have property weights defined for the similarity process. + Otherwise, those objects will not influence the final score. The WEIGHTS + dictionary under `stix2.equivalence.graph` can give you an idea on how to add + new entries and pass them via the `weight_dict` argument. Similarly, the values + or methods can be fine tuned for a particular use case. + + Note: + Default weight_dict: + + .. include:: ../graph_default_sem_eq_weights.rst + + Note: + This implementation follows the Semantic Equivalence Committee Note. + see `the Committee Note `__. + + """ + return graph_equivalence(ds1, ds2, prop_scores, threshold, **weight_dict) From f966c64b40b941ae0c8df7c61760d23e59db0e49 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 16 Feb 2021 01:05:46 -0500 Subject: [PATCH 04/27] update test suite for environments --- stix2/test/v20/test_environment.py | 116 ++++++++++++++++++++++----- stix2/test/v21/test_environment.py | 122 +++++++++++++++++++++++------ 2 files changed, 196 insertions(+), 42 deletions(-) diff --git a/stix2/test/v20/test_environment.py b/stix2/test/v20/test_environment.py index e572aee..bcc2b60 100644 --- a/stix2/test/v20/test_environment.py +++ b/stix2/test/v20/test_environment.py @@ -1,3 +1,4 @@ +import json import os import pytest @@ -67,6 +68,11 @@ def ds2(): yield stix2.MemoryStore(stix_objs) +@pytest.fixture +def fs(): + yield stix2.FileSystemSource(FS_PATH) + + def test_object_factory_created_by_ref_str(): factory = stix2.ObjectFactory(created_by_ref=IDENTITY_ID) ind = factory.create(stix2.v20.Indicator, **INDICATOR_KWARGS) @@ -497,7 +503,20 @@ def test_list_semantic_check(ds, ds2): assert round(score) == 1 -def test_graph_equivalence_with_filesystem_source(ds): +def test_graph_similarity_raises_value_error(ds): + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": -1, + }, + } + with pytest.raises(ValueError): + prop_scores1 = {} + stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + + +def test_graph_equivalence_with_filesystem_source(ds, fs): weights = { "_internal": { "ignore_spec_version": True, @@ -505,12 +524,31 @@ def test_graph_equivalence_with_filesystem_source(ds): "max_depth": 1, }, } - prop_scores = {} - fs = stix2.FileSystemSource(FS_PATH) - env = stix2.Environment().graphically_equivalent(fs, ds, prop_scores, **weights) - assert round(env) == 28 - assert round(prop_scores["matching_score"]) == 139 - assert round(prop_scores["sum_weights"]) == 500 + prop_scores1 = {} + env1 = stix2.Environment().graph_similarity(fs, ds, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": True, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_similarity(ds, fs, prop_scores2, **weights) + + assert round(env1) == 25 + assert round(prop_scores1["matching_score"]) == 451 + assert round(prop_scores1["len_pairs"]) == 18 + + assert round(env2) == 25 + assert round(prop_scores2["matching_score"]) == 451 + assert round(prop_scores2["len_pairs"]) == 18 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) def test_graph_equivalence_with_duplicate_graph(ds): @@ -522,10 +560,10 @@ def test_graph_equivalence_with_duplicate_graph(ds): }, } prop_scores = {} - env = stix2.Environment().graphically_equivalent(ds, ds, prop_scores, **weights) + env = stix2.Environment().graph_similarity(ds, ds, prop_scores, **weights) assert round(env) == 100 assert round(prop_scores["matching_score"]) == 800 - assert round(prop_scores["sum_weights"]) == 800 + assert round(prop_scores["len_pairs"]) == 8 def test_graph_equivalence_with_versioning_check_on(ds2, ds): @@ -536,11 +574,31 @@ def test_graph_equivalence_with_versioning_check_on(ds2, ds): "max_depth": 1, }, } - prop_scores = {} - env = stix2.Environment().graphically_equivalent(ds, ds2, prop_scores, **weights) - assert round(env) == 93 - assert round(prop_scores["matching_score"]) == 745 - assert round(prop_scores["sum_weights"]) == 800 + prop_scores1 = {} + env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": True, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, **weights) + + assert round(env1) == 88 + assert round(prop_scores1["matching_score"]) == 789 + assert round(prop_scores1["len_pairs"]) == 9 + + assert round(env2) == 88 + assert round(prop_scores2["matching_score"]) == 789 + assert round(prop_scores2["len_pairs"]) == 9 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) def test_graph_equivalence_with_versioning_check_off(ds2, ds): @@ -551,8 +609,28 @@ def test_graph_equivalence_with_versioning_check_off(ds2, ds): "max_depth": 1, }, } - prop_scores = {} - env = stix2.Environment().graphically_equivalent(ds, ds2, prop_scores, **weights) - assert round(env) == 93 - assert round(prop_scores["matching_score"]) == 745 - assert round(prop_scores["sum_weights"]) == 800 + prop_scores1 = {} + env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, **weights) + + assert round(env1) == 88 + assert round(prop_scores1["matching_score"]) == 789 + assert round(prop_scores1["len_pairs"]) == 9 + + assert round(env2) == 88 + assert round(prop_scores2["matching_score"]) == 789 + assert round(prop_scores2["len_pairs"]) == 9 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 5682ad1..774d09a 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -1,3 +1,4 @@ +import json import os import pytest @@ -71,6 +72,11 @@ def ds2(): yield stix2.MemoryStore(stix_objs) +@pytest.fixture +def fs(): + yield stix2.FileSystemSource(FS_PATH) + + def test_object_factory_created_by_ref_str(): factory = stix2.ObjectFactory(created_by_ref=IDENTITY_ID) ind = factory.create(stix2.v21.Indicator, **INDICATOR_KWARGS) @@ -955,8 +961,30 @@ def test_list_semantic_check(ds, ds2): ) assert round(score) == 1 + score = stix2.equivalence.object.list_reference_check( + object_refs2, + object_refs1, + ds2, + ds, + **weights, + ) + assert round(score) == 1 -def test_graph_equivalence_with_filesystem_source(ds): + +def test_graph_similarity_raises_value_error(ds): + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": -1, + }, + } + with pytest.raises(ValueError): + prop_scores1 = {} + stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + + +def test_graph_equivalence_with_filesystem_source(ds, fs): weights = { "_internal": { "ignore_spec_version": True, @@ -965,18 +993,30 @@ def test_graph_equivalence_with_filesystem_source(ds): }, } prop_scores1 = {} - prop_scores2 = {} - fs = stix2.FileSystemSource(FS_PATH) - env = stix2.Environment().graph_similarity(fs, ds, prop_scores1, **weights) - assert round(env) == 26 - assert round(prop_scores1["matching_score"]) == 460 - assert round(prop_scores1["sum_weights"]) == 18 + env1 = stix2.Environment().graph_similarity(fs, ds, prop_scores1, **weights) - env = stix2.Environment().graph_similarity(ds, fs, prop_scores2, **weights) - assert round(env) == 47 - assert round(prop_scores2["matching_score"]) == 852 - assert round(prop_scores2["sum_weights"]) == 18 - assert prop_scores1 == prop_scores2 + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": True, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_similarity(ds, fs, prop_scores2, **weights) + + assert round(env1) == 23 + assert round(prop_scores1["matching_score"]) == 411 + assert round(prop_scores1["len_pairs"]) == 18 + + assert round(env2) == 23 + assert round(prop_scores2["matching_score"]) == 411 + assert round(prop_scores2["len_pairs"]) == 18 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) def test_graph_equivalence_with_duplicate_graph(ds): @@ -991,7 +1031,7 @@ def test_graph_equivalence_with_duplicate_graph(ds): env = stix2.Environment().graph_similarity(ds, ds, prop_scores, **weights) assert round(env) == 100 assert round(prop_scores["matching_score"]) == 800 - assert round(prop_scores["sum_weights"]) == 8 + assert round(prop_scores["len_pairs"]) == 8 def test_graph_equivalence_with_versioning_check_on(ds2, ds): @@ -1002,11 +1042,29 @@ def test_graph_equivalence_with_versioning_check_on(ds2, ds): "max_depth": 1, }, } - prop_scores = {} - env = stix2.Environment().graph_similarity(ds, ds2, prop_scores, **weights) - assert round(env) == 93 - assert round(prop_scores["matching_score"]) == 745 - assert round(prop_scores["sum_weights"]) == 8 + prop_scores1 = {} + env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + assert round(env1) == 88 + assert round(prop_scores1["matching_score"]) == 789 + assert round(prop_scores1["len_pairs"]) == 9 + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, **weights) + assert round(env2) == 88 + assert round(prop_scores2["matching_score"]) == 789 + assert round(prop_scores2["len_pairs"]) == 9 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) def test_graph_equivalence_with_versioning_check_off(ds2, ds): @@ -1017,8 +1075,26 @@ def test_graph_equivalence_with_versioning_check_off(ds2, ds): "max_depth": 1, }, } - prop_scores = {} - env = stix2.Environment().graph_similarity(ds, ds2, prop_scores, **weights) - assert round(env) == 93 - assert round(prop_scores["matching_score"]) == 745 - assert round(prop_scores["sum_weights"]) == 8 + prop_scores1 = {} + env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + assert round(env1) == 88 + assert round(prop_scores1["matching_score"]) == 789 + assert round(prop_scores1["len_pairs"]) == 9 + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, **weights) + assert round(env2) == 88 + assert round(prop_scores2["matching_score"]) == 789 + assert round(prop_scores2["len_pairs"]) == 9 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) From ef610ec8d3c0bad28aa7401b33571210449d7f97 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 16 Feb 2021 09:36:09 -0500 Subject: [PATCH 05/27] small docstring fix --- stix2/environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stix2/environment.py b/stix2/environment.py index 61751f9..bd4445f 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -222,7 +222,7 @@ class Environment(DataStoreMixin): Note: Default weight_dict: - .. include:: ../../object_default_sem_eq_weights.rst + .. include:: ../object_default_sem_eq_weights.rst Note: This implementation follows the Semantic Equivalence Committee Note. From fbea229004ced319e5c34357679ca4f1001f54b7 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 16 Feb 2021 09:44:03 -0500 Subject: [PATCH 06/27] add styling changes --- stix2/environment.py | 5 +++-- stix2/equivalence/graph/__init__.py | 5 +++-- stix2/equivalence/object/__init__.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index bd4445f..246d279 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -5,9 +5,10 @@ from .datastore import CompositeDataSource, DataStoreMixin from .equivalence.graph import graph_equivalence, graph_similarity from .equivalence.object import ( # noqa: F401 WEIGHTS, check_property_present, custom_pattern_based, exact_match, - list_reference_check, partial_external_reference_based, partial_list_based, + list_reference_check, object_equivalence, object_similarity, + partial_external_reference_based, partial_list_based, partial_location_distance, partial_string_based, partial_timestamp_based, - reference_check, object_equivalence, object_similarity, + reference_check, ) from .parsing import parse as _parse diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index 797aa23..d9d6e0c 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -2,8 +2,9 @@ import logging from ..object import ( - WEIGHTS, exact_match, list_reference_check, partial_string_based, - partial_timestamp_based, reference_check, object_similarity, object_pairs, bucket_per_type + WEIGHTS, bucket_per_type, exact_match, list_reference_check, object_pairs, + object_similarity, partial_string_based, partial_timestamp_based, + reference_check, ) logger = logging.getLogger(__name__) diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 13e029c..29e3c4f 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -401,7 +401,7 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights): pairs = object_pairs( bucket_per_type(refs1, "id-split"), bucket_per_type(refs2, "id-split"), - weights + weights, ) for ref1, ref2 in pairs: From 09fd8c060bb5a42236efc4dd40641ff98e1bbbdb Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 16 Feb 2021 10:23:02 -0500 Subject: [PATCH 07/27] clear debug message --- stix2/equivalence/graph/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index d9d6e0c..1a25484 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -139,7 +139,7 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): prop_scores["summary"] = results logger.debug( - "DONE\t\tSUM_PAIRS: %.2f\tMATCHING_SCORE: %.2f\t SIMILARITY_SCORE: %.2f", + "DONE\t\tLEN_PAIRS: %.2f\tMATCHING_SCORE: %.2f\t SIMILARITY_SCORE: %.2f", len_pairs, matching_score, similarity_score, From d2d85badb2c297ea1abdfc1612d4ad847fd8a2a6 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 16 Feb 2021 11:01:07 -0500 Subject: [PATCH 08/27] make some functions internal, add some docs for them --- stix2/equivalence/graph/__init__.py | 14 ++++++++------ stix2/equivalence/object/__init__.py | 16 +++++++++++----- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index 1a25484..3d892f4 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -2,9 +2,9 @@ import logging from ..object import ( - WEIGHTS, bucket_per_type, exact_match, list_reference_check, object_pairs, - object_similarity, partial_string_based, partial_timestamp_based, - reference_check, + WEIGHTS, _bucket_per_type, _object_pairs, exact_match, + list_reference_check, object_similarity, partial_string_based, + partial_timestamp_based, reference_check, ) logger = logging.getLogger(__name__) @@ -99,9 +99,11 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): raise ValueError("weight_dict['_internal']['max_depth'] must be greater than 0") depth = weights["_internal"]["max_depth"] - graph1 = bucket_per_type(ds1.query([])) - graph2 = bucket_per_type(ds2.query([])) - pairs = object_pairs(graph1, graph2, weights) + pairs = _object_pairs( + _bucket_per_type(ds1.query([])), + _bucket_per_type(ds2.query([])), + weights, + ) for object1, object2 in pairs: iprop_score1 = {} diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 29e3c4f..39eb99a 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -398,9 +398,9 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights): weighted on the amount of unique objects that could 1) be de-referenced 2) """ results = {} - pairs = object_pairs( - bucket_per_type(refs1, "id-split"), - bucket_per_type(refs2, "id-split"), + pairs = _object_pairs( + _bucket_per_type(refs1, "id-split"), + _bucket_per_type(refs2, "id-split"), weights, ) @@ -433,7 +433,10 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights): return result -def bucket_per_type(g, mode="type"): +def _bucket_per_type(g, mode="type"): + """Given a list of objects or references, bucket them by type. + Depending on the list type: extract from 'type' property or using + the 'id'""" buckets = collections.defaultdict(list) if mode == "type": [buckets[obj["type"]].append(obj) for obj in g] @@ -442,7 +445,10 @@ def bucket_per_type(g, mode="type"): return buckets -def object_pairs(g1, g2, w): +def _object_pairs(g1, g2, w): + """Returns a generator with the product of the comparable + objects for the graph similarity process. It determines + objects in common between graphs and objects with weights.""" types_in_common = set(g1.keys()).intersection(g2.keys()) testable_types = types_in_common.intersection(w.keys()) From c656d35da5b934d3152160000e7a3c65e427f8d5 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 16 Feb 2021 13:40:42 -0500 Subject: [PATCH 09/27] add more test coverage for new functions --- stix2/test/v20/test_environment.py | 134 +++++++++++++++- stix2/test/v21/test_environment.py | 239 ++++++++++++++++++++++++----- 2 files changed, 331 insertions(+), 42 deletions(-) diff --git a/stix2/test/v20/test_environment.py b/stix2/test/v20/test_environment.py index bcc2b60..33e0985 100644 --- a/stix2/test/v20/test_environment.py +++ b/stix2/test/v20/test_environment.py @@ -516,7 +516,7 @@ def test_graph_similarity_raises_value_error(ds): stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) -def test_graph_equivalence_with_filesystem_source(ds, fs): +def test_graph_similarity_with_filesystem_source(ds, fs): weights = { "_internal": { "ignore_spec_version": True, @@ -551,7 +551,7 @@ def test_graph_equivalence_with_filesystem_source(ds, fs): assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) -def test_graph_equivalence_with_duplicate_graph(ds): +def test_graph_similarity_with_duplicate_graph(ds): weights = { "_internal": { "ignore_spec_version": False, @@ -566,7 +566,7 @@ def test_graph_equivalence_with_duplicate_graph(ds): assert round(prop_scores["len_pairs"]) == 8 -def test_graph_equivalence_with_versioning_check_on(ds2, ds): +def test_graph_similarity_with_versioning_check_on(ds2, ds): weights = { "_internal": { "ignore_spec_version": False, @@ -601,6 +601,126 @@ def test_graph_equivalence_with_versioning_check_on(ds2, ds): assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) +def test_graph_similarity_with_versioning_check_off(ds2, ds): + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores1 = {} + env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, **weights) + + assert round(env1) == 88 + assert round(prop_scores1["matching_score"]) == 789 + assert round(prop_scores1["len_pairs"]) == 9 + + assert round(env2) == 88 + assert round(prop_scores2["matching_score"]) == 789 + assert round(prop_scores2["len_pairs"]) == 9 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) + + +def test_graph_equivalence_with_filesystem_source(ds, fs): + weights = { + "_internal": { + "ignore_spec_version": True, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores1 = {} + env1 = stix2.Environment().graph_equivalence(fs, ds, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": True, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_equivalence(ds, fs, prop_scores2, **weights) + + assert env1 is False + assert round(prop_scores1["matching_score"]) == 451 + assert round(prop_scores1["len_pairs"]) == 18 + + assert env2 is False + assert round(prop_scores2["matching_score"]) == 451 + assert round(prop_scores2["len_pairs"]) == 18 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) + + +def test_graph_equivalence_with_duplicate_graph(ds): + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores = {} + env = stix2.Environment().graph_equivalence(ds, ds, prop_scores, **weights) + assert env is True + assert round(prop_scores["matching_score"]) == 800 + assert round(prop_scores["len_pairs"]) == 8 + + +def test_graph_equivalence_with_versioning_check_on(ds2, ds): + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": True, + "max_depth": 1, + }, + } + prop_scores1 = {} + env1 = stix2.Environment().graph_equivalence(ds, ds2, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": True, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_equivalence(ds2, ds, prop_scores2, **weights) + + assert env1 is True + assert round(prop_scores1["matching_score"]) == 789 + assert round(prop_scores1["len_pairs"]) == 9 + + assert env2 is True + assert round(prop_scores2["matching_score"]) == 789 + assert round(prop_scores2["len_pairs"]) == 9 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) + + def test_graph_equivalence_with_versioning_check_off(ds2, ds): weights = { "_internal": { @@ -610,7 +730,7 @@ def test_graph_equivalence_with_versioning_check_off(ds2, ds): }, } prop_scores1 = {} - env1 = stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) + env1 = stix2.Environment().graph_equivalence(ds, ds2, prop_scores1, **weights) # Switching parameters weights = { @@ -621,13 +741,13 @@ def test_graph_equivalence_with_versioning_check_off(ds2, ds): }, } prop_scores2 = {} - env2 = stix2.Environment().graph_similarity(ds2, ds, prop_scores2, **weights) + env2 = stix2.Environment().graph_equivalence(ds2, ds, prop_scores2, **weights) - assert round(env1) == 88 + assert env1 is True assert round(prop_scores1["matching_score"]) == 789 assert round(prop_scores1["len_pairs"]) == 9 - assert round(env2) == 88 + assert env2 is True assert round(prop_scores2["matching_score"]) == 789 assert round(prop_scores2["len_pairs"]) == 9 diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 774d09a..80c4ba8 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -38,7 +38,7 @@ def ds(): @pytest.fixture -def ds2(): +def ds2_objects(): cam = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) idy = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS) ind = stix2.v21.Indicator(id=INDICATOR_ID, created_by_ref=idy.id, **INDICATOR_KWARGS) @@ -69,7 +69,12 @@ def ds2(): published="2021-04-09T08:22:22Z", object_refs=stix_objs, ) stix_objs.append(reprt) - yield stix2.MemoryStore(stix_objs) + yield stix_objs + + +@pytest.fixture +def ds2(ds2_objects): + yield stix2.MemoryStore(ds2_objects) @pytest.fixture @@ -432,14 +437,14 @@ def test_related_to_by_target(ds): assert any(x['id'] == INDICATOR_ID for x in resp) -def test_semantic_equivalence_on_same_attack_pattern1(): +def test_object_similarity_on_same_attack_pattern1(): ap1 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS) ap2 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS) env = stix2.Environment().object_similarity(ap1, ap2) assert round(env) == 100 -def test_semantic_equivalence_on_same_attack_pattern2(): +def test_object_similarity_on_same_attack_pattern2(): ATTACK_KWARGS = dict( name="Phishing", external_references=[ @@ -455,14 +460,14 @@ def test_semantic_equivalence_on_same_attack_pattern2(): assert round(env) == 100 -def test_semantic_equivalence_on_same_campaign1(): +def test_object_similarity_on_same_campaign1(): camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) env = stix2.Environment().object_similarity(camp1, camp2) assert round(env) == 100 -def test_semantic_equivalence_on_same_campaign2(): +def test_object_similarity_on_same_campaign2(): CAMP_KWARGS = dict( name="Green Group Attacks Against Finance", description="Campaign by Green Group against a series of targets in the financial services sector.", @@ -474,14 +479,14 @@ def test_semantic_equivalence_on_same_campaign2(): assert round(env) == 100 -def test_semantic_equivalence_on_same_identity1(): +def test_object_similarity_on_same_identity1(): iden1 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS) iden2 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS) env = stix2.Environment().object_similarity(iden1, iden2) assert round(env) == 100 -def test_semantic_equivalence_on_same_identity2(): +def test_object_similarity_on_same_identity2(): IDEN_KWARGS = dict( name="John Smith", identity_class="individual", @@ -493,14 +498,14 @@ def test_semantic_equivalence_on_same_identity2(): assert round(env) == 100 -def test_semantic_equivalence_on_same_indicator(): +def test_object_similarity_on_same_indicator(): ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) env = stix2.Environment().object_similarity(ind1, ind2) assert round(env) == 100 -def test_semantic_equivalence_on_same_location1(): +def test_object_similarity_on_same_location1(): location_kwargs = dict(latitude=45, longitude=179) loc1 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs) loc2 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs) @@ -508,7 +513,7 @@ def test_semantic_equivalence_on_same_location1(): assert round(env) == 100 -def test_semantic_equivalence_on_same_location2(): +def test_object_similarity_on_same_location2(): location_kwargs = dict( latitude=38.889, longitude=-77.023, @@ -521,7 +526,7 @@ def test_semantic_equivalence_on_same_location2(): assert round(env) == 100 -def test_semantic_equivalence_location_with_no_latlong(): +def test_object_similarity_location_with_no_latlong(): loc_kwargs = dict(country="US", administrative_area="US-DC") loc1 = stix2.v21.Location(id=LOCATION_ID, **LOCATION_KWARGS) loc2 = stix2.v21.Location(id=LOCATION_ID, **loc_kwargs) @@ -529,21 +534,21 @@ def test_semantic_equivalence_location_with_no_latlong(): assert round(env) != 100 -def test_semantic_equivalence_on_same_malware(): +def test_object_similarity_on_same_malware(): malw1 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS) malw2 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS) env = stix2.Environment().object_similarity(malw1, malw2) assert round(env) == 100 -def test_semantic_equivalence_on_same_threat_actor1(): +def test_object_similarity_on_same_threat_actor1(): ta1 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS) ta2 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS) env = stix2.Environment().object_similarity(ta1, ta2) assert round(env) == 100 -def test_semantic_equivalence_on_same_threat_actor2(): +def test_object_similarity_on_same_threat_actor2(): THREAT_KWARGS = dict( threat_actor_types=["crime-syndicate"], aliases=["super-evil"], @@ -555,21 +560,34 @@ def test_semantic_equivalence_on_same_threat_actor2(): assert round(env) == 100 -def test_semantic_equivalence_on_same_tool(): +def test_object_similarity_on_same_tool(): tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) env = stix2.Environment().object_similarity(tool1, tool2) assert round(env) == 100 -def test_semantic_equivalence_on_same_vulnerability1(): +def test_object_similarity_on_same_vulnerability1(): vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) - env = stix2.Environment().object_similarity(vul1, vul2) + prop_scores = {} + env = stix2.Environment().object_similarity(vul1, vul2, prop_scores) assert round(env) == 100 + assert round(prop_scores["matching_score"]) == 30 + assert round(prop_scores["sum_weights"]) == 30 -def test_semantic_equivalence_on_same_vulnerability2(): +def test_object_equivalence_on_same_vulnerability1(): + vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) + vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) + prop_scores = {} + env = stix2.Environment().object_equivalence(vul1, vul2, prop_scores) + assert env is True + assert round(prop_scores["matching_score"]) == 30 + assert round(prop_scores["sum_weights"]) == 30 + + +def test_object_similarity_on_same_vulnerability2(): VULN_KWARGS1 = dict( name="Heartbleed", external_references=[ @@ -590,11 +608,42 @@ def test_semantic_equivalence_on_same_vulnerability2(): ) vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS1) vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS2) - env = stix2.Environment().object_similarity(vul1, vul2) + prop_scores = {} + env = stix2.Environment().object_similarity(vul1, vul2, prop_scores) assert round(env) == 0.0 + assert round(prop_scores["matching_score"]) == 0 + assert round(prop_scores["sum_weights"]) == 100 -def test_semantic_equivalence_on_unknown_object(): +def test_object_equivalence_on_same_vulnerability2(): + VULN_KWARGS1 = dict( + name="Heartbleed", + external_references=[ + { + "url": "https://example", + "source_name": "some-source", + }, + ], + ) + VULN_KWARGS2 = dict( + name="Foo", + external_references=[ + { + "url": "https://example2", + "source_name": "some-source2", + }, + ], + ) + vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS1) + vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS2) + prop_scores = {} + env = stix2.Environment().object_equivalence(vul1, vul2, prop_scores) + assert env is False + assert round(prop_scores["matching_score"]) == 0 + assert round(prop_scores["sum_weights"]) == 100 + + +def test_object_similarity_on_unknown_object(): CUSTOM_KWARGS1 = dict( type="x-foobar", id="x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061", @@ -650,7 +699,7 @@ def test_semantic_equivalence_on_unknown_object(): assert round(env) == 0 -def test_semantic_equivalence_different_type_raises(): +def test_object_similarity_different_type_raises(): with pytest.raises(ValueError) as excinfo: vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) @@ -659,7 +708,7 @@ def test_semantic_equivalence_different_type_raises(): assert str(excinfo.value) == "The objects to compare must be of the same type!" -def test_semantic_equivalence_different_spec_version_raises(): +def test_object_similarity_different_spec_version_raises(): with pytest.raises(ValueError) as excinfo: V20_KWARGS = dict( labels=['malicious-activity'], @@ -672,7 +721,7 @@ def test_semantic_equivalence_different_spec_version_raises(): assert str(excinfo.value) == "The objects to compare must be of the same spec version!" -def test_semantic_equivalence_zero_match(): +def test_object_similarity_zero_match(): IND_KWARGS = dict( indicator_types=["APTX"], pattern="[ipv4-addr:value = '192.168.1.1']", @@ -696,7 +745,7 @@ def test_semantic_equivalence_zero_match(): assert round(env) == 0 -def test_semantic_equivalence_different_spec_version(): +def test_object_similarity_different_spec_version(): IND_KWARGS = dict( labels=["APTX"], pattern="[ipv4-addr:value = '192.168.1.1']", @@ -786,18 +835,18 @@ def test_semantic_equivalence_different_spec_version(): ), ], ) -def test_semantic_equivalence_external_references(refs1, refs2, ret_val): +def test_object_similarity_external_references(refs1, refs2, ret_val): value = stix2.environment.partial_external_reference_based(refs1, refs2) assert value == ret_val -def test_semantic_equivalence_timestamp(): +def test_object_similarity_timestamp(): t1 = "2018-10-17T00:14:20.652Z" t2 = "2018-10-17T12:14:20.652Z" assert stix2.environment.partial_timestamp_based(t1, t2, 1) == 0.5 -def test_semantic_equivalence_exact_match(): +def test_object_similarity_exact_match(): t1 = "2018-10-17T00:14:20.652Z" t2 = "2018-10-17T12:14:20.652Z" assert stix2.environment.exact_match(t1, t2) == 0.0 @@ -813,7 +862,7 @@ def custom_semantic_equivalence_method(obj1, obj2, **weights): return 96.0, 100.0 -def test_semantic_equivalence_method_provided(): +def test_object_similarity_method_provided(): # Because `method` is provided, `partial_list_based` will be ignored TOOL2_KWARGS = dict( name="Random Software", @@ -834,7 +883,7 @@ def test_semantic_equivalence_method_provided(): assert round(env) == 96 -def test_semantic_equivalence_prop_scores(): +def test_object_similarity_prop_scores(): TOOL2_KWARGS = dict( name="Random Software", tool_types=["information-gathering"], @@ -856,7 +905,7 @@ def custom_semantic_equivalence_method_prop_scores(obj1, obj2, prop_scores, **we return 96.0, 100.0 -def test_semantic_equivalence_prop_scores_method_provided(): +def test_object_similarity_prop_scores_method_provided(): TOOL2_KWARGS = dict( name="Random Software", tool_types=["information-gathering"], @@ -984,7 +1033,7 @@ def test_graph_similarity_raises_value_error(ds): stix2.Environment().graph_similarity(ds, ds2, prop_scores1, **weights) -def test_graph_equivalence_with_filesystem_source(ds, fs): +def test_graph_similarity_with_filesystem_source(ds, fs): weights = { "_internal": { "ignore_spec_version": True, @@ -1019,7 +1068,7 @@ def test_graph_equivalence_with_filesystem_source(ds, fs): assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) -def test_graph_equivalence_with_duplicate_graph(ds): +def test_graph_similarity_with_duplicate_graph(ds): weights = { "_internal": { "ignore_spec_version": False, @@ -1034,7 +1083,7 @@ def test_graph_equivalence_with_duplicate_graph(ds): assert round(prop_scores["len_pairs"]) == 8 -def test_graph_equivalence_with_versioning_check_on(ds2, ds): +def test_graph_similarity_with_versioning_check_on(ds2, ds): weights = { "_internal": { "ignore_spec_version": False, @@ -1067,7 +1116,7 @@ def test_graph_equivalence_with_versioning_check_on(ds2, ds): assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) -def test_graph_equivalence_with_versioning_check_off(ds2, ds): +def test_graph_similarity_with_versioning_check_off(ds2, ds): weights = { "_internal": { "ignore_spec_version": False, @@ -1098,3 +1147,123 @@ def test_graph_equivalence_with_versioning_check_off(ds2, ds): prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) + + +def test_graph_equivalence_with_filesystem_source(ds, fs): + weights = { + "_internal": { + "ignore_spec_version": True, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores1 = {} + env1 = stix2.Environment().graph_equivalence(fs, ds, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": True, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_equivalence(ds, fs, prop_scores2, **weights) + + assert env1 is False + assert round(prop_scores1["matching_score"]) == 411 + assert round(prop_scores1["len_pairs"]) == 18 + + assert env2 is False + assert round(prop_scores2["matching_score"]) == 411 + assert round(prop_scores2["len_pairs"]) == 18 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) + + +def test_graph_equivalence_with_duplicate_graph(ds): + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores = {} + env = stix2.Environment().graph_equivalence(ds, ds, prop_scores, **weights) + assert env is True + assert round(prop_scores["matching_score"]) == 800 + assert round(prop_scores["len_pairs"]) == 8 + + +def test_graph_equivalence_with_versioning_check_on(ds2, ds): + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": True, + "max_depth": 1, + }, + } + prop_scores1 = {} + env1 = stix2.Environment().graph_equivalence(ds, ds2, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": True, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_equivalence(ds2, ds, prop_scores2, **weights) + + assert env1 is True + assert round(prop_scores1["matching_score"]) == 789 + assert round(prop_scores1["len_pairs"]) == 9 + + assert env2 is True + assert round(prop_scores2["matching_score"]) == 789 + assert round(prop_scores2["len_pairs"]) == 9 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) + + +def test_graph_equivalence_with_versioning_check_off(ds2, ds): + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores1 = {} + env1 = stix2.Environment().graph_equivalence(ds, ds2, prop_scores1, **weights) + + # Switching parameters + weights = { + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores2 = {} + env2 = stix2.Environment().graph_equivalence(ds2, ds, prop_scores2, **weights) + + assert env1 is True + assert round(prop_scores1["matching_score"]) == 789 + assert round(prop_scores1["len_pairs"]) == 9 + + assert env2 is True + assert round(prop_scores2["matching_score"]) == 789 + assert round(prop_scores2["len_pairs"]) == 9 + + prop_scores1["matching_score"] = round(prop_scores1["matching_score"], 3) + prop_scores2["matching_score"] = round(prop_scores2["matching_score"], 3) + assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) From ee63e9faf4b017d2a43560db70ec1f287a504e0c Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Wed, 17 Feb 2021 21:30:14 -0500 Subject: [PATCH 10/27] resolve issue regarding reference_check or list_reference_check, remove redundant object_similarity call update test suite --- stix2/equivalence/graph/__init__.py | 27 +++--- stix2/equivalence/object/__init__.py | 13 +-- stix2/test/v21/test_environment.py | 120 ++++++++++++++++++++++++++- 3 files changed, 136 insertions(+), 24 deletions(-) diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index 3d892f4..402bcb2 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -105,31 +105,26 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): weights, ) + weights["_internal"]["ds1"] = ds1 + weights["_internal"]["ds2"] = ds2 + for object1, object2 in pairs: - iprop_score1 = {} - iprop_score2 = {} + iprop_score = {} object1_id = object1["id"] object2_id = object2["id"] + result = object_similarity(object1, object2, iprop_score, **weights) weights["_internal"]["max_depth"] = depth - weights["_internal"]["ds1"] = ds1 - weights["_internal"]["ds2"] = ds2 - result1 = object_similarity(object1, object2, iprop_score1, **weights) - - weights["_internal"]["max_depth"] = depth - weights["_internal"]["ds1"] = ds2 - weights["_internal"]["ds2"] = ds1 - result2 = object_similarity(object2, object1, iprop_score2, **weights) if object1_id not in results: - results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score1, "value": result1} - elif result1 > results[object1_id]["value"]: - results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score1, "value": result1} + results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result} + elif result > results[object1_id]["value"]: + results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result} if object2_id not in results: - results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score2, "value": result2} - elif result2 > results[object2_id]["value"]: - results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score2, "value": result2} + results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score, "value": result} + elif result > results[object2_id]["value"]: + results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score, "value": result} matching_score = sum(x["value"] for x in results.values()) len_pairs = len(results) diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 39eb99a..7f348b6 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -125,12 +125,13 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): contributing_score = w * comp_funct(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"], threshold) elif comp_funct == reference_check or comp_funct == list_reference_check: max_depth = weights["_internal"]["max_depth"] - if max_depth < 0: - continue # prevent excessive recursion + if max_depth > 0: + weights["_internal"]["max_depth"] = max_depth - 1 + ds1, ds2 = weights["_internal"]["ds1"], weights["_internal"]["ds2"] + contributing_score = w * comp_funct(obj1[prop], obj2[prop], ds1, ds2, **weights) + weights["_internal"]["max_depth"] = max_depth + 1 else: - weights["_internal"]["max_depth"] -= 1 - ds1, ds2 = weights["_internal"]["ds1"], weights["_internal"]["ds2"] - contributing_score = w * comp_funct(obj1[prop], obj2[prop], ds1, ds2, **weights) + continue # prevent excessive recursion else: contributing_score = w * comp_funct(obj1[prop], obj2[prop]) @@ -376,7 +377,7 @@ def reference_check(ref1, ref2, ds1, ds2, **weights): type1, type2 = ref1.split("--")[0], ref2.split("--")[0] result = 0.0 - if type1 == type2: + if type1 == type2 and type1 in weights: if weights["_internal"]["versioning_checks"]: result = _versioned_checks(ref1, ref2, ds1, ds2, **weights) / 100.0 else: diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 80c4ba8..fb651af 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -723,10 +723,11 @@ def test_object_similarity_different_spec_version_raises(): def test_object_similarity_zero_match(): IND_KWARGS = dict( - indicator_types=["APTX"], + indicator_types=["malicious-activity", "bar"], pattern="[ipv4-addr:value = '192.168.1.1']", pattern_type="stix", valid_from="2019-01-01T12:34:56Z", + labels=["APTX", "foo"], ) weights = { "indicator": { @@ -742,7 +743,9 @@ def test_object_similarity_zero_match(): ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **IND_KWARGS) env = stix2.Environment().object_similarity(ind1, ind2, **weights) - assert round(env) == 0 + assert round(env) == 8 + env = stix2.Environment().object_similarity(ind2, ind1, **weights) + assert round(env) == 8 def test_object_similarity_different_spec_version(): @@ -766,6 +769,9 @@ def test_object_similarity_different_spec_version(): env = stix2.Environment().object_similarity(ind1, ind2, **weights) assert round(env) == 0 + env = stix2.Environment().object_similarity(ind2, ind1, **weights) + assert round(env) == 0 + @pytest.mark.parametrize( "refs1,refs2,ret_val", [ @@ -1068,6 +1074,116 @@ def test_graph_similarity_with_filesystem_source(ds, fs): assert json.dumps(prop_scores1, sort_keys=True, indent=4) == json.dumps(prop_scores2, sort_keys=True, indent=4) +def test_depth_limiting(): + g1 = [ + { + "type": "foo", + "id": "foo--07f9dd2a-1cce-45bb-8cbe-dba3f007aafd", + "spec_version": "2.1", + "created": "1986-02-08T00:20:17Z", + "modified": "1989-12-11T06:54:29Z", + "some1_ref": "foo--700a8a3c-9936-412f-b4eb-ede466476180", + "some2_ref": "foo--f4a999a3-df94-499d-9cac-6c02e21775ee", + }, + { + "type": "foo", + "id": "foo--700a8a3c-9936-412f-b4eb-ede466476180", + "spec_version": "2.1", + "created": "1989-01-06T10:31:54Z", + "modified": "1995-06-18T10:25:01Z", + "some1_ref": "foo--705afd45-eb56-43fc-a214-313d63d199a3", + }, + { + "type": "foo", + "id": "foo--705afd45-eb56-43fc-a214-313d63d199a3", + "spec_version": "2.1", + "created": "1977-11-06T21:19:29Z", + "modified": "1997-12-02T20:33:34Z", + }, + { + "type": "foo", + "id": "foo--f4a999a3-df94-499d-9cac-6c02e21775ee", + "spec_version": "2.1", + "created": "1991-09-17T00:40:52Z", + "modified": "1992-12-06T11:02:47Z", + "name": "alice", + }, + ] + + g2 = [ + { + "type": "foo", + "id": "foo--71570479-3e6e-48d2-81fb-897454dec55d", + "spec_version": "2.1", + "created": "1975-12-22T05:20:38Z", + "modified": "1980-11-11T01:09:03Z", + "some1_ref": "foo--4aeda39b-31fa-4ffb-a847-d8edc175a579", + "some2_ref": "foo--941e48d6-3100-4419-9e8c-cf1eb59e71b2", + }, + { + "type": "foo", + "id": "foo--4aeda39b-31fa-4ffb-a847-d8edc175a579", + "spec_version": "2.1", + "created": "1976-01-05T08:32:03Z", + "modified": "1980-11-09T05:41:02Z", + "some1_ref": "foo--689252c3-5d20-43ff-bbf7-c8e45d713768", + }, + { + "type": "foo", + "id": "foo--689252c3-5d20-43ff-bbf7-c8e45d713768", + "spec_version": "2.1", + "created": "1974-09-11T18:56:30Z", + "modified": "1976-10-31T11:59:43Z", + }, + { + "type": "foo", + "id": "foo--941e48d6-3100-4419-9e8c-cf1eb59e71b2", + "spec_version": "2.1", + "created": "1985-01-03T01:07:03Z", + "modified": "1992-07-20T21:32:31Z", + "name": "alice", + } + ] + + mem_store1 = stix2.MemorySource(g1) + mem_store2 = stix2.MemorySource(g2) + + custom_weights = { + "foo": { + "some1_ref": (33, stix2.equivalence.object.reference_check), + "some2_ref": (33, stix2.equivalence.object.reference_check), + "name": (34, stix2.equivalence.object.partial_string_based), + }, + "_internal": { + "ignore_spec_version": False, + "versioning_checks": False, + "max_depth": 1, + }, + } + prop_scores1 = {} + env1 = stix2.equivalence.graph.graph_similarity(mem_store1, mem_store2, prop_scores1, **custom_weights) + + assert round(env1) == 38 + assert round(prop_scores1["matching_score"]) == 300 + assert round(prop_scores1["len_pairs"]) == 8 + # from 'alice' check in de-reference + assert prop_scores1['summary']['foo--71570479-3e6e-48d2-81fb-897454dec55d']['prop_score']['some2_ref']['weight'] == 33 + assert prop_scores1['summary']['foo--07f9dd2a-1cce-45bb-8cbe-dba3f007aafd']['prop_score']['some2_ref']['weight'] == 33 + + # Switching parameters + prop_scores2 = {} + env2 = stix2.equivalence.graph.graph_similarity( + mem_store2, mem_store1, prop_scores2, **custom_weights + ) + + assert round(env2) == 38 + assert round(prop_scores2["matching_score"]) == 300 + assert round(prop_scores2["len_pairs"]) == 8 + # from 'alice' check in de-reference + assert prop_scores2['summary']['foo--71570479-3e6e-48d2-81fb-897454dec55d']['prop_score']['some2_ref']['weight'] == 33 + assert prop_scores2['summary']['foo--07f9dd2a-1cce-45bb-8cbe-dba3f007aafd']['prop_score']['some2_ref']['weight'] == 33 + + def test_graph_similarity_with_duplicate_graph(ds): weights = { "_internal": { From 702c80bd5361f3ca70b56a58972cb8a2307e9303 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Wed, 17 Feb 2021 21:47:56 -0500 Subject: [PATCH 11/27] add styling changes --- stix2/test/v21/test_environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index fb651af..c3c2701 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -1142,7 +1142,7 @@ def test_depth_limiting(): "created": "1985-01-03T01:07:03Z", "modified": "1992-07-20T21:32:31Z", "name": "alice", - } + }, ] mem_store1 = stix2.MemorySource(g1) From 52c5f3ad296054db47937b92bb772baf6679bd2d Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Feb 2021 09:14:10 -0500 Subject: [PATCH 12/27] Update stix2/equivalence/object/__init__.py Co-authored-by: Chris Lenk --- stix2/equivalence/object/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 7f348b6..0f2ae54 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -64,7 +64,7 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): prop_scores: A dictionary that can hold individual property scores, weights, contributing score, matching score and sum of weights. weight_dict: A dictionary that can be used to override settings - in the semantic equivalence process + in the similarity process Returns: float: A number between 0.0 and 100.0 as a measurement of similarity. From 154fc4e236d65ff39618ee39e30875b63eecd37e Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Feb 2021 09:14:34 -0500 Subject: [PATCH 13/27] Update stix2/equivalence/object/__init__.py update variable names Co-authored-by: Chris Lenk --- stix2/equivalence/object/__init__.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 0f2ae54..20b60a2 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -434,27 +434,29 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights): return result -def _bucket_per_type(g, mode="type"): +def _bucket_per_type(graph, mode="type"): """Given a list of objects or references, bucket them by type. Depending on the list type: extract from 'type' property or using - the 'id'""" + the 'id'. + """ buckets = collections.defaultdict(list) if mode == "type": - [buckets[obj["type"]].append(obj) for obj in g] + [buckets[obj["type"]].append(obj) for obj in graph] elif mode == "id-split": - [buckets[obj.split("--")[0]].append(obj) for obj in g] + [buckets[obj.split("--")[0]].append(obj) for obj in graph] return buckets -def _object_pairs(g1, g2, w): +def _object_pairs(graph1, graph2, weights): """Returns a generator with the product of the comparable objects for the graph similarity process. It determines - objects in common between graphs and objects with weights.""" - types_in_common = set(g1.keys()).intersection(g2.keys()) - testable_types = types_in_common.intersection(w.keys()) + objects in common between graphs and objects with weights. + """ + types_in_common = set(graph1.keys()).intersection(graph2.keys()) + testable_types = types_in_common.intersection(weights.keys()) return itertools.chain.from_iterable( - itertools.product(g1[stix_type], g2[stix_type]) + itertools.product(graph1[stix_type], graph2[stix_type]) for stix_type in testable_types ) From 34feac6ae78d0dccc69729198decb4e8fbe1a2f2 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Feb 2021 09:14:47 -0500 Subject: [PATCH 14/27] Update stix2/environment.py Co-authored-by: Chris Lenk --- stix2/environment.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index 246d279..99b4a5e 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -199,8 +199,7 @@ class Environment(DataStoreMixin): @staticmethod def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): - """This method returns a measure of similarity depending on how - similar the two objects are. + """This method returns a measure of how similar the two objects are. Args: obj1: A stix2 object instance From 75574c94273c45beed691a3c912240fc35c69141 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Feb 2021 09:15:09 -0500 Subject: [PATCH 15/27] Update stix2/equivalence/object/__init__.py docstrings Co-authored-by: Chris Lenk --- stix2/equivalence/object/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 20b60a2..790fc87 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -25,7 +25,7 @@ def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict): score to result in successfully calling both objects equivalent. This value can be tuned. weight_dict: A dictionary that can be used to override settings - in the semantic equivalence process + in the similarity process Returns: bool: True if the result of the object similarity is greater than or equal to From 75b411df85ed539655307c328f515ef4204d477b Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Feb 2021 09:15:26 -0500 Subject: [PATCH 16/27] Update stix2/environment.py docstrings Co-authored-by: Chris Lenk --- stix2/environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stix2/environment.py b/stix2/environment.py index 99b4a5e..672fb11 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -207,7 +207,7 @@ class Environment(DataStoreMixin): prop_scores: A dictionary that can hold individual property scores, weights, contributing score, matching score and sum of weights. weight_dict: A dictionary that can be used to override settings - in the semantic equivalence process + in the similarity process Returns: float: A number between 0.0 and 100.0 as a measurement of similarity. From 010593345ccc3420788de22c518dbae929e4ade3 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Feb 2021 09:16:19 -0500 Subject: [PATCH 17/27] Update stix2/environment.py docstrings Co-authored-by: Chris Lenk --- stix2/environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stix2/environment.py b/stix2/environment.py index 672fb11..cebb080 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -246,7 +246,7 @@ class Environment(DataStoreMixin): score to result in successfully calling both objects equivalent. This value can be tuned. weight_dict: A dictionary that can be used to override settings - in the semantic equivalence process + in the similarity process Returns: bool: True if the result of the object similarity is greater than or equal to From fa6978969bd63d93e2b2b51c60d720a46e630e49 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Feb 2021 10:20:42 -0500 Subject: [PATCH 18/27] removing unused imports (backwards breaking) --- stix2/environment.py | 11 +---------- stix2/test/v21/test_environment.py | 30 +++++++++++++++--------------- 2 files changed, 16 insertions(+), 25 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index cebb080..d0f694e 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -3,18 +3,9 @@ import copy from .datastore import CompositeDataSource, DataStoreMixin from .equivalence.graph import graph_equivalence, graph_similarity -from .equivalence.object import ( # noqa: F401 - WEIGHTS, check_property_present, custom_pattern_based, exact_match, - list_reference_check, object_equivalence, object_similarity, - partial_external_reference_based, partial_list_based, - partial_location_distance, partial_string_based, partial_timestamp_based, - reference_check, -) +from .equivalence.object import object_equivalence, object_similarity from .parsing import parse as _parse -# TODO: Remove all unused imports that now belong to the equivalence module in the next major release. -# Kept for backwards compatibility. - class ObjectFactory(object): """Easily create STIX objects with default values for certain properties. diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index c3c2701..e7bf4da 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -670,17 +670,17 @@ def test_object_similarity_on_unknown_object(): def _x_foobar_checks(obj1, obj2, **weights): matching_score = 0.0 sum_weights = 0.0 - if stix2.environment.check_property_present("external_references", obj1, obj2): + if stix2.equivalence.object.check_property_present("external_references", obj1, obj2): w = weights["external_references"] sum_weights += w - matching_score += w * stix2.environment.partial_external_reference_based( + matching_score += w * stix2.equivalence.object.partial_external_reference_based( obj1["external_references"], obj2["external_references"], ) - if stix2.environment.check_property_present("name", obj1, obj2): + if stix2.equivalence.object.check_property_present("name", obj1, obj2): w = weights["name"] sum_weights += w - matching_score += w * stix2.environment.partial_string_based(obj1["name"], obj2["name"]) + matching_score += w * stix2.equivalence.object.partial_string_based(obj1["name"], obj2["name"]) return matching_score, sum_weights weights = { @@ -731,9 +731,9 @@ def test_object_similarity_zero_match(): ) weights = { "indicator": { - "indicator_types": (15, stix2.environment.partial_list_based), - "pattern": (80, stix2.environment.custom_pattern_based), - "valid_from": (5, stix2.environment.partial_timestamp_based), + "indicator_types": (15, stix2.equivalence.object.partial_list_based), + "pattern": (80, stix2.equivalence.object.custom_pattern_based), + "valid_from": (5, stix2.equivalence.object.partial_timestamp_based), "tdelta": 1, # One day interval }, "_internal": { @@ -755,9 +755,9 @@ def test_object_similarity_different_spec_version(): ) weights = { "indicator": { - "indicator_types": (15, stix2.environment.partial_list_based), - "pattern": (80, stix2.environment.custom_pattern_based), - "valid_from": (5, stix2.environment.partial_timestamp_based), + "indicator_types": (15, stix2.equivalence.object.partial_list_based), + "pattern": (80, stix2.equivalence.object.custom_pattern_based), + "valid_from": (5, stix2.equivalence.object.partial_timestamp_based), "tdelta": 1, # One day interval }, "_internal": { @@ -842,20 +842,20 @@ def test_object_similarity_different_spec_version(): ], ) def test_object_similarity_external_references(refs1, refs2, ret_val): - value = stix2.environment.partial_external_reference_based(refs1, refs2) + value = stix2.equivalence.object.partial_external_reference_based(refs1, refs2) assert value == ret_val def test_object_similarity_timestamp(): t1 = "2018-10-17T00:14:20.652Z" t2 = "2018-10-17T12:14:20.652Z" - assert stix2.environment.partial_timestamp_based(t1, t2, 1) == 0.5 + assert stix2.equivalence.object.partial_timestamp_based(t1, t2, 1) == 0.5 def test_object_similarity_exact_match(): t1 = "2018-10-17T00:14:20.652Z" t2 = "2018-10-17T12:14:20.652Z" - assert stix2.environment.exact_match(t1, t2) == 0.0 + assert stix2.equivalence.object.exact_match(t1, t2) == 0.0 def test_non_existent_config_for_object(): @@ -877,8 +877,8 @@ def test_object_similarity_method_provided(): weights = { "tool": { - "tool_types": (20, stix2.environment.partial_list_based), - "name": (80, stix2.environment.partial_string_based), + "tool_types": (20, stix2.equivalence.object.partial_list_based), + "name": (80, stix2.equivalence.object.partial_string_based), "method": custom_semantic_equivalence_method, }, } From 99453770cfe0d072d6335b395308b5a6b3337fcd Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Thu, 18 Feb 2021 10:37:34 -0500 Subject: [PATCH 19/27] doctring changes, _versioned_checks changes --- stix2/equivalence/__init__.py | 2 +- stix2/equivalence/graph/__init__.py | 3 +-- stix2/equivalence/object/__init__.py | 37 ++++++++++++++++------------ 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/stix2/equivalence/__init__.py b/stix2/equivalence/__init__.py index f175024..0ca9d83 100644 --- a/stix2/equivalence/__init__.py +++ b/stix2/equivalence/__init__.py @@ -1,4 +1,4 @@ -"""Python APIs for STIX 2 Semantic Equivalence. +"""Python APIs for STIX 2 Semantic Equivalence and Similarity. .. autosummary:: :toctree: equivalence diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py index 402bcb2..e78624e 100644 --- a/stix2/equivalence/graph/__init__.py +++ b/stix2/equivalence/graph/__init__.py @@ -97,7 +97,6 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): if weights["_internal"]["max_depth"] <= 0: raise ValueError("weight_dict['_internal']['max_depth'] must be greater than 0") - depth = weights["_internal"]["max_depth"] pairs = _object_pairs( _bucket_per_type(ds1.query([])), @@ -108,13 +107,13 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): weights["_internal"]["ds1"] = ds1 weights["_internal"]["ds2"] = ds2 + logger.debug("Starting graph similarity process between DataStores: '%s' and '%s'", ds1.id, ds2.id) for object1, object2 in pairs: iprop_score = {} object1_id = object1["id"] object2_id = object2["id"] result = object_similarity(object1, object2, iprop_score, **weights) - weights["_internal"]["max_depth"] = depth if object1_id not in results: results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result} diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py index 790fc87..e175938 100644 --- a/stix2/equivalence/object/__init__.py +++ b/stix2/equivalence/object/__init__.py @@ -103,13 +103,13 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): try: weights[type1] except KeyError: - logger.warning("'%s' type has no 'weights' dict specified & thus no semantic equivalence method to call!", type1) + logger.warning("'%s' type has no 'weights' dict specified & thus no object similarity method to call!", type1) sum_weights = matching_score = 0 else: try: method = weights[type1]["method"] except KeyError: - logger.debug("Starting semantic equivalence process between: '%s' and '%s'", obj1["id"], obj2["id"]) + logger.debug("Starting object similarity process between: '%s' and '%s'", obj1["id"], obj2["id"]) matching_score = 0.0 sum_weights = 0.0 @@ -129,9 +129,9 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): weights["_internal"]["max_depth"] = max_depth - 1 ds1, ds2 = weights["_internal"]["ds1"], weights["_internal"]["ds2"] contributing_score = w * comp_funct(obj1[prop], obj2[prop], ds1, ds2, **weights) - weights["_internal"]["max_depth"] = max_depth + 1 else: continue # prevent excessive recursion + weights["_internal"]["max_depth"] = max_depth else: contributing_score = w * comp_funct(obj1[prop], obj2[prop]) @@ -148,7 +148,7 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): prop_scores["sum_weights"] = sum_weights logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) else: - logger.debug("Starting semantic equivalence process between: '%s' and '%s'", obj1["id"], obj2["id"]) + logger.debug("Starting object similarity process between: '%s' and '%s'", obj1["id"], obj2["id"]) try: matching_score, sum_weights = method(obj1, obj2, prop_scores, **weights[type1]) except TypeError: @@ -350,19 +350,24 @@ def partial_location_distance(lat1, long1, lat2, long2, threshold): def _versioned_checks(ref1, ref2, ds1, ds2, **weights): """Checks multiple object versions if present in graph. - Maximizes for the semantic equivalence score of a particular version.""" + Maximizes for the similarity score of a particular version.""" results = {} objects1 = ds1.query([Filter("id", "=", ref1)]) objects2 = ds2.query([Filter("id", "=", ref2)]) - if len(objects1) > 0 and len(objects2) > 0: - for o1 in objects1: - for o2 in objects2: - result = object_similarity(o1, o2, **weights) - if ref1 not in results: - results[ref1] = {"matched": ref2, "value": result} - elif result > results[ref1]["value"]: - results[ref1] = {"matched": ref2, "value": result} + pairs = _object_pairs( + _bucket_per_type(objects1), + _bucket_per_type(objects2), + weights, + ) + + for object1, object2 in pairs: + result = object_similarity(object1, object2, **weights) + if ref1 not in results: + results[ref1] = {"matched": ref2, "value": result} + elif result > results[ref1]["value"]: + results[ref1] = {"matched": ref2, "value": result} + result = results.get(ref1, {}).get("value", 0.0) logger.debug( "--\t\t_versioned_checks '%s' '%s'\tresult: '%s'", @@ -372,8 +377,8 @@ def _versioned_checks(ref1, ref2, ds1, ds2, **weights): def reference_check(ref1, ref2, ds1, ds2, **weights): - """For two references, de-reference the object and perform object-based - semantic equivalence. The score influences the result of an edge check.""" + """For two references, de-reference the object and perform object_similarity. + The score influences the result of an edge check.""" type1, type2 = ref1.split("--")[0], ref2.split("--")[0] result = 0.0 @@ -394,7 +399,7 @@ def reference_check(ref1, ref2, ds1, ds2, **weights): def list_reference_check(refs1, refs2, ds1, ds2, **weights): """For objects that contain multiple references (i.e., object_refs) perform - the same de-reference procedure and perform object-based semantic equivalence. + the same de-reference procedure and perform object_similarity. The score influences the objects containing these references. The result is weighted on the amount of unique objects that could 1) be de-referenced 2) """ results = {} From 490251dd85e1f79095b0f7b53053bd05632c3775 Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Thu, 18 Feb 2021 12:21:52 -0500 Subject: [PATCH 20/27] Revert JSON canonicalization code Undo our Python 2 compatibility fixes --- .pre-commit-config.yaml | 1 + stix2/canonicalization/Canonicalize.py | 55 ++++++--------------- stix2/canonicalization/NumberToJson.py | 67 ++++++++++++++++---------- 3 files changed, 58 insertions(+), 65 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d127dd6..434eb95 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,3 +23,4 @@ repos: args: ["-c", "--diff"] - id: isort name: Sort python imports (fixes files) +exclude: ^stix2/canonicalization/ diff --git a/stix2/canonicalization/Canonicalize.py b/stix2/canonicalization/Canonicalize.py index 78145be..72910ab 100644 --- a/stix2/canonicalization/Canonicalize.py +++ b/stix2/canonicalization/Canonicalize.py @@ -20,12 +20,8 @@ # JCS compatible JSON serializer for Python 3.x # ################################################# -# This file has been modified to be compatible with Python 2.x as well - import re -import six - from stix2.canonicalization.NumberToJson import convert2Es6Format try: @@ -55,10 +51,10 @@ ESCAPE_DCT = { } for i in range(0x20): ESCAPE_DCT.setdefault(chr(i), '\\u{0:04x}'.format(i)) + #ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,)) INFINITY = float('inf') - def py_encode_basestring(s): """Return a JSON representation of a Python string @@ -70,7 +66,6 @@ def py_encode_basestring(s): encode_basestring = (c_encode_basestring or py_encode_basestring) - def py_encode_basestring_ascii(s): """Return an ASCII-only JSON representation of a Python string @@ -83,6 +78,7 @@ def py_encode_basestring_ascii(s): n = ord(s) if n < 0x10000: return '\\u{0:04x}'.format(n) + #return '\\u%04x' % (n,) else: # surrogate pair n -= 0x10000 @@ -96,7 +92,6 @@ encode_basestring_ascii = ( c_encode_basestring_ascii or py_encode_basestring_ascii ) - class JSONEncoder(object): """Extensible JSON encoder for Python data structures. @@ -128,11 +123,10 @@ class JSONEncoder(object): """ item_separator = ', ' key_separator = ': ' - def __init__( - self, skipkeys=False, ensure_ascii=False, + self, *, skipkeys=False, ensure_ascii=False, check_circular=True, allow_nan=True, sort_keys=True, - indent=None, separators=(',', ':'), default=None, + indent=None, separators=(',', ':'), default=None ): """Constructor for JSONEncoder, with sensible defaults. @@ -277,6 +271,7 @@ class JSONEncoder(object): return text + if ( _one_shot and c_make_encoder is not None and self.indent is None @@ -294,11 +289,10 @@ class JSONEncoder(object): ) return _iterencode(o, 0) - def _make_iterencode( markers, _default, _encoder, _indent, _floatstr, _key_separator, _item_separator, _sort_keys, _skipkeys, _one_shot, - # HACK: hand-optimized bytecode; turn globals into locals + ## HACK: hand-optimized bytecode; turn globals into locals ValueError=ValueError, dict=dict, float=float, @@ -362,10 +356,7 @@ def _make_iterencode( chunks = _iterencode_dict(value, _current_indent_level) else: chunks = _iterencode(value, _current_indent_level) - # Below line commented-out for python2 compatibility - # yield from chunks - for chunk in chunks: - yield chunk + yield from chunks if newline_indent is not None: _current_indent_level -= 1 yield '\n' + _indent * _current_indent_level @@ -397,8 +388,7 @@ def _make_iterencode( else: items = dct.items() for key, value in items: - # Replaced isinstance(key, str) with below to enable simultaneous python 2 & 3 compatibility - if isinstance(key, six.string_types) or isinstance(key, six.binary_type): + if isinstance(key, str): pass # JavaScript is weakly typed for these, so it makes sense to # also allow them. Many encoders seem to do something like this. @@ -445,10 +435,7 @@ def _make_iterencode( chunks = _iterencode_dict(value, _current_indent_level) else: chunks = _iterencode(value, _current_indent_level) - # Below line commented-out for python2 compatibility - # yield from chunks - for chunk in chunks: - yield chunk + yield from chunks if newline_indent is not None: _current_indent_level -= 1 yield '\n' + _indent * _current_indent_level @@ -457,8 +444,7 @@ def _make_iterencode( del markers[markerid] def _iterencode(o, _current_indent_level): - # Replaced isinstance(o, str) with below to enable simultaneous python 2 & 3 compatibility - if isinstance(o, six.string_types) or isinstance(o, six.binary_type): + if isinstance(o, str): yield _encoder(o) elif o is None: yield 'null' @@ -473,15 +459,9 @@ def _make_iterencode( # see comment for int/float in _make_iterencode yield convert2Es6Format(o) elif isinstance(o, (list, tuple)): - # Below line commented-out for python2 compatibility - # yield from _iterencode_list(o, _current_indent_level) - for thing in _iterencode_list(o, _current_indent_level): - yield thing + yield from _iterencode_list(o, _current_indent_level) elif isinstance(o, dict): - # Below line commented-out for python2 compatibility - # yield from _iterencode_dict(o, _current_indent_level) - for thing in _iterencode_dict(o, _current_indent_level): - yield thing + yield from _iterencode_dict(o, _current_indent_level) else: if markers is not None: markerid = id(o) @@ -489,23 +469,18 @@ def _make_iterencode( raise ValueError("Circular reference detected") markers[markerid] = o o = _default(o) - # Below line commented-out for python2 compatibility - # yield from _iterencode(o, _current_indent_level) - for thing in _iterencode(o, _current_indent_level): - yield thing + yield from _iterencode(o, _current_indent_level) if markers is not None: del markers[markerid] return _iterencode - -def canonicalize(obj, utf8=True): +def canonicalize(obj,utf8=True): textVal = JSONEncoder(sort_keys=True).encode(obj) if utf8: return textVal.encode() return textVal - -def serialize(obj, utf8=True): +def serialize(obj,utf8=True): textVal = JSONEncoder(sort_keys=False).encode(obj) if utf8: return textVal.encode() diff --git a/stix2/canonicalization/NumberToJson.py b/stix2/canonicalization/NumberToJson.py index cea54d0..132af3f 100644 --- a/stix2/canonicalization/NumberToJson.py +++ b/stix2/canonicalization/NumberToJson.py @@ -21,40 +21,50 @@ # Convert a Python double/float into an ES6/V8 compatible string # ################################################################## def convert2Es6Format(value): - # Convert double/float to str using the native Python formatter +# Convert double/float to str using the native Python formatter fvalue = float(value) - - # Zero is a special case. The following line takes "-0" case as well +# +# Zero is a special case. The following line takes "-0" case as well +# if fvalue == 0: return '0' - - # The rest of the algorithm works on the textual representation only +# +# The rest of the algorithm works on the textual representation only +# pyDouble = str(fvalue) - - # The following line catches the "inf" and "nan" values returned by str(fvalue) +# +# The following line catches the "inf" and "nan" values returned by str(fvalue) +# if pyDouble.find('n') >= 0: raise ValueError("Invalid JSON number: " + pyDouble) - - # Save sign separately, it doesn't have any role in the algorithm +# +# Save sign separately, it doesn't have any role in the algorithm +# pySign = '' if pyDouble.find('-') == 0: pySign = '-' pyDouble = pyDouble[1:] - - # Now we should only have valid non-zero values +# +# Now we should only have valid non-zero values +# pyExpStr = '' pyExpVal = 0 q = pyDouble.find('e') if q > 0: - # Grab the exponent and remove it from the number +# +# Grab the exponent and remove it from the number +# pyExpStr = pyDouble[q:] if pyExpStr[2:3] == '0': - # Supress leading zero on exponents +# +# Supress leading zero on exponents +# pyExpStr = pyExpStr[:2] + pyExpStr[3:] pyDouble = pyDouble[0:q] pyExpVal = int(pyExpStr[1:]) - - # Split number in pyFirst + pyDot + pyLast +# +# Split number in pyFirst + pyDot + pyLast +# pyFirst = pyDouble pyDot = '' pyLast = '' @@ -63,33 +73,40 @@ def convert2Es6Format(value): pyDot = '.' pyFirst = pyDouble[:q] pyLast = pyDouble[q + 1:] - - # Now the string is split into: pySign + pyFirst + pyDot + pyLast + pyExpStr +# +# Now the string is split into: pySign + pyFirst + pyDot + pyLast + pyExpStr +# if pyLast == '0': - # Always remove trailing .0 +# +# Always remove trailing .0 +# pyDot = '' pyLast = '' - if pyExpVal > 0 and pyExpVal < 21: - # Integers are shown as is with up to 21 digits +# +# Integers are shown as is with up to 21 digits +# pyFirst += pyLast pyLast = '' pyDot = '' pyExpStr = '' q = pyExpVal - len(pyFirst) while q >= 0: - q -= 1 + q -= 1; pyFirst += '0' elif pyExpVal < 0 and pyExpVal > -7: - # Small numbers are shown as 0.etc with e-6 as lower limit +# +# Small numbers are shown as 0.etc with e-6 as lower limit +# pyLast = pyFirst + pyLast pyFirst = '0' pyDot = '.' pyExpStr = '' q = pyExpVal while q < -1: - q += 1 + q += 1; pyLast = '0' + pyLast - - # The resulting sub-strings are concatenated +# +# The resulting sub-strings are concatenated +# return pySign + pyFirst + pyDot + pyLast + pyExpStr From b4a0a9ea10c32c13ae730871867364d2711af3ff Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Thu, 18 Feb 2021 12:26:54 -0500 Subject: [PATCH 21/27] Remove six dependency (backwards breaking) We've already removed Python 2 from our test harness, but this removes all python 2 compatibility code left in. --- setup.py | 1 - stix2/base.py | 27 ++++++------------------ stix2/custom.py | 15 +++++-------- stix2/datastore/__init__.py | 6 ++---- stix2/datastore/filesystem.py | 4 +--- stix2/datastore/filters.py | 7 ++---- stix2/markings/utils.py | 6 ++---- stix2/pattern_visitor.py | 5 ++--- stix2/patterns.py | 4 +--- stix2/properties.py | 8 +++---- stix2/test/v20/test_datastore_filters.py | 7 +++--- stix2/test/v20/test_datastore_taxii.py | 3 +-- stix2/test/v21/test_datastore_filters.py | 7 +++--- stix2/test/v21/test_datastore_taxii.py | 3 +-- stix2/test/v21/test_deterministic_ids.py | 8 +------ stix2/utils.py | 3 +-- stix2/v20/common.py | 4 +--- stix2/v21/sdo.py | 2 +- 18 files changed, 36 insertions(+), 84 deletions(-) diff --git a/setup.py b/setup.py index 4bfc191..175c32d 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,6 @@ setup( 'pytz', 'requests', 'simplejson', - 'six>=1.13.0', 'stix2-patterns>=1.2.0', ], project_urls={ diff --git a/stix2/base.py b/stix2/base.py index 38a5997..b158f06 100644 --- a/stix2/base.py +++ b/stix2/base.py @@ -5,7 +5,6 @@ import re import uuid import simplejson as json -import six import stix2 from stix2.canonicalization.Canonicalize import canonicalize @@ -70,12 +69,9 @@ class _STIXBase(Mapping): # InvalidValueError... so let those propagate. raise except Exception as exc: - six.raise_from( - InvalidValueError( - self.__class__, prop_name, reason=str(exc), - ), - exc, - ) + raise InvalidValueError( + self.__class__, prop_name, reason=str(exc), + ) from exc # interproperty constraint methods @@ -369,19 +365,8 @@ class _Observable(_STIXBase): if json_serializable_object: data = canonicalize(json_serializable_object, utf8=False) - - # The situation is complicated w.r.t. python 2/3 behavior, so - # I'd rather not rely on particular exceptions being raised to - # determine what to do. Better to just check the python version - # directly. - if six.PY3: - uuid_ = uuid.uuid5(SCO_DET_ID_NAMESPACE, data) - else: - uuid_ = uuid.uuid5( - SCO_DET_ID_NAMESPACE, data.encode("utf-8"), - ) - - id_ = "{}--{}".format(self._type, six.text_type(uuid_)) + uuid_ = uuid.uuid5(SCO_DET_ID_NAMESPACE, data) + id_ = "{}--{}".format(self._type, str(uuid_)) return id_ @@ -447,7 +432,7 @@ def _make_json_serializable(value): for v in value ] - elif not isinstance(value, (int, float, six.string_types, bool)): + elif not isinstance(value, (int, float, str, bool)): # If a "simple" value which is not already JSON-serializable, # JSON-serialize to a string and use that as our JSON-serializable # value. This applies to our datetime objects currently (timestamp diff --git a/stix2/custom.py b/stix2/custom.py index b012f37..f4dea7f 100644 --- a/stix2/custom.py +++ b/stix2/custom.py @@ -1,7 +1,5 @@ from collections import OrderedDict -import six - from .base import _cls_init from .registration import ( _register_marking, _register_object, _register_observable, @@ -13,14 +11,11 @@ def _get_properties_dict(properties): try: return OrderedDict(properties) except TypeError as e: - six.raise_from( - ValueError( - "properties must be dict-like, e.g. a list " - "containing tuples. For example, " - "[('property1', IntegerProperty())]", - ), - e, - ) + raise ValueError( + "properties must be dict-like, e.g. a list " + "containing tuples. For example, " + "[('property1', IntegerProperty())]", + ) from e def _custom_object_builder(cls, type, properties, version, base_class): diff --git a/stix2/datastore/__init__.py b/stix2/datastore/__init__.py index 1ff0769..715c6e6 100644 --- a/stix2/datastore/__init__.py +++ b/stix2/datastore/__init__.py @@ -15,8 +15,6 @@ Python STIX2 DataStore API. from abc import ABCMeta, abstractmethod import uuid -from six import with_metaclass - from stix2.datastore.filters import Filter, FilterSet from stix2.utils import deduplicate @@ -219,7 +217,7 @@ class DataStoreMixin(object): raise AttributeError(msg % self.__class__.__name__) -class DataSink(with_metaclass(ABCMeta)): +class DataSink(metaclass=ABCMeta): """An implementer will create a concrete subclass from this class for the specific DataSink. @@ -245,7 +243,7 @@ class DataSink(with_metaclass(ABCMeta)): """ -class DataSource(with_metaclass(ABCMeta)): +class DataSource(metaclass=ABCMeta): """An implementer will create a concrete subclass from this class for the specific DataSource. diff --git a/stix2/datastore/filesystem.py b/stix2/datastore/filesystem.py index d865768..d844115 100644 --- a/stix2/datastore/filesystem.py +++ b/stix2/datastore/filesystem.py @@ -6,8 +6,6 @@ import os import re import stat -import six - from stix2 import v20, v21 from stix2.base import _STIXBase from stix2.datastore import ( @@ -116,7 +114,7 @@ def _update_allow(allow_set, value): """ adding_seq = hasattr(value, "__iter__") and \ - not isinstance(value, six.string_types) + not isinstance(value, str) if allow_set is None: allow_set = set() diff --git a/stix2/datastore/filters.py b/stix2/datastore/filters.py index 4f72b82..6d9273e 100644 --- a/stix2/datastore/filters.py +++ b/stix2/datastore/filters.py @@ -3,8 +3,6 @@ import collections from datetime import datetime -import six - import stix2.utils """Supported filter operations""" @@ -12,8 +10,7 @@ FILTER_OPS = ['=', '!=', 'in', '>', '<', '>=', '<=', 'contains'] """Supported filter value types""" FILTER_VALUE_TYPES = ( - bool, dict, float, int, list, tuple, six.string_types, - datetime, + bool, dict, float, int, list, tuple, str, datetime, ) @@ -84,7 +81,7 @@ class Filter(collections.namedtuple('Filter', ['property', 'op', 'value'])): # If filtering on a timestamp property and the filter value is a string, # try to convert the filter value to a datetime instance. if isinstance(stix_obj_property, datetime) and \ - isinstance(self.value, six.string_types): + isinstance(self.value, str): filter_value = stix2.utils.parse_into_datetime(self.value) else: filter_value = self.value diff --git a/stix2/markings/utils.py b/stix2/markings/utils.py index 41516cc..39d3185 100644 --- a/stix2/markings/utils.py +++ b/stix2/markings/utils.py @@ -2,8 +2,6 @@ import collections -import six - from stix2 import exceptions, utils @@ -129,7 +127,7 @@ def compress_markings(granular_markings): {'marking_ref': item, 'selectors': sorted(selectors)} if utils.is_marking(item) else {'lang': item, 'selectors': sorted(selectors)} - for item, selectors in six.iteritems(map_) + for item, selectors in map_.items() ] return compressed @@ -230,7 +228,7 @@ def iterpath(obj, path=None): if path is None: path = [] - for varname, varobj in iter(sorted(six.iteritems(obj))): + for varname, varobj in iter(sorted(obj.items())): path.append(varname) yield (path, varobj) diff --git a/stix2/pattern_visitor.py b/stix2/pattern_visitor.py index 93eb083..c4a616b 100644 --- a/stix2/pattern_visitor.py +++ b/stix2/pattern_visitor.py @@ -3,7 +3,6 @@ import importlib import inspect -from six import text_type from stix2patterns.exceptions import ParseException from stix2patterns.grammars.STIXPatternParser import TerminalNode from stix2patterns.v20.grammars.STIXPatternParser import \ @@ -263,7 +262,7 @@ class STIXPatternVisitorForSTIX2(): property_path.append( self.instantiate( "ListObjectPathComponent", - current.property_name if isinstance(current, BasicObjectPathComponent) else text_type(current), + current.property_name if isinstance(current, BasicObjectPathComponent) else str(current), next.value, ), ) @@ -286,7 +285,7 @@ class STIXPatternVisitorForSTIX2(): if isinstance(first_component, TerminalNode): step = first_component.getText() else: - step = text_type(first_component) + step = str(first_component) # if step.endswith("_ref"): # return stix2.ReferenceObjectPathComponent(step) # else: diff --git a/stix2/patterns.py b/stix2/patterns.py index ce07637..f9f451e 100644 --- a/stix2/patterns.py +++ b/stix2/patterns.py @@ -5,8 +5,6 @@ import binascii import datetime import re -import six - from .utils import parse_into_datetime @@ -15,7 +13,7 @@ def escape_quotes_and_backslashes(s): def quote_if_needed(x): - if isinstance(x, six.string_types): + if isinstance(x, str): if x.find("-") != -1: if not x.startswith("'"): return "'" + x + "'" diff --git a/stix2/properties.py b/stix2/properties.py index bf7fc8c..dbbe667 100644 --- a/stix2/properties.py +++ b/stix2/properties.py @@ -7,8 +7,6 @@ import inspect import re import uuid -from six import string_types, text_type - from .base import _STIXBase from .exceptions import ( CustomContentError, DictionaryKeyError, MissingPropertiesError, @@ -227,7 +225,7 @@ class ListProperty(Property): except TypeError: raise ValueError("must be an iterable.") - if isinstance(value, (_STIXBase, string_types)): + if isinstance(value, (_STIXBase, str)): value = [value] if isinstance(self.contained, Property): @@ -268,8 +266,8 @@ class StringProperty(Property): super(StringProperty, self).__init__(**kwargs) def clean(self, value): - if not isinstance(value, string_types): - return text_type(value) + if not isinstance(value, str): + return str(value) return value diff --git a/stix2/test/v20/test_datastore_filters.py b/stix2/test/v20/test_datastore_filters.py index c5d26c1..e8945d1 100644 --- a/stix2/test/v20/test_datastore_filters.py +++ b/stix2/test/v20/test_datastore_filters.py @@ -128,18 +128,17 @@ def test_filter_value_type_check(): with pytest.raises(TypeError) as excinfo: Filter('created', '=', object()) - # On Python 2, the type of object() is `` On Python 3, it's ``. - assert any([s in str(excinfo.value) for s in ["", "''"]]) + assert "''" in str(excinfo.value) assert "is not supported. The type must be a Python immutable type or dictionary" in str(excinfo.value) with pytest.raises(TypeError) as excinfo: Filter("type", "=", complex(2, -1)) - assert any([s in str(excinfo.value) for s in ["", "''"]]) + assert "''" in str(excinfo.value) assert "is not supported. The type must be a Python immutable type or dictionary" in str(excinfo.value) with pytest.raises(TypeError) as excinfo: Filter("type", "=", set([16, 23])) - assert any([s in str(excinfo.value) for s in ["", "''"]]) + assert "''" in str(excinfo.value) assert "is not supported. The type must be a Python immutable type or dictionary" in str(excinfo.value) diff --git a/stix2/test/v20/test_datastore_taxii.py b/stix2/test/v20/test_datastore_taxii.py index 0b21981..34daa80 100644 --- a/stix2/test/v20/test_datastore_taxii.py +++ b/stix2/test/v20/test_datastore_taxii.py @@ -3,7 +3,6 @@ import json from medallion.filters.basic_filter import BasicFilter import pytest from requests.models import Response -import six from taxii2client.common import _filter_kwargs_to_query_params from taxii2client.v20 import Collection @@ -27,7 +26,7 @@ class MockTAXIICollectionEndpoint(Collection): def add_objects(self, bundle): self._verify_can_write() - if isinstance(bundle, six.string_types): + if isinstance(bundle, str): bundle = json.loads(bundle) for object in bundle.get("objects", []): self.objects.append(object) diff --git a/stix2/test/v21/test_datastore_filters.py b/stix2/test/v21/test_datastore_filters.py index b7b41a0..a6a50a7 100644 --- a/stix2/test/v21/test_datastore_filters.py +++ b/stix2/test/v21/test_datastore_filters.py @@ -146,18 +146,17 @@ def test_filter_value_type_check(): with pytest.raises(TypeError) as excinfo: Filter('created', '=', object()) - # On Python 2, the type of object() is `` On Python 3, it's ``. - assert any([s in str(excinfo.value) for s in ["", "''"]]) + assert "''" in str(excinfo.value) assert "is not supported. The type must be a Python immutable type or dictionary" in str(excinfo.value) with pytest.raises(TypeError) as excinfo: Filter("type", "=", complex(2, -1)) - assert any([s in str(excinfo.value) for s in ["", "''"]]) + assert "''" in str(excinfo.value) assert "is not supported. The type must be a Python immutable type or dictionary" in str(excinfo.value) with pytest.raises(TypeError) as excinfo: Filter("type", "=", set([16, 23])) - assert any([s in str(excinfo.value) for s in ["", "''"]]) + assert "''" in str(excinfo.value) assert "is not supported. The type must be a Python immutable type or dictionary" in str(excinfo.value) diff --git a/stix2/test/v21/test_datastore_taxii.py b/stix2/test/v21/test_datastore_taxii.py index 92ae6dc..4b7e299 100644 --- a/stix2/test/v21/test_datastore_taxii.py +++ b/stix2/test/v21/test_datastore_taxii.py @@ -3,7 +3,6 @@ import json from medallion.filters.basic_filter import BasicFilter import pytest from requests.models import Response -import six from taxii2client.common import _filter_kwargs_to_query_params from taxii2client.v21 import Collection @@ -27,7 +26,7 @@ class MockTAXIICollectionEndpoint(Collection): def add_objects(self, bundle): self._verify_can_write() - if isinstance(bundle, six.string_types): + if isinstance(bundle, str): bundle = json.loads(bundle) for object in bundle.get("objects", []): self.objects.append(object) diff --git a/stix2/test/v21/test_deterministic_ids.py b/stix2/test/v21/test_deterministic_ids.py index 1e6e2d4..56b2e8a 100644 --- a/stix2/test/v21/test_deterministic_ids.py +++ b/stix2/test/v21/test_deterministic_ids.py @@ -3,7 +3,6 @@ import datetime import uuid import pytest -import six import stix2.base import stix2.canonicalization.Canonicalize @@ -31,12 +30,7 @@ def _make_uuid5(name): """ Make a STIX 2.1+ compliant UUIDv5 from a "name". """ - if six.PY3: - uuid_ = uuid.uuid5(SCO_DET_ID_NAMESPACE, name) - else: - uuid_ = uuid.uuid5( - SCO_DET_ID_NAMESPACE, name.encode("utf-8"), - ) + uuid_ = uuid.uuid5(SCO_DET_ID_NAMESPACE, name) return uuid_ diff --git a/stix2/utils.py b/stix2/utils.py index 3e272f8..08e272d 100644 --- a/stix2/utils.py +++ b/stix2/utils.py @@ -7,7 +7,6 @@ import json import re import pytz -import six import stix2.registry as mappings import stix2.version @@ -70,7 +69,7 @@ def _to_enum(value, enum_type, enum_default=None): if not isinstance(value, enum_type): if value is None and enum_default is not None: value = enum_default - elif isinstance(value, six.string_types): + elif isinstance(value, str): value = enum_type[value.upper()] else: raise TypeError( diff --git a/stix2/v20/common.py b/stix2/v20/common.py index 720f14f..6695c9a 100644 --- a/stix2/v20/common.py +++ b/stix2/v20/common.py @@ -3,8 +3,6 @@ from collections import OrderedDict import copy -import six - from ..custom import _custom_marking_builder from ..markings import _MarkingsMixin from ..markings.utils import check_tlp_marking @@ -21,7 +19,7 @@ def _should_set_millisecond(cr, marking_type): if marking_type == TLPMarking: return True # otherwise, precision is kept from how it was given - if isinstance(cr, six.string_types): + if isinstance(cr, str): if '.' in cr: return True else: diff --git a/stix2/v21/sdo.py b/stix2/v21/sdo.py index c078967..35a878e 100644 --- a/stix2/v21/sdo.py +++ b/stix2/v21/sdo.py @@ -2,9 +2,9 @@ from collections import OrderedDict import itertools +from urllib.parse import quote_plus import warnings -from six.moves.urllib.parse import quote_plus from stix2patterns.validator import run_validator from ..custom import _custom_object_builder From 528d956f4a03ba8d43185a151b86124fff50ce96 Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Thu, 18 Feb 2021 20:45:38 -0500 Subject: [PATCH 22/27] Specify lowest supported Python version --- README.rst | 2 ++ setup.py | 1 + 2 files changed, 3 insertions(+) diff --git a/README.rst b/README.rst index 6055cf9..662e020 100644 --- a/README.rst +++ b/README.rst @@ -21,6 +21,8 @@ Install with `pip `__: $ pip install stix2 +Note: The library requires Python 3.6+. + Usage ----- diff --git a/setup.py b/setup.py index 175c32d..3f82733 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ setup( ], keywords='stix stix2 json cti cyber threat intelligence', packages=find_packages(exclude=['*.test', '*.test.*']), + python_requires='>=3.6', install_requires=[ 'pytz', 'requests', From b7b1b54afb7c91d65a73f480f6c78218faed9fce Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Thu, 18 Feb 2021 23:39:24 -0500 Subject: [PATCH 23/27] Remove 'six' from config --- .isort.cfg | 1 - docs/conf.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.isort.cfg b/.isort.cfg index e409e00..5e5af7f 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -10,7 +10,6 @@ known_third_party = pytz, requests, simplejson, - six, sphinx, stix2patterns, taxii2client, diff --git a/docs/conf.py b/docs/conf.py index 9723e39..5d12af3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -4,7 +4,6 @@ import os import re import sys -from six import class_types from sphinx.ext.autodoc import ClassDocumenter from stix2.base import _STIXBase @@ -107,7 +106,7 @@ class STIXPropertyDocumenter(ClassDocumenter): @classmethod def can_document_member(cls, member, membername, isattr, parent): - return isinstance(member, class_types) and \ + return isinstance(member, type) and \ issubclass(member, _STIXBase) and \ hasattr(member, '_properties') From 35f4bb0443e1bbc844c29febb05471dea180aac1 Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Thu, 18 Feb 2021 23:42:06 -0500 Subject: [PATCH 24/27] Update the semantic equivalence user guide page Differentiate between similarity and equivalence, and update functions to their new names and locations. --- docs/guide/equivalence.ipynb | 1778 ++++++++++++++++++++++++++-------- 1 file changed, 1350 insertions(+), 428 deletions(-) diff --git a/docs/guide/equivalence.ipynb b/docs/guide/equivalence.ipynb index 8393495..7427747 100644 --- a/docs/guide/equivalence.ipynb +++ b/docs/guide/equivalence.ipynb @@ -59,7 +59,9 @@ "source": [ "## Checking Semantic Equivalence\n", "\n", - "The [Environment](../api/stix2.environment.rst#stix2.environment.Environment) has a function for checking if two STIX Objects are semantically equivalent. For each supported object type, the algorithm checks if the values for a specific set of properties match. Then each matching property is weighted since every property doesn't represent the same level of importance for semantic equivalence. The result will be the sum of these weighted values, in the range of 0 to 100. A result of 0 means that the the two objects are not equivalent, and a result of 100 means that they are equivalent.\n", + "The [Environment](../api/stix2.environment.rst#stix2.environment.Environment) has functions for checking if two STIX Objects are very similar or identical. The functions differentiate between equivalence, which is a binary concept (two things are either equivalent or they are not), and similarity, which is a continuum (an object can be more similar to one object than to another). The similarity function answers the question, “How similar are these two objects?” while the equivalence function uses the similarity function to answer the question, “Are these two objects equivalent?”\n", + "\n", + "For each supported object type, the [object_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.object_similarity) function checks if the values for a specific set of properties match. Then each matching property is weighted since every property doesn't represent the same level of importance for semantic similarity. The result will be the sum of these weighted values, in the range of 0 to 100. A result of 0 means that the the two objects are not equivalent, and a result of 100 means that they are equivalent. Values in between mean the two objects are more or less similar, and can be used to determine if they should be considered equivalent or not. The [object_equivalence()](../api/stix2.environment.rst#stix2.environment.Environment.object_equivalence) calls [object_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.object_similarity) and compares the result to a threshold to determine if the objects are equivalent. Different organizations or users may use different thresholds.\n", "\n", "TODO: Add a link to the committee note when it is released.\n", "\n", @@ -67,7 +69,7 @@ "\n", "Another use case for this functionality is to identify identical or near-identical content, such as a vulnerability shared under three different nicknames by three different STIX producers. A third use case involves a feed that aggregates data from multiple other sources. It will want to make sure that it is not publishing duplicate data.\n", "\n", - "Below we will show examples of the semantic equivalence results of various objects. Unless otherwise specified, the ID of each object will be generated by the library, so the two objects will not have the same ID. This demonstrates that the semantic equivalence algorithm only looks at specific properties for each object type.\n", + "Below we will show examples of the semantic similarity results of various objects. Unless otherwise specified, the ID of each object will be generated by the library, so the two objects will not have the same ID. This demonstrates that the semantic similarity algorithm only looks at specific properties for each object type. Each example also shows the result of calling the equivalence function, with a threshold value of `90`.\n", "\n", "**Please note** that you will need to install a few extra dependencies in order to use the semantic equivalence functions. You can do this using:\n", "\n", @@ -75,19 +77,24 @@ "\n", "### Attack Pattern Example\n", "\n", - "For Attack Patterns, the only properties that contribute to semantic equivalence are `name` and `external_references`, with weights of 30 and 70, respectively. In this example, both attack patterns have the same external reference but the second has a slightly different yet still similar name." + "For Attack Patterns, the only properties that contribute to semantic similarity are `name` and `external_references`, with weights of 30 and 70, respectively. In this example, both attack patterns have the same external reference but the second has a slightly different yet still similar name." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
91.9\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
91.81818181818181\n",
        "
\n" ], "text/plain": [ "" ] }, - "execution_count": 3, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/html": [ + "
True\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -190,7 +284,8 @@ " },\n", " ],\n", ")\n", - "print(env.semantically_equivalent(ap1, ap2))" + "print(env.object_similarity(ap1, ap2))\n", + "print(env.object_equivalence(ap1, ap2, threshold=90))" ] }, { @@ -199,19 +294,24 @@ "source": [ "### Campaign Example\n", "\n", - "For Campaigns, the only properties that contribute to semantic equivalence are `name` and `aliases`, with weights of 60 and 40, respectively. In this example, the two campaigns have completely different names, but slightly similar descriptions. The result may be higher than expected because the Jaro-Winkler algorithm used to compare string properties looks at the edit distance of the two strings rather than just the words in them." + "For Campaigns, the only properties that contribute to semantic similarity are `name` and `aliases`, with weights of 60 and 40, respectively. In this example, the two campaigns have completely different names, but slightly similar descriptions." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
False\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -298,7 +485,8 @@ "\n", "c2 = Campaign(\n", " name=\"Another Campaign\",)\n", - "print(env.semantically_equivalent(c1, c2))" + "print(env.object_similarity(c1, c2))\n", + "print(env.object_equivalence(c1, c2, threshold=90))" ] }, { @@ -307,19 +495,24 @@ "source": [ "### Identity Example\n", "\n", - "For Identities, the only properties that contribute to semantic equivalence are `name`, `identity_class`, and `sectors`, with weights of 60, 20, and 20, respectively. In this example, the two identities are identical, but are missing one of the contributing properties. The algorithm only compares properties that are actually present on the objects. Also note that they have completely different description properties, but because description is not one of the properties considered for semantic equivalence, this difference has no effect on the result." + "For Identities, the only properties that contribute to semantic similarity are `name`, `identity_class`, and `sectors`, with weights of 60, 20, and 20, respectively. In this example, the two identities are identical, but are missing one of the contributing properties. The algorithm only compares properties that are actually present on the objects. Also note that they have completely different description properties, but because description is not one of the properties considered for semantic similarity, this difference has no effect on the result." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
True\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -411,7 +691,8 @@ " identity_class=\"individual\",\n", " description=\"A person\",\n", ")\n", - "print(env.semantically_equivalent(id1, id2))" + "print(env.object_similarity(id1, id2))\n", + "print(env.object_equivalence(id1, id2, threshold=90))" ] }, { @@ -420,28 +701,26 @@ "source": [ "### Indicator Example\n", "\n", - "For Indicators, the only properties that contribute to semantic equivalence are `indicator_types`, `pattern`, and `valid_from`, with weights of 15, 80, and 5, respectively. In this example, the two indicators have patterns with different hashes but the same indicator_type and valid_from. For patterns, the algorithm currently only checks if they are identical." + "For Indicators, the only properties that contribute to semantic similarity are `indicator_types`, `pattern`, and `valid_from`, with weights of 15, 80, and 5, respectively. In this example, the two indicators have patterns with different hashes but the same indicator_type and valid_from. For patterns, the algorithm currently only checks if they are identical." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": { "scrolled": true }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Indicator pattern equivalence is not fully defined; will default to zero if not completely identical\n" - ] - }, { "data": { "text/html": [ - "
False\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -535,7 +901,8 @@ " pattern=\"[file:hashes.MD5 = '79054025255fb1a26e4bc422aef54eb4']\",\n", " valid_from=\"2017-01-01T12:34:56Z\",\n", ")\n", - "print(env.semantically_equivalent(ind1, ind2))" + "print(env.object_similarity(ind1, ind2))\n", + "print(env.object_equivalence(ind1, ind2, threshold=90))" ] }, { @@ -551,12 +918,12 @@ "source": [ "### Location Example\n", "\n", - "For Locations, the only properties that contribute to semantic equivalence are `longitude`/`latitude`, `region`, and `country`, with weights of 34, 33, and 33, respectively. In this example, the two locations are Washington, D.C. and New York City. The algorithm computes the distance between two locations using the haversine formula and uses that to influence equivalence." + "For Locations, the only properties that contribute to semantic similarity are `longitude`/`latitude`, `region`, and `country`, with weights of 34, 33, and 33, respectively. In this example, the two locations are Washington, D.C. and New York City. The algorithm computes the distance between two locations using the haversine formula and uses that to influence similarity." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": { "scrolled": true }, @@ -564,8 +931,13 @@ { "data": { "text/html": [ - "
False\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -655,7 +1114,8 @@ " latitude=40.713,\n", " longitude=-74.006,\n", ")\n", - "print(env.semantically_equivalent(loc1, loc2))" + "print(env.object_similarity(loc1, loc2))\n", + "print(env.object_equivalence(loc1, loc2, threshold=90))" ] }, { @@ -664,12 +1124,12 @@ "source": [ "### Malware Example\n", "\n", - "For Malware, the only properties that contribute to semantic equivalence are `malware_types` and `name`, with weights of 20 and 80, respectively. In this example, the two malware objects only differ in the strings in their malware_types lists. For lists, the algorithm bases its calculations on the intersection of the two lists. An empty intersection will result in a 0, and a complete intersection will result in a 1 for that property." + "For Malware, the only properties that contribute to semantic similarity are `malware_types` and `name`, with weights of 20 and 80, respectively. In this example, the two malware objects only differ in the strings in their malware_types lists. For lists, the algorithm bases its calculations on the intersection of the two lists. An empty intersection will result in a 0, and a complete intersection will result in a 1 for that property." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": { "scrolled": true }, @@ -677,8 +1137,13 @@ { "data": { "text/html": [ - "
True\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -772,7 +1324,8 @@ " name=\"Cryptolocker\",\n", " is_family=False,\n", " )\n", - "print(env.semantically_equivalent(mal1, mal2))" + "print(env.object_similarity(mal1, mal2))\n", + "print(env.object_equivalence(mal1, mal2, threshold=90))" ] }, { @@ -781,12 +1334,12 @@ "source": [ "### Threat Actor Example\n", "\n", - "For Threat Actors, the only properties that contribute to semantic equivalence are `threat_actor_types`, `name`, and `aliases`, with weights of 20, 60, and 20, respectively. In this example, the two threat actors have the same id properties but everything else is different. Since the id property does not factor into semantic equivalence, the result is not very high. The result is not zero because of the \"Token Sort Ratio\" algorithm used to compare the `name` property." + "For Threat Actors, the only properties that contribute to semantic similarity are `threat_actor_types`, `name`, and `aliases`, with weights of 20, 60, and 20, respectively. In this example, the two threat actors have the same id properties but everything else is different. Since the id property does not factor into semantic similarity, the result is not very high. The result is not zero because of the \"Token Sort Ratio\" algorithm used to compare the `name` property." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": { "scrolled": true }, @@ -794,8 +1347,13 @@ { "data": { "text/html": [ - "
6.6000000000000005\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
6.66666666666667\n",
        "
\n" ], "text/plain": [ "" ] }, - "execution_count": 9, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/html": [ + "
False\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -889,7 +1534,8 @@ " name=\"James Bond\",\n", " aliases=[\"007\"],\n", ")\n", - "print(env.semantically_equivalent(ta1, ta2))" + "print(env.object_similarity(ta1, ta2))\n", + "print(env.object_equivalence(ta1, ta2, threshold=90))" ] }, { @@ -898,12 +1544,12 @@ "source": [ "### Tool Example\n", "\n", - "For Tools, the only properties that contribute to semantic equivalence are `tool_types` and `name`, with weights of 20 and 80, respectively. In this example, the two tools have the same values for properties that contribute to semantic equivalence but one has an additional, non-contributing property." + "For Tools, the only properties that contribute to semantic similarity are `tool_types` and `name`, with weights of 20 and 80, respectively. In this example, the two tools have the same values for properties that contribute to semantic similarity but one has an additional, non-contributing property." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": { "scrolled": true }, @@ -911,120 +1557,13 @@ { "data": { "text/html": [ - "
100.0\n",
-       "
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from stix2 import Tool\n", - "\n", - "t1 = Tool(\n", - " tool_types=[\"remote-access\"],\n", - " name=\"VNC\",\n", - ")\n", - "t2 = Tool(\n", - " tool_types=[\"remote-access\"],\n", - " name=\"VNC\",\n", - " description=\"This is a tool\"\n", - ")\n", - "print(env.semantically_equivalent(t1, t2))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Vulnerability Example\n", - "\n", - "For Vulnerabilities, the only properties that contribute to semantic equivalence are `name` and `external_references`, with weights of 30 and 70, respectively. In this example, the two vulnerabilities have the same name but one also has an external reference. The algorithm doesn't take into account any semantic equivalence contributing properties that are not present on both objects." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
0\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
True\n",
        "
\n" ], "text/plain": [ "" ] }, - "execution_count": 13, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from stix2 import Report\n", + "from stix2 import Tool\n", "\n", - "r1 = Report(\n", - " report_types=[\"campaign\"],\n", - " name=\"Bad Cybercrime\",\n", - " published=\"2016-04-06T20:03:00.000Z\",\n", - " object_refs=[\"indicator--a740531e-63ff-4e49-a9e1-a0a3eed0e3e7\"],\n", + "t1 = Tool(\n", + " tool_types=[\"remote-access\"],\n", + " name=\"VNC\",\n", ")\n", - "r2 = Report(\n", - " report_types=[\"campaign\"],\n", - " name=\"Bad Cybercrime\",\n", - " published=\"2016-04-06T20:03:00.000Z\",\n", - " object_refs=[\"indicator--a740531e-63ff-4e49-a9e1-a0a3eed0e3e7\"],\n", + "t2 = Tool(\n", + " tool_types=[\"remote-access\"],\n", + " name=\"VNC\",\n", + " description=\"This is a tool\"\n", ")\n", - "print(env.semantically_equivalent(r1, r2))" + "print(env.object_similarity(t1, t2))\n", + "print(env.object_equivalence(t1, t2, threshold=90))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "By default, comparing objects of different spec versions will result in a `ValueError`." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "The objects to compare must be of the same spec version!", - "output_type": "error", - "traceback": [ - "\u001b[0;31mValueError\u001b[0m\u001b[0;31m:\u001b[0m The objects to compare must be of the same spec version!\n" - ] - } - ], - "source": [ - "from stix2.v20 import Identity as Identity20\n", + "### Vulnerability Example\n", "\n", - "id20 = Identity20(\n", - " name=\"John Smith\",\n", - " identity_class=\"individual\",\n", - ")\n", - "print(env.semantically_equivalent(id2, id20))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can optionally allow comparing across spec versions by providing a configuration dictionary using `ignore_spec_version` like in the next example:" + "For Vulnerabilities, the only properties that contribute to semantic similarity are `name` and `external_references`, with weights of 30 and 70, respectively. In this example, the two vulnerabilities have the same name but one also has an external reference. The algorithm doesn't take into account any semantic similarity contributing properties that are not present on both objects." ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
6.6000000000000005\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
True\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2 import Vulnerability\n", + "\n", + "vuln1 = Vulnerability(\n", + " name=\"Heartbleed\",\n", + " external_references=[\n", + " {\n", + " \"url\": \"https://example\",\n", + " \"source_name\": \"some-source\",\n", + " },\n", + " ],\n", + ")\n", + "vuln2 = Vulnerability(\n", + " name=\"Heartbleed\",\n", + ")\n", + "print(env.object_similarity(vuln1, vuln2))\n", + "print(env.object_equivalence(vuln1, vuln2, threshold=90))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Other Examples\n", + "\n", + "Comparing objects of different types will result in a `ValueError`." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "The objects to compare must be of the same type!", + "output_type": "error", + "traceback": [ + "\u001b[0;31mValueError\u001b[0m\u001b[0;31m:\u001b[0m The objects to compare must be of the same type!\n" + ] + } + ], + "source": [ + "print(env.object_similarity(ind1, vuln1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Some object types do not have a defined method for calculating semantic similarity and by default will give a warning and a result of zero." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "'report' type has no 'weights' dict specified & thus no object similarity method to call!\n" + ] + }, + { + "data": { + "text/html": [ + "
0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2 import Report\n", + "\n", + "r1 = Report(\n", + " report_types=[\"campaign\"],\n", + " name=\"Bad Cybercrime\",\n", + " published=\"2016-04-06T20:03:00.000Z\",\n", + " object_refs=[\"indicator--a740531e-63ff-4e49-a9e1-a0a3eed0e3e7\"],\n", + ")\n", + "r2 = Report(\n", + " report_types=[\"campaign\"],\n", + " name=\"Bad Cybercrime\",\n", + " published=\"2016-04-06T20:03:00.000Z\",\n", + " object_refs=[\"indicator--a740531e-63ff-4e49-a9e1-a0a3eed0e3e7\"],\n", + ")\n", + "print(env.object_similarity(r1, r2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, comparing objects of different spec versions will result in a `ValueError`." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "The objects to compare must be of the same spec version!", + "output_type": "error", + "traceback": [ + "\u001b[0;31mValueError\u001b[0m\u001b[0;31m:\u001b[0m The objects to compare must be of the same spec version!\n" + ] + } + ], + "source": [ + "from stix2.v20 import Identity as Identity20\n", + "\n", + "id20 = Identity20(\n", + " name=\"John Smith\",\n", + " identity_class=\"individual\",\n", + ")\n", + "print(env.object_similarity(id2, id20))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can optionally allow comparing across spec versions by providing a configuration dictionary using `ignore_spec_version` like in the next example:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
100.0\n",
        "
\n" ], "text/plain": [ @@ -1515,6 +2236,132 @@ "output_type": "execute_result" } ], + "source": [ + "from stix2.v20 import Identity as Identity20\n", + "\n", + "id20 = Identity20(\n", + " name=\"John Smith\",\n", + " identity_class=\"individual\",\n", + ")\n", + "print(env.object_similarity(id2, id20, **{\"_internal\": {\"ignore_spec_version\": True}}))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Detailed Results\n", + "\n", + "If your logging level is set to `DEBUG` or higher, the function will log more detailed results. These show the semantic similarity and weighting for each property that is checked, to show how the final result was arrived at." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Starting object similarity process between: 'threat-actor--b70cc8d6-33d9-41e2-9beb-7bce7af08a76' and 'threat-actor--e325f521-ce0f-48b3-b4d5-65fa699c0e16'\n", + "--\t\tpartial_string_based 'Evil Org' 'James Bond'\tresult: '11.111111111111114'\n", + "'name' check -- weight: 60, contributing score: 6.666666666666669\n", + "--\t\tpartial_list_based '['crime-syndicate']' '['spy']'\tresult: '0.0'\n", + "'threat_actor_types' check -- weight: 20, contributing score: 0.0\n", + "--\t\tpartial_list_based '['super-evil']' '['007']'\tresult: '0.0'\n", + "'aliases' check -- weight: 20, contributing score: 0.0\n", + "Matching Score: 6.666666666666669, Sum of Weights: 100.0\n" + ] + }, + { + "data": { + "text/html": [ + "
6.66666666666667\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import logging\n", "logging.basicConfig(format='%(message)s')\n", @@ -1531,7 +2378,7 @@ " name=\"James Bond\",\n", " aliases=[\"007\"],\n", ")\n", - "print(env.semantically_equivalent(ta3, ta4))\n", + "print(env.object_similarity(ta3, ta4))\n", "\n", "logger.setLevel(logging.ERROR)" ] @@ -1540,23 +2387,28 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can also retrieve the detailed results in a dictionary so the detailed results information can be accessed and used more programatically. The [semantically_equivalent()](../api/stix2.environment.rst#stix2.environment.Environment.semantically_equivalent) function takes an optional third argument, called `prop_scores`. This argument should be a dictionary into which the detailed debugging information will be stored.\n", + "You can also retrieve the detailed results in a dictionary so the detailed results information can be accessed and used more programatically. The [object_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.object_similarity) function takes an optional third argument, called `prop_scores`. This argument should be a dictionary into which the detailed debugging information will be stored.\n", "\n", - "Using `prop_scores` is simple: simply pass in a dictionary to `semantically_equivalent()`, and after the function is done executing, the dictionary will have the various scores in it. Specifically, it will have the overall `matching_score` and `sum_weights`, along with the weight and contributing score for each of the semantic equivalence contributing properties.\n", + "Using `prop_scores` is simple: simply pass in a dictionary to [object_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.object_similarity), and after the function has finished executing, the dictionary will contain the various scores. Specifically, it will have the overall `matching_score` and `sum_weights`, along with the weight and contributing score for each of the semantic similarity contributing properties.\n", "\n", "For example:" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Semantic equivalence score using standard weights: 16.6\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
Semantic equivalence score using standard weights: 16.666666666666668\n",
        "
\n" ], "text/plain": [ "" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/html": [ - "
{'name': {'weight': 60, 'contributing_score': 6.6}, 'threat_actor_types': {'weight': 20, 'contributing_score': 10.0}, 'aliases': {'weight': 20, 'contributing_score': 0.0}, 'matching_score': 16.6, 'sum_weights': 100.0}\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
{'name': {'weight': 60, 'contributing_score': 6.666666666666669}, 'threat_actor_types': {'weight': 20, 'contributing_score': 10.0}, 'aliases': {'weight': 20, 'contributing_score': 0.0}, 'matching_score': 16.666666666666668, 'sum_weights': 100.0}\n",
        "
\n" ], "text/plain": [ "" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/html": [ - "
Prop: name | weight: 60 | contributing_score: 6.6\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
Prop: name | weight: 60 | contributing_score: 6.666666666666669\n",
        "
\n" ], "text/plain": [ "" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/html": [ - "
matching_score: 16.6\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
matching_score: 16.666666666666668\n",
        "
\n" ], "text/plain": [ "" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/html": [ - "
Using standard weights: 16.6\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
Using standard weights: 16.666666666666668\n",
        "
\n" ], "text/plain": [ "" ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/html": [ - "
Using custom weights: 28.300000000000004\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
Using custom weights: 28.33333333333334\n",
        "
\n" ], "text/plain": [ "" ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "weights = {\n", - " \"threat-actor\": { # You must specify the object type\n", - " \"name\": (30, stix2.environment.partial_string_based), # Each property's value must be a tuple\n", - " \"threat_actor_types\": (50, stix2.environment.partial_list_based), # The 1st component must be the weight\n", - " \"aliases\": (20, stix2.environment.partial_list_based) # The 2nd component must be the comparison function\n", + " \"threat-actor\": { # You must specify the object type\n", + " \"name\": (30, stix2.equivalence.object.partial_string_based), # Each property's value must be a tuple\n", + " \"threat_actor_types\": (50, stix2.equivalence.object.partial_list_based), # The 1st component must be the weight\n", + " \"aliases\": (20, stix2.equivalence.object.partial_list_based) # The 2nd component must be the comparison function\n", " }\n", "}\n", "\n", - "print(\"Using standard weights: %s\" % (env.semantically_equivalent(ta5, ta6)))\n", - "print(\"Using custom weights: %s\" % (env.semantically_equivalent(ta5, ta6, **weights)))" + "print(\"Using standard weights: %s\" % (env.object_similarity(ta5, ta6)))\n", + "print(\"Using custom weights: %s\" % (env.object_similarity(ta5, ta6, **weights)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Notice how there is a difference in the semantic equivalence scores, simply due to the fact that custom weights were used.\n", + "Notice how there is a difference in the semantic similarity scores, simply due to the fact that custom weights were used.\n", "\n", "#### Custom Weights With prop_scores\n", - "If we want to use both `prop_scores` and `weights`, then they would be the third and fourth arguments, respectively, to `sematically_equivalent()`:" + "If we want to use both `prop_scores` and `weights`, then they would be the third and fourth arguments, respectively, to [object_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.object_similarity):" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "9.95" + "10.000000000000002" ] }, - "execution_count": 19, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/html": [ - "
{'name': {'weight': 45, 'contributing_score': 4.95}, 'threat_actor_types': {'weight': 10, 'contributing_score': 5.0}, 'aliases': {'weight': 45, 'contributing_score': 0.0}, 'matching_score': 9.95, 'sum_weights': 100.0}\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
{'name': {'weight': 45, 'contributing_score': 5.000000000000002}, 'threat_actor_types': {'weight': 10, 'contributing_score': 5.0}, 'aliases': {'weight': 45, 'contributing_score': 0.0}, 'matching_score': 10.000000000000002, 'sum_weights': 100.0}\n",
        "
\n" ], "text/plain": [ "" ] }, - "execution_count": 19, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -2475,12 +3372,12 @@ "prop_scores = {}\n", "weights = {\n", " \"threat-actor\": {\n", - " \"name\": (45, stix2.environment.partial_string_based),\n", - " \"threat_actor_types\": (10, stix2.environment.partial_list_based),\n", - " \"aliases\": (45, stix2.environment.partial_list_based),\n", + " \"name\": (45, stix2.equivalence.object.partial_string_based),\n", + " \"threat_actor_types\": (10, stix2.equivalence.object.partial_list_based),\n", + " \"aliases\": (45, stix2.equivalence.object.partial_list_based),\n", " },\n", "}\n", - "env.semantically_equivalent(ta5, ta6, prop_scores, **weights)\n", + "env.object_similarity(ta5, ta6, prop_scores, **weights)\n", "print(prop_scores)" ] }, @@ -2488,20 +3385,25 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Custom Semantic Equivalence Functions\n", + "#### Custom Semantic Similarity Functions\n", "You can also write and use your own semantic equivalence functions. In the examples above, you could replace the built-in comparison functions for any or all properties. For example, here we use a custom string comparison function just for the `'name'` property:" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Using standard weights: 16.6\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
Using standard weights: 16.666666666666668\n",
        "
\n" ], "text/plain": [ "" ] }, - "execution_count": 21, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/html": [ - "
Using a custom method: 6.6000000000000005\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
Using a custom method: 6.66666666666667\n",
        "
\n" ], "text/plain": [ "" ] }, - "execution_count": 21, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "def custom_semantic_equivalence_method(obj1, obj2, **weights):\n", + "def custom_semantic_similarity_method(obj1, obj2, **weights):\n", " sum_weights = 0\n", " matching_score = 0\n", " # Compare name\n", " w = weights['name']\n", " sum_weights += w\n", - " contributing_score = w * stix2.environment.partial_string_based(obj1['name'], obj2['name'])\n", + " contributing_score = w * stix2.equivalence.object.partial_string_based(obj1['name'], obj2['name'])\n", " matching_score += contributing_score\n", " # Compare aliases only for spies\n", " if 'spy' in obj1['threat_actor_types'] + obj2['threat_actor_types']:\n", " w = weights['aliases']\n", " sum_weights += w\n", - " contributing_score = w * stix2.environment.partial_list_based(obj1['aliases'], obj2['aliases'])\n", + " contributing_score = w * stix2.equivalence.object.partial_list_based(obj1['aliases'], obj2['aliases'])\n", " matching_score += contributing_score\n", " \n", " return matching_score, sum_weights\n", @@ -2802,31 +3714,36 @@ " \"threat-actor\": {\n", " \"name\": 60,\n", " \"aliases\": 40,\n", - " \"method\": custom_semantic_equivalence_method\n", + " \"method\": custom_semantic_similarity_method\n", " }\n", "}\n", "\n", - "print(\"Using standard weights: %s\" % (env.semantically_equivalent(ta5, ta6)))\n", - "print(\"Using a custom method: %s\" % (env.semantically_equivalent(ta5, ta6, **weights)))" + "print(\"Using standard weights: %s\" % (env.object_similarity(ta5, ta6)))\n", + "print(\"Using a custom method: %s\" % (env.object_similarity(ta5, ta6, **weights)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "You can also write custom functions for comparing objects of your own custom types. Like in the previous example, you can use the built-in functions listed above to help with this, or write your own. In the following example we define semantic equivalence for our new `x-foobar` object type. Notice that this time we have included support for detailed results with `prop_scores`." + "You can also write custom functions for comparing objects of your own custom types. Like in the previous example, you can use the built-in functions listed above to help with this, or write your own. In the following example we define semantic similarity for our new `x-foobar` object type. Notice that this time we have included support for detailed results with `prop_scores`." ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
71.6\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
71.42857142857143\n",
        "
\n" ], "text/plain": [ "" ] }, - "execution_count": 22, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/html": [ - "
{'name': (60, 60.0), 'color': (40, 11.6), 'matching_score': 71.6, 'sum_weights': 100.0}\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
{'name': (60, 60.0), 'color': (40, 11.428571428571427), 'matching_score': 71.42857142857143, 'sum_weights': 100.0}\n",
        "
\n" ], "text/plain": [ "" ] }, - "execution_count": 22, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -2991,16 +3913,16 @@ "def _x_foobar_checks(obj1, obj2, prop_scores, **weights):\n", " matching_score = 0.0\n", " sum_weights = 0.0\n", - " if stix2.environment.check_property_present(\"name\", obj1, obj2):\n", + " if stix2.equivalence.object.check_property_present(\"name\", obj1, obj2):\n", " w = weights[\"name\"]\n", " sum_weights += w\n", - " contributing_score = w * stix2.environment.partial_string_based(obj1[\"name\"], obj2[\"name\"])\n", + " contributing_score = w * stix2.equivalence.object.partial_string_based(obj1[\"name\"], obj2[\"name\"])\n", " matching_score += contributing_score\n", " prop_scores[\"name\"] = (w, contributing_score)\n", - " if stix2.environment.check_property_present(\"color\", obj1, obj2):\n", + " if stix2.equivalence.object.check_property_present(\"color\", obj1, obj2):\n", " w = weights[\"color\"]\n", " sum_weights += w\n", - " contributing_score = w * stix2.environment.partial_string_based(obj1[\"color\"], obj2[\"color\"])\n", + " contributing_score = w * stix2.equivalence.object.partial_string_based(obj1[\"color\"], obj2[\"color\"])\n", " matching_score += contributing_score\n", " prop_scores[\"color\"] = (w, contributing_score)\n", " \n", @@ -3031,7 +3953,7 @@ " \"name\": \"Zot\",\n", " \"color\": \"blue\",\n", "}\n", - "print(env.semantically_equivalent(foo1, foo2, prop_scores, **weights))\n", + "print(env.object_similarity(foo1, foo2, prop_scores, **weights))\n", "print(prop_scores)" ] } From 230852895731e26839307dfa0e5a44d4064a6662 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Fri, 19 Feb 2021 14:48:23 -0500 Subject: [PATCH 25/27] add text and example for equivalence.ipynb --- docs/guide/equivalence.ipynb | 430 ++++++++++++++++++++++++++++++++++- 1 file changed, 428 insertions(+), 2 deletions(-) diff --git a/docs/guide/equivalence.ipynb b/docs/guide/equivalence.ipynb index 7427747..5ac460a 100644 --- a/docs/guide/equivalence.ipynb +++ b/docs/guide/equivalence.ipynb @@ -57,11 +57,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Checking Semantic Equivalence\n", + "## Checking Object Similarity and Equivalence\n", "\n", "The [Environment](../api/stix2.environment.rst#stix2.environment.Environment) has functions for checking if two STIX Objects are very similar or identical. The functions differentiate between equivalence, which is a binary concept (two things are either equivalent or they are not), and similarity, which is a continuum (an object can be more similar to one object than to another). The similarity function answers the question, “How similar are these two objects?” while the equivalence function uses the similarity function to answer the question, “Are these two objects equivalent?”\n", "\n", - "For each supported object type, the [object_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.object_similarity) function checks if the values for a specific set of properties match. Then each matching property is weighted since every property doesn't represent the same level of importance for semantic similarity. The result will be the sum of these weighted values, in the range of 0 to 100. A result of 0 means that the the two objects are not equivalent, and a result of 100 means that they are equivalent. Values in between mean the two objects are more or less similar, and can be used to determine if they should be considered equivalent or not. The [object_equivalence()](../api/stix2.environment.rst#stix2.environment.Environment.object_equivalence) calls [object_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.object_similarity) and compares the result to a threshold to determine if the objects are equivalent. Different organizations or users may use different thresholds.\n", + "For each supported object type, the [object_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.object_similarity) function checks if the values for a specific set of properties match. Then each matching property is weighted since every property does not represent the same level of importance for semantic similarity. The result will be the sum of these weighted values, in the range of 0 to 100. A result of 0 means that the two objects are not equivalent, and a result of 100 means that they are equivalent. Values in between mean the two objects are more or less similar and can be used to determine if they should be considered equivalent or not. The [object_equivalence()](../api/stix2.environment.rst#stix2.environment.Environment.object_equivalence) calls [object_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.object_similarity) and compares the result to a threshold to determine if the objects are equivalent. Different organizations or users may use different thresholds.\n", "\n", "TODO: Add a link to the committee note when it is released.\n", "\n", @@ -3956,6 +3956,432 @@ "print(env.object_similarity(foo1, foo2, prop_scores, **weights))\n", "print(prop_scores)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Checking Graph Similarity and Equivalence\n", + "\n", + "The next logical step for checking if two individual objects are similar or equivalent is to check all relevant neighbors or equal type objects for the best match. It can help you determine if you have seen similar intelligence in the past and builds upon the same foundation of the local object similarity comparisons. The [Environment](../api/stix2.environment.rst#stix2.environment.Environment) has two functions with similar requirements for graph-based checks.\n", + "\n", + "For each supported object type, the [graph_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.graph_similarity) function checks if the values for a specific set of objects to match and will compare against all of the same type objects maximizing for score obtained from the properties match. It requires two DataStore instances which will serve as our graph representation and will allow the algorithm to make additional checks like de-referencing objects. Internally it calls [object_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.object_similarity). \n", + "\n", + "Some limitations exist in the implementation that are important to be understood by those analyzing the results of this algorithm.\n", + "- Only STIX types with weights defined will be checked. This could result in a maximal sub-graph and score that is smaller than expect. We recommend looking at the prop_scores or logging output for details and to understand how the result was calculated.\n", + "- Failure to de-reference an object for checks will result in a 0 for that property. This applies to `*_ref` or `*_refs` properties.\n", + "- Keep reasonable expectations in running-time, especially with DataStores that require network communication or when the number of items in the graphs is high. You can also tune how much depth the algorithm should check in de-reference calls; this can affect your running-time.\n", + "\n", + "**Please note** that you will need to install the TAXII dependencies in addition to the semantic requirements if you plan on using the TAXII DataStore classes. You can do this using:\n", + "\n", + "```pip install stix2[taxii]```\n", + "\n", + "#### Graph Similarity and Equivalence Example\n", + "\n", + "By default, it use default weights defined here [object_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.object_similarity) in combination with [graph_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.graph_similarity)." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "59.68831168831168\n", + "False\n", + "{\n", + " \"matching_score\": 835.6363636363635,\n", + " \"len_pairs\": 14,\n", + " \"summary\": {\n", + " \"campaign--a8c85d5d-bdc6-4613-8e0b-b836ff450c28\": {\n", + " \"lhs\": \"campaign--a8c85d5d-bdc6-4613-8e0b-b836ff450c28\",\n", + " \"rhs\": \"campaign--caf3f196-1d91-4b87-9f3b-855967af6782\",\n", + " \"prop_score\": {\n", + " \"name\": {\n", + " \"weight\": 60,\n", + " \"contributing_score\": 18.000000000000004\n", + " },\n", + " \"matching_score\": 18.000000000000004,\n", + " \"sum_weights\": 60.0\n", + " },\n", + " \"value\": 30.000000000000004\n", + " },\n", + " \"campaign--caf3f196-1d91-4b87-9f3b-855967af6782\": {\n", + " \"lhs\": \"campaign--caf3f196-1d91-4b87-9f3b-855967af6782\",\n", + " \"rhs\": \"campaign--a8c85d5d-bdc6-4613-8e0b-b836ff450c28\",\n", + " \"prop_score\": {\n", + " \"name\": {\n", + " \"weight\": 60,\n", + " \"contributing_score\": 18.000000000000004\n", + " },\n", + " \"matching_score\": 18.000000000000004,\n", + " \"sum_weights\": 60.0\n", + " },\n", + " \"value\": 30.000000000000004\n", + " },\n", + " \"attack-pattern--eb837f70-9798-4907-8c8a-bf883f7f4ec3\": {\n", + " \"lhs\": \"attack-pattern--eb837f70-9798-4907-8c8a-bf883f7f4ec3\",\n", + " \"rhs\": \"attack-pattern--94caa050-50d1-4c20-9891-a1b9f47d2448\",\n", + " \"prop_score\": {\n", + " \"name\": {\n", + " \"weight\": 30,\n", + " \"contributing_score\": 21.81818181818182\n", + " },\n", + " \"external_references\": {\n", + " \"weight\": 70,\n", + " \"contributing_score\": 70.0\n", + " },\n", + " \"matching_score\": 91.81818181818181,\n", + " \"sum_weights\": 100.0\n", + " },\n", + " \"value\": 91.81818181818181\n", + " },\n", + " \"attack-pattern--94caa050-50d1-4c20-9891-a1b9f47d2448\": {\n", + " \"lhs\": \"attack-pattern--94caa050-50d1-4c20-9891-a1b9f47d2448\",\n", + " \"rhs\": \"attack-pattern--eb837f70-9798-4907-8c8a-bf883f7f4ec3\",\n", + " \"prop_score\": {\n", + " \"name\": {\n", + " \"weight\": 30,\n", + " \"contributing_score\": 21.81818181818182\n", + " },\n", + " \"external_references\": {\n", + " \"weight\": 70,\n", + " \"contributing_score\": 70.0\n", + " },\n", + " \"matching_score\": 91.81818181818181,\n", + " \"sum_weights\": 100.0\n", + " },\n", + " \"value\": 91.81818181818181\n", + " },\n", + " \"identity--8d29b554-9904-430c-bc78-82c97750350a\": {\n", + " \"lhs\": \"identity--8d29b554-9904-430c-bc78-82c97750350a\",\n", + " \"rhs\": \"identity--4a4daf92-7c94-407c-a303-3a51924c32a0\",\n", + " \"prop_score\": {\n", + " \"name\": {\n", + " \"weight\": 60,\n", + " \"contributing_score\": 60.0\n", + " },\n", + " \"identity_class\": {\n", + " \"weight\": 20,\n", + " \"contributing_score\": 20.0\n", + " },\n", + " \"matching_score\": 80.0,\n", + " \"sum_weights\": 80.0\n", + " },\n", + " \"value\": 100.0\n", + " },\n", + " \"identity--4a4daf92-7c94-407c-a303-3a51924c32a0\": {\n", + " \"lhs\": \"identity--4a4daf92-7c94-407c-a303-3a51924c32a0\",\n", + " \"rhs\": \"identity--8d29b554-9904-430c-bc78-82c97750350a\",\n", + " \"prop_score\": {\n", + " \"name\": {\n", + " \"weight\": 60,\n", + " \"contributing_score\": 60.0\n", + " },\n", + " \"identity_class\": {\n", + " \"weight\": 20,\n", + " \"contributing_score\": 20.0\n", + " },\n", + " \"matching_score\": 80.0,\n", + " \"sum_weights\": 80.0\n", + " },\n", + " \"value\": 100.0\n", + " },\n", + " \"threat-actor--8e2e2d2b-17d4-4cbf-938f-98ee46b3cd3f\": {\n", + " \"lhs\": \"threat-actor--8e2e2d2b-17d4-4cbf-938f-98ee46b3cd3f\",\n", + " \"rhs\": \"threat-actor--8e2e2d2b-17d4-4cbf-938f-98ee46b3cd3f\",\n", + " \"prop_score\": {\n", + " \"name\": {\n", + " \"weight\": 60,\n", + " \"contributing_score\": 6.66666666666667\n", + " },\n", + " \"threat_actor_types\": {\n", + " \"weight\": 20,\n", + " \"contributing_score\": 0.0\n", + " },\n", + " \"aliases\": {\n", + " \"weight\": 20,\n", + " \"contributing_score\": 0.0\n", + " },\n", + " \"matching_score\": 6.66666666666667,\n", + " \"sum_weights\": 100.0\n", + " },\n", + " \"value\": 6.66666666666667\n", + " },\n", + " \"indicator--9884b67b-5a83-4377-a941-1821f705a6aa\": {\n", + " \"lhs\": \"indicator--9884b67b-5a83-4377-a941-1821f705a6aa\",\n", + " \"rhs\": \"indicator--94467f25-2857-4d86-9c4d-f7a5a2cd20f4\",\n", + " \"prop_score\": {\n", + " \"indicator_types\": {\n", + " \"weight\": 15,\n", + " \"contributing_score\": 15.0\n", + " },\n", + " \"pattern\": {\n", + " \"weight\": 80,\n", + " \"contributing_score\": 0\n", + " },\n", + " \"valid_from\": {\n", + " \"weight\": 5,\n", + " \"contributing_score\": 5.0\n", + " },\n", + " \"matching_score\": 20.0,\n", + " \"sum_weights\": 100.0\n", + " },\n", + " \"value\": 20.0\n", + " },\n", + " \"indicator--94467f25-2857-4d86-9c4d-f7a5a2cd20f4\": {\n", + " \"lhs\": \"indicator--94467f25-2857-4d86-9c4d-f7a5a2cd20f4\",\n", + " \"rhs\": \"indicator--9884b67b-5a83-4377-a941-1821f705a6aa\",\n", + " \"prop_score\": {\n", + " \"indicator_types\": {\n", + " \"weight\": 15,\n", + " \"contributing_score\": 15.0\n", + " },\n", + " \"pattern\": {\n", + " \"weight\": 80,\n", + " \"contributing_score\": 0\n", + " },\n", + " \"valid_from\": {\n", + " \"weight\": 5,\n", + " \"contributing_score\": 5.0\n", + " },\n", + " \"matching_score\": 20.0,\n", + " \"sum_weights\": 100.0\n", + " },\n", + " \"value\": 20.0\n", + " },\n", + " \"report--5205b115-eb30-4ec3-89df-bd0b7ab3da7d\": {\n", + " \"lhs\": \"report--5205b115-eb30-4ec3-89df-bd0b7ab3da7d\",\n", + " \"rhs\": \"report--230450a3-f484-4555-ab1a-9bd67665d359\",\n", + " \"prop_score\": {\n", + " \"name\": {\n", + " \"weight\": 30,\n", + " \"contributing_score\": 30.0\n", + " },\n", + " \"published\": {\n", + " \"weight\": 10,\n", + " \"contributing_score\": 10.0\n", + " },\n", + " \"object_refs\": {\n", + " \"weight\": 60,\n", + " \"contributing_score\": 29.0\n", + " },\n", + " \"matching_score\": 69.0,\n", + " \"sum_weights\": 100.0\n", + " },\n", + " \"value\": 69.0\n", + " },\n", + " \"report--230450a3-f484-4555-ab1a-9bd67665d359\": {\n", + " \"lhs\": \"report--230450a3-f484-4555-ab1a-9bd67665d359\",\n", + " \"rhs\": \"report--5205b115-eb30-4ec3-89df-bd0b7ab3da7d\",\n", + " \"prop_score\": {\n", + " \"name\": {\n", + " \"weight\": 30,\n", + " \"contributing_score\": 30.0\n", + " },\n", + " \"published\": {\n", + " \"weight\": 10,\n", + " \"contributing_score\": 10.0\n", + " },\n", + " \"object_refs\": {\n", + " \"weight\": 60,\n", + " \"contributing_score\": 29.0\n", + " },\n", + " \"matching_score\": 69.0,\n", + " \"sum_weights\": 100.0\n", + " },\n", + " \"value\": 69.0\n", + " },\n", + " \"malware--9c4638ec-f1de-4ddb-abf4-1b760417654e\": {\n", + " \"lhs\": \"malware--9c4638ec-f1de-4ddb-abf4-1b760417654e\",\n", + " \"rhs\": \"malware--9c4638ec-f1de-4ddb-abf4-1b760417654e\",\n", + " \"prop_score\": {\n", + " \"malware_types\": {\n", + " \"weight\": 20,\n", + " \"contributing_score\": 10.0\n", + " },\n", + " \"name\": {\n", + " \"weight\": 80,\n", + " \"contributing_score\": 80.0\n", + " },\n", + " \"matching_score\": 90.0,\n", + " \"sum_weights\": 100.0\n", + " },\n", + " \"value\": 90.0\n", + " },\n", + " \"relationship--c9dea34f-fe7c-43a1-a496-766a4290d63d\": {\n", + " \"lhs\": \"relationship--c9dea34f-fe7c-43a1-a496-766a4290d63d\",\n", + " \"rhs\": \"relationship--2e8ec6c1-7934-416c-a471-d572ec84e1e7\",\n", + " \"prop_score\": {\n", + " \"relationship_type\": {\n", + " \"weight\": 20,\n", + " \"contributing_score\": 20.0\n", + " },\n", + " \"source_ref\": {\n", + " \"weight\": 40,\n", + " \"contributing_score\": 2.666666666666668\n", + " },\n", + " \"target_ref\": {\n", + " \"weight\": 40,\n", + " \"contributing_score\": 36.0\n", + " },\n", + " \"matching_score\": 58.66666666666667,\n", + " \"sum_weights\": 100.0\n", + " },\n", + " \"value\": 58.666666666666664\n", + " },\n", + " \"relationship--2e8ec6c1-7934-416c-a471-d572ec84e1e7\": {\n", + " \"lhs\": \"relationship--2e8ec6c1-7934-416c-a471-d572ec84e1e7\",\n", + " \"rhs\": \"relationship--c9dea34f-fe7c-43a1-a496-766a4290d63d\",\n", + " \"prop_score\": {\n", + " \"relationship_type\": {\n", + " \"weight\": 20,\n", + " \"contributing_score\": 20.0\n", + " },\n", + " \"source_ref\": {\n", + " \"weight\": 40,\n", + " \"contributing_score\": 2.666666666666668\n", + " },\n", + " \"target_ref\": {\n", + " \"weight\": 40,\n", + " \"contributing_score\": 36.0\n", + " },\n", + " \"matching_score\": 58.66666666666667,\n", + " \"sum_weights\": 100.0\n", + " },\n", + " \"value\": 58.666666666666664\n", + " }\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "import json\n", + "\n", + "from stix2 import Relationship\n", + "\n", + "\n", + "g1 = [\n", + " AttackPattern(\n", + " name=\"Phishing\",\n", + " external_references=[\n", + " {\n", + " \"url\": \"https://example2\",\n", + " \"source_name\": \"some-source2\",\n", + " },\n", + " ],\n", + " ),\n", + " Campaign(name=\"Someone Attacks Somebody\"),\n", + " Identity(\n", + " name=\"John Smith\",\n", + " identity_class=\"individual\",\n", + " description=\"Just some guy\",\n", + " ),\n", + " Indicator(\n", + " indicator_types=['malicious-activity'],\n", + " pattern_type=\"stix\",\n", + " pattern=\"[file:hashes.MD5 = 'd41d8cd98f00b204e9800998ecf8427e']\",\n", + " valid_from=\"2017-01-01T12:34:56Z\",\n", + " ),\n", + " Malware(id=MALWARE_ID,\n", + " malware_types=['ransomware'],\n", + " name=\"Cryptolocker\",\n", + " is_family=False,\n", + " ),\n", + " ThreatActor(id=THREAT_ACTOR_ID,\n", + " threat_actor_types=[\"crime-syndicate\"],\n", + " name=\"Evil Org\",\n", + " aliases=[\"super-evil\"],\n", + " ),\n", + " Relationship(\n", + " source_ref=THREAT_ACTOR_ID,\n", + " target_ref=MALWARE_ID,\n", + " relationship_type=\"uses\",\n", + " ),\n", + " Report(\n", + " report_types=[\"campaign\"],\n", + " name=\"Bad Cybercrime\",\n", + " published=\"2016-04-06T20:03:00.000Z\",\n", + " object_refs=[THREAT_ACTOR_ID, MALWARE_ID],\n", + " ),\n", + "]\n", + "\n", + "g2 = [\n", + " AttackPattern(\n", + " name=\"Spear phishing\",\n", + " external_references=[\n", + " {\n", + " \"url\": \"https://example2\",\n", + " \"source_name\": \"some-source2\",\n", + " },\n", + " ],\n", + " ),\n", + " Campaign(name=\"Another Campaign\"),\n", + " Identity(\n", + " name=\"John Smith\",\n", + " identity_class=\"individual\",\n", + " description=\"A person\",\n", + " ),\n", + " Indicator(\n", + " indicator_types=['malicious-activity'],\n", + " pattern_type=\"stix\",\n", + " pattern=\"[file:hashes.MD5 = '79054025255fb1a26e4bc422aef54eb4']\",\n", + " valid_from=\"2017-01-01T12:34:56Z\",\n", + " ),\n", + " Malware(id=MALWARE_ID,\n", + " malware_types=['ransomware', 'dropper'],\n", + " name=\"Cryptolocker\",\n", + " is_family=False,\n", + " ),\n", + " ThreatActor(id=THREAT_ACTOR_ID,\n", + " threat_actor_types=[\"spy\"],\n", + " name=\"James Bond\",\n", + " aliases=[\"007\"],\n", + " ),\n", + " Relationship(\n", + " source_ref=THREAT_ACTOR_ID,\n", + " target_ref=MALWARE_ID,\n", + " relationship_type=\"uses\",\n", + " ),\n", + " Report(\n", + " report_types=[\"campaign\"],\n", + " name=\"Bad Cybercrime\",\n", + " published=\"2016-04-06T20:03:00.000Z\",\n", + " object_refs=[THREAT_ACTOR_ID, MALWARE_ID],\n", + " ),\n", + "]\n", + "\n", + "\n", + "weights = {\n", + " \"_internal\": {\n", + " \"ignore_spec_version\": False,\n", + " \"versioning_checks\": False,\n", + " \"max_depth\": 1,\n", + " },\n", + "}\n", + "\n", + "memstore1 = MemoryStore(g1)\n", + "memstore2 = MemoryStore(g2)\n", + "prop_scores = {}\n", + "\n", + "similarity_result = env.graph_similarity(memstore1, memstore2, prop_scores, **weights)\n", + "equivalence_result = env.graph_equivalence(memstore1, memstore2, threshold=60)\n", + "\n", + "print(similarity_result)\n", + "print(equivalence_result)\n", + "print(json.dumps(prop_scores, indent=4, sort_keys=False))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The example above uses the same objects found in this guide to demonstrate the graph similarity and equivalence use. Under this approach Grouping, Relationship, Report and Sighting have default weights defined allowing object de-reference. The Report and Relationship objects respectively show their `*_ref` and `*_refs` properties checked in the summary output. Analyzing the similarity output we can observe that objects checked individually rated high, but as we take into account the rest of the graph discrepancies add up and produced a lower score." + ] } ], "metadata": { From ecf952c07babd27f67306baf814b26d99cbac598 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Fri, 19 Feb 2021 14:50:52 -0500 Subject: [PATCH 26/27] it updated itself... --- docs/guide/equivalence.ipynb | 792 +++++++++++++++++++++++------------ 1 file changed, 522 insertions(+), 270 deletions(-) diff --git a/docs/guide/equivalence.ipynb b/docs/guide/equivalence.ipynb index 5ac460a..44e2e4d 100644 --- a/docs/guide/equivalence.ipynb +++ b/docs/guide/equivalence.ipynb @@ -3987,276 +3987,528 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "59.68831168831168\n", - "False\n", - "{\n", - " \"matching_score\": 835.6363636363635,\n", - " \"len_pairs\": 14,\n", - " \"summary\": {\n", - " \"campaign--a8c85d5d-bdc6-4613-8e0b-b836ff450c28\": {\n", - " \"lhs\": \"campaign--a8c85d5d-bdc6-4613-8e0b-b836ff450c28\",\n", - " \"rhs\": \"campaign--caf3f196-1d91-4b87-9f3b-855967af6782\",\n", - " \"prop_score\": {\n", - " \"name\": {\n", - " \"weight\": 60,\n", - " \"contributing_score\": 18.000000000000004\n", - " },\n", - " \"matching_score\": 18.000000000000004,\n", - " \"sum_weights\": 60.0\n", - " },\n", - " \"value\": 30.000000000000004\n", - " },\n", - " \"campaign--caf3f196-1d91-4b87-9f3b-855967af6782\": {\n", - " \"lhs\": \"campaign--caf3f196-1d91-4b87-9f3b-855967af6782\",\n", - " \"rhs\": \"campaign--a8c85d5d-bdc6-4613-8e0b-b836ff450c28\",\n", - " \"prop_score\": {\n", - " \"name\": {\n", - " \"weight\": 60,\n", - " \"contributing_score\": 18.000000000000004\n", - " },\n", - " \"matching_score\": 18.000000000000004,\n", - " \"sum_weights\": 60.0\n", - " },\n", - " \"value\": 30.000000000000004\n", - " },\n", - " \"attack-pattern--eb837f70-9798-4907-8c8a-bf883f7f4ec3\": {\n", - " \"lhs\": \"attack-pattern--eb837f70-9798-4907-8c8a-bf883f7f4ec3\",\n", - " \"rhs\": \"attack-pattern--94caa050-50d1-4c20-9891-a1b9f47d2448\",\n", - " \"prop_score\": {\n", - " \"name\": {\n", - " \"weight\": 30,\n", - " \"contributing_score\": 21.81818181818182\n", - " },\n", - " \"external_references\": {\n", - " \"weight\": 70,\n", - " \"contributing_score\": 70.0\n", - " },\n", - " \"matching_score\": 91.81818181818181,\n", - " \"sum_weights\": 100.0\n", - " },\n", - " \"value\": 91.81818181818181\n", - " },\n", - " \"attack-pattern--94caa050-50d1-4c20-9891-a1b9f47d2448\": {\n", - " \"lhs\": \"attack-pattern--94caa050-50d1-4c20-9891-a1b9f47d2448\",\n", - " \"rhs\": \"attack-pattern--eb837f70-9798-4907-8c8a-bf883f7f4ec3\",\n", - " \"prop_score\": {\n", - " \"name\": {\n", - " \"weight\": 30,\n", - " \"contributing_score\": 21.81818181818182\n", - " },\n", - " \"external_references\": {\n", - " \"weight\": 70,\n", - " \"contributing_score\": 70.0\n", - " },\n", - " \"matching_score\": 91.81818181818181,\n", - " \"sum_weights\": 100.0\n", - " },\n", - " \"value\": 91.81818181818181\n", - " },\n", - " \"identity--8d29b554-9904-430c-bc78-82c97750350a\": {\n", - " \"lhs\": \"identity--8d29b554-9904-430c-bc78-82c97750350a\",\n", - " \"rhs\": \"identity--4a4daf92-7c94-407c-a303-3a51924c32a0\",\n", - " \"prop_score\": {\n", - " \"name\": {\n", - " \"weight\": 60,\n", - " \"contributing_score\": 60.0\n", - " },\n", - " \"identity_class\": {\n", - " \"weight\": 20,\n", - " \"contributing_score\": 20.0\n", - " },\n", - " \"matching_score\": 80.0,\n", - " \"sum_weights\": 80.0\n", - " },\n", - " \"value\": 100.0\n", - " },\n", - " \"identity--4a4daf92-7c94-407c-a303-3a51924c32a0\": {\n", - " \"lhs\": \"identity--4a4daf92-7c94-407c-a303-3a51924c32a0\",\n", - " \"rhs\": \"identity--8d29b554-9904-430c-bc78-82c97750350a\",\n", - " \"prop_score\": {\n", - " \"name\": {\n", - " \"weight\": 60,\n", - " \"contributing_score\": 60.0\n", - " },\n", - " \"identity_class\": {\n", - " \"weight\": 20,\n", - " \"contributing_score\": 20.0\n", - " },\n", - " \"matching_score\": 80.0,\n", - " \"sum_weights\": 80.0\n", - " },\n", - " \"value\": 100.0\n", - " },\n", - " \"threat-actor--8e2e2d2b-17d4-4cbf-938f-98ee46b3cd3f\": {\n", - " \"lhs\": \"threat-actor--8e2e2d2b-17d4-4cbf-938f-98ee46b3cd3f\",\n", - " \"rhs\": \"threat-actor--8e2e2d2b-17d4-4cbf-938f-98ee46b3cd3f\",\n", - " \"prop_score\": {\n", - " \"name\": {\n", - " \"weight\": 60,\n", - " \"contributing_score\": 6.66666666666667\n", - " },\n", - " \"threat_actor_types\": {\n", - " \"weight\": 20,\n", - " \"contributing_score\": 0.0\n", - " },\n", - " \"aliases\": {\n", - " \"weight\": 20,\n", - " \"contributing_score\": 0.0\n", - " },\n", - " \"matching_score\": 6.66666666666667,\n", - " \"sum_weights\": 100.0\n", - " },\n", - " \"value\": 6.66666666666667\n", - " },\n", - " \"indicator--9884b67b-5a83-4377-a941-1821f705a6aa\": {\n", - " \"lhs\": \"indicator--9884b67b-5a83-4377-a941-1821f705a6aa\",\n", - " \"rhs\": \"indicator--94467f25-2857-4d86-9c4d-f7a5a2cd20f4\",\n", - " \"prop_score\": {\n", - " \"indicator_types\": {\n", - " \"weight\": 15,\n", - " \"contributing_score\": 15.0\n", - " },\n", - " \"pattern\": {\n", - " \"weight\": 80,\n", - " \"contributing_score\": 0\n", - " },\n", - " \"valid_from\": {\n", - " \"weight\": 5,\n", - " \"contributing_score\": 5.0\n", - " },\n", - " \"matching_score\": 20.0,\n", - " \"sum_weights\": 100.0\n", - " },\n", - " \"value\": 20.0\n", - " },\n", - " \"indicator--94467f25-2857-4d86-9c4d-f7a5a2cd20f4\": {\n", - " \"lhs\": \"indicator--94467f25-2857-4d86-9c4d-f7a5a2cd20f4\",\n", - " \"rhs\": \"indicator--9884b67b-5a83-4377-a941-1821f705a6aa\",\n", - " \"prop_score\": {\n", - " \"indicator_types\": {\n", - " \"weight\": 15,\n", - " \"contributing_score\": 15.0\n", - " },\n", - " \"pattern\": {\n", - " \"weight\": 80,\n", - " \"contributing_score\": 0\n", - " },\n", - " \"valid_from\": {\n", - " \"weight\": 5,\n", - " \"contributing_score\": 5.0\n", - " },\n", - " \"matching_score\": 20.0,\n", - " \"sum_weights\": 100.0\n", - " },\n", - " \"value\": 20.0\n", - " },\n", - " \"report--5205b115-eb30-4ec3-89df-bd0b7ab3da7d\": {\n", - " \"lhs\": \"report--5205b115-eb30-4ec3-89df-bd0b7ab3da7d\",\n", - " \"rhs\": \"report--230450a3-f484-4555-ab1a-9bd67665d359\",\n", - " \"prop_score\": {\n", - " \"name\": {\n", - " \"weight\": 30,\n", - " \"contributing_score\": 30.0\n", - " },\n", - " \"published\": {\n", - " \"weight\": 10,\n", - " \"contributing_score\": 10.0\n", - " },\n", - " \"object_refs\": {\n", - " \"weight\": 60,\n", - " \"contributing_score\": 29.0\n", - " },\n", - " \"matching_score\": 69.0,\n", - " \"sum_weights\": 100.0\n", - " },\n", - " \"value\": 69.0\n", - " },\n", - " \"report--230450a3-f484-4555-ab1a-9bd67665d359\": {\n", - " \"lhs\": \"report--230450a3-f484-4555-ab1a-9bd67665d359\",\n", - " \"rhs\": \"report--5205b115-eb30-4ec3-89df-bd0b7ab3da7d\",\n", - " \"prop_score\": {\n", - " \"name\": {\n", - " \"weight\": 30,\n", - " \"contributing_score\": 30.0\n", - " },\n", - " \"published\": {\n", - " \"weight\": 10,\n", - " \"contributing_score\": 10.0\n", - " },\n", - " \"object_refs\": {\n", - " \"weight\": 60,\n", - " \"contributing_score\": 29.0\n", - " },\n", - " \"matching_score\": 69.0,\n", - " \"sum_weights\": 100.0\n", - " },\n", - " \"value\": 69.0\n", - " },\n", - " \"malware--9c4638ec-f1de-4ddb-abf4-1b760417654e\": {\n", - " \"lhs\": \"malware--9c4638ec-f1de-4ddb-abf4-1b760417654e\",\n", - " \"rhs\": \"malware--9c4638ec-f1de-4ddb-abf4-1b760417654e\",\n", - " \"prop_score\": {\n", - " \"malware_types\": {\n", - " \"weight\": 20,\n", - " \"contributing_score\": 10.0\n", - " },\n", - " \"name\": {\n", - " \"weight\": 80,\n", - " \"contributing_score\": 80.0\n", - " },\n", - " \"matching_score\": 90.0,\n", - " \"sum_weights\": 100.0\n", - " },\n", - " \"value\": 90.0\n", - " },\n", - " \"relationship--c9dea34f-fe7c-43a1-a496-766a4290d63d\": {\n", - " \"lhs\": \"relationship--c9dea34f-fe7c-43a1-a496-766a4290d63d\",\n", - " \"rhs\": \"relationship--2e8ec6c1-7934-416c-a471-d572ec84e1e7\",\n", - " \"prop_score\": {\n", - " \"relationship_type\": {\n", - " \"weight\": 20,\n", - " \"contributing_score\": 20.0\n", - " },\n", - " \"source_ref\": {\n", - " \"weight\": 40,\n", - " \"contributing_score\": 2.666666666666668\n", - " },\n", - " \"target_ref\": {\n", - " \"weight\": 40,\n", - " \"contributing_score\": 36.0\n", - " },\n", - " \"matching_score\": 58.66666666666667,\n", - " \"sum_weights\": 100.0\n", - " },\n", - " \"value\": 58.666666666666664\n", - " },\n", - " \"relationship--2e8ec6c1-7934-416c-a471-d572ec84e1e7\": {\n", - " \"lhs\": \"relationship--2e8ec6c1-7934-416c-a471-d572ec84e1e7\",\n", - " \"rhs\": \"relationship--c9dea34f-fe7c-43a1-a496-766a4290d63d\",\n", - " \"prop_score\": {\n", - " \"relationship_type\": {\n", - " \"weight\": 20,\n", - " \"contributing_score\": 20.0\n", - " },\n", - " \"source_ref\": {\n", - " \"weight\": 40,\n", - " \"contributing_score\": 2.666666666666668\n", - " },\n", - " \"target_ref\": {\n", - " \"weight\": 40,\n", - " \"contributing_score\": 36.0\n", - " },\n", - " \"matching_score\": 58.66666666666667,\n", - " \"sum_weights\": 100.0\n", - " },\n", - " \"value\": 58.666666666666664\n", - " }\n", - " }\n", - "}\n" - ] + "data": { + "text/html": [ + "
59.68831168831168\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/html": [ + "
False\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/html": [ + "
{\n",
+       "    "matching_score": 835.6363636363635,\n",
+       "    "len_pairs": 14,\n",
+       "    "summary": {\n",
+       "        "malware--9c4638ec-f1de-4ddb-abf4-1b760417654e": {\n",
+       "            "lhs": "malware--9c4638ec-f1de-4ddb-abf4-1b760417654e",\n",
+       "            "rhs": "malware--9c4638ec-f1de-4ddb-abf4-1b760417654e",\n",
+       "            "prop_score": {\n",
+       "                "malware_types": {\n",
+       "                    "weight": 20,\n",
+       "                    "contributing_score": 10.0\n",
+       "                },\n",
+       "                "name": {\n",
+       "                    "weight": 80,\n",
+       "                    "contributing_score": 80.0\n",
+       "                },\n",
+       "                "matching_score": 90.0,\n",
+       "                "sum_weights": 100.0\n",
+       "            },\n",
+       "            "value": 90.0\n",
+       "        },\n",
+       "        "indicator--e75264b9-9e47-4215-87f3-3da1d3f7c802": {\n",
+       "            "lhs": "indicator--e75264b9-9e47-4215-87f3-3da1d3f7c802",\n",
+       "            "rhs": "indicator--0f44a5fd-d70a-4add-bb95-748d048582f6",\n",
+       "            "prop_score": {\n",
+       "                "indicator_types": {\n",
+       "                    "weight": 15,\n",
+       "                    "contributing_score": 15.0\n",
+       "                },\n",
+       "                "pattern": {\n",
+       "                    "weight": 80,\n",
+       "                    "contributing_score": 0\n",
+       "                },\n",
+       "                "valid_from": {\n",
+       "                    "weight": 5,\n",
+       "                    "contributing_score": 5.0\n",
+       "                },\n",
+       "                "matching_score": 20.0,\n",
+       "                "sum_weights": 100.0\n",
+       "            },\n",
+       "            "value": 20.0\n",
+       "        },\n",
+       "        "indicator--0f44a5fd-d70a-4add-bb95-748d048582f6": {\n",
+       "            "lhs": "indicator--0f44a5fd-d70a-4add-bb95-748d048582f6",\n",
+       "            "rhs": "indicator--e75264b9-9e47-4215-87f3-3da1d3f7c802",\n",
+       "            "prop_score": {\n",
+       "                "indicator_types": {\n",
+       "                    "weight": 15,\n",
+       "                    "contributing_score": 15.0\n",
+       "                },\n",
+       "                "pattern": {\n",
+       "                    "weight": 80,\n",
+       "                    "contributing_score": 0\n",
+       "                },\n",
+       "                "valid_from": {\n",
+       "                    "weight": 5,\n",
+       "                    "contributing_score": 5.0\n",
+       "                },\n",
+       "                "matching_score": 20.0,\n",
+       "                "sum_weights": 100.0\n",
+       "            },\n",
+       "            "value": 20.0\n",
+       "        },\n",
+       "        "attack-pattern--15f804f5-b2ca-48b9-86fc-86367b64313c": {\n",
+       "            "lhs": "attack-pattern--15f804f5-b2ca-48b9-86fc-86367b64313c",\n",
+       "            "rhs": "attack-pattern--4278bbc2-9fda-428c-91b8-63a4c146c8dc",\n",
+       "            "prop_score": {\n",
+       "                "name": {\n",
+       "                    "weight": 30,\n",
+       "                    "contributing_score": 21.81818181818182\n",
+       "                },\n",
+       "                "external_references": {\n",
+       "                    "weight": 70,\n",
+       "                    "contributing_score": 70.0\n",
+       "                },\n",
+       "                "matching_score": 91.81818181818181,\n",
+       "                "sum_weights": 100.0\n",
+       "            },\n",
+       "            "value": 91.81818181818181\n",
+       "        },\n",
+       "        "attack-pattern--4278bbc2-9fda-428c-91b8-63a4c146c8dc": {\n",
+       "            "lhs": "attack-pattern--4278bbc2-9fda-428c-91b8-63a4c146c8dc",\n",
+       "            "rhs": "attack-pattern--15f804f5-b2ca-48b9-86fc-86367b64313c",\n",
+       "            "prop_score": {\n",
+       "                "name": {\n",
+       "                    "weight": 30,\n",
+       "                    "contributing_score": 21.81818181818182\n",
+       "                },\n",
+       "                "external_references": {\n",
+       "                    "weight": 70,\n",
+       "                    "contributing_score": 70.0\n",
+       "                },\n",
+       "                "matching_score": 91.81818181818181,\n",
+       "                "sum_weights": 100.0\n",
+       "            },\n",
+       "            "value": 91.81818181818181\n",
+       "        },\n",
+       "        "campaign--4e49de06-0bac-4548-a38c-49dde7ec979b": {\n",
+       "            "lhs": "campaign--4e49de06-0bac-4548-a38c-49dde7ec979b",\n",
+       "            "rhs": "campaign--f1e6443b-62d1-431c-80d7-238a1754de74",\n",
+       "            "prop_score": {\n",
+       "                "name": {\n",
+       "                    "weight": 60,\n",
+       "                    "contributing_score": 18.000000000000004\n",
+       "                },\n",
+       "                "matching_score": 18.000000000000004,\n",
+       "                "sum_weights": 60.0\n",
+       "            },\n",
+       "            "value": 30.000000000000004\n",
+       "        },\n",
+       "        "campaign--f1e6443b-62d1-431c-80d7-238a1754de74": {\n",
+       "            "lhs": "campaign--f1e6443b-62d1-431c-80d7-238a1754de74",\n",
+       "            "rhs": "campaign--4e49de06-0bac-4548-a38c-49dde7ec979b",\n",
+       "            "prop_score": {\n",
+       "                "name": {\n",
+       "                    "weight": 60,\n",
+       "                    "contributing_score": 18.000000000000004\n",
+       "                },\n",
+       "                "matching_score": 18.000000000000004,\n",
+       "                "sum_weights": 60.0\n",
+       "            },\n",
+       "            "value": 30.000000000000004\n",
+       "        },\n",
+       "        "report--52324d35-fa15-45a3-9548-819aea99dcbb": {\n",
+       "            "lhs": "report--52324d35-fa15-45a3-9548-819aea99dcbb",\n",
+       "            "rhs": "report--a3822512-0b59-4d5d-849c-023133fe173f",\n",
+       "            "prop_score": {\n",
+       "                "name": {\n",
+       "                    "weight": 30,\n",
+       "                    "contributing_score": 30.0\n",
+       "                },\n",
+       "                "published": {\n",
+       "                    "weight": 10,\n",
+       "                    "contributing_score": 10.0\n",
+       "                },\n",
+       "                "object_refs": {\n",
+       "                    "weight": 60,\n",
+       "                    "contributing_score": 29.0\n",
+       "                },\n",
+       "                "matching_score": 69.0,\n",
+       "                "sum_weights": 100.0\n",
+       "            },\n",
+       "            "value": 69.0\n",
+       "        },\n",
+       "        "report--a3822512-0b59-4d5d-849c-023133fe173f": {\n",
+       "            "lhs": "report--a3822512-0b59-4d5d-849c-023133fe173f",\n",
+       "            "rhs": "report--52324d35-fa15-45a3-9548-819aea99dcbb",\n",
+       "            "prop_score": {\n",
+       "                "name": {\n",
+       "                    "weight": 30,\n",
+       "                    "contributing_score": 30.0\n",
+       "                },\n",
+       "                "published": {\n",
+       "                    "weight": 10,\n",
+       "                    "contributing_score": 10.0\n",
+       "                },\n",
+       "                "object_refs": {\n",
+       "                    "weight": 60,\n",
+       "                    "contributing_score": 29.0\n",
+       "                },\n",
+       "                "matching_score": 69.0,\n",
+       "                "sum_weights": 100.0\n",
+       "            },\n",
+       "            "value": 69.0\n",
+       "        },\n",
+       "        "relationship--a4e44e34-689e-43f4-b837-13c8cd2fb48e": {\n",
+       "            "lhs": "relationship--a4e44e34-689e-43f4-b837-13c8cd2fb48e",\n",
+       "            "rhs": "relationship--456b79c5-ae81-4d97-bf8d-9e3b5ac6132b",\n",
+       "            "prop_score": {\n",
+       "                "relationship_type": {\n",
+       "                    "weight": 20,\n",
+       "                    "contributing_score": 20.0\n",
+       "                },\n",
+       "                "source_ref": {\n",
+       "                    "weight": 40,\n",
+       "                    "contributing_score": 2.666666666666668\n",
+       "                },\n",
+       "                "target_ref": {\n",
+       "                    "weight": 40,\n",
+       "                    "contributing_score": 36.0\n",
+       "                },\n",
+       "                "matching_score": 58.66666666666667,\n",
+       "                "sum_weights": 100.0\n",
+       "            },\n",
+       "            "value": 58.666666666666664\n",
+       "        },\n",
+       "        "relationship--456b79c5-ae81-4d97-bf8d-9e3b5ac6132b": {\n",
+       "            "lhs": "relationship--456b79c5-ae81-4d97-bf8d-9e3b5ac6132b",\n",
+       "            "rhs": "relationship--a4e44e34-689e-43f4-b837-13c8cd2fb48e",\n",
+       "            "prop_score": {\n",
+       "                "relationship_type": {\n",
+       "                    "weight": 20,\n",
+       "                    "contributing_score": 20.0\n",
+       "                },\n",
+       "                "source_ref": {\n",
+       "                    "weight": 40,\n",
+       "                    "contributing_score": 2.666666666666668\n",
+       "                },\n",
+       "                "target_ref": {\n",
+       "                    "weight": 40,\n",
+       "                    "contributing_score": 36.0\n",
+       "                },\n",
+       "                "matching_score": 58.66666666666667,\n",
+       "                "sum_weights": 100.0\n",
+       "            },\n",
+       "            "value": 58.666666666666664\n",
+       "        },\n",
+       "        "threat-actor--8e2e2d2b-17d4-4cbf-938f-98ee46b3cd3f": {\n",
+       "            "lhs": "threat-actor--8e2e2d2b-17d4-4cbf-938f-98ee46b3cd3f",\n",
+       "            "rhs": "threat-actor--8e2e2d2b-17d4-4cbf-938f-98ee46b3cd3f",\n",
+       "            "prop_score": {\n",
+       "                "name": {\n",
+       "                    "weight": 60,\n",
+       "                    "contributing_score": 6.66666666666667\n",
+       "                },\n",
+       "                "threat_actor_types": {\n",
+       "                    "weight": 20,\n",
+       "                    "contributing_score": 0.0\n",
+       "                },\n",
+       "                "aliases": {\n",
+       "                    "weight": 20,\n",
+       "                    "contributing_score": 0.0\n",
+       "                },\n",
+       "                "matching_score": 6.66666666666667,\n",
+       "                "sum_weights": 100.0\n",
+       "            },\n",
+       "            "value": 6.66666666666667\n",
+       "        },\n",
+       "        "identity--6fb2922c-b2f0-4c87-97a8-2281e8f7eb2d": {\n",
+       "            "lhs": "identity--6fb2922c-b2f0-4c87-97a8-2281e8f7eb2d",\n",
+       "            "rhs": "identity--eef09301-a76e-4e99-a5ca-362fa9aba0b7",\n",
+       "            "prop_score": {\n",
+       "                "name": {\n",
+       "                    "weight": 60,\n",
+       "                    "contributing_score": 60.0\n",
+       "                },\n",
+       "                "identity_class": {\n",
+       "                    "weight": 20,\n",
+       "                    "contributing_score": 20.0\n",
+       "                },\n",
+       "                "matching_score": 80.0,\n",
+       "                "sum_weights": 80.0\n",
+       "            },\n",
+       "            "value": 100.0\n",
+       "        },\n",
+       "        "identity--eef09301-a76e-4e99-a5ca-362fa9aba0b7": {\n",
+       "            "lhs": "identity--eef09301-a76e-4e99-a5ca-362fa9aba0b7",\n",
+       "            "rhs": "identity--6fb2922c-b2f0-4c87-97a8-2281e8f7eb2d",\n",
+       "            "prop_score": {\n",
+       "                "name": {\n",
+       "                    "weight": 60,\n",
+       "                    "contributing_score": 60.0\n",
+       "                },\n",
+       "                "identity_class": {\n",
+       "                    "weight": 20,\n",
+       "                    "contributing_score": 20.0\n",
+       "                },\n",
+       "                "matching_score": 80.0,\n",
+       "                "sum_weights": 80.0\n",
+       "            },\n",
+       "            "value": 100.0\n",
+       "        }\n",
+       "    }\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ From 4825f5e30384d0d14065f8d02ea1af81c983247a Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Fri, 19 Feb 2021 16:57:15 -0500 Subject: [PATCH 27/27] Small touchups --- docs/guide/equivalence.ipynb | 642 +++++++++++++++++------------------ 1 file changed, 321 insertions(+), 321 deletions(-) diff --git a/docs/guide/equivalence.ipynb b/docs/guide/equivalence.ipynb index 44e2e4d..e61e9ed 100644 --- a/docs/guide/equivalence.ipynb +++ b/docs/guide/equivalence.ipynb @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -168,7 +168,7 @@ "" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" }, @@ -255,7 +255,7 @@ "" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -299,7 +299,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -385,7 +385,7 @@ "" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" }, @@ -472,7 +472,7 @@ "" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -500,7 +500,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -586,7 +586,7 @@ "" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" }, @@ -673,7 +673,7 @@ "" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -706,7 +706,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": { "scrolled": true }, @@ -794,7 +794,7 @@ "" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" }, @@ -881,7 +881,7 @@ "" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -923,7 +923,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": { "scrolled": true }, @@ -1011,7 +1011,7 @@ "" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" }, @@ -1098,7 +1098,7 @@ "" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -1129,7 +1129,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": { "scrolled": true }, @@ -1217,7 +1217,7 @@ "" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" }, @@ -1304,7 +1304,7 @@ "" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1339,7 +1339,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": { "scrolled": true }, @@ -1427,7 +1427,7 @@ "" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" }, @@ -1514,7 +1514,7 @@ "" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1549,7 +1549,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": { "scrolled": true }, @@ -1637,7 +1637,7 @@ "" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" }, @@ -1724,7 +1724,7 @@ "" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1756,7 +1756,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -1842,7 +1842,7 @@ "" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" }, @@ -1929,7 +1929,7 @@ "" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1964,7 +1964,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -1989,7 +1989,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -2082,7 +2082,7 @@ "" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -2114,7 +2114,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -2145,7 +2145,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -2231,7 +2231,7 @@ "" ] }, - "execution_count": 16, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -2257,14 +2257,14 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Starting object similarity process between: 'threat-actor--b70cc8d6-33d9-41e2-9beb-7bce7af08a76' and 'threat-actor--e325f521-ce0f-48b3-b4d5-65fa699c0e16'\n", + "Starting object similarity process between: 'threat-actor--54040762-8540-4c37-8f6d-6ebcc20da2b5' and 'threat-actor--b2a6f234-5594-42d9-9cdb-f4b82bc575a6'\n", "--\t\tpartial_string_based 'Evil Org' 'James Bond'\tresult: '11.111111111111114'\n", "'name' check -- weight: 60, contributing score: 6.666666666666669\n", "--\t\tpartial_list_based '['crime-syndicate']' '['spy']'\tresult: '0.0'\n", @@ -2357,7 +2357,7 @@ "" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -2396,7 +2396,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -2482,7 +2482,7 @@ "" ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" }, @@ -2569,7 +2569,7 @@ "" ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" }, @@ -2656,7 +2656,7 @@ "" ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" }, @@ -2743,7 +2743,7 @@ "" ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" }, @@ -2830,7 +2830,7 @@ "" ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" }, @@ -2917,7 +2917,7 @@ "" ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" }, @@ -3004,7 +3004,7 @@ "" ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -3064,7 +3064,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -3150,7 +3150,7 @@ "" ] }, - "execution_count": 20, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" }, @@ -3237,7 +3237,7 @@ "" ] }, - "execution_count": 20, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -3267,7 +3267,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -3276,7 +3276,7 @@ "10.000000000000002" ] }, - "execution_count": 21, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" }, @@ -3363,7 +3363,7 @@ "" ] }, - "execution_count": 21, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -3391,7 +3391,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -3477,7 +3477,7 @@ "" ] }, - "execution_count": 22, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -3514,7 +3514,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -3600,7 +3600,7 @@ "" ] }, - "execution_count": 23, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" }, @@ -3687,7 +3687,7 @@ "" ] }, - "execution_count": 23, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -3731,7 +3731,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -3817,7 +3817,7 @@ "" ] }, - "execution_count": 25, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" }, @@ -3904,7 +3904,7 @@ "" ] }, - "execution_count": 25, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -3963,14 +3963,14 @@ "source": [ "## Checking Graph Similarity and Equivalence\n", "\n", - "The next logical step for checking if two individual objects are similar or equivalent is to check all relevant neighbors or equal type objects for the best match. It can help you determine if you have seen similar intelligence in the past and builds upon the same foundation of the local object similarity comparisons. The [Environment](../api/stix2.environment.rst#stix2.environment.Environment) has two functions with similar requirements for graph-based checks.\n", + "The next logical step for checking if two individual objects are similar or equivalent is to check all relevant neighbors and related objects for the best matches. It can help you determine if you have seen similar intelligence in the past and builds upon the foundation of the local object similarity comparisons described above. The [Environment](../api/stix2.environment.rst#stix2.environment.Environment) has two functions with similar requirements for graph-based checks.\n", "\n", - "For each supported object type, the [graph_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.graph_similarity) function checks if the values for a specific set of objects to match and will compare against all of the same type objects maximizing for score obtained from the properties match. It requires two DataStore instances which will serve as our graph representation and will allow the algorithm to make additional checks like de-referencing objects. Internally it calls [object_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.object_similarity). \n", + "For each supported object type, the [graph_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.graph_similarity) function checks if the values for a specific set of objects match and will compare against all of the same type objects, maximizing for score obtained from the properties match. It requires two DataStore instances which represent the two graphs to be compared and allows the algorithm to make additional checks like de-referencing objects. Internally it calls [object_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.object_similarity). \n", "\n", - "Some limitations exist in the implementation that are important to be understood by those analyzing the results of this algorithm.\n", + "Some limitations exist that are important to understand when analyzing the results of this algorithm.\n", "- Only STIX types with weights defined will be checked. This could result in a maximal sub-graph and score that is smaller than expect. We recommend looking at the prop_scores or logging output for details and to understand how the result was calculated.\n", "- Failure to de-reference an object for checks will result in a 0 for that property. This applies to `*_ref` or `*_refs` properties.\n", - "- Keep reasonable expectations in running-time, especially with DataStores that require network communication or when the number of items in the graphs is high. You can also tune how much depth the algorithm should check in de-reference calls; this can affect your running-time.\n", + "- Keep reasonable expectations in terms of how long it takes to run, especially with DataStores that require network communication or when the number of items in the graphs is high. You can also tune how much depth the algorithm should check in de-reference calls; this can affect your running-time.\n", "\n", "**Please note** that you will need to install the TAXII dependencies in addition to the semantic requirements if you plan on using the TAXII DataStore classes. You can do this using:\n", "\n", @@ -3978,22 +3978,22 @@ "\n", "#### Graph Similarity and Equivalence Example\n", "\n", - "By default, it use default weights defined here [object_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.object_similarity) in combination with [graph_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.graph_similarity)." + "By default, the algorithm uses default weights defined here [object_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.object_similarity) in combination with [graph_similarity()](../api/stix2.environment.rst#stix2.environment.Environment.graph_similarity)." ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "