WIP: changes to graph_similarity

busted main loop, symmetrical properties not present
2021-02-01 22:35:37 -05:00 · 2021-02-01 22:35:37 -05:00 · 489970718f
parent 03b3423cbb
commit 489970718f
4 changed files with 97 additions and 73 deletions
--- a/stix2/environment.py
+++ b/stix2/environment.py
@ -2,12 +2,12 @@
 import copy

 from .datastore import CompositeDataSource, DataStoreMixin
-from .equivalence.graph import graphically_equivalent
+from .equivalence.graph import graph_similarity
 from .equivalence.object import (  # noqa: F401
    WEIGHTS, check_property_present, custom_pattern_based, exact_match,
    list_reference_check, partial_external_reference_based, partial_list_based,
    partial_location_distance, partial_string_based, partial_timestamp_based,
-    reference_check, semantically_equivalent,
+    reference_check, object_similarity,
 )
 from .parsing import parse as _parse

@ -197,7 +197,7 @@ class Environment(DataStoreMixin):
            return None

    @staticmethod
-    def semantically_equivalent(obj1, obj2, prop_scores={}, **weight_dict):
+    def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
        """This method verifies if two objects of the same type are
        semantically equivalent.

@ -229,10 +229,10 @@ class Environment(DataStoreMixin):
            see `the Committee Note <link here>`__.

        """
-        return semantically_equivalent(obj1, obj2, prop_scores, **weight_dict)
+        return object_similarity(obj1, obj2, prop_scores, **weight_dict)

    @staticmethod
-    def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
+    def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
        """This method verifies if two graphs are semantically equivalent.
        Each DataStore can contain a connected or disconnected graph and the
        final result is weighted over the amount of objects we managed to compare.
@ -267,4 +267,4 @@ class Environment(DataStoreMixin):
            see `the Committee Note <link here>`__.

        """
-        return graphically_equivalent(ds1, ds2, prop_scores, **weight_dict)
+        return graph_similarity(ds1, ds2, prop_scores, **weight_dict)
--- a/stix2/equivalence/graph/init.py
+++ b/stix2/equivalence/graph/init.py
@ -1,15 +1,17 @@
 """Python APIs for STIX 2 Graph-based Semantic Equivalence."""
+import collections
+import itertools
 import logging

 from ..object import (
    WEIGHTS, exact_match, list_reference_check, partial_string_based,
-    partial_timestamp_based, reference_check, semantically_equivalent,
+    partial_timestamp_based, reference_check, object_similarity,
 )

 logger = logging.getLogger(__name__)


-def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
+def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
    """This method verifies if two graphs are semantically equivalent.
    Each DataStore can contain a connected or disconnected graph and the
    final result is weighted over the amount of objects we managed to compare.
@ -44,49 +46,48 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
        see `the Committee Note <link here>`__.

    """
+    results = {}
+    equivalence_score = 0
    weights = GRAPH_WEIGHTS.copy()

    if weight_dict:
        weights.update(weight_dict)

-    results = {}
    depth = weights["_internal"]["max_depth"]

-    graph1 = ds1.query([])
-    graph2 = ds2.query([])
+    graph1 = bucket_per_type(ds1.query([]))
+    graph2 = bucket_per_type(ds2.query([]))
+    pairs = object_pairs(graph1, graph2, weights)

-    graph1.sort(key=lambda x: x["type"])
-    graph2.sort(key=lambda x: x["type"])
-
-    if len(graph1) < len(graph2):
+    for object1, object2 in pairs:
+        iprop_score1 = {}
+        iprop_score2 = {}
+        object1_id = object1["id"]
+        object2_id = object2["id"]
+        weights["_internal"]["max_depth"] = depth
        weights["_internal"]["ds1"] = ds1
        weights["_internal"]["ds2"] = ds2
-        g1 = graph1
-        g2 = graph2
-    else:
+        result1 = object_similarity(object1, object2, iprop_score1, **weights)
+
        weights["_internal"]["ds1"] = ds2
        weights["_internal"]["ds2"] = ds1
-        g1 = graph2
-        g2 = graph1
+        result2 = object_similarity(object2, object1, iprop_score2, **weights)

-    for object1 in g1:
-        for object2 in g2:
-            if object1["type"] == object2["type"] and object1["type"] in weights:
-                iprop_score = {}
-                result = semantically_equivalent(object1, object2, iprop_score, **weights)
-                objects1_id = object1["id"]
-                weights["_internal"]["max_depth"] = depth
+        if object1_id not in results:
+            results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1}
+        elif result1 > results[object1_id]["value"]:
+            results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1}

-                if objects1_id not in results:
-                    results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result}
-                elif result > results[objects1_id]["value"]:
-                    results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result}
+        if object2_id not in results:
+            results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2}
+        elif result1 > results[object2_id]["value"]:
+            results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2}

-    equivalence_score = 0
    matching_score = sum(x["value"] for x in results.values())
-    sum_weights = len(results) * 100.0
+    sum_weights = len(results)
    if sum_weights > 0:
-        equivalence_score = (matching_score / sum_weights) * 100
+        equivalence_score = matching_score / sum_weights
+
    prop_scores["matching_score"] = matching_score
    prop_scores["sum_weights"] = sum_weights
    prop_scores["summary"] = results
@ -100,6 +101,22 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
    return equivalence_score


+def bucket_per_type(g):
+    buckets = collections.defaultdict(list)
+    [buckets[obj["type"]].append(obj) for obj in g]
+    return buckets
+
+
+def object_pairs(g1, g2, w):
+    types_in_common = set(g1.keys()).intersection(g2.keys())
+    testable_types = types_in_common.intersection(w.keys())
+
+    return itertools.chain.from_iterable(
+        itertools.product(g1[stix_type], g2[stix_type])
+        for stix_type in testable_types
+    )
+
+
 # default weights used for the graph semantic equivalence process
 GRAPH_WEIGHTS = WEIGHTS.copy()
 GRAPH_WEIGHTS.update({
--- a/stix2/equivalence/object/init.py
+++ b/stix2/equivalence/object/init.py
@ -9,7 +9,7 @@ from ..pattern import equivalent_patterns
 logger = logging.getLogger(__name__)


-def semantically_equivalent(obj1, obj2, prop_scores={}, **weight_dict):
+def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
    """This method verifies if two objects of the same type are
    semantically equivalent.

@ -312,7 +312,7 @@ def _versioned_checks(ref1, ref2, ds1, ds2, **weights):
    if len(objects1) > 0 and len(objects2) > 0:
        for o1 in objects1:
            for o2 in objects2:
-                result = semantically_equivalent(o1, o2, **weights)
+                result = object_similarity(o1, o2, **weights)
                if ref1 not in results:
                    results[ref1] = {"matched": ref2, "value": result}
                elif result > results[ref1]["value"]:
@ -337,7 +337,7 @@ def reference_check(ref1, ref2, ds1, ds2, **weights):
        else:
            o1, o2 = ds1.get(ref1), ds2.get(ref2)
            if o1 and o2:
-                result = semantically_equivalent(o1, o2, **weights) / 100.0
+                result = object_similarity(o1, o2, **weights) / 100.0

    logger.debug(
        "--\t\treference_check '%s' '%s'\tresult: '%s'",
--- a/stix2/test/v21/test_environment.py
+++ b/stix2/test/v21/test_environment.py
@ -429,7 +429,7 @@ def test_related_to_by_target(ds):
 def test_semantic_equivalence_on_same_attack_pattern1():
    ap1 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS)
    ap2 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS)
-    env = stix2.Environment().semantically_equivalent(ap1, ap2)
+    env = stix2.Environment().object_similarity(ap1, ap2)
    assert round(env) == 100


@ -445,14 +445,14 @@ def test_semantic_equivalence_on_same_attack_pattern2():
    )
    ap1 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_KWARGS)
    ap2 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_KWARGS)
-    env = stix2.Environment().semantically_equivalent(ap1, ap2)
+    env = stix2.Environment().object_similarity(ap1, ap2)
    assert round(env) == 100


 def test_semantic_equivalence_on_same_campaign1():
    camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS)
    camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS)
-    env = stix2.Environment().semantically_equivalent(camp1, camp2)
+    env = stix2.Environment().object_similarity(camp1, camp2)
    assert round(env) == 100


@ -464,14 +464,14 @@ def test_semantic_equivalence_on_same_campaign2():
    )
    camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMP_KWARGS)
    camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMP_KWARGS)
-    env = stix2.Environment().semantically_equivalent(camp1, camp2)
+    env = stix2.Environment().object_similarity(camp1, camp2)
    assert round(env) == 100


 def test_semantic_equivalence_on_same_identity1():
    iden1 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS)
    iden2 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS)
-    env = stix2.Environment().semantically_equivalent(iden1, iden2)
+    env = stix2.Environment().object_similarity(iden1, iden2)
    assert round(env) == 100


@ -483,14 +483,14 @@ def test_semantic_equivalence_on_same_identity2():
    )
    iden1 = stix2.v21.Identity(id=IDENTITY_ID, **IDEN_KWARGS)
    iden2 = stix2.v21.Identity(id=IDENTITY_ID, **IDEN_KWARGS)
-    env = stix2.Environment().semantically_equivalent(iden1, iden2)
+    env = stix2.Environment().object_similarity(iden1, iden2)
    assert round(env) == 100


 def test_semantic_equivalence_on_same_indicator():
    ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS)
    ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS)
-    env = stix2.Environment().semantically_equivalent(ind1, ind2)
+    env = stix2.Environment().object_similarity(ind1, ind2)
    assert round(env) == 100


@ -498,7 +498,7 @@ def test_semantic_equivalence_on_same_location1():
    location_kwargs = dict(latitude=45, longitude=179)
    loc1 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs)
    loc2 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs)
-    env = stix2.Environment().semantically_equivalent(loc1, loc2)
+    env = stix2.Environment().object_similarity(loc1, loc2)
    assert round(env) == 100


@ -511,7 +511,7 @@ def test_semantic_equivalence_on_same_location2():
    )
    loc1 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs)
    loc2 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs)
-    env = stix2.Environment().semantically_equivalent(loc1, loc2)
+    env = stix2.Environment().object_similarity(loc1, loc2)
    assert round(env) == 100


@ -519,21 +519,21 @@ def test_semantic_equivalence_location_with_no_latlong():
    loc_kwargs = dict(country="US", administrative_area="US-DC")
    loc1 = stix2.v21.Location(id=LOCATION_ID, **LOCATION_KWARGS)
    loc2 = stix2.v21.Location(id=LOCATION_ID, **loc_kwargs)
-    env = stix2.Environment().semantically_equivalent(loc1, loc2)
+    env = stix2.Environment().object_similarity(loc1, loc2)
    assert round(env) != 100


 def test_semantic_equivalence_on_same_malware():
    malw1 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS)
    malw2 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS)
-    env = stix2.Environment().semantically_equivalent(malw1, malw2)
+    env = stix2.Environment().object_similarity(malw1, malw2)
    assert round(env) == 100


 def test_semantic_equivalence_on_same_threat_actor1():
    ta1 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS)
    ta2 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS)
-    env = stix2.Environment().semantically_equivalent(ta1, ta2)
+    env = stix2.Environment().object_similarity(ta1, ta2)
    assert round(env) == 100


@ -545,21 +545,21 @@ def test_semantic_equivalence_on_same_threat_actor2():
    )
    ta1 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_KWARGS)
    ta2 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_KWARGS)
-    env = stix2.Environment().semantically_equivalent(ta1, ta2)
+    env = stix2.Environment().object_similarity(ta1, ta2)
    assert round(env) == 100


 def test_semantic_equivalence_on_same_tool():
    tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS)
    tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS)
-    env = stix2.Environment().semantically_equivalent(tool1, tool2)
+    env = stix2.Environment().object_similarity(tool1, tool2)
    assert round(env) == 100


 def test_semantic_equivalence_on_same_vulnerability1():
    vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS)
    vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS)
-    env = stix2.Environment().semantically_equivalent(vul1, vul2)
+    env = stix2.Environment().object_similarity(vul1, vul2)
    assert round(env) == 100


@ -584,7 +584,7 @@ def test_semantic_equivalence_on_same_vulnerability2():
    )
    vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS1)
    vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS2)
-    env = stix2.Environment().semantically_equivalent(vul1, vul2)
+    env = stix2.Environment().object_similarity(vul1, vul2)
    assert round(env) == 0.0


@ -640,7 +640,7 @@ def test_semantic_equivalence_on_unknown_object():
    }
    cust1 = stix2.parse(CUSTOM_KWARGS1, allow_custom=True)
    cust2 = stix2.parse(CUSTOM_KWARGS2, allow_custom=True)
-    env = stix2.Environment().semantically_equivalent(cust1, cust2, **weights)
+    env = stix2.Environment().object_similarity(cust1, cust2, **weights)
    assert round(env) == 0


@ -648,7 +648,7 @@ def test_semantic_equivalence_different_type_raises():
    with pytest.raises(ValueError) as excinfo:
        vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS)
        ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS)
-        stix2.Environment().semantically_equivalent(vul1, ind1)
+        stix2.Environment().object_similarity(vul1, ind1)

    assert str(excinfo.value) == "The objects to compare must be of the same type!"

@ -661,7 +661,7 @@ def test_semantic_equivalence_different_spec_version_raises():
        )
        ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS)
        ind2 = stix2.v20.Indicator(id=INDICATOR_ID, **V20_KWARGS)
-        stix2.Environment().semantically_equivalent(ind1, ind2)
+        stix2.Environment().object_similarity(ind1, ind2)

    assert str(excinfo.value) == "The objects to compare must be of the same spec version!"

@ -686,7 +686,7 @@ def test_semantic_equivalence_zero_match():
    }
    ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS)
    ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **IND_KWARGS)
-    env = stix2.Environment().semantically_equivalent(ind1, ind2, **weights)
+    env = stix2.Environment().object_similarity(ind1, ind2, **weights)
    assert round(env) == 0


@ -708,7 +708,7 @@ def test_semantic_equivalence_different_spec_version():
    }
    ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS)
    ind2 = stix2.v20.Indicator(id=INDICATOR_ID, **IND_KWARGS)
-    env = stix2.Environment().semantically_equivalent(ind1, ind2, **weights)
+    env = stix2.Environment().object_similarity(ind1, ind2, **weights)
    assert round(env) == 0


@ -800,7 +800,7 @@ def test_semantic_equivalence_exact_match():
 def test_non_existent_config_for_object():
    r1 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS)
    r2 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS)
-    assert stix2.Environment().semantically_equivalent(r1, r2) == 0.0
+    assert stix2.Environment().object_similarity(r1, r2) == 0.0


 def custom_semantic_equivalence_method(obj1, obj2, **weights):
@ -824,7 +824,7 @@ def test_semantic_equivalence_method_provided():

    tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS)
    tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS)
-    env = stix2.Environment().semantically_equivalent(tool1, tool2, **weights)
+    env = stix2.Environment().object_similarity(tool1, tool2, **weights)
    assert round(env) == 96


@ -838,7 +838,7 @@ def test_semantic_equivalence_prop_scores():

    tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS)
    tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS)
-    stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores)
+    stix2.Environment().object_similarity(tool1, tool2, prop_scores)
    assert len(prop_scores) == 4
    assert round(prop_scores["matching_score"], 1) == 8.9
    assert round(prop_scores["sum_weights"], 1) == 100.0
@ -868,7 +868,7 @@ def test_semantic_equivalence_prop_scores_method_provided():

    tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS)
    tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS)
-    env = stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores, **weights)
+    env = stix2.Environment().object_similarity(tool1, tool2, prop_scores, **weights)
    assert round(env) == 96
    assert len(prop_scores) == 2
    assert prop_scores["matching_score"] == 96.0
@ -964,12 +964,19 @@ def test_graph_equivalence_with_filesystem_source(ds):
            "max_depth": 1,
        },
    }
-    prop_scores = {}
+    prop_scores1 = {}
+    prop_scores2 = {}
    fs = stix2.FileSystemSource(FS_PATH)
-    env = stix2.Environment().graphically_equivalent(fs, ds, prop_scores, **weights)
-    assert round(env) == 24
-    assert round(prop_scores["matching_score"]) == 122
-    assert round(prop_scores["sum_weights"]) == 500
+    env = stix2.Environment().graph_similarity(fs, ds, prop_scores1, **weights)
+    assert round(env) == 26
+    assert round(prop_scores1["matching_score"]) == 460
+    assert round(prop_scores1["sum_weights"]) == 18
+
+    env = stix2.Environment().graph_similarity(ds, fs, prop_scores2, **weights)
+    assert round(env) == 47
+    assert round(prop_scores2["matching_score"]) == 852
+    assert round(prop_scores2["sum_weights"]) == 18
+    assert prop_scores1 == prop_scores2


 def test_graph_equivalence_with_duplicate_graph(ds):
@ -981,10 +988,10 @@ def test_graph_equivalence_with_duplicate_graph(ds):
        },
    }
    prop_scores = {}
-    env = stix2.Environment().graphically_equivalent(ds, ds, prop_scores, **weights)
+    env = stix2.Environment().graph_similarity(ds, ds, prop_scores, **weights)
    assert round(env) == 100
    assert round(prop_scores["matching_score"]) == 800
-    assert round(prop_scores["sum_weights"]) == 800
+    assert round(prop_scores["sum_weights"]) == 8


 def test_graph_equivalence_with_versioning_check_on(ds2, ds):
@ -996,10 +1003,10 @@ def test_graph_equivalence_with_versioning_check_on(ds2, ds):
        },
    }
    prop_scores = {}
-    env = stix2.Environment().graphically_equivalent(ds, ds2, prop_scores, **weights)
+    env = stix2.Environment().graph_similarity(ds, ds2, prop_scores, **weights)
    assert round(env) == 93
    assert round(prop_scores["matching_score"]) == 745
-    assert round(prop_scores["sum_weights"]) == 800
+    assert round(prop_scores["sum_weights"]) == 8


 def test_graph_equivalence_with_versioning_check_off(ds2, ds):
@ -1011,7 +1018,7 @@ def test_graph_equivalence_with_versioning_check_off(ds2, ds):
        },
    }
    prop_scores = {}
-    env = stix2.Environment().graphically_equivalent(ds, ds2, prop_scores, **weights)
+    env = stix2.Environment().graph_similarity(ds, ds2, prop_scores, **weights)
    assert round(env) == 93
    assert round(prop_scores["matching_score"]) == 745
-    assert round(prop_scores["sum_weights"]) == 800
+    assert round(prop_scores["sum_weights"]) == 8