resolve issues with graph similarity

- new methods for graph equivalence and similarity - remove sorting and len comparisons - rename some variables
2021-02-16 00:57:26 -05:00 · 2021-02-16 00:57:26 -05:00 · 02b076b3bb
parent 489970718f
commit 02b076b3bb
2 changed files with 160 additions and 70 deletions
--- a/stix2/equivalence/graph/init.py
+++ b/stix2/equivalence/graph/init.py
@ -1,21 +1,62 @@
-"""Python APIs for STIX 2 Graph-based Semantic Equivalence."""
-import collections
-import itertools
+"""Python APIs for STIX 2 Graph-based Semantic Equivalence and Similarity."""
 import logging

 from ..object import (
    WEIGHTS, exact_match, list_reference_check, partial_string_based,
-    partial_timestamp_based, reference_check, object_similarity,
+    partial_timestamp_based, reference_check, object_similarity, object_pairs, bucket_per_type
 )

 logger = logging.getLogger(__name__)


+def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
+    """This method returns a true/false value if two graphs are semantically equivalent.
+    Internally, it calls the graph_similarity function and compares it against the given
+    threshold value.
+
+    Args:
+        ds1: A DataStore object instance representing your graph
+        ds2: A DataStore object instance representing your graph
+        prop_scores: A dictionary that can hold individual property scores,
+            weights, contributing score, matching score and sum of weights.
+        threshold: A numerical value between 0 and 100 to determine the minimum
+            score to result in successfully calling both graphs equivalent. This
+            value can be tuned.
+        weight_dict: A dictionary that can be used to override settings
+            in the similarity process
+
+    Returns:
+        bool: True if the result of the graph similarity is greater than or equal to
+            the threshold value. False otherwise.
+
+    Warning:
+        Object types need to have property weights defined for the similarity process.
+        Otherwise, those objects will not influence the final score. The WEIGHTS
+        dictionary under `stix2.equivalence.graph` can give you an idea on how to add
+        new entries and pass them via the `weight_dict` argument. Similarly, the values
+        or methods can be fine tuned for a particular use case.
+
+    Note:
+        Default weight_dict:
+
+        .. include:: ../../graph_default_sem_eq_weights.rst
+
+    Note:
+        This implementation follows the Semantic Equivalence Committee Note.
+        see `the Committee Note <link here>`__.
+
+    """
+    similarity_result = graph_similarity(ds1, ds2, prop_scores, **weight_dict)
+    if similarity_result >= threshold:
+        return True
+    return False
+
+
 def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
-    """This method verifies if two graphs are semantically equivalent.
+    """This method returns a similarity score for two given graphs.
    Each DataStore can contain a connected or disconnected graph and the
    final result is weighted over the amount of objects we managed to compare.
-    This approach builds on top of the object-based semantic equivalence process
+    This approach builds on top of the object-based similarity process
    and each comparison can return a value between 0 and 100.

    Args:
@ -24,20 +65,20 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
        prop_scores: A dictionary that can hold individual property scores,
            weights, contributing score, matching score and sum of weights.
        weight_dict: A dictionary that can be used to override settings
-            in the semantic equivalence process
+            in the similarity process

    Returns:
-        float: A number between 0.0 and 100.0 as a measurement of equivalence.
+        float: A number between 0.0 and 100.0 as a measurement of similarity.

    Warning:
-        Object types need to have property weights defined for the equivalence process.
+        Object types need to have property weights defined for the similarity process.
        Otherwise, those objects will not influence the final score. The WEIGHTS
        dictionary under `stix2.equivalence.graph` can give you an idea on how to add
        new entries and pass them via the `weight_dict` argument. Similarly, the values
        or methods can be fine tuned for a particular use case.

    Note:
-        Default weights_dict:
+        Default weight_dict:

        .. include:: ../../graph_default_sem_eq_weights.rst

@ -47,12 +88,14 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):

    """
    results = {}
-    equivalence_score = 0
+    similarity_score = 0
    weights = GRAPH_WEIGHTS.copy()

    if weight_dict:
        weights.update(weight_dict)

+    if weights["_internal"]["max_depth"] <= 0:
+        raise ValueError("weight_dict['_internal']['max_depth'] must be greater than 0")
    depth = weights["_internal"]["max_depth"]

    graph1 = bucket_per_type(ds1.query([]))
@ -64,60 +107,46 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
        iprop_score2 = {}
        object1_id = object1["id"]
        object2_id = object2["id"]
+
        weights["_internal"]["max_depth"] = depth
        weights["_internal"]["ds1"] = ds1
        weights["_internal"]["ds2"] = ds2
        result1 = object_similarity(object1, object2, iprop_score1, **weights)

+        weights["_internal"]["max_depth"] = depth
        weights["_internal"]["ds1"] = ds2
        weights["_internal"]["ds2"] = ds1
        result2 = object_similarity(object2, object1, iprop_score2, **weights)

        if object1_id not in results:
-            results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1}
+            results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score1, "value": result1}
        elif result1 > results[object1_id]["value"]:
-            results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1}
+            results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score1, "value": result1}

        if object2_id not in results:
-            results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2}
-        elif result1 > results[object2_id]["value"]:
-            results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2}
+            results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score2, "value": result2}
+        elif result2 > results[object2_id]["value"]:
+            results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score2, "value": result2}

    matching_score = sum(x["value"] for x in results.values())
-    sum_weights = len(results)
-    if sum_weights > 0:
-        equivalence_score = matching_score / sum_weights
+    len_pairs = len(results)
+    if len_pairs > 0:
+        similarity_score = matching_score / len_pairs

    prop_scores["matching_score"] = matching_score
-    prop_scores["sum_weights"] = sum_weights
+    prop_scores["len_pairs"] = len_pairs
    prop_scores["summary"] = results

    logger.debug(
-        "DONE\t\tSUM_WEIGHT: %.2f\tMATCHING_SCORE: %.2f\t SCORE: %.2f",
-        sum_weights,
+        "DONE\t\tSUM_PAIRS: %.2f\tMATCHING_SCORE: %.2f\t SIMILARITY_SCORE: %.2f",
+        len_pairs,
        matching_score,
-        equivalence_score,
+        similarity_score,
    )
-    return equivalence_score
+    return similarity_score


-def bucket_per_type(g):
-    buckets = collections.defaultdict(list)
-    [buckets[obj["type"]].append(obj) for obj in g]
-    return buckets
-
-
-def object_pairs(g1, g2, w):
-    types_in_common = set(g1.keys()).intersection(g2.keys())
-    testable_types = types_in_common.intersection(w.keys())
-
-    return itertools.chain.from_iterable(
-        itertools.product(g1[stix_type], g2[stix_type])
-        for stix_type in testable_types
-    )
-
-
-# default weights used for the graph semantic equivalence process
+# default weights used for the graph similarity process
 GRAPH_WEIGHTS = WEIGHTS.copy()
 GRAPH_WEIGHTS.update({
    "grouping": {
--- a/stix2/equivalence/object/init.py
+++ b/stix2/equivalence/object/init.py
@ -1,4 +1,6 @@
-"""Python APIs for STIX 2 Object-based Semantic Equivalence."""
+"""Python APIs for STIX 2 Object-based Semantic Equivalence and Similarity."""
+import collections
+import itertools
 import logging
 import time

@ -9,9 +11,52 @@ from ..pattern import equivalent_patterns
 logger = logging.getLogger(__name__)


+def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict):
+    """This method returns a true/false value if two objects are semantically equivalent.
+    Internally, it calls the object_similarity function and compares it against the given
+    threshold value.
+
+    Args:
+        obj1: A stix2 object instance
+        obj2: A stix2 object instance
+        prop_scores: A dictionary that can hold individual property scores,
+            weights, contributing score, matching score and sum of weights.
+        threshold: A numerical value between 0 and 100 to determine the minimum
+            score to result in successfully calling both objects equivalent. This
+            value can be tuned.
+        weight_dict: A dictionary that can be used to override settings
+            in the semantic equivalence process
+
+    Returns:
+        bool: True if the result of the object similarity is greater than or equal to
+            the threshold value. False otherwise.
+
+    Warning:
+        Object types need to have property weights defined for the similarity process.
+        Otherwise, those objects will not influence the final score. The WEIGHTS
+        dictionary under `stix2.equivalence.object` can give you an idea on how to add
+        new entries and pass them via the `weight_dict` argument. Similarly, the values
+        or methods can be fine tuned for a particular use case.
+
+    Note:
+        Default weight_dict:
+
+        .. include:: ../../object_default_sem_eq_weights.rst
+
+    Note:
+        This implementation follows the Semantic Equivalence Committee Note.
+        see `the Committee Note <link here>`__.
+
+    """
+    similarity_result = object_similarity(obj1, obj2, prop_scores, **weight_dict)
+    if similarity_result >= threshold:
+        return True
+    return False
+
+
 def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
-    """This method verifies if two objects of the same type are
-    semantically equivalent.
+    """This method returns a measure of similarity depending on how
+    similar the two objects are.

    Args:
        obj1: A stix2 object instance
@ -22,17 +67,17 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
            in the semantic equivalence process

    Returns:
-        float: A number between 0.0 and 100.0 as a measurement of equivalence.
+        float: A number between 0.0 and 100.0 as a measurement of similarity.

    Warning:
-        Object types need to have property weights defined for the equivalence process.
+        Object types need to have property weights defined for the similarity process.
        Otherwise, those objects will not influence the final score. The WEIGHTS
        dictionary under `stix2.equivalence.object` can give you an idea on how to add
        new entries and pass them via the `weight_dict` argument. Similarly, the values
        or methods can be fine tuned for a particular use case.

    Note:
-        Default weights_dict:
+        Default weight_dict:

        .. include:: ../../object_default_sem_eq_weights.rst

@ -352,34 +397,31 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights):
    The score influences the objects containing these references. The result is
    weighted on the amount of unique objects that could 1) be de-referenced 2) """
    results = {}
-    if len(refs1) >= len(refs2):
-        l1 = refs1
-        l2 = refs2
-        b1 = ds1
-        b2 = ds2
-    else:
-        l1 = refs2
-        l2 = refs1
-        b1 = ds2
-        b2 = ds1

-    l1.sort()
-    l2.sort()
+    pairs = object_pairs(
+        bucket_per_type(refs1, "id-split"),
+        bucket_per_type(refs2, "id-split"),
+        weights
+    )

-    for ref1 in l1:
-        for ref2 in l2:
-            type1, type2 = ref1.split("--")[0], ref2.split("--")[0]
-            if type1 == type2:
-                score = reference_check(ref1, ref2, b1, b2, **weights) * 100.0
+    for ref1, ref2 in pairs:
+        type1, type2 = ref1.split("--")[0], ref2.split("--")[0]
+        if type1 == type2:
+            score = reference_check(ref1, ref2, ds1, ds2, **weights)

-                if ref1 not in results:
-                    results[ref1] = {"matched": ref2, "value": score}
-                elif score > results[ref1]["value"]:
-                    results[ref1] = {"matched": ref2, "value": score}
+            if ref1 not in results:
+                results[ref1] = {"matched": ref2, "value": score}
+            elif score > results[ref1]["value"]:
+                results[ref1] = {"matched": ref2, "value": score}
+
+            if ref2 not in results:
+                results[ref2] = {"matched": ref1, "value": score}
+            elif score > results[ref2]["value"]:
+                results[ref2] = {"matched": ref1, "value": score}

    result = 0.0
    total_sum = sum(x["value"] for x in results.values())
-    max_score = len(results) * 100.0
+    max_score = len(results)

    if max_score > 0:
        result = total_sum / max_score
@ -391,7 +433,26 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights):
    return result


-# default weights used for the semantic equivalence process
+def bucket_per_type(g, mode="type"):
+    buckets = collections.defaultdict(list)
+    if mode == "type":
+        [buckets[obj["type"]].append(obj) for obj in g]
+    elif mode == "id-split":
+        [buckets[obj.split("--")[0]].append(obj) for obj in g]
+    return buckets
+
+
+def object_pairs(g1, g2, w):
+    types_in_common = set(g1.keys()).intersection(g2.keys())
+    testable_types = types_in_common.intersection(w.keys())
+
+    return itertools.chain.from_iterable(
+        itertools.product(g1[stix_type], g2[stix_type])
+        for stix_type in testable_types
+    )
+
+
+# default weights used for the similarity process
 WEIGHTS = {
    "attack-pattern": {
        "name": (30, partial_string_based),