doctring changes, _versioned_checks changes

2021-02-18 10:37:34 -05:00 · 2021-02-18 10:37:34 -05:00 · 99453770cf
parent fa6978969b
commit 99453770cf
3 changed files with 23 additions and 19 deletions
--- a/stix2/equivalence/init.py
+++ b/stix2/equivalence/init.py
@ -1,4 +1,4 @@
-"""Python APIs for STIX 2 Semantic Equivalence.
+"""Python APIs for STIX 2 Semantic Equivalence and Similarity.

 .. autosummary::
   :toctree: equivalence
--- a/stix2/equivalence/graph/init.py
+++ b/stix2/equivalence/graph/init.py
@ -97,7 +97,6 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):

    if weights["_internal"]["max_depth"] <= 0:
        raise ValueError("weight_dict['_internal']['max_depth'] must be greater than 0")
-    depth = weights["_internal"]["max_depth"]

    pairs = _object_pairs(
        _bucket_per_type(ds1.query([])),
@ -108,13 +107,13 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
    weights["_internal"]["ds1"] = ds1
    weights["_internal"]["ds2"] = ds2

+    logger.debug("Starting graph similarity process between DataStores: '%s' and '%s'", ds1.id, ds2.id)
    for object1, object2 in pairs:
        iprop_score = {}
        object1_id = object1["id"]
        object2_id = object2["id"]

        result = object_similarity(object1, object2, iprop_score, **weights)
-        weights["_internal"]["max_depth"] = depth

        if object1_id not in results:
            results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result}
--- a/stix2/equivalence/object/init.py
+++ b/stix2/equivalence/object/init.py
@ -103,13 +103,13 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
    try:
        weights[type1]
    except KeyError:
-        logger.warning("'%s' type has no 'weights' dict specified & thus no semantic equivalence method to call!", type1)
+        logger.warning("'%s' type has no 'weights' dict specified & thus no object similarity method to call!", type1)
        sum_weights = matching_score = 0
    else:
        try:
            method = weights[type1]["method"]
        except KeyError:
-            logger.debug("Starting semantic equivalence process between: '%s' and '%s'", obj1["id"], obj2["id"])
+            logger.debug("Starting object similarity process between: '%s' and '%s'", obj1["id"], obj2["id"])
            matching_score = 0.0
            sum_weights = 0.0

@ -129,9 +129,9 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
                            weights["_internal"]["max_depth"] = max_depth - 1
                            ds1, ds2 = weights["_internal"]["ds1"], weights["_internal"]["ds2"]
                            contributing_score = w * comp_funct(obj1[prop], obj2[prop], ds1, ds2, **weights)
-                            weights["_internal"]["max_depth"] = max_depth + 1
                        else:
                            continue  # prevent excessive recursion
+                        weights["_internal"]["max_depth"] = max_depth
                    else:
                        contributing_score = w * comp_funct(obj1[prop], obj2[prop])

@ -148,7 +148,7 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
            prop_scores["sum_weights"] = sum_weights
            logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights)
        else:
-            logger.debug("Starting semantic equivalence process between: '%s' and '%s'", obj1["id"], obj2["id"])
+            logger.debug("Starting object similarity process between: '%s' and '%s'", obj1["id"], obj2["id"])
            try:
                matching_score, sum_weights = method(obj1, obj2, prop_scores, **weights[type1])
            except TypeError:
@ -350,19 +350,24 @@ def partial_location_distance(lat1, long1, lat2, long2, threshold):

 def _versioned_checks(ref1, ref2, ds1, ds2, **weights):
    """Checks multiple object versions if present in graph.
-    Maximizes for the semantic equivalence score of a particular version."""
+    Maximizes for the similarity score of a particular version."""
    results = {}
    objects1 = ds1.query([Filter("id", "=", ref1)])
    objects2 = ds2.query([Filter("id", "=", ref2)])

-    if len(objects1) > 0 and len(objects2) > 0:
-        for o1 in objects1:
-            for o2 in objects2:
-                result = object_similarity(o1, o2, **weights)
-                if ref1 not in results:
-                    results[ref1] = {"matched": ref2, "value": result}
-                elif result > results[ref1]["value"]:
-                    results[ref1] = {"matched": ref2, "value": result}
+    pairs = _object_pairs(
+        _bucket_per_type(objects1),
+        _bucket_per_type(objects2),
+        weights,
+    )
+
+    for object1, object2 in pairs:
+        result = object_similarity(object1, object2, **weights)
+        if ref1 not in results:
+            results[ref1] = {"matched": ref2, "value": result}
+        elif result > results[ref1]["value"]:
+            results[ref1] = {"matched": ref2, "value": result}
+
    result = results.get(ref1, {}).get("value", 0.0)
    logger.debug(
        "--\t\t_versioned_checks '%s' '%s'\tresult: '%s'",
@ -372,8 +377,8 @@ def _versioned_checks(ref1, ref2, ds1, ds2, **weights):


 def reference_check(ref1, ref2, ds1, ds2, **weights):
-    """For two references, de-reference the object and perform object-based
-    semantic equivalence. The score influences the result of an edge check."""
+    """For two references, de-reference the object and perform object_similarity.
+    The score influences the result of an edge check."""
    type1, type2 = ref1.split("--")[0], ref2.split("--")[0]
    result = 0.0

@ -394,7 +399,7 @@ def reference_check(ref1, ref2, ds1, ds2, **weights):

 def list_reference_check(refs1, refs2, ds1, ds2, **weights):
    """For objects that contain multiple references (i.e., object_refs) perform
-    the same de-reference procedure and perform object-based semantic equivalence.
+    the same de-reference procedure and perform object_similarity.
    The score influences the objects containing these references. The result is
    weighted on the amount of unique objects that could 1) be de-referenced 2) """
    results = {}