WIP: changes to both similarity functions, expose settings

2021-02-26 19:19:33 -05:00 · 2021-02-26 19:19:33 -05:00 · f9a52eeed3
parent 173575205a
commit f9a52eeed3
3 changed files with 123 additions and 83 deletions
--- a/stix2/environment.py
+++ b/stix2/environment.py
@ -189,7 +189,8 @@ class Environment(DataStoreMixin):
            return None

    @staticmethod
-    def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
+    def object_similarity(obj1, obj2, prop_scores={}, ignore_spec_version=False,
+                     versioning_checks=False, max_depth=1, **weight_dict):
        """This method returns a measure of how similar the two objects are.

        Args:
@ -220,10 +221,12 @@ class Environment(DataStoreMixin):
            see `the Committee Note <link here>`__.

        """
-        return object_similarity(obj1, obj2, prop_scores, **weight_dict)
+        return object_similarity(obj1, obj2, prop_scores, ignore_spec_version,
+                                versioning_checks, max_depth, **weight_dict)

    @staticmethod
-    def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict):
+    def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, ignore_spec_version=False,
+                     versioning_checks=False, max_depth=1, **weight_dict):
        """This method returns a true/false value if two objects are semantically equivalent.
        Internally, it calls the object_similarity function and compares it against the given
        threshold value.
@ -263,7 +266,8 @@ class Environment(DataStoreMixin):
        return object_equivalence(obj1, obj2, prop_scores, threshold, **weight_dict)

    @staticmethod
-    def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
+    def graph_similarity(ds1, ds2, prop_scores={}, ignore_spec_version=False,
+                     versioning_checks=False, max_depth=1, **weight_dict):
        """This method returns a similarity score for two given graphs.
        Each DataStore can contain a connected or disconnected graph and the
        final result is weighted over the amount of objects we managed to compare.
@ -298,10 +302,12 @@ class Environment(DataStoreMixin):
            see `the Committee Note <link here>`__.

        """
-        return graph_similarity(ds1, ds2, prop_scores, **weight_dict)
+        return graph_similarity(ds1, ds2, prop_scores, ignore_spec_version,
+                                versioning_checks, max_depth, **weight_dict)

    @staticmethod
-    def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
+    def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, ignore_spec_version=False,
+                     versioning_checks=False, max_depth=1, **weight_dict):
        """This method returns a true/false value if two graphs are semantically equivalent.
        Internally, it calls the graph_similarity function and compares it against the given
        threshold value.
--- a/stix2/equivalence/graph/init.py
+++ b/stix2/equivalence/graph/init.py
@ -53,7 +53,8 @@ def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
    return False


-def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
+def graph_similarity(ds1, ds2, prop_scores={}, ignore_spec_version=False,
+                     versioning_checks=False, max_depth=1, **weight_dict):
    """This method returns a similarity score for two given graphs.
    Each DataStore can contain a connected or disconnected graph and the
    final result is weighted over the amount of objects we managed to compare.
@ -65,6 +66,9 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
        ds2: A DataStore object instance representing your graph
        prop_scores: A dictionary that can hold individual property scores,
            weights, contributing score, matching score and sum of weights.
+        ignore_spec_version: As
+        versioning_checks: As
+        max_depth: As
        weight_dict: A dictionary that can be used to override settings
            in the similarity process

@ -90,13 +94,21 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
    """
    results = {}
    similarity_score = 0
-    weights = GRAPH_WEIGHTS.copy()
+    weights = WEIGHTS.copy()

    if weight_dict:
        weights.update(weight_dict)

+    weights["_internal"] = {
+        "ignore_spec_version": ignore_spec_version,
+        "versioning_checks": versioning_checks,
+        "ds1": ds1,
+        "ds2": ds2,
+        "max_depth": max_depth,
+    }
+
    if weights["_internal"]["max_depth"] <= 0:
-        raise ValueError("weight_dict['_internal']['max_depth'] must be greater than 0")
+        raise ValueError("'max_depth' must be greater than 0")

    pairs = _object_pairs(
        _bucket_per_type(ds1.query([])),
@ -104,16 +116,15 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
        weights,
    )

-    weights["_internal"]["ds1"] = ds1
-    weights["_internal"]["ds2"] = ds2
-
    logger.debug("Starting graph similarity process between DataStores: '%s' and '%s'", ds1.id, ds2.id)
    for object1, object2 in pairs:
        iprop_score = {}
        object1_id = object1["id"]
        object2_id = object2["id"]

-        result = object_similarity(object1, object2, iprop_score, **weights)
+        result = object_similarity(object1, object2, iprop_score, ds1, ds2,
+                                   ignore_spec_version, versioning_checks,
+                                   max_depth, **weights)

        if object1_id not in results:
            results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result}
@ -141,40 +152,3 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
        similarity_score,
    )
    return similarity_score
-
-
-# default weights used for the graph similarity process
-GRAPH_WEIGHTS = WEIGHTS.copy()
-GRAPH_WEIGHTS.update({
-    "grouping": {
-        "name": (20, partial_string_based),
-        "context": (20, partial_string_based),
-        "object_refs": (60, list_reference_check),
-    },
-    "relationship": {
-        "relationship_type": (20, exact_match),
-        "source_ref": (40, reference_check),
-        "target_ref": (40, reference_check),
-    },
-    "report": {
-        "name": (30, partial_string_based),
-        "published": (10, partial_timestamp_based),
-        "object_refs": (60, list_reference_check),
-        "tdelta": 1,  # One day interval
-    },
-    "sighting": {
-        "first_seen": (5, partial_timestamp_based),
-        "last_seen": (5, partial_timestamp_based),
-        "sighting_of_ref": (40, reference_check),
-        "observed_data_refs": (20, list_reference_check),
-        "where_sighted_refs": (20, list_reference_check),
-        "summary": (10, exact_match),
-    },
-    "_internal": {
-        "ignore_spec_version": False,
-        "versioning_checks": False,
-        "ds1": None,
-        "ds2": None,
-        "max_depth": 1,
-    },
-})  # :autodoc-skip:
--- a/stix2/equivalence/object/init.py
+++ b/stix2/equivalence/object/init.py
@ -4,7 +4,7 @@ import itertools
 import logging
 import time

-from ...datastore import Filter
+from ...datastore import Filter, DataStoreMixin, DataSink, DataSource
 from ...utils import STIXdatetime, parse_into_datetime
 from ..pattern import equivalent_patterns

@ -54,7 +54,9 @@ def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict):
    return False


-def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
+def object_similarity(obj1, obj2, prop_scores={}, ds1=None, ds2=None,
+                      ignore_spec_version=False, versioning_checks=False,
+                      max_depth=1, **weight_dict):
    """This method returns a measure of similarity depending on how
    similar the two objects are.

@ -63,6 +65,11 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
        obj2: A stix2 object instance
        prop_scores: A dictionary that can hold individual property scores,
            weights, contributing score, matching score and sum of weights.
+        ds1: As
+        ds2: As
+        ignore_spec_version: As
+        versioning_checks: As
+        max_depth: As
        weight_dict: A dictionary that can be used to override settings
            in the similarity process

@ -91,6 +98,14 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
    if weight_dict:
        weights.update(weight_dict)

+    weights["_internal"] = {
+        "ignore_spec_version": ignore_spec_version,
+        "versioning_checks": versioning_checks,
+        "ds1": ds1,
+        "ds2": ds2,
+        "max_depth": max_depth,
+    }
+
    type1, type2 = obj1["type"], obj2["type"]
    ignore_spec_version = weights["_internal"]["ignore_spec_version"]

@ -117,6 +132,7 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
                if check_property_present(prop, obj1, obj2):
                    w = weights[type1][prop][0]
                    comp_funct = weights[type1][prop][1]
+                    prop_scores[prop] = {}

                    if comp_funct == partial_timestamp_based:
                        contributing_score = w * comp_funct(obj1[prop], obj2[prop], weights[type1]["tdelta"])
@ -124,24 +140,30 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
                        threshold = weights[type1]["threshold"]
                        contributing_score = w * comp_funct(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"], threshold)
                    elif comp_funct == reference_check or comp_funct == list_reference_check:
-                        max_depth = weights["_internal"]["max_depth"]
-                        if max_depth > 0:
-                            weights["_internal"]["max_depth"] = max_depth - 1
+                        max_depth_i = weights["_internal"]["max_depth"]
+                        if max_depth_i > 0:
+                            weights["_internal"]["max_depth"] = max_depth_i - 1
                            ds1, ds2 = weights["_internal"]["ds1"], weights["_internal"]["ds2"]
-                            contributing_score = w * comp_funct(obj1[prop], obj2[prop], ds1, ds2, **weights)
+                            if _datastore_check(ds1, ds2):
+                                contributing_score = w * comp_funct(obj1[prop], obj2[prop], ds1, ds2, **weights)
+                            elif comp_funct == reference_check:
+                                comp_funct = exact_match
+                                contributing_score = w * comp_funct(obj1[prop], obj2[prop])
+                            elif comp_funct == list_reference_check:
+                                comp_funct = partial_list_based
+                                contributing_score = w * comp_funct(obj1[prop], obj2[prop])
+                            prop_scores[prop]["method"] = comp_funct.__name__
                        else:
                            continue  # prevent excessive recursion
-                        weights["_internal"]["max_depth"] = max_depth
+                        weights["_internal"]["max_depth"] = max_depth_i
                    else:
                        contributing_score = w * comp_funct(obj1[prop], obj2[prop])

                    sum_weights += w
                    matching_score += contributing_score

-                    prop_scores[prop] = {
-                        "weight": w,
-                        "contributing_score": contributing_score,
-                    }
+                    prop_scores[prop]["weight"] = w
+                    prop_scores[prop]["contributing_score"] = contributing_score
                    logger.debug("'%s' check -- weight: %s, contributing score: %s", prop, w, contributing_score)

            prop_scores["matching_score"] = matching_score
@ -196,7 +218,9 @@ def partial_timestamp_based(t1, t2, tdelta):


 def partial_list_based(l1, l2):
-    """Performs a partial list matching via finding the intersection between common values.
+    """Performs a partial list matching via finding the intersection between
+    common values. Repeated values are counted only once. This method can be
+    used for *_refs equality checks when de-reference is not possible.

    Args:
        l1: A list of values.
@ -213,7 +237,8 @@ def partial_list_based(l1, l2):


 def exact_match(val1, val2):
-    """Performs an exact value match based on two values
+    """Performs an exact value match based on two values. This method can be
+    used for *_ref equality check when de-reference is not possible.

    Args:
        val1: A value suitable for an equality test.
@ -275,15 +300,8 @@ def partial_external_reference_based(refs1, refs2):
    allowed = {"veris", "cve", "capec", "mitre-attack"}
    matches = 0

-    if len(refs1) >= len(refs2):
-        l1 = refs1
-        l2 = refs2
-    else:
-        l1 = refs2
-        l2 = refs1
-
-    for ext_ref1 in l1:
-        for ext_ref2 in l2:
+    for ext_ref1 in refs1:
+        for ext_ref2 in refs2:
            sn_match = False
            ei_match = False
            url_match = False
@ -352,17 +370,21 @@ def _versioned_checks(ref1, ref2, ds1, ds2, **weights):
    """Checks multiple object versions if present in graph.
    Maximizes for the similarity score of a particular version."""
    results = {}
-    objects1 = ds1.query([Filter("id", "=", ref1)])
-    objects2 = ds2.query([Filter("id", "=", ref2)])

    pairs = _object_pairs(
-        _bucket_per_type(objects1),
-        _bucket_per_type(objects2),
+        _bucket_per_type(ds1.query([Filter("id", "=", ref1)])),
+        _bucket_per_type(ds2.query([Filter("id", "=", ref2)])),
        weights,
    )
+    ignore_spec_version = weights["_internal"]["ignore_spec_version"]
+    versioning_checks = weights["_internal"]["versioning_checks"]
+    max_depth = weights["_internal"]["max_depth"]

    for object1, object2 in pairs:
-        result = object_similarity(object1, object2, **weights)
+        result = object_similarity(object1, object2, ds1=ds1, ds2=ds2,
+                                   ignore_spec_version=ignore_spec_version,
+                                   versioning_checks=versioning_checks,
+                                   max_depth=max_depth, **weights)
        if ref1 not in results:
            results[ref1] = {"matched": ref2, "value": result}
        elif result > results[ref1]["value"]:
@ -383,12 +405,18 @@ def reference_check(ref1, ref2, ds1, ds2, **weights):
    result = 0.0

    if type1 == type2 and type1 in weights:
-        if weights["_internal"]["versioning_checks"]:
+        ignore_spec_version = weights["_internal"]["ignore_spec_version"]
+        versioning_checks = weights["_internal"]["versioning_checks"]
+        max_depth = weights["_internal"]["max_depth"]
+        if versioning_checks:
            result = _versioned_checks(ref1, ref2, ds1, ds2, **weights) / 100.0
        else:
            o1, o2 = ds1.get(ref1), ds2.get(ref2)
            if o1 and o2:
-                result = object_similarity(o1, o2, **weights) / 100.0
+                result = object_similarity(o1, o2, ds1=ds1, ds2=ds2,
+                                           ignore_spec_version=ignore_spec_version,
+                                           versioning_checks=versioning_checks,
+                                           max_depth=max_depth, **weights) / 100.0

    logger.debug(
        "--\t\treference_check '%s' '%s'\tresult: '%s'",
@ -439,6 +467,13 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights):
    return result


+def _datastore_check(ds1, ds2):
+    if (issubclass(ds1.__class__, (DataStoreMixin, DataSink, DataSource)) or
+            issubclass(ds2.__class__, (DataStoreMixin, DataSink, DataSource))):
+        return True
+    return False
+
+
 def _bucket_per_type(graph, mode="type"):
    """Given a list of objects or references, bucket them by type.
    Depending on the list type: extract from 'type' property or using
@ -480,11 +515,20 @@ WEIGHTS = {
        "name": (60, partial_string_based),
        "external_references": (40, partial_external_reference_based),
    },
+    "grouping": {
+        "name": (20, partial_string_based),
+        "context": (20, partial_string_based),
+        "object_refs": (60, list_reference_check),
+    },
    "identity": {
        "name": (60, partial_string_based),
        "identity_class": (20, exact_match),
        "sectors": (20, partial_list_based),
    },
+    "incident": {
+        "name": (60, partial_string_based),
+        "external_references": (40, partial_external_reference_based),
+    },
    "indicator": {
        "indicator_types": (15, partial_list_based),
        "pattern": (80, custom_pattern_based),
@ -511,6 +555,25 @@ WEIGHTS = {
        "definition": (60, exact_match),
        "definition_type": (20, exact_match),
    },
+    "relationship": {
+        "relationship_type": (20, exact_match),
+        "source_ref": (40, reference_check),
+        "target_ref": (40, reference_check),
+    },
+    "report": {
+        "name": (30, partial_string_based),
+        "published": (10, partial_timestamp_based),
+        "object_refs": (60, list_reference_check),
+        "tdelta": 1,  # One day interval
+    },
+    "sighting": {
+        "first_seen": (5, partial_timestamp_based),
+        "last_seen": (5, partial_timestamp_based),
+        "sighting_of_ref": (40, reference_check),
+        "observed_data_refs": (20, list_reference_check),
+        "where_sighted_refs": (20, list_reference_check),
+        "summary": (10, exact_match),
+    },
    "threat-actor": {
        "name": (60, partial_string_based),
        "threat_actor_types": (20, partial_list_based),
@ -523,8 +586,5 @@ WEIGHTS = {
    "vulnerability": {
        "name": (30, partial_string_based),
        "external_references": (70, partial_external_reference_based),
-    },
-    "_internal": {
-        "ignore_spec_version": False,
-    },
+    }
 }  # :autodoc-skip: