WIP: changes to both similarity functions, expose settings

pull/1/head
Emmanuelle Vargas-Gonzalez 2021-02-26 19:19:33 -05:00
parent 173575205a
commit f9a52eeed3
3 changed files with 123 additions and 83 deletions

View File

@ -189,7 +189,8 @@ class Environment(DataStoreMixin):
return None return None
@staticmethod @staticmethod
def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): def object_similarity(obj1, obj2, prop_scores={}, ignore_spec_version=False,
versioning_checks=False, max_depth=1, **weight_dict):
"""This method returns a measure of how similar the two objects are. """This method returns a measure of how similar the two objects are.
Args: Args:
@ -220,10 +221,12 @@ class Environment(DataStoreMixin):
see `the Committee Note <link here>`__. see `the Committee Note <link here>`__.
""" """
return object_similarity(obj1, obj2, prop_scores, **weight_dict) return object_similarity(obj1, obj2, prop_scores, ignore_spec_version,
versioning_checks, max_depth, **weight_dict)
@staticmethod @staticmethod
def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict): def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, ignore_spec_version=False,
versioning_checks=False, max_depth=1, **weight_dict):
"""This method returns a true/false value if two objects are semantically equivalent. """This method returns a true/false value if two objects are semantically equivalent.
Internally, it calls the object_similarity function and compares it against the given Internally, it calls the object_similarity function and compares it against the given
threshold value. threshold value.
@ -263,7 +266,8 @@ class Environment(DataStoreMixin):
return object_equivalence(obj1, obj2, prop_scores, threshold, **weight_dict) return object_equivalence(obj1, obj2, prop_scores, threshold, **weight_dict)
@staticmethod @staticmethod
def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): def graph_similarity(ds1, ds2, prop_scores={}, ignore_spec_version=False,
versioning_checks=False, max_depth=1, **weight_dict):
"""This method returns a similarity score for two given graphs. """This method returns a similarity score for two given graphs.
Each DataStore can contain a connected or disconnected graph and the Each DataStore can contain a connected or disconnected graph and the
final result is weighted over the amount of objects we managed to compare. final result is weighted over the amount of objects we managed to compare.
@ -298,10 +302,12 @@ class Environment(DataStoreMixin):
see `the Committee Note <link here>`__. see `the Committee Note <link here>`__.
""" """
return graph_similarity(ds1, ds2, prop_scores, **weight_dict) return graph_similarity(ds1, ds2, prop_scores, ignore_spec_version,
versioning_checks, max_depth, **weight_dict)
@staticmethod @staticmethod
def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict): def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, ignore_spec_version=False,
versioning_checks=False, max_depth=1, **weight_dict):
"""This method returns a true/false value if two graphs are semantically equivalent. """This method returns a true/false value if two graphs are semantically equivalent.
Internally, it calls the graph_similarity function and compares it against the given Internally, it calls the graph_similarity function and compares it against the given
threshold value. threshold value.

View File

@ -53,7 +53,8 @@ def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
return False return False
def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): def graph_similarity(ds1, ds2, prop_scores={}, ignore_spec_version=False,
versioning_checks=False, max_depth=1, **weight_dict):
"""This method returns a similarity score for two given graphs. """This method returns a similarity score for two given graphs.
Each DataStore can contain a connected or disconnected graph and the Each DataStore can contain a connected or disconnected graph and the
final result is weighted over the amount of objects we managed to compare. final result is weighted over the amount of objects we managed to compare.
@ -65,6 +66,9 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
ds2: A DataStore object instance representing your graph ds2: A DataStore object instance representing your graph
prop_scores: A dictionary that can hold individual property scores, prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights. weights, contributing score, matching score and sum of weights.
ignore_spec_version: As
versioning_checks: As
max_depth: As
weight_dict: A dictionary that can be used to override settings weight_dict: A dictionary that can be used to override settings
in the similarity process in the similarity process
@ -90,13 +94,21 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
""" """
results = {} results = {}
similarity_score = 0 similarity_score = 0
weights = GRAPH_WEIGHTS.copy() weights = WEIGHTS.copy()
if weight_dict: if weight_dict:
weights.update(weight_dict) weights.update(weight_dict)
weights["_internal"] = {
"ignore_spec_version": ignore_spec_version,
"versioning_checks": versioning_checks,
"ds1": ds1,
"ds2": ds2,
"max_depth": max_depth,
}
if weights["_internal"]["max_depth"] <= 0: if weights["_internal"]["max_depth"] <= 0:
raise ValueError("weight_dict['_internal']['max_depth'] must be greater than 0") raise ValueError("'max_depth' must be greater than 0")
pairs = _object_pairs( pairs = _object_pairs(
_bucket_per_type(ds1.query([])), _bucket_per_type(ds1.query([])),
@ -104,16 +116,15 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
weights, weights,
) )
weights["_internal"]["ds1"] = ds1
weights["_internal"]["ds2"] = ds2
logger.debug("Starting graph similarity process between DataStores: '%s' and '%s'", ds1.id, ds2.id) logger.debug("Starting graph similarity process between DataStores: '%s' and '%s'", ds1.id, ds2.id)
for object1, object2 in pairs: for object1, object2 in pairs:
iprop_score = {} iprop_score = {}
object1_id = object1["id"] object1_id = object1["id"]
object2_id = object2["id"] object2_id = object2["id"]
result = object_similarity(object1, object2, iprop_score, **weights) result = object_similarity(object1, object2, iprop_score, ds1, ds2,
ignore_spec_version, versioning_checks,
max_depth, **weights)
if object1_id not in results: if object1_id not in results:
results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result} results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score, "value": result}
@ -141,40 +152,3 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
similarity_score, similarity_score,
) )
return similarity_score return similarity_score
# default weights used for the graph similarity process
GRAPH_WEIGHTS = WEIGHTS.copy()
GRAPH_WEIGHTS.update({
"grouping": {
"name": (20, partial_string_based),
"context": (20, partial_string_based),
"object_refs": (60, list_reference_check),
},
"relationship": {
"relationship_type": (20, exact_match),
"source_ref": (40, reference_check),
"target_ref": (40, reference_check),
},
"report": {
"name": (30, partial_string_based),
"published": (10, partial_timestamp_based),
"object_refs": (60, list_reference_check),
"tdelta": 1, # One day interval
},
"sighting": {
"first_seen": (5, partial_timestamp_based),
"last_seen": (5, partial_timestamp_based),
"sighting_of_ref": (40, reference_check),
"observed_data_refs": (20, list_reference_check),
"where_sighted_refs": (20, list_reference_check),
"summary": (10, exact_match),
},
"_internal": {
"ignore_spec_version": False,
"versioning_checks": False,
"ds1": None,
"ds2": None,
"max_depth": 1,
},
}) # :autodoc-skip:

View File

@ -4,7 +4,7 @@ import itertools
import logging import logging
import time import time
from ...datastore import Filter from ...datastore import Filter, DataStoreMixin, DataSink, DataSource
from ...utils import STIXdatetime, parse_into_datetime from ...utils import STIXdatetime, parse_into_datetime
from ..pattern import equivalent_patterns from ..pattern import equivalent_patterns
@ -54,7 +54,9 @@ def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict):
return False return False
def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): def object_similarity(obj1, obj2, prop_scores={}, ds1=None, ds2=None,
ignore_spec_version=False, versioning_checks=False,
max_depth=1, **weight_dict):
"""This method returns a measure of similarity depending on how """This method returns a measure of similarity depending on how
similar the two objects are. similar the two objects are.
@ -63,6 +65,11 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
obj2: A stix2 object instance obj2: A stix2 object instance
prop_scores: A dictionary that can hold individual property scores, prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights. weights, contributing score, matching score and sum of weights.
ds1: As
ds2: As
ignore_spec_version: As
versioning_checks: As
max_depth: As
weight_dict: A dictionary that can be used to override settings weight_dict: A dictionary that can be used to override settings
in the similarity process in the similarity process
@ -91,6 +98,14 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
if weight_dict: if weight_dict:
weights.update(weight_dict) weights.update(weight_dict)
weights["_internal"] = {
"ignore_spec_version": ignore_spec_version,
"versioning_checks": versioning_checks,
"ds1": ds1,
"ds2": ds2,
"max_depth": max_depth,
}
type1, type2 = obj1["type"], obj2["type"] type1, type2 = obj1["type"], obj2["type"]
ignore_spec_version = weights["_internal"]["ignore_spec_version"] ignore_spec_version = weights["_internal"]["ignore_spec_version"]
@ -117,6 +132,7 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
if check_property_present(prop, obj1, obj2): if check_property_present(prop, obj1, obj2):
w = weights[type1][prop][0] w = weights[type1][prop][0]
comp_funct = weights[type1][prop][1] comp_funct = weights[type1][prop][1]
prop_scores[prop] = {}
if comp_funct == partial_timestamp_based: if comp_funct == partial_timestamp_based:
contributing_score = w * comp_funct(obj1[prop], obj2[prop], weights[type1]["tdelta"]) contributing_score = w * comp_funct(obj1[prop], obj2[prop], weights[type1]["tdelta"])
@ -124,24 +140,30 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
threshold = weights[type1]["threshold"] threshold = weights[type1]["threshold"]
contributing_score = w * comp_funct(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"], threshold) contributing_score = w * comp_funct(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"], threshold)
elif comp_funct == reference_check or comp_funct == list_reference_check: elif comp_funct == reference_check or comp_funct == list_reference_check:
max_depth = weights["_internal"]["max_depth"] max_depth_i = weights["_internal"]["max_depth"]
if max_depth > 0: if max_depth_i > 0:
weights["_internal"]["max_depth"] = max_depth - 1 weights["_internal"]["max_depth"] = max_depth_i - 1
ds1, ds2 = weights["_internal"]["ds1"], weights["_internal"]["ds2"] ds1, ds2 = weights["_internal"]["ds1"], weights["_internal"]["ds2"]
contributing_score = w * comp_funct(obj1[prop], obj2[prop], ds1, ds2, **weights) if _datastore_check(ds1, ds2):
contributing_score = w * comp_funct(obj1[prop], obj2[prop], ds1, ds2, **weights)
elif comp_funct == reference_check:
comp_funct = exact_match
contributing_score = w * comp_funct(obj1[prop], obj2[prop])
elif comp_funct == list_reference_check:
comp_funct = partial_list_based
contributing_score = w * comp_funct(obj1[prop], obj2[prop])
prop_scores[prop]["method"] = comp_funct.__name__
else: else:
continue # prevent excessive recursion continue # prevent excessive recursion
weights["_internal"]["max_depth"] = max_depth weights["_internal"]["max_depth"] = max_depth_i
else: else:
contributing_score = w * comp_funct(obj1[prop], obj2[prop]) contributing_score = w * comp_funct(obj1[prop], obj2[prop])
sum_weights += w sum_weights += w
matching_score += contributing_score matching_score += contributing_score
prop_scores[prop] = { prop_scores[prop]["weight"] = w
"weight": w, prop_scores[prop]["contributing_score"] = contributing_score
"contributing_score": contributing_score,
}
logger.debug("'%s' check -- weight: %s, contributing score: %s", prop, w, contributing_score) logger.debug("'%s' check -- weight: %s, contributing score: %s", prop, w, contributing_score)
prop_scores["matching_score"] = matching_score prop_scores["matching_score"] = matching_score
@ -196,7 +218,9 @@ def partial_timestamp_based(t1, t2, tdelta):
def partial_list_based(l1, l2): def partial_list_based(l1, l2):
"""Performs a partial list matching via finding the intersection between common values. """Performs a partial list matching via finding the intersection between
common values. Repeated values are counted only once. This method can be
used for *_refs equality checks when de-reference is not possible.
Args: Args:
l1: A list of values. l1: A list of values.
@ -213,7 +237,8 @@ def partial_list_based(l1, l2):
def exact_match(val1, val2): def exact_match(val1, val2):
"""Performs an exact value match based on two values """Performs an exact value match based on two values. This method can be
used for *_ref equality check when de-reference is not possible.
Args: Args:
val1: A value suitable for an equality test. val1: A value suitable for an equality test.
@ -275,15 +300,8 @@ def partial_external_reference_based(refs1, refs2):
allowed = {"veris", "cve", "capec", "mitre-attack"} allowed = {"veris", "cve", "capec", "mitre-attack"}
matches = 0 matches = 0
if len(refs1) >= len(refs2): for ext_ref1 in refs1:
l1 = refs1 for ext_ref2 in refs2:
l2 = refs2
else:
l1 = refs2
l2 = refs1
for ext_ref1 in l1:
for ext_ref2 in l2:
sn_match = False sn_match = False
ei_match = False ei_match = False
url_match = False url_match = False
@ -352,17 +370,21 @@ def _versioned_checks(ref1, ref2, ds1, ds2, **weights):
"""Checks multiple object versions if present in graph. """Checks multiple object versions if present in graph.
Maximizes for the similarity score of a particular version.""" Maximizes for the similarity score of a particular version."""
results = {} results = {}
objects1 = ds1.query([Filter("id", "=", ref1)])
objects2 = ds2.query([Filter("id", "=", ref2)])
pairs = _object_pairs( pairs = _object_pairs(
_bucket_per_type(objects1), _bucket_per_type(ds1.query([Filter("id", "=", ref1)])),
_bucket_per_type(objects2), _bucket_per_type(ds2.query([Filter("id", "=", ref2)])),
weights, weights,
) )
ignore_spec_version = weights["_internal"]["ignore_spec_version"]
versioning_checks = weights["_internal"]["versioning_checks"]
max_depth = weights["_internal"]["max_depth"]
for object1, object2 in pairs: for object1, object2 in pairs:
result = object_similarity(object1, object2, **weights) result = object_similarity(object1, object2, ds1=ds1, ds2=ds2,
ignore_spec_version=ignore_spec_version,
versioning_checks=versioning_checks,
max_depth=max_depth, **weights)
if ref1 not in results: if ref1 not in results:
results[ref1] = {"matched": ref2, "value": result} results[ref1] = {"matched": ref2, "value": result}
elif result > results[ref1]["value"]: elif result > results[ref1]["value"]:
@ -383,12 +405,18 @@ def reference_check(ref1, ref2, ds1, ds2, **weights):
result = 0.0 result = 0.0
if type1 == type2 and type1 in weights: if type1 == type2 and type1 in weights:
if weights["_internal"]["versioning_checks"]: ignore_spec_version = weights["_internal"]["ignore_spec_version"]
versioning_checks = weights["_internal"]["versioning_checks"]
max_depth = weights["_internal"]["max_depth"]
if versioning_checks:
result = _versioned_checks(ref1, ref2, ds1, ds2, **weights) / 100.0 result = _versioned_checks(ref1, ref2, ds1, ds2, **weights) / 100.0
else: else:
o1, o2 = ds1.get(ref1), ds2.get(ref2) o1, o2 = ds1.get(ref1), ds2.get(ref2)
if o1 and o2: if o1 and o2:
result = object_similarity(o1, o2, **weights) / 100.0 result = object_similarity(o1, o2, ds1=ds1, ds2=ds2,
ignore_spec_version=ignore_spec_version,
versioning_checks=versioning_checks,
max_depth=max_depth, **weights) / 100.0
logger.debug( logger.debug(
"--\t\treference_check '%s' '%s'\tresult: '%s'", "--\t\treference_check '%s' '%s'\tresult: '%s'",
@ -439,6 +467,13 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights):
return result return result
def _datastore_check(ds1, ds2):
if (issubclass(ds1.__class__, (DataStoreMixin, DataSink, DataSource)) or
issubclass(ds2.__class__, (DataStoreMixin, DataSink, DataSource))):
return True
return False
def _bucket_per_type(graph, mode="type"): def _bucket_per_type(graph, mode="type"):
"""Given a list of objects or references, bucket them by type. """Given a list of objects or references, bucket them by type.
Depending on the list type: extract from 'type' property or using Depending on the list type: extract from 'type' property or using
@ -480,11 +515,20 @@ WEIGHTS = {
"name": (60, partial_string_based), "name": (60, partial_string_based),
"external_references": (40, partial_external_reference_based), "external_references": (40, partial_external_reference_based),
}, },
"grouping": {
"name": (20, partial_string_based),
"context": (20, partial_string_based),
"object_refs": (60, list_reference_check),
},
"identity": { "identity": {
"name": (60, partial_string_based), "name": (60, partial_string_based),
"identity_class": (20, exact_match), "identity_class": (20, exact_match),
"sectors": (20, partial_list_based), "sectors": (20, partial_list_based),
}, },
"incident": {
"name": (60, partial_string_based),
"external_references": (40, partial_external_reference_based),
},
"indicator": { "indicator": {
"indicator_types": (15, partial_list_based), "indicator_types": (15, partial_list_based),
"pattern": (80, custom_pattern_based), "pattern": (80, custom_pattern_based),
@ -511,6 +555,25 @@ WEIGHTS = {
"definition": (60, exact_match), "definition": (60, exact_match),
"definition_type": (20, exact_match), "definition_type": (20, exact_match),
}, },
"relationship": {
"relationship_type": (20, exact_match),
"source_ref": (40, reference_check),
"target_ref": (40, reference_check),
},
"report": {
"name": (30, partial_string_based),
"published": (10, partial_timestamp_based),
"object_refs": (60, list_reference_check),
"tdelta": 1, # One day interval
},
"sighting": {
"first_seen": (5, partial_timestamp_based),
"last_seen": (5, partial_timestamp_based),
"sighting_of_ref": (40, reference_check),
"observed_data_refs": (20, list_reference_check),
"where_sighted_refs": (20, list_reference_check),
"summary": (10, exact_match),
},
"threat-actor": { "threat-actor": {
"name": (60, partial_string_based), "name": (60, partial_string_based),
"threat_actor_types": (20, partial_list_based), "threat_actor_types": (20, partial_list_based),
@ -523,8 +586,5 @@ WEIGHTS = {
"vulnerability": { "vulnerability": {
"name": (30, partial_string_based), "name": (30, partial_string_based),
"external_references": (70, partial_external_reference_based), "external_references": (70, partial_external_reference_based),
}, }
"_internal": {
"ignore_spec_version": False,
},
} # :autodoc-skip: } # :autodoc-skip: