resolve issues with graph similarity

- new methods for graph equivalence and similarity
- remove sorting and len comparisons
- rename some variables
pull/1/head
Emmanuelle Vargas-Gonzalez 2021-02-16 00:57:26 -05:00
parent 489970718f
commit 02b076b3bb
2 changed files with 160 additions and 70 deletions

View File

@ -1,21 +1,62 @@
"""Python APIs for STIX 2 Graph-based Semantic Equivalence.""" """Python APIs for STIX 2 Graph-based Semantic Equivalence and Similarity."""
import collections
import itertools
import logging import logging
from ..object import ( from ..object import (
WEIGHTS, exact_match, list_reference_check, partial_string_based, WEIGHTS, exact_match, list_reference_check, partial_string_based,
partial_timestamp_based, reference_check, object_similarity, partial_timestamp_based, reference_check, object_similarity, object_pairs, bucket_per_type
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
"""This method returns a true/false value if two graphs are semantically equivalent.
Internally, it calls the graph_similarity function and compares it against the given
threshold value.
Args:
ds1: A DataStore object instance representing your graph
ds2: A DataStore object instance representing your graph
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
threshold: A numerical value between 0 and 100 to determine the minimum
score to result in successfully calling both graphs equivalent. This
value can be tuned.
weight_dict: A dictionary that can be used to override settings
in the similarity process
Returns:
bool: True if the result of the graph similarity is greater than or equal to
the threshold value. False otherwise.
Warning:
Object types need to have property weights defined for the similarity process.
Otherwise, those objects will not influence the final score. The WEIGHTS
dictionary under `stix2.equivalence.graph` can give you an idea on how to add
new entries and pass them via the `weight_dict` argument. Similarly, the values
or methods can be fine tuned for a particular use case.
Note:
Default weight_dict:
.. include:: ../../graph_default_sem_eq_weights.rst
Note:
This implementation follows the Semantic Equivalence Committee Note.
see `the Committee Note <link here>`__.
"""
similarity_result = graph_similarity(ds1, ds2, prop_scores, **weight_dict)
if similarity_result >= threshold:
return True
return False
def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict): def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
"""This method verifies if two graphs are semantically equivalent. """This method returns a similarity score for two given graphs.
Each DataStore can contain a connected or disconnected graph and the Each DataStore can contain a connected or disconnected graph and the
final result is weighted over the amount of objects we managed to compare. final result is weighted over the amount of objects we managed to compare.
This approach builds on top of the object-based semantic equivalence process This approach builds on top of the object-based similarity process
and each comparison can return a value between 0 and 100. and each comparison can return a value between 0 and 100.
Args: Args:
@ -24,20 +65,20 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
prop_scores: A dictionary that can hold individual property scores, prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights. weights, contributing score, matching score and sum of weights.
weight_dict: A dictionary that can be used to override settings weight_dict: A dictionary that can be used to override settings
in the semantic equivalence process in the similarity process
Returns: Returns:
float: A number between 0.0 and 100.0 as a measurement of equivalence. float: A number between 0.0 and 100.0 as a measurement of similarity.
Warning: Warning:
Object types need to have property weights defined for the equivalence process. Object types need to have property weights defined for the similarity process.
Otherwise, those objects will not influence the final score. The WEIGHTS Otherwise, those objects will not influence the final score. The WEIGHTS
dictionary under `stix2.equivalence.graph` can give you an idea on how to add dictionary under `stix2.equivalence.graph` can give you an idea on how to add
new entries and pass them via the `weight_dict` argument. Similarly, the values new entries and pass them via the `weight_dict` argument. Similarly, the values
or methods can be fine tuned for a particular use case. or methods can be fine tuned for a particular use case.
Note: Note:
Default weights_dict: Default weight_dict:
.. include:: ../../graph_default_sem_eq_weights.rst .. include:: ../../graph_default_sem_eq_weights.rst
@ -47,12 +88,14 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
""" """
results = {} results = {}
equivalence_score = 0 similarity_score = 0
weights = GRAPH_WEIGHTS.copy() weights = GRAPH_WEIGHTS.copy()
if weight_dict: if weight_dict:
weights.update(weight_dict) weights.update(weight_dict)
if weights["_internal"]["max_depth"] <= 0:
raise ValueError("weight_dict['_internal']['max_depth'] must be greater than 0")
depth = weights["_internal"]["max_depth"] depth = weights["_internal"]["max_depth"]
graph1 = bucket_per_type(ds1.query([])) graph1 = bucket_per_type(ds1.query([]))
@ -64,60 +107,46 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
iprop_score2 = {} iprop_score2 = {}
object1_id = object1["id"] object1_id = object1["id"]
object2_id = object2["id"] object2_id = object2["id"]
weights["_internal"]["max_depth"] = depth weights["_internal"]["max_depth"] = depth
weights["_internal"]["ds1"] = ds1 weights["_internal"]["ds1"] = ds1
weights["_internal"]["ds2"] = ds2 weights["_internal"]["ds2"] = ds2
result1 = object_similarity(object1, object2, iprop_score1, **weights) result1 = object_similarity(object1, object2, iprop_score1, **weights)
weights["_internal"]["max_depth"] = depth
weights["_internal"]["ds1"] = ds2 weights["_internal"]["ds1"] = ds2
weights["_internal"]["ds2"] = ds1 weights["_internal"]["ds2"] = ds1
result2 = object_similarity(object2, object1, iprop_score2, **weights) result2 = object_similarity(object2, object1, iprop_score2, **weights)
if object1_id not in results: if object1_id not in results:
results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1} results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score1, "value": result1}
elif result1 > results[object1_id]["value"]: elif result1 > results[object1_id]["value"]:
results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1} results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score1, "value": result1}
if object2_id not in results: if object2_id not in results:
results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2} results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score2, "value": result2}
elif result1 > results[object2_id]["value"]: elif result2 > results[object2_id]["value"]:
results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2} results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score2, "value": result2}
matching_score = sum(x["value"] for x in results.values()) matching_score = sum(x["value"] for x in results.values())
sum_weights = len(results) len_pairs = len(results)
if sum_weights > 0: if len_pairs > 0:
equivalence_score = matching_score / sum_weights similarity_score = matching_score / len_pairs
prop_scores["matching_score"] = matching_score prop_scores["matching_score"] = matching_score
prop_scores["sum_weights"] = sum_weights prop_scores["len_pairs"] = len_pairs
prop_scores["summary"] = results prop_scores["summary"] = results
logger.debug( logger.debug(
"DONE\t\tSUM_WEIGHT: %.2f\tMATCHING_SCORE: %.2f\t SCORE: %.2f", "DONE\t\tSUM_PAIRS: %.2f\tMATCHING_SCORE: %.2f\t SIMILARITY_SCORE: %.2f",
sum_weights, len_pairs,
matching_score, matching_score,
equivalence_score, similarity_score,
) )
return equivalence_score return similarity_score
def bucket_per_type(g): # default weights used for the graph similarity process
buckets = collections.defaultdict(list)
[buckets[obj["type"]].append(obj) for obj in g]
return buckets
def object_pairs(g1, g2, w):
types_in_common = set(g1.keys()).intersection(g2.keys())
testable_types = types_in_common.intersection(w.keys())
return itertools.chain.from_iterable(
itertools.product(g1[stix_type], g2[stix_type])
for stix_type in testable_types
)
# default weights used for the graph semantic equivalence process
GRAPH_WEIGHTS = WEIGHTS.copy() GRAPH_WEIGHTS = WEIGHTS.copy()
GRAPH_WEIGHTS.update({ GRAPH_WEIGHTS.update({
"grouping": { "grouping": {

View File

@ -1,4 +1,6 @@
"""Python APIs for STIX 2 Object-based Semantic Equivalence.""" """Python APIs for STIX 2 Object-based Semantic Equivalence and Similarity."""
import collections
import itertools
import logging import logging
import time import time
@ -9,9 +11,52 @@ from ..pattern import equivalent_patterns
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict):
"""This method returns a true/false value if two objects are semantically equivalent.
Internally, it calls the object_similarity function and compares it against the given
threshold value.
Args:
obj1: A stix2 object instance
obj2: A stix2 object instance
prop_scores: A dictionary that can hold individual property scores,
weights, contributing score, matching score and sum of weights.
threshold: A numerical value between 0 and 100 to determine the minimum
score to result in successfully calling both objects equivalent. This
value can be tuned.
weight_dict: A dictionary that can be used to override settings
in the semantic equivalence process
Returns:
bool: True if the result of the object similarity is greater than or equal to
the threshold value. False otherwise.
Warning:
Object types need to have property weights defined for the similarity process.
Otherwise, those objects will not influence the final score. The WEIGHTS
dictionary under `stix2.equivalence.object` can give you an idea on how to add
new entries and pass them via the `weight_dict` argument. Similarly, the values
or methods can be fine tuned for a particular use case.
Note:
Default weight_dict:
.. include:: ../../object_default_sem_eq_weights.rst
Note:
This implementation follows the Semantic Equivalence Committee Note.
see `the Committee Note <link here>`__.
"""
similarity_result = object_similarity(obj1, obj2, prop_scores, **weight_dict)
if similarity_result >= threshold:
return True
return False
def object_similarity(obj1, obj2, prop_scores={}, **weight_dict): def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
"""This method verifies if two objects of the same type are """This method returns a measure of similarity depending on how
semantically equivalent. similar the two objects are.
Args: Args:
obj1: A stix2 object instance obj1: A stix2 object instance
@ -22,17 +67,17 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
in the semantic equivalence process in the semantic equivalence process
Returns: Returns:
float: A number between 0.0 and 100.0 as a measurement of equivalence. float: A number between 0.0 and 100.0 as a measurement of similarity.
Warning: Warning:
Object types need to have property weights defined for the equivalence process. Object types need to have property weights defined for the similarity process.
Otherwise, those objects will not influence the final score. The WEIGHTS Otherwise, those objects will not influence the final score. The WEIGHTS
dictionary under `stix2.equivalence.object` can give you an idea on how to add dictionary under `stix2.equivalence.object` can give you an idea on how to add
new entries and pass them via the `weight_dict` argument. Similarly, the values new entries and pass them via the `weight_dict` argument. Similarly, the values
or methods can be fine tuned for a particular use case. or methods can be fine tuned for a particular use case.
Note: Note:
Default weights_dict: Default weight_dict:
.. include:: ../../object_default_sem_eq_weights.rst .. include:: ../../object_default_sem_eq_weights.rst
@ -352,34 +397,31 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights):
The score influences the objects containing these references. The result is The score influences the objects containing these references. The result is
weighted on the amount of unique objects that could 1) be de-referenced 2) """ weighted on the amount of unique objects that could 1) be de-referenced 2) """
results = {} results = {}
if len(refs1) >= len(refs2):
l1 = refs1
l2 = refs2
b1 = ds1
b2 = ds2
else:
l1 = refs2
l2 = refs1
b1 = ds2
b2 = ds1
l1.sort() pairs = object_pairs(
l2.sort() bucket_per_type(refs1, "id-split"),
bucket_per_type(refs2, "id-split"),
weights
)
for ref1 in l1: for ref1, ref2 in pairs:
for ref2 in l2: type1, type2 = ref1.split("--")[0], ref2.split("--")[0]
type1, type2 = ref1.split("--")[0], ref2.split("--")[0] if type1 == type2:
if type1 == type2: score = reference_check(ref1, ref2, ds1, ds2, **weights)
score = reference_check(ref1, ref2, b1, b2, **weights) * 100.0
if ref1 not in results: if ref1 not in results:
results[ref1] = {"matched": ref2, "value": score} results[ref1] = {"matched": ref2, "value": score}
elif score > results[ref1]["value"]: elif score > results[ref1]["value"]:
results[ref1] = {"matched": ref2, "value": score} results[ref1] = {"matched": ref2, "value": score}
if ref2 not in results:
results[ref2] = {"matched": ref1, "value": score}
elif score > results[ref2]["value"]:
results[ref2] = {"matched": ref1, "value": score}
result = 0.0 result = 0.0
total_sum = sum(x["value"] for x in results.values()) total_sum = sum(x["value"] for x in results.values())
max_score = len(results) * 100.0 max_score = len(results)
if max_score > 0: if max_score > 0:
result = total_sum / max_score result = total_sum / max_score
@ -391,7 +433,26 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights):
return result return result
# default weights used for the semantic equivalence process def bucket_per_type(g, mode="type"):
buckets = collections.defaultdict(list)
if mode == "type":
[buckets[obj["type"]].append(obj) for obj in g]
elif mode == "id-split":
[buckets[obj.split("--")[0]].append(obj) for obj in g]
return buckets
def object_pairs(g1, g2, w):
types_in_common = set(g1.keys()).intersection(g2.keys())
testable_types = types_in_common.intersection(w.keys())
return itertools.chain.from_iterable(
itertools.product(g1[stix_type], g2[stix_type])
for stix_type in testable_types
)
# default weights used for the similarity process
WEIGHTS = { WEIGHTS = {
"attack-pattern": { "attack-pattern": {
"name": (30, partial_string_based), "name": (30, partial_string_based),