resolve issues with graph similarity
- new methods for graph equivalence and similarity - remove sorting and len comparisons - rename some variablespull/1/head
parent
489970718f
commit
02b076b3bb
|
@ -1,21 +1,62 @@
|
||||||
"""Python APIs for STIX 2 Graph-based Semantic Equivalence."""
|
"""Python APIs for STIX 2 Graph-based Semantic Equivalence and Similarity."""
|
||||||
import collections
|
|
||||||
import itertools
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from ..object import (
|
from ..object import (
|
||||||
WEIGHTS, exact_match, list_reference_check, partial_string_based,
|
WEIGHTS, exact_match, list_reference_check, partial_string_based,
|
||||||
partial_timestamp_based, reference_check, object_similarity,
|
partial_timestamp_based, reference_check, object_similarity, object_pairs, bucket_per_type
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def graph_equivalence(ds1, ds2, prop_scores={}, threshold=70, **weight_dict):
|
||||||
|
"""This method returns a true/false value if two graphs are semantically equivalent.
|
||||||
|
Internally, it calls the graph_similarity function and compares it against the given
|
||||||
|
threshold value.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ds1: A DataStore object instance representing your graph
|
||||||
|
ds2: A DataStore object instance representing your graph
|
||||||
|
prop_scores: A dictionary that can hold individual property scores,
|
||||||
|
weights, contributing score, matching score and sum of weights.
|
||||||
|
threshold: A numerical value between 0 and 100 to determine the minimum
|
||||||
|
score to result in successfully calling both graphs equivalent. This
|
||||||
|
value can be tuned.
|
||||||
|
weight_dict: A dictionary that can be used to override settings
|
||||||
|
in the similarity process
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the result of the graph similarity is greater than or equal to
|
||||||
|
the threshold value. False otherwise.
|
||||||
|
|
||||||
|
Warning:
|
||||||
|
Object types need to have property weights defined for the similarity process.
|
||||||
|
Otherwise, those objects will not influence the final score. The WEIGHTS
|
||||||
|
dictionary under `stix2.equivalence.graph` can give you an idea on how to add
|
||||||
|
new entries and pass them via the `weight_dict` argument. Similarly, the values
|
||||||
|
or methods can be fine tuned for a particular use case.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
Default weight_dict:
|
||||||
|
|
||||||
|
.. include:: ../../graph_default_sem_eq_weights.rst
|
||||||
|
|
||||||
|
Note:
|
||||||
|
This implementation follows the Semantic Equivalence Committee Note.
|
||||||
|
see `the Committee Note <link here>`__.
|
||||||
|
|
||||||
|
"""
|
||||||
|
similarity_result = graph_similarity(ds1, ds2, prop_scores, **weight_dict)
|
||||||
|
if similarity_result >= threshold:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
|
def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
|
||||||
"""This method verifies if two graphs are semantically equivalent.
|
"""This method returns a similarity score for two given graphs.
|
||||||
Each DataStore can contain a connected or disconnected graph and the
|
Each DataStore can contain a connected or disconnected graph and the
|
||||||
final result is weighted over the amount of objects we managed to compare.
|
final result is weighted over the amount of objects we managed to compare.
|
||||||
This approach builds on top of the object-based semantic equivalence process
|
This approach builds on top of the object-based similarity process
|
||||||
and each comparison can return a value between 0 and 100.
|
and each comparison can return a value between 0 and 100.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -24,20 +65,20 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
|
||||||
prop_scores: A dictionary that can hold individual property scores,
|
prop_scores: A dictionary that can hold individual property scores,
|
||||||
weights, contributing score, matching score and sum of weights.
|
weights, contributing score, matching score and sum of weights.
|
||||||
weight_dict: A dictionary that can be used to override settings
|
weight_dict: A dictionary that can be used to override settings
|
||||||
in the semantic equivalence process
|
in the similarity process
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
float: A number between 0.0 and 100.0 as a measurement of equivalence.
|
float: A number between 0.0 and 100.0 as a measurement of similarity.
|
||||||
|
|
||||||
Warning:
|
Warning:
|
||||||
Object types need to have property weights defined for the equivalence process.
|
Object types need to have property weights defined for the similarity process.
|
||||||
Otherwise, those objects will not influence the final score. The WEIGHTS
|
Otherwise, those objects will not influence the final score. The WEIGHTS
|
||||||
dictionary under `stix2.equivalence.graph` can give you an idea on how to add
|
dictionary under `stix2.equivalence.graph` can give you an idea on how to add
|
||||||
new entries and pass them via the `weight_dict` argument. Similarly, the values
|
new entries and pass them via the `weight_dict` argument. Similarly, the values
|
||||||
or methods can be fine tuned for a particular use case.
|
or methods can be fine tuned for a particular use case.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
Default weights_dict:
|
Default weight_dict:
|
||||||
|
|
||||||
.. include:: ../../graph_default_sem_eq_weights.rst
|
.. include:: ../../graph_default_sem_eq_weights.rst
|
||||||
|
|
||||||
|
@ -47,12 +88,14 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
results = {}
|
results = {}
|
||||||
equivalence_score = 0
|
similarity_score = 0
|
||||||
weights = GRAPH_WEIGHTS.copy()
|
weights = GRAPH_WEIGHTS.copy()
|
||||||
|
|
||||||
if weight_dict:
|
if weight_dict:
|
||||||
weights.update(weight_dict)
|
weights.update(weight_dict)
|
||||||
|
|
||||||
|
if weights["_internal"]["max_depth"] <= 0:
|
||||||
|
raise ValueError("weight_dict['_internal']['max_depth'] must be greater than 0")
|
||||||
depth = weights["_internal"]["max_depth"]
|
depth = weights["_internal"]["max_depth"]
|
||||||
|
|
||||||
graph1 = bucket_per_type(ds1.query([]))
|
graph1 = bucket_per_type(ds1.query([]))
|
||||||
|
@ -64,60 +107,46 @@ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
|
||||||
iprop_score2 = {}
|
iprop_score2 = {}
|
||||||
object1_id = object1["id"]
|
object1_id = object1["id"]
|
||||||
object2_id = object2["id"]
|
object2_id = object2["id"]
|
||||||
|
|
||||||
weights["_internal"]["max_depth"] = depth
|
weights["_internal"]["max_depth"] = depth
|
||||||
weights["_internal"]["ds1"] = ds1
|
weights["_internal"]["ds1"] = ds1
|
||||||
weights["_internal"]["ds2"] = ds2
|
weights["_internal"]["ds2"] = ds2
|
||||||
result1 = object_similarity(object1, object2, iprop_score1, **weights)
|
result1 = object_similarity(object1, object2, iprop_score1, **weights)
|
||||||
|
|
||||||
|
weights["_internal"]["max_depth"] = depth
|
||||||
weights["_internal"]["ds1"] = ds2
|
weights["_internal"]["ds1"] = ds2
|
||||||
weights["_internal"]["ds2"] = ds1
|
weights["_internal"]["ds2"] = ds1
|
||||||
result2 = object_similarity(object2, object1, iprop_score2, **weights)
|
result2 = object_similarity(object2, object1, iprop_score2, **weights)
|
||||||
|
|
||||||
if object1_id not in results:
|
if object1_id not in results:
|
||||||
results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1}
|
results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score1, "value": result1}
|
||||||
elif result1 > results[object1_id]["value"]:
|
elif result1 > results[object1_id]["value"]:
|
||||||
results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1}
|
results[object1_id] = {"lhs": object1_id, "rhs": object2_id, "prop_score": iprop_score1, "value": result1}
|
||||||
|
|
||||||
if object2_id not in results:
|
if object2_id not in results:
|
||||||
results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2}
|
results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score2, "value": result2}
|
||||||
elif result1 > results[object2_id]["value"]:
|
elif result2 > results[object2_id]["value"]:
|
||||||
results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2}
|
results[object2_id] = {"lhs": object2_id, "rhs": object1_id, "prop_score": iprop_score2, "value": result2}
|
||||||
|
|
||||||
matching_score = sum(x["value"] for x in results.values())
|
matching_score = sum(x["value"] for x in results.values())
|
||||||
sum_weights = len(results)
|
len_pairs = len(results)
|
||||||
if sum_weights > 0:
|
if len_pairs > 0:
|
||||||
equivalence_score = matching_score / sum_weights
|
similarity_score = matching_score / len_pairs
|
||||||
|
|
||||||
prop_scores["matching_score"] = matching_score
|
prop_scores["matching_score"] = matching_score
|
||||||
prop_scores["sum_weights"] = sum_weights
|
prop_scores["len_pairs"] = len_pairs
|
||||||
prop_scores["summary"] = results
|
prop_scores["summary"] = results
|
||||||
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"DONE\t\tSUM_WEIGHT: %.2f\tMATCHING_SCORE: %.2f\t SCORE: %.2f",
|
"DONE\t\tSUM_PAIRS: %.2f\tMATCHING_SCORE: %.2f\t SIMILARITY_SCORE: %.2f",
|
||||||
sum_weights,
|
len_pairs,
|
||||||
matching_score,
|
matching_score,
|
||||||
equivalence_score,
|
similarity_score,
|
||||||
)
|
)
|
||||||
return equivalence_score
|
return similarity_score
|
||||||
|
|
||||||
|
|
||||||
def bucket_per_type(g):
|
# default weights used for the graph similarity process
|
||||||
buckets = collections.defaultdict(list)
|
|
||||||
[buckets[obj["type"]].append(obj) for obj in g]
|
|
||||||
return buckets
|
|
||||||
|
|
||||||
|
|
||||||
def object_pairs(g1, g2, w):
|
|
||||||
types_in_common = set(g1.keys()).intersection(g2.keys())
|
|
||||||
testable_types = types_in_common.intersection(w.keys())
|
|
||||||
|
|
||||||
return itertools.chain.from_iterable(
|
|
||||||
itertools.product(g1[stix_type], g2[stix_type])
|
|
||||||
for stix_type in testable_types
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# default weights used for the graph semantic equivalence process
|
|
||||||
GRAPH_WEIGHTS = WEIGHTS.copy()
|
GRAPH_WEIGHTS = WEIGHTS.copy()
|
||||||
GRAPH_WEIGHTS.update({
|
GRAPH_WEIGHTS.update({
|
||||||
"grouping": {
|
"grouping": {
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
"""Python APIs for STIX 2 Object-based Semantic Equivalence."""
|
"""Python APIs for STIX 2 Object-based Semantic Equivalence and Similarity."""
|
||||||
|
import collections
|
||||||
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
@ -9,9 +11,52 @@ from ..pattern import equivalent_patterns
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def object_equivalence(obj1, obj2, prop_scores={}, threshold=70, **weight_dict):
|
||||||
|
"""This method returns a true/false value if two objects are semantically equivalent.
|
||||||
|
Internally, it calls the object_similarity function and compares it against the given
|
||||||
|
threshold value.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
obj1: A stix2 object instance
|
||||||
|
obj2: A stix2 object instance
|
||||||
|
prop_scores: A dictionary that can hold individual property scores,
|
||||||
|
weights, contributing score, matching score and sum of weights.
|
||||||
|
threshold: A numerical value between 0 and 100 to determine the minimum
|
||||||
|
score to result in successfully calling both objects equivalent. This
|
||||||
|
value can be tuned.
|
||||||
|
weight_dict: A dictionary that can be used to override settings
|
||||||
|
in the semantic equivalence process
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the result of the object similarity is greater than or equal to
|
||||||
|
the threshold value. False otherwise.
|
||||||
|
|
||||||
|
Warning:
|
||||||
|
Object types need to have property weights defined for the similarity process.
|
||||||
|
Otherwise, those objects will not influence the final score. The WEIGHTS
|
||||||
|
dictionary under `stix2.equivalence.object` can give you an idea on how to add
|
||||||
|
new entries and pass them via the `weight_dict` argument. Similarly, the values
|
||||||
|
or methods can be fine tuned for a particular use case.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
Default weight_dict:
|
||||||
|
|
||||||
|
.. include:: ../../object_default_sem_eq_weights.rst
|
||||||
|
|
||||||
|
Note:
|
||||||
|
This implementation follows the Semantic Equivalence Committee Note.
|
||||||
|
see `the Committee Note <link here>`__.
|
||||||
|
|
||||||
|
"""
|
||||||
|
similarity_result = object_similarity(obj1, obj2, prop_scores, **weight_dict)
|
||||||
|
if similarity_result >= threshold:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
|
def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
|
||||||
"""This method verifies if two objects of the same type are
|
"""This method returns a measure of similarity depending on how
|
||||||
semantically equivalent.
|
similar the two objects are.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
obj1: A stix2 object instance
|
obj1: A stix2 object instance
|
||||||
|
@ -22,17 +67,17 @@ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
|
||||||
in the semantic equivalence process
|
in the semantic equivalence process
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
float: A number between 0.0 and 100.0 as a measurement of equivalence.
|
float: A number between 0.0 and 100.0 as a measurement of similarity.
|
||||||
|
|
||||||
Warning:
|
Warning:
|
||||||
Object types need to have property weights defined for the equivalence process.
|
Object types need to have property weights defined for the similarity process.
|
||||||
Otherwise, those objects will not influence the final score. The WEIGHTS
|
Otherwise, those objects will not influence the final score. The WEIGHTS
|
||||||
dictionary under `stix2.equivalence.object` can give you an idea on how to add
|
dictionary under `stix2.equivalence.object` can give you an idea on how to add
|
||||||
new entries and pass them via the `weight_dict` argument. Similarly, the values
|
new entries and pass them via the `weight_dict` argument. Similarly, the values
|
||||||
or methods can be fine tuned for a particular use case.
|
or methods can be fine tuned for a particular use case.
|
||||||
|
|
||||||
Note:
|
Note:
|
||||||
Default weights_dict:
|
Default weight_dict:
|
||||||
|
|
||||||
.. include:: ../../object_default_sem_eq_weights.rst
|
.. include:: ../../object_default_sem_eq_weights.rst
|
||||||
|
|
||||||
|
@ -352,34 +397,31 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights):
|
||||||
The score influences the objects containing these references. The result is
|
The score influences the objects containing these references. The result is
|
||||||
weighted on the amount of unique objects that could 1) be de-referenced 2) """
|
weighted on the amount of unique objects that could 1) be de-referenced 2) """
|
||||||
results = {}
|
results = {}
|
||||||
if len(refs1) >= len(refs2):
|
|
||||||
l1 = refs1
|
|
||||||
l2 = refs2
|
|
||||||
b1 = ds1
|
|
||||||
b2 = ds2
|
|
||||||
else:
|
|
||||||
l1 = refs2
|
|
||||||
l2 = refs1
|
|
||||||
b1 = ds2
|
|
||||||
b2 = ds1
|
|
||||||
|
|
||||||
l1.sort()
|
pairs = object_pairs(
|
||||||
l2.sort()
|
bucket_per_type(refs1, "id-split"),
|
||||||
|
bucket_per_type(refs2, "id-split"),
|
||||||
|
weights
|
||||||
|
)
|
||||||
|
|
||||||
for ref1 in l1:
|
for ref1, ref2 in pairs:
|
||||||
for ref2 in l2:
|
type1, type2 = ref1.split("--")[0], ref2.split("--")[0]
|
||||||
type1, type2 = ref1.split("--")[0], ref2.split("--")[0]
|
if type1 == type2:
|
||||||
if type1 == type2:
|
score = reference_check(ref1, ref2, ds1, ds2, **weights)
|
||||||
score = reference_check(ref1, ref2, b1, b2, **weights) * 100.0
|
|
||||||
|
|
||||||
if ref1 not in results:
|
if ref1 not in results:
|
||||||
results[ref1] = {"matched": ref2, "value": score}
|
results[ref1] = {"matched": ref2, "value": score}
|
||||||
elif score > results[ref1]["value"]:
|
elif score > results[ref1]["value"]:
|
||||||
results[ref1] = {"matched": ref2, "value": score}
|
results[ref1] = {"matched": ref2, "value": score}
|
||||||
|
|
||||||
|
if ref2 not in results:
|
||||||
|
results[ref2] = {"matched": ref1, "value": score}
|
||||||
|
elif score > results[ref2]["value"]:
|
||||||
|
results[ref2] = {"matched": ref1, "value": score}
|
||||||
|
|
||||||
result = 0.0
|
result = 0.0
|
||||||
total_sum = sum(x["value"] for x in results.values())
|
total_sum = sum(x["value"] for x in results.values())
|
||||||
max_score = len(results) * 100.0
|
max_score = len(results)
|
||||||
|
|
||||||
if max_score > 0:
|
if max_score > 0:
|
||||||
result = total_sum / max_score
|
result = total_sum / max_score
|
||||||
|
@ -391,7 +433,26 @@ def list_reference_check(refs1, refs2, ds1, ds2, **weights):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
# default weights used for the semantic equivalence process
|
def bucket_per_type(g, mode="type"):
|
||||||
|
buckets = collections.defaultdict(list)
|
||||||
|
if mode == "type":
|
||||||
|
[buckets[obj["type"]].append(obj) for obj in g]
|
||||||
|
elif mode == "id-split":
|
||||||
|
[buckets[obj.split("--")[0]].append(obj) for obj in g]
|
||||||
|
return buckets
|
||||||
|
|
||||||
|
|
||||||
|
def object_pairs(g1, g2, w):
|
||||||
|
types_in_common = set(g1.keys()).intersection(g2.keys())
|
||||||
|
testable_types = types_in_common.intersection(w.keys())
|
||||||
|
|
||||||
|
return itertools.chain.from_iterable(
|
||||||
|
itertools.product(g1[stix_type], g2[stix_type])
|
||||||
|
for stix_type in testable_types
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# default weights used for the similarity process
|
||||||
WEIGHTS = {
|
WEIGHTS = {
|
||||||
"attack-pattern": {
|
"attack-pattern": {
|
||||||
"name": (30, partial_string_based),
|
"name": (30, partial_string_based),
|
||||||
|
|
Loading…
Reference in New Issue