diff --git a/stix2/environment.py b/stix2/environment.py
index 4dc6ff0..bc7fcaf 100644
--- a/stix2/environment.py
+++ b/stix2/environment.py
@@ -2,12 +2,12 @@
import copy
from .datastore import CompositeDataSource, DataStoreMixin
-from .equivalence.graph import graphically_equivalent
+from .equivalence.graph import graph_similarity
from .equivalence.object import ( # noqa: F401
WEIGHTS, check_property_present, custom_pattern_based, exact_match,
list_reference_check, partial_external_reference_based, partial_list_based,
partial_location_distance, partial_string_based, partial_timestamp_based,
- reference_check, semantically_equivalent,
+ reference_check, object_similarity,
)
from .parsing import parse as _parse
@@ -197,7 +197,7 @@ class Environment(DataStoreMixin):
return None
@staticmethod
- def semantically_equivalent(obj1, obj2, prop_scores={}, **weight_dict):
+ def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
"""This method verifies if two objects of the same type are
semantically equivalent.
@@ -229,10 +229,10 @@ class Environment(DataStoreMixin):
see `the Committee Note `__.
"""
- return semantically_equivalent(obj1, obj2, prop_scores, **weight_dict)
+ return object_similarity(obj1, obj2, prop_scores, **weight_dict)
@staticmethod
- def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
+ def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
"""This method verifies if two graphs are semantically equivalent.
Each DataStore can contain a connected or disconnected graph and the
final result is weighted over the amount of objects we managed to compare.
@@ -267,4 +267,4 @@ class Environment(DataStoreMixin):
see `the Committee Note `__.
"""
- return graphically_equivalent(ds1, ds2, prop_scores, **weight_dict)
+ return graph_similarity(ds1, ds2, prop_scores, **weight_dict)
diff --git a/stix2/equivalence/graph/__init__.py b/stix2/equivalence/graph/__init__.py
index 680f42f..cff99d0 100644
--- a/stix2/equivalence/graph/__init__.py
+++ b/stix2/equivalence/graph/__init__.py
@@ -1,15 +1,17 @@
"""Python APIs for STIX 2 Graph-based Semantic Equivalence."""
+import collections
+import itertools
import logging
from ..object import (
WEIGHTS, exact_match, list_reference_check, partial_string_based,
- partial_timestamp_based, reference_check, semantically_equivalent,
+ partial_timestamp_based, reference_check, object_similarity,
)
logger = logging.getLogger(__name__)
-def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
+def graph_similarity(ds1, ds2, prop_scores={}, **weight_dict):
"""This method verifies if two graphs are semantically equivalent.
Each DataStore can contain a connected or disconnected graph and the
final result is weighted over the amount of objects we managed to compare.
@@ -44,49 +46,48 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
see `the Committee Note `__.
"""
+ results = {}
+ equivalence_score = 0
weights = GRAPH_WEIGHTS.copy()
if weight_dict:
weights.update(weight_dict)
- results = {}
depth = weights["_internal"]["max_depth"]
- graph1 = ds1.query([])
- graph2 = ds2.query([])
+ graph1 = bucket_per_type(ds1.query([]))
+ graph2 = bucket_per_type(ds2.query([]))
+ pairs = object_pairs(graph1, graph2, weights)
- graph1.sort(key=lambda x: x["type"])
- graph2.sort(key=lambda x: x["type"])
-
- if len(graph1) < len(graph2):
+ for object1, object2 in pairs:
+ iprop_score1 = {}
+ iprop_score2 = {}
+ object1_id = object1["id"]
+ object2_id = object2["id"]
+ weights["_internal"]["max_depth"] = depth
weights["_internal"]["ds1"] = ds1
weights["_internal"]["ds2"] = ds2
- g1 = graph1
- g2 = graph2
- else:
+ result1 = object_similarity(object1, object2, iprop_score1, **weights)
+
weights["_internal"]["ds1"] = ds2
weights["_internal"]["ds2"] = ds1
- g1 = graph2
- g2 = graph1
+ result2 = object_similarity(object2, object1, iprop_score2, **weights)
- for object1 in g1:
- for object2 in g2:
- if object1["type"] == object2["type"] and object1["type"] in weights:
- iprop_score = {}
- result = semantically_equivalent(object1, object2, iprop_score, **weights)
- objects1_id = object1["id"]
- weights["_internal"]["max_depth"] = depth
+ if object1_id not in results:
+ results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1}
+ elif result1 > results[object1_id]["value"]:
+ results[object1_id] = {"lhs": object1["id"], "rhs": object2["id"], "prop_score": iprop_score1, "value": result1}
- if objects1_id not in results:
- results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result}
- elif result > results[objects1_id]["value"]:
- results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result}
+ if object2_id not in results:
+ results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2}
+ elif result1 > results[object2_id]["value"]:
+ results[object2_id] = {"lhs": object2["id"], "rhs": object1["id"], "prop_score": iprop_score2, "value": result2}
- equivalence_score = 0
matching_score = sum(x["value"] for x in results.values())
- sum_weights = len(results) * 100.0
+ sum_weights = len(results)
if sum_weights > 0:
- equivalence_score = (matching_score / sum_weights) * 100
+ equivalence_score = matching_score / sum_weights
+
prop_scores["matching_score"] = matching_score
prop_scores["sum_weights"] = sum_weights
prop_scores["summary"] = results
@@ -100,6 +101,22 @@ def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
return equivalence_score
+def bucket_per_type(g):
+ buckets = collections.defaultdict(list)
+ [buckets[obj["type"]].append(obj) for obj in g]
+ return buckets
+
+
+def object_pairs(g1, g2, w):
+ types_in_common = set(g1.keys()).intersection(g2.keys())
+ testable_types = types_in_common.intersection(w.keys())
+
+ return itertools.chain.from_iterable(
+ itertools.product(g1[stix_type], g2[stix_type])
+ for stix_type in testable_types
+ )
+
+
# default weights used for the graph semantic equivalence process
GRAPH_WEIGHTS = WEIGHTS.copy()
GRAPH_WEIGHTS.update({
diff --git a/stix2/equivalence/object/__init__.py b/stix2/equivalence/object/__init__.py
index 0225788..8b1ceaa 100644
--- a/stix2/equivalence/object/__init__.py
+++ b/stix2/equivalence/object/__init__.py
@@ -9,7 +9,7 @@ from ..pattern import equivalent_patterns
logger = logging.getLogger(__name__)
-def semantically_equivalent(obj1, obj2, prop_scores={}, **weight_dict):
+def object_similarity(obj1, obj2, prop_scores={}, **weight_dict):
"""This method verifies if two objects of the same type are
semantically equivalent.
@@ -312,7 +312,7 @@ def _versioned_checks(ref1, ref2, ds1, ds2, **weights):
if len(objects1) > 0 and len(objects2) > 0:
for o1 in objects1:
for o2 in objects2:
- result = semantically_equivalent(o1, o2, **weights)
+ result = object_similarity(o1, o2, **weights)
if ref1 not in results:
results[ref1] = {"matched": ref2, "value": result}
elif result > results[ref1]["value"]:
@@ -337,7 +337,7 @@ def reference_check(ref1, ref2, ds1, ds2, **weights):
else:
o1, o2 = ds1.get(ref1), ds2.get(ref2)
if o1 and o2:
- result = semantically_equivalent(o1, o2, **weights) / 100.0
+ result = object_similarity(o1, o2, **weights) / 100.0
logger.debug(
"--\t\treference_check '%s' '%s'\tresult: '%s'",
diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py
index 0da01d1..5682ad1 100644
--- a/stix2/test/v21/test_environment.py
+++ b/stix2/test/v21/test_environment.py
@@ -429,7 +429,7 @@ def test_related_to_by_target(ds):
def test_semantic_equivalence_on_same_attack_pattern1():
ap1 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS)
ap2 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS)
- env = stix2.Environment().semantically_equivalent(ap1, ap2)
+ env = stix2.Environment().object_similarity(ap1, ap2)
assert round(env) == 100
@@ -445,14 +445,14 @@ def test_semantic_equivalence_on_same_attack_pattern2():
)
ap1 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_KWARGS)
ap2 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_KWARGS)
- env = stix2.Environment().semantically_equivalent(ap1, ap2)
+ env = stix2.Environment().object_similarity(ap1, ap2)
assert round(env) == 100
def test_semantic_equivalence_on_same_campaign1():
camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS)
camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS)
- env = stix2.Environment().semantically_equivalent(camp1, camp2)
+ env = stix2.Environment().object_similarity(camp1, camp2)
assert round(env) == 100
@@ -464,14 +464,14 @@ def test_semantic_equivalence_on_same_campaign2():
)
camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMP_KWARGS)
camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMP_KWARGS)
- env = stix2.Environment().semantically_equivalent(camp1, camp2)
+ env = stix2.Environment().object_similarity(camp1, camp2)
assert round(env) == 100
def test_semantic_equivalence_on_same_identity1():
iden1 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS)
iden2 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS)
- env = stix2.Environment().semantically_equivalent(iden1, iden2)
+ env = stix2.Environment().object_similarity(iden1, iden2)
assert round(env) == 100
@@ -483,14 +483,14 @@ def test_semantic_equivalence_on_same_identity2():
)
iden1 = stix2.v21.Identity(id=IDENTITY_ID, **IDEN_KWARGS)
iden2 = stix2.v21.Identity(id=IDENTITY_ID, **IDEN_KWARGS)
- env = stix2.Environment().semantically_equivalent(iden1, iden2)
+ env = stix2.Environment().object_similarity(iden1, iden2)
assert round(env) == 100
def test_semantic_equivalence_on_same_indicator():
ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS)
ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS)
- env = stix2.Environment().semantically_equivalent(ind1, ind2)
+ env = stix2.Environment().object_similarity(ind1, ind2)
assert round(env) == 100
@@ -498,7 +498,7 @@ def test_semantic_equivalence_on_same_location1():
location_kwargs = dict(latitude=45, longitude=179)
loc1 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs)
loc2 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs)
- env = stix2.Environment().semantically_equivalent(loc1, loc2)
+ env = stix2.Environment().object_similarity(loc1, loc2)
assert round(env) == 100
@@ -511,7 +511,7 @@ def test_semantic_equivalence_on_same_location2():
)
loc1 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs)
loc2 = stix2.v21.Location(id=LOCATION_ID, **location_kwargs)
- env = stix2.Environment().semantically_equivalent(loc1, loc2)
+ env = stix2.Environment().object_similarity(loc1, loc2)
assert round(env) == 100
@@ -519,21 +519,21 @@ def test_semantic_equivalence_location_with_no_latlong():
loc_kwargs = dict(country="US", administrative_area="US-DC")
loc1 = stix2.v21.Location(id=LOCATION_ID, **LOCATION_KWARGS)
loc2 = stix2.v21.Location(id=LOCATION_ID, **loc_kwargs)
- env = stix2.Environment().semantically_equivalent(loc1, loc2)
+ env = stix2.Environment().object_similarity(loc1, loc2)
assert round(env) != 100
def test_semantic_equivalence_on_same_malware():
malw1 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS)
malw2 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS)
- env = stix2.Environment().semantically_equivalent(malw1, malw2)
+ env = stix2.Environment().object_similarity(malw1, malw2)
assert round(env) == 100
def test_semantic_equivalence_on_same_threat_actor1():
ta1 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS)
ta2 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS)
- env = stix2.Environment().semantically_equivalent(ta1, ta2)
+ env = stix2.Environment().object_similarity(ta1, ta2)
assert round(env) == 100
@@ -545,21 +545,21 @@ def test_semantic_equivalence_on_same_threat_actor2():
)
ta1 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_KWARGS)
ta2 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_KWARGS)
- env = stix2.Environment().semantically_equivalent(ta1, ta2)
+ env = stix2.Environment().object_similarity(ta1, ta2)
assert round(env) == 100
def test_semantic_equivalence_on_same_tool():
tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS)
tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS)
- env = stix2.Environment().semantically_equivalent(tool1, tool2)
+ env = stix2.Environment().object_similarity(tool1, tool2)
assert round(env) == 100
def test_semantic_equivalence_on_same_vulnerability1():
vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS)
vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS)
- env = stix2.Environment().semantically_equivalent(vul1, vul2)
+ env = stix2.Environment().object_similarity(vul1, vul2)
assert round(env) == 100
@@ -584,7 +584,7 @@ def test_semantic_equivalence_on_same_vulnerability2():
)
vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS1)
vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS2)
- env = stix2.Environment().semantically_equivalent(vul1, vul2)
+ env = stix2.Environment().object_similarity(vul1, vul2)
assert round(env) == 0.0
@@ -640,7 +640,7 @@ def test_semantic_equivalence_on_unknown_object():
}
cust1 = stix2.parse(CUSTOM_KWARGS1, allow_custom=True)
cust2 = stix2.parse(CUSTOM_KWARGS2, allow_custom=True)
- env = stix2.Environment().semantically_equivalent(cust1, cust2, **weights)
+ env = stix2.Environment().object_similarity(cust1, cust2, **weights)
assert round(env) == 0
@@ -648,7 +648,7 @@ def test_semantic_equivalence_different_type_raises():
with pytest.raises(ValueError) as excinfo:
vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS)
ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS)
- stix2.Environment().semantically_equivalent(vul1, ind1)
+ stix2.Environment().object_similarity(vul1, ind1)
assert str(excinfo.value) == "The objects to compare must be of the same type!"
@@ -661,7 +661,7 @@ def test_semantic_equivalence_different_spec_version_raises():
)
ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS)
ind2 = stix2.v20.Indicator(id=INDICATOR_ID, **V20_KWARGS)
- stix2.Environment().semantically_equivalent(ind1, ind2)
+ stix2.Environment().object_similarity(ind1, ind2)
assert str(excinfo.value) == "The objects to compare must be of the same spec version!"
@@ -686,7 +686,7 @@ def test_semantic_equivalence_zero_match():
}
ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS)
ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **IND_KWARGS)
- env = stix2.Environment().semantically_equivalent(ind1, ind2, **weights)
+ env = stix2.Environment().object_similarity(ind1, ind2, **weights)
assert round(env) == 0
@@ -708,7 +708,7 @@ def test_semantic_equivalence_different_spec_version():
}
ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS)
ind2 = stix2.v20.Indicator(id=INDICATOR_ID, **IND_KWARGS)
- env = stix2.Environment().semantically_equivalent(ind1, ind2, **weights)
+ env = stix2.Environment().object_similarity(ind1, ind2, **weights)
assert round(env) == 0
@@ -800,7 +800,7 @@ def test_semantic_equivalence_exact_match():
def test_non_existent_config_for_object():
r1 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS)
r2 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS)
- assert stix2.Environment().semantically_equivalent(r1, r2) == 0.0
+ assert stix2.Environment().object_similarity(r1, r2) == 0.0
def custom_semantic_equivalence_method(obj1, obj2, **weights):
@@ -824,7 +824,7 @@ def test_semantic_equivalence_method_provided():
tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS)
tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS)
- env = stix2.Environment().semantically_equivalent(tool1, tool2, **weights)
+ env = stix2.Environment().object_similarity(tool1, tool2, **weights)
assert round(env) == 96
@@ -838,7 +838,7 @@ def test_semantic_equivalence_prop_scores():
tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS)
tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS)
- stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores)
+ stix2.Environment().object_similarity(tool1, tool2, prop_scores)
assert len(prop_scores) == 4
assert round(prop_scores["matching_score"], 1) == 8.9
assert round(prop_scores["sum_weights"], 1) == 100.0
@@ -868,7 +868,7 @@ def test_semantic_equivalence_prop_scores_method_provided():
tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS)
tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS)
- env = stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores, **weights)
+ env = stix2.Environment().object_similarity(tool1, tool2, prop_scores, **weights)
assert round(env) == 96
assert len(prop_scores) == 2
assert prop_scores["matching_score"] == 96.0
@@ -964,12 +964,19 @@ def test_graph_equivalence_with_filesystem_source(ds):
"max_depth": 1,
},
}
- prop_scores = {}
+ prop_scores1 = {}
+ prop_scores2 = {}
fs = stix2.FileSystemSource(FS_PATH)
- env = stix2.Environment().graphically_equivalent(fs, ds, prop_scores, **weights)
- assert round(env) == 24
- assert round(prop_scores["matching_score"]) == 122
- assert round(prop_scores["sum_weights"]) == 500
+ env = stix2.Environment().graph_similarity(fs, ds, prop_scores1, **weights)
+ assert round(env) == 26
+ assert round(prop_scores1["matching_score"]) == 460
+ assert round(prop_scores1["sum_weights"]) == 18
+
+ env = stix2.Environment().graph_similarity(ds, fs, prop_scores2, **weights)
+ assert round(env) == 47
+ assert round(prop_scores2["matching_score"]) == 852
+ assert round(prop_scores2["sum_weights"]) == 18
+ assert prop_scores1 == prop_scores2
def test_graph_equivalence_with_duplicate_graph(ds):
@@ -981,10 +988,10 @@ def test_graph_equivalence_with_duplicate_graph(ds):
},
}
prop_scores = {}
- env = stix2.Environment().graphically_equivalent(ds, ds, prop_scores, **weights)
+ env = stix2.Environment().graph_similarity(ds, ds, prop_scores, **weights)
assert round(env) == 100
assert round(prop_scores["matching_score"]) == 800
- assert round(prop_scores["sum_weights"]) == 800
+ assert round(prop_scores["sum_weights"]) == 8
def test_graph_equivalence_with_versioning_check_on(ds2, ds):
@@ -996,10 +1003,10 @@ def test_graph_equivalence_with_versioning_check_on(ds2, ds):
},
}
prop_scores = {}
- env = stix2.Environment().graphically_equivalent(ds, ds2, prop_scores, **weights)
+ env = stix2.Environment().graph_similarity(ds, ds2, prop_scores, **weights)
assert round(env) == 93
assert round(prop_scores["matching_score"]) == 745
- assert round(prop_scores["sum_weights"]) == 800
+ assert round(prop_scores["sum_weights"]) == 8
def test_graph_equivalence_with_versioning_check_off(ds2, ds):
@@ -1011,7 +1018,7 @@ def test_graph_equivalence_with_versioning_check_off(ds2, ds):
},
}
prop_scores = {}
- env = stix2.Environment().graphically_equivalent(ds, ds2, prop_scores, **weights)
+ env = stix2.Environment().graph_similarity(ds, ds2, prop_scores, **weights)
assert round(env) == 93
assert round(prop_scores["matching_score"]) == 745
- assert round(prop_scores["sum_weights"]) == 800
+ assert round(prop_scores["sum_weights"]) == 8