From 93aa709b684d3b3b3a483107a02c9ad9aa4c69d3 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Fri, 26 Jul 2019 16:01:45 -0400 Subject: [PATCH 01/23] write down some of the semantic-equivalence work. WIP --- setup.py | 1 + stix2/environment.py | 124 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) diff --git a/setup.py b/setup.py index 07de2a4..185c76c 100644 --- a/setup.py +++ b/setup.py @@ -63,5 +63,6 @@ setup( }, extras_require={ 'taxii': ['taxii2-client'], + 'semantic': ['pyjarowinkler'], }, ) diff --git a/stix2/environment.py b/stix2/environment.py index 104fdb2..8049c0d 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -186,3 +186,127 @@ class Environment(DataStoreMixin): return self.get(creator_id) else: return None + + def semantically_equivalent(self, obj1, obj2): + """This method is meant to verify if two objects of the same type are + semantically equivalent. + + Args: + obj1: A stix2 object instance + obj2: A stix2 object instance + + Returns: + float: A number between 0.0 and 1.0 as a measurement of equivalence. + + Warnings: + Not all objects are supported. + + Notes: + This implementation follows the Committee Note on semantic equivalence. + see `the Committee Note `__. + + """ + equivalence_score = 0.0 + type1, type2 = obj1["type"], obj2["type"] + + if type1 != type2: + raise ValueError('The objects to compare must be of the same type!') + + if obj1.get("spec_version", "") != obj2.get("spec_version", ""): + raise ValueError('The objects to compare must be of the same spec version!') + + if type1 == "attack-pattern": + if _check_property_present("name", obj1, obj2): + _partial_string_based(obj1["name"], obj2["name"]) + if _check_property_present("external_references", obj1, obj2): + _partial_external_reference_based(obj1["external_references"], obj2["external_references"]) + elif type1 == "campaign": + if _check_property_present("name", obj1, obj2): + _partial_string_based(obj1["name"], obj2["name"]) + if _check_property_present("aliases", obj1, obj2): + _partial_list_based(obj1["aliases"], obj2["aliases"]) + elif type1 == "course-of-action": + pass + elif type1 == "identity": + if _check_property_present("name", obj1, obj2): + _exact_match(obj1["name"], obj2["name"]) + if _check_property_present("identity_class", obj1, obj2): + _exact_match(obj1["identity_class"], obj2["identity_class"]) + if _check_property_present("sectors", obj1, obj2): + _partial_list_based(obj1["sectors"], obj2["sectors"]) + elif type1 == "indicator": + if _check_property_present("indicator_types", obj1, obj2): + _partial_list_based(obj1["indicator_types"], obj2["indicator_types"]) + if _check_property_present("pattern", obj1, obj2): + pass # TODO: needs to be done + if _check_property_present("valid_from", obj1, obj2): + _partial_timestamp_based(obj1["valid_from"], obj2["valid_from"]) + elif type1 == "instrusion-set": + pass + elif type1 == "location": + pass + elif type1 == "malware": + if _check_property_present("malware_types", obj1, obj2): + _partial_list_based(obj1["malware_types"], obj2["malware_types"]) + if _check_property_present("name", obj1, obj2): + _partial_string_based(obj1["name"], obj2["name"]) + elif type1 == "observed-data": + pass + elif type1 == "report": + pass + elif type1 == "threat-actor": + if _check_property_present("name", obj1, obj2): + _partial_string_based(obj1["name"], obj2["name"]) + if _check_property_present("threat_actor_types", obj1, obj2): + _partial_list_based(obj1["threat_actor_types"], obj2["threat_actor_types"]) + if _check_property_present("aliases", obj1, obj2): + _partial_list_based(obj1["aliases"], obj2["aliases"]) + elif type1 == "tool": + if _check_property_present("tool_types", obj1, obj2): + _partial_list_based(obj1["tool_types"], obj2["tool_types"]) + if _check_property_present("name", obj1, obj2): + _partial_string_based(obj1["name"], obj2["name"]) + elif type1 == "vulnerability": + if _check_property_present("name", obj1, obj2): + _partial_string_based(obj1["name"], obj2["name"]) + if _check_property_present("external_references", obj1, obj2): + _partial_external_reference_based(obj1["external_references"], obj2["external_references"]) + # TODO: need to actually calculate the value + return equivalence_score + + +def _check_property_present(prop, obj1, obj2): + if prop in obj1 and prop in obj2: + return True + return False + + +def _partial_timestamp_based(t1, t2): + from .utils import parse_into_datetime + tdelta = 1 # One day... + stix_t1, stix_t2 = parse_into_datetime(t1), parse_into_datetime(t2) + return 1 - min(abs(stix_t1.timestamp() - stix_t2.timestamp()) / (86400 * tdelta), 1) + + +def _partial_list_based(l1, l2): + l1_set, l2_set = set(l1), set(l2) + return len(l1_set.intersection(l2_set)) / max(len(l1_set), len(l2_set)) + + +def _exact_match(val1, val2): + if val1 == val2: + return 1.0 + return 0.0 + + +def _partial_string_based(str1, str2): + from pyjarowinkler import distance + return distance.get_jaro_distance(str1, str2) + + +def _partial_external_reference_based(refs1, refs2): + pass # TODO: needs to be done + + +def _partial_location_distance(loc1, loc2): + pass # TODO: needs to be done From 6fa77adfe3a38c391b6a873c35600ff6d8d2a46a Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 10 Sep 2019 15:04:07 -0400 Subject: [PATCH 02/23] wrote all default weights, actually computing the equivalence score logging for unsupported objects, finished implementing some methods. Missing to implement patterning. --- stix2/environment.py | 238 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 205 insertions(+), 33 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index 8049c0d..41d9de1 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -1,10 +1,14 @@ """Python STIX2 Environment API.""" import copy +import logging +import math from .core import parse as _parse from .datastore import CompositeDataSource, DataStoreMixin +logger = logging.getLogger(__name__) + class ObjectFactory(object): """Easily create STIX objects with default values for certain properties. @@ -187,13 +191,16 @@ class Environment(DataStoreMixin): else: return None - def semantically_equivalent(self, obj1, obj2): + @staticmethod + def semantically_equivalent(obj1, obj2, **weight_dict): """This method is meant to verify if two objects of the same type are semantically equivalent. Args: obj1: A stix2 object instance obj2: A stix2 object instance + weight_dict: A dictionary that can be used to override settings + in the semantic equivalence process Returns: float: A number between 0.0 and 1.0 as a measurement of equivalence. @@ -206,7 +213,58 @@ class Environment(DataStoreMixin): see `the Committee Note `__. """ - equivalence_score = 0.0 + # default weights used for the semantic equivalence process + weigths = { + "attack-pattern": { + "name": 30, + "external_references": 70, + }, + "campaign": { + "name": 60, + "aliases": 40, + }, + "identity": { + "name": 60, + "identity_class": 20, + "sectors": 20, + }, + "indicator": { + "indicator_types": 15, + "pattern": 80, + "valid_from": 5, + }, + "location": { + "longitude_latitude": 34, + "region": 33, + "country": 33, + }, + "malware": { + "malware_types": 20, + "name": 80, + }, + "threat-actor": { + "name": 60, + "threat_actor_types": 20, + "aliases": 20, + }, + "tool": { + "tool_types": 20, + "name": 80, + }, + "vulnerability": { + "name": 30, + "external_references": 70, + }, + "_internal": { + "tdelta": 1, + }, + } + + if weight_dict: + weigths.update(weight_dict) + + matching_score = 0.0 + sum_weights = 0.0 type1, type2 = obj1["type"], obj2["type"] if type1 != type2: @@ -217,61 +275,132 @@ class Environment(DataStoreMixin): if type1 == "attack-pattern": if _check_property_present("name", obj1, obj2): - _partial_string_based(obj1["name"], obj2["name"]) + w = weigths["attack-pattern"]["name"] + sum_weights += w + matching_score += w * _partial_string_based(obj1["name"], obj2["name"]) if _check_property_present("external_references", obj1, obj2): - _partial_external_reference_based(obj1["external_references"], obj2["external_references"]) + w = weigths["attack-pattern"]["external_references"] + sum_weights += w + matching_score += ( + w * + _partial_external_reference_based(obj1["external_references"], obj2["external_references"]) + ) + elif type1 == "campaign": if _check_property_present("name", obj1, obj2): - _partial_string_based(obj1["name"], obj2["name"]) + w = weigths["campaign"]["name"] + sum_weights += w + matching_score += w * _partial_string_based(obj1["name"], obj2["name"]) if _check_property_present("aliases", obj1, obj2): - _partial_list_based(obj1["aliases"], obj2["aliases"]) + w = weigths["campaign"]["aliases"] + sum_weights += w + matching_score += w * _partial_list_based(obj1["aliases"], obj2["aliases"]) + elif type1 == "course-of-action": - pass + logger.warning("%s type is not supported for semantic equivalence", type1) + elif type1 == "identity": if _check_property_present("name", obj1, obj2): - _exact_match(obj1["name"], obj2["name"]) + w = weigths["identity"]["name"] + sum_weights += w + matching_score += w * _exact_match(obj1["name"], obj2["name"]) if _check_property_present("identity_class", obj1, obj2): - _exact_match(obj1["identity_class"], obj2["identity_class"]) + w = weigths["identity"]["identity_class"] + sum_weights += w + matching_score += w * _exact_match(obj1["identity_class"], obj2["identity_class"]) if _check_property_present("sectors", obj1, obj2): - _partial_list_based(obj1["sectors"], obj2["sectors"]) + w = weigths["identity"]["sectors"] + sum_weights += w + matching_score += w * _partial_list_based(obj1["sectors"], obj2["sectors"]) + elif type1 == "indicator": if _check_property_present("indicator_types", obj1, obj2): - _partial_list_based(obj1["indicator_types"], obj2["indicator_types"]) + w = weigths["indicator"]["indicator_types"] + sum_weights += w + matching_score += w * _partial_list_based(obj1["indicator_types"], obj2["indicator_types"]) if _check_property_present("pattern", obj1, obj2): - pass # TODO: needs to be done + w = weigths["indicator"]["pattern"] + sum_weights += w + matching_score += w * _custom_pattern_based(obj1["pattern"], obj2["pattern"]) if _check_property_present("valid_from", obj1, obj2): - _partial_timestamp_based(obj1["valid_from"], obj2["valid_from"]) + w = weigths["indicator"]["valid_from"] + sum_weights += w + matching_score += ( + w * + _partial_timestamp_based(obj1["valid_from"], obj2["valid_from"], weigths["_internal"]["tdelta"]) + ) + elif type1 == "instrusion-set": - pass + logger.warning("%s type is not supported for semantic equivalence", type1) + elif type1 == "location": - pass + if _check_property_present("latitude", obj1, obj2) and _check_property_present("longitude", obj1, obj2): + w = weigths["location"]["longitude_latitude"] + sum_weights += w + matching_score += ( + w * + _partial_location_distance(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"]) + ) + if _check_property_present("region", obj1, obj2): + w = weigths["location"]["region"] + sum_weights += w + matching_score += w * _exact_match(obj1["region"], obj2["region"]) + if _check_property_present("country", obj1, obj2): + w = weigths["location"]["country"] + sum_weights += w + matching_score += w * _exact_match(obj1["country"], obj2["country"]) + elif type1 == "malware": if _check_property_present("malware_types", obj1, obj2): - _partial_list_based(obj1["malware_types"], obj2["malware_types"]) + w = weigths["malware"]["malware_types"] + sum_weights += w + matching_score += w * _partial_list_based(obj1["malware_types"], obj2["malware_types"]) if _check_property_present("name", obj1, obj2): - _partial_string_based(obj1["name"], obj2["name"]) + w = weigths["malware"]["name"] + sum_weights += w + matching_score += w * _partial_string_based(obj1["name"], obj2["name"]) + elif type1 == "observed-data": - pass + logger.warning("%s type is not supported for semantic equivalence", type1) + elif type1 == "report": - pass + logger.warning("%s type is not supported for semantic equivalence", type1) + elif type1 == "threat-actor": if _check_property_present("name", obj1, obj2): - _partial_string_based(obj1["name"], obj2["name"]) + w = weigths["threat-actor"]["name"] + sum_weights += w + matching_score += w * _partial_string_based(obj1["name"], obj2["name"]) if _check_property_present("threat_actor_types", obj1, obj2): - _partial_list_based(obj1["threat_actor_types"], obj2["threat_actor_types"]) + w = weigths["threat-actor"]["threat_actor_types"] + sum_weights += w + matching_score += w * _partial_list_based(obj1["threat_actor_types"], obj2["threat_actor_types"]) if _check_property_present("aliases", obj1, obj2): - _partial_list_based(obj1["aliases"], obj2["aliases"]) + w = weigths["threat-actor"]["aliases"] + sum_weights += w + matching_score += w * _partial_list_based(obj1["aliases"], obj2["aliases"]) + elif type1 == "tool": if _check_property_present("tool_types", obj1, obj2): - _partial_list_based(obj1["tool_types"], obj2["tool_types"]) + w = weigths["tool"]["tool_types"] + sum_weights += w + matching_score += w * _partial_list_based(obj1["tool_types"], obj2["tool_types"]) if _check_property_present("name", obj1, obj2): - _partial_string_based(obj1["name"], obj2["name"]) + w = weigths["tool"]["name"] + sum_weights += w + matching_score += w * _partial_string_based(obj1["name"], obj2["name"]) + elif type1 == "vulnerability": if _check_property_present("name", obj1, obj2): - _partial_string_based(obj1["name"], obj2["name"]) + w = weigths["vulnerability"]["name"] + sum_weights += w + matching_score += w * _partial_string_based(obj1["name"], obj2["name"]) if _check_property_present("external_references", obj1, obj2): - _partial_external_reference_based(obj1["external_references"], obj2["external_references"]) - # TODO: need to actually calculate the value + w = weigths["vulnerability"]["external_references"] + sum_weights += w + matching_score += w * _partial_external_reference_based(obj1["external_references"], obj2["external_references"]) + + equivalence_score = (matching_score / sum_weights) * 100.0 return equivalence_score @@ -281,16 +410,15 @@ def _check_property_present(prop, obj1, obj2): return False -def _partial_timestamp_based(t1, t2): +def _partial_timestamp_based(t1, t2, tdelta): from .utils import parse_into_datetime - tdelta = 1 # One day... stix_t1, stix_t2 = parse_into_datetime(t1), parse_into_datetime(t2) return 1 - min(abs(stix_t1.timestamp() - stix_t2.timestamp()) / (86400 * tdelta), 1) def _partial_list_based(l1, l2): l1_set, l2_set = set(l1), set(l2) - return len(l1_set.intersection(l2_set)) / max(len(l1_set), len(l2_set)) + return len(l1_set.intersection(l2_set)) / max(len(l1), len(l2)) def _exact_match(val1, val2): @@ -304,9 +432,53 @@ def _partial_string_based(str1, str2): return distance.get_jaro_distance(str1, str2) +def _custom_pattern_based(pattern1, pattern2): + return 0 # TODO: Needs to be implemented + + def _partial_external_reference_based(refs1, refs2): - pass # TODO: needs to be done + allowed = set(("veris", "cve", "capec", "mitre-attack")) + matches = 0 + + if len(refs1) >= len(refs2): + l1 = refs1 + l2 = refs2 + else: + l1 = refs2 + l2 = refs1 + + for ext_ref1 in l1: + for ext_ref2 in l2: + sn_match = False + ei_match = False + url_match = False + source_name = None + + if _check_property_present("source_name", ext_ref1, ext_ref2): + if ext_ref1["source_name"] == ext_ref2["source_name"]: + source_name = ext_ref1["source_name"] + sn_match = True + if _check_property_present("external_id", ext_ref1, ext_ref2): + if ext_ref1["external_id"] == ext_ref2["external_id"]: + ei_match = True + if _check_property_present("url", ext_ref1, ext_ref2): + if ext_ref1["url"] == ext_ref2["url"]: + url_match = True + + # Special case: if source_name is a STIX defined name and either + # external_id or url match then its a perfect match and other entries + # can be ignored. + if sn_match and (ei_match or url_match) and source_name in allowed: + return 1.0 + + # Regular check. If the source_name (not STIX-defined) or external_id or + # url matches then we consider the entry a match. + if (sn_match or ei_match or url_match) and source_name not in allowed: + matches += 1 + + return matches / max(len(refs1), len(refs2)) -def _partial_location_distance(loc1, loc2): - pass # TODO: needs to be done +def _partial_location_distance(lat1, long1, lat2, long2): + distance = math.sqrt(((lat2 - lat1) ** 2) + ((long2 - long1) ** 2)) + return 1 - (distance / 1000.0) From e8eb7bcca20c8b6a350bf009c17f821d829ae96b Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 16 Sep 2019 14:35:14 -0400 Subject: [PATCH 03/23] fix logging messages, typos and add tests for the semantic equivalence method --- stix2/environment.py | 24 ++-- stix2/test/v21/test_environment.py | 170 ++++++++++++++++++++++++++++- 2 files changed, 183 insertions(+), 11 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index 41d9de1..4c7d2b0 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -203,7 +203,7 @@ class Environment(DataStoreMixin): in the semantic equivalence process Returns: - float: A number between 0.0 and 1.0 as a measurement of equivalence. + float: A number between 0.0 and 100.0 as a measurement of equivalence. Warnings: Not all objects are supported. @@ -256,7 +256,7 @@ class Environment(DataStoreMixin): "external_references": 70, }, "_internal": { - "tdelta": 1, + "tdelta": 1, # One day interval }, } @@ -270,7 +270,7 @@ class Environment(DataStoreMixin): if type1 != type2: raise ValueError('The objects to compare must be of the same type!') - if obj1.get("spec_version", "") != obj2.get("spec_version", ""): + if obj1.get("spec_version", "2.0") != obj2.get("spec_version", "2.0"): raise ValueError('The objects to compare must be of the same spec version!') if type1 == "attack-pattern": @@ -297,7 +297,8 @@ class Environment(DataStoreMixin): matching_score += w * _partial_list_based(obj1["aliases"], obj2["aliases"]) elif type1 == "course-of-action": - logger.warning("%s type is not supported for semantic equivalence", type1) + logger.warning("%s type has no semantic equivalence implementation", type1) + return 0 elif type1 == "identity": if _check_property_present("name", obj1, obj2): @@ -330,8 +331,9 @@ class Environment(DataStoreMixin): _partial_timestamp_based(obj1["valid_from"], obj2["valid_from"], weigths["_internal"]["tdelta"]) ) - elif type1 == "instrusion-set": - logger.warning("%s type is not supported for semantic equivalence", type1) + elif type1 == "intrusion-set": + logger.warning("%s type has no semantic equivalence implementation", type1) + return 0 elif type1 == "location": if _check_property_present("latitude", obj1, obj2) and _check_property_present("longitude", obj1, obj2): @@ -361,10 +363,12 @@ class Environment(DataStoreMixin): matching_score += w * _partial_string_based(obj1["name"], obj2["name"]) elif type1 == "observed-data": - logger.warning("%s type is not supported for semantic equivalence", type1) + logger.warning("%s type has no semantic equivalence implementation", type1) + return 0 elif type1 == "report": - logger.warning("%s type is not supported for semantic equivalence", type1) + logger.warning("%s type has no semantic equivalence implementation", type1) + return 0 elif type1 == "threat-actor": if _check_property_present("name", obj1, obj2): @@ -400,6 +404,9 @@ class Environment(DataStoreMixin): sum_weights += w matching_score += w * _partial_external_reference_based(obj1["external_references"], obj2["external_references"]) + if sum_weights <= 0: + return 0 + equivalence_score = (matching_score / sum_weights) * 100.0 return equivalence_score @@ -433,6 +440,7 @@ def _partial_string_based(str1, str2): def _custom_pattern_based(pattern1, pattern2): + logger.warning("Checking for Indicator pattern equivalence is currently not implemented!") return 0 # TODO: Needs to be implemented diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index e08971e..9dee464 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -3,9 +3,13 @@ import pytest import stix2 from .constants import ( - CAMPAIGN_ID, CAMPAIGN_KWARGS, FAKE_TIME, IDENTITY_ID, IDENTITY_KWARGS, - INDICATOR_ID, INDICATOR_KWARGS, MALWARE_ID, MALWARE_KWARGS, - RELATIONSHIP_IDS, + ATTACK_PATTERN_ID, ATTACK_PATTERN_KWARGS, CAMPAIGN_ID, CAMPAIGN_KWARGS, + COURSE_OF_ACTION_ID, COURSE_OF_ACTION_KWARGS, FAKE_TIME, IDENTITY_ID, + IDENTITY_KWARGS, INDICATOR_ID, INDICATOR_KWARGS, INTRUSION_SET_ID, + INTRUSION_SET_KWARGS, LOCATION_ID, MALWARE_ID, MALWARE_KWARGS, + OBSERVED_DATA_ID, OBSERVED_DATA_KWARGS, RELATIONSHIP_IDS, REPORT_ID, + REPORT_KWARGS, THREAT_ACTOR_ID, THREAT_ACTOR_KWARGS, TOOL_ID, TOOL_KWARGS, + VULNERABILITY_ID, VULNERABILITY_KWARGS, ) @@ -372,3 +376,163 @@ def test_related_to_by_target(ds): assert len(resp) == 2 assert any(x['id'] == CAMPAIGN_ID for x in resp) assert any(x['id'] == INDICATOR_ID for x in resp) + + +def test_semantic_equivalence_on_same_attack_pattern(): + ap1 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS) + ap2 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS) + env = stix2.Environment().semantically_equivalent(ap1, ap2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_campaign(): + camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) + camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) + env = stix2.Environment().semantically_equivalent(camp1, camp2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_identity(): + iden1 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS) + iden2 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS) + env = stix2.Environment().semantically_equivalent(iden1, iden2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_indicator(): + ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) + ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) + env = stix2.Environment().semantically_equivalent(ind1, ind2) + assert round(env) == 20 # No support for pattern, hence the 20 + + +def test_semantic_equivalence_on_same_location(): + LOCATION_KWARGS = dict(latitude=45, longitude=179) + loc1 = stix2.v21.Location(id=LOCATION_ID, **LOCATION_KWARGS) + loc2 = stix2.v21.Location(id=LOCATION_ID, **LOCATION_KWARGS) + env = stix2.Environment().semantically_equivalent(loc1, loc2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_malware(): + malw1 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS) + malw2 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS) + env = stix2.Environment().semantically_equivalent(malw1, malw2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_threat_actor(): + ta1 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS) + ta2 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS) + env = stix2.Environment().semantically_equivalent(ta1, ta2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_tool(): + tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) + tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) + env = stix2.Environment().semantically_equivalent(tool1, tool2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_vulnerability(): + vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) + vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) + env = stix2.Environment().semantically_equivalent(vul1, vul2) + assert round(env) == 100 + + +def test_semantic_equivalence_different_type_raises(): + with pytest.raises(ValueError) as excinfo: + vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) + ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) + stix2.Environment().semantically_equivalent(vul1, ind1) + + assert str(excinfo.value) == "The objects to compare must be of the same type!" + + +def test_semantic_equivalence_different_spec_version_raises(): + with pytest.raises(ValueError) as excinfo: + V20_KWARGS = dict( + labels=['malicious-activity'], + pattern="[file:hashes.MD5 = 'd41d8cd98f00b204e9800998ecf8427e']", + ) + ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) + ind2 = stix2.v20.Indicator(id=INDICATOR_ID, **V20_KWARGS) + stix2.Environment().semantically_equivalent(ind1, ind2) + + assert str(excinfo.value) == "The objects to compare must be of the same spec version!" + + +def test_semantic_equivalence_on_unsupported_types(): + coa1 = stix2.v21.CourseOfAction(id=COURSE_OF_ACTION_ID, **COURSE_OF_ACTION_KWARGS) + ints1 = stix2.v21.IntrusionSet(id=INTRUSION_SET_ID, **INTRUSION_SET_KWARGS) + obs1 = stix2.v21.ObservedData(id=OBSERVED_DATA_ID, **OBSERVED_DATA_KWARGS) + rep1 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) + + coa2 = stix2.v21.CourseOfAction(id=COURSE_OF_ACTION_ID, **COURSE_OF_ACTION_KWARGS) + ints2 = stix2.v21.IntrusionSet(id=INTRUSION_SET_ID, **INTRUSION_SET_KWARGS) + obs2 = stix2.v21.ObservedData(id=OBSERVED_DATA_ID, **OBSERVED_DATA_KWARGS) + rep2 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) + + obj_list = [(coa1, coa2), (ints1, ints2), (obs1, obs2), (rep1, rep2)] + + for obj1, obj2 in obj_list: + env = stix2.Environment().semantically_equivalent(obj1, obj2) + assert round(env) == 0 + + +def test_semantic_equivalence_zero_match(): + IND_KWARGS = dict( + indicator_types=["APTX"], + pattern="[ipv4-addr:value = '192.168.1.1']", + ) + weigths = { + "attack-pattern": { + "name": 30, + "external_references": 70, + }, + "campaign": { + "name": 60, + "aliases": 40, + }, + "identity": { + "name": 60, + "identity_class": 20, + "sectors": 20, + }, + "indicator": { + "indicator_types": 15, + "pattern": 85, + "valid_from": 0, + }, + "location": { + "longitude_latitude": 34, + "region": 33, + "country": 33, + }, + "malware": { + "malware_types": 20, + "name": 80, + }, + "threat-actor": { + "name": 60, + "threat_actor_types": 20, + "aliases": 20, + }, + "tool": { + "tool_types": 20, + "name": 80, + }, + "vulnerability": { + "name": 30, + "external_references": 70, + }, + "_internal": { + "tdelta": 1, + }, + } + ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) + ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **IND_KWARGS) + env = stix2.Environment().semantically_equivalent(ind1, ind2, **weigths) + assert round(env) == 0 From ea0df7080600b51f79bd9420ee40461ed87ae853 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 16 Sep 2019 14:45:01 -0400 Subject: [PATCH 04/23] update test environment requirements --- .isort.cfg | 1 + .travis.yml | 7 ++----- tox.ini | 1 + 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.isort.cfg b/.isort.cfg index d644f60..0f2fca3 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -5,6 +5,7 @@ known_third_party = antlr4, dateutil, medallion, + pyjarowinkler, pytest, pytz, requests, diff --git a/.travis.yml b/.travis.yml index 261f125..c05ec72 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,16 +1,13 @@ sudo: false language: python cache: pip +dist: xenial python: - "2.7" - "3.4" - "3.5" - "3.6" -matrix: - include: - - python: 3.7 # https://github.com/travis-ci/travis-ci/issues/9069#issuecomment-425720905 - dist: xenial - sudo: true + - "3.7" install: - pip install -U pip setuptools - pip install tox-travis pre-commit diff --git a/tox.ini b/tox.ini index f3a10fb..2bdae15 100644 --- a/tox.ini +++ b/tox.ini @@ -9,6 +9,7 @@ deps = pytest-cov coverage taxii2-client + pyjarowinkler medallion commands = pytest --ignore=stix2/test/v20/test_workbench.py --ignore=stix2/test/v21/test_workbench.py --cov=stix2 stix2/test/ --cov-report term-missing From 98ecdf53e389938dce90c97b93866b6871b70381 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 17 Sep 2019 11:08:01 -0400 Subject: [PATCH 05/23] update timestamp comparison method --- stix2/environment.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index 4c7d2b0..3616f72 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -3,9 +3,11 @@ import copy import logging import math +import time from .core import parse as _parse from .datastore import CompositeDataSource, DataStoreMixin +from .utils import STIXdatetime, parse_into_datetime logger = logging.getLogger(__name__) @@ -418,9 +420,12 @@ def _check_property_present(prop, obj1, obj2): def _partial_timestamp_based(t1, t2, tdelta): - from .utils import parse_into_datetime - stix_t1, stix_t2 = parse_into_datetime(t1), parse_into_datetime(t2) - return 1 - min(abs(stix_t1.timestamp() - stix_t2.timestamp()) / (86400 * tdelta), 1) + if not isinstance(t1, STIXdatetime): + t1 = parse_into_datetime(t1) + if not isinstance(t2, STIXdatetime): + t2 = parse_into_datetime(t2) + t1, t2 = time.mktime(t1.timetuple()), time.mktime(t2.timetuple()) + return 1 - min(abs(t1 - t2) / (86400 * tdelta), 1) def _partial_list_based(l1, l2): From 09858ba2633bf4eb39ac040099ed2519957e93eb Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 17 Sep 2019 15:28:37 -0400 Subject: [PATCH 06/23] create more tests to improve coverage --- stix2/test/v21/test_environment.py | 171 ++++++++++++++++++++++++++++- 1 file changed, 166 insertions(+), 5 deletions(-) diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 9dee464..3738b75 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -1,6 +1,7 @@ import pytest import stix2 +import stix2.environment from .constants import ( ATTACK_PATTERN_ID, ATTACK_PATTERN_KWARGS, CAMPAIGN_ID, CAMPAIGN_KWARGS, @@ -385,20 +386,45 @@ def test_semantic_equivalence_on_same_attack_pattern(): assert round(env) == 100 -def test_semantic_equivalence_on_same_campaign(): +def test_semantic_equivalence_on_same_campaign1(): camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) env = stix2.Environment().semantically_equivalent(camp1, camp2) assert round(env) == 100 -def test_semantic_equivalence_on_same_identity(): +def test_semantic_equivalence_on_same_campaign2(): + CAMP_KWARGS = dict( + name="Green Group Attacks Against Finance", + description="Campaign by Green Group against a series of targets in the financial services sector.", + aliases=["super-green", "some-green"], + ) + + camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMP_KWARGS) + camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMP_KWARGS) + env = stix2.Environment().semantically_equivalent(camp1, camp2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_identity1(): iden1 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS) iden2 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS) env = stix2.Environment().semantically_equivalent(iden1, iden2) assert round(env) == 100 +def test_semantic_equivalence_on_same_identity2(): + IDEN_KWARGS = dict( + name="John Smith", + identity_class="individual", + sectors=["government", "critical-infrastructure"], + ) + iden1 = stix2.v21.Identity(id=IDENTITY_ID, **IDEN_KWARGS) + iden2 = stix2.v21.Identity(id=IDENTITY_ID, **IDEN_KWARGS) + env = stix2.Environment().semantically_equivalent(iden1, iden2) + assert round(env) == 100 + + def test_semantic_equivalence_on_same_indicator(): ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) @@ -406,7 +432,7 @@ def test_semantic_equivalence_on_same_indicator(): assert round(env) == 20 # No support for pattern, hence the 20 -def test_semantic_equivalence_on_same_location(): +def test_semantic_equivalence_on_same_location1(): LOCATION_KWARGS = dict(latitude=45, longitude=179) loc1 = stix2.v21.Location(id=LOCATION_ID, **LOCATION_KWARGS) loc2 = stix2.v21.Location(id=LOCATION_ID, **LOCATION_KWARGS) @@ -414,6 +440,19 @@ def test_semantic_equivalence_on_same_location(): assert round(env) == 100 +def test_semantic_equivalence_on_same_location2(): + LOCATION_KWARGS = dict( + latitude=38.889, + longitude=-77.023, + region="northern-america", + country="us", + ) + loc1 = stix2.v21.Location(id=LOCATION_ID, **LOCATION_KWARGS) + loc2 = stix2.v21.Location(id=LOCATION_ID, **LOCATION_KWARGS) + env = stix2.Environment().semantically_equivalent(loc1, loc2) + assert round(env) == 100 + + def test_semantic_equivalence_on_same_malware(): malw1 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS) malw2 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS) @@ -421,13 +460,25 @@ def test_semantic_equivalence_on_same_malware(): assert round(env) == 100 -def test_semantic_equivalence_on_same_threat_actor(): +def test_semantic_equivalence_on_same_threat_actor1(): ta1 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS) ta2 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS) env = stix2.Environment().semantically_equivalent(ta1, ta2) assert round(env) == 100 +def test_semantic_equivalence_on_same_threat_actor2(): + THREAT_KWARGS = dict( + threat_actor_types=["crime-syndicate"], + aliases=["super-evil"], + name="Evil Org", + ) + ta1 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_KWARGS) + ta2 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_KWARGS) + env = stix2.Environment().semantically_equivalent(ta1, ta2) + assert round(env) == 100 + + def test_semantic_equivalence_on_same_tool(): tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) @@ -435,13 +486,38 @@ def test_semantic_equivalence_on_same_tool(): assert round(env) == 100 -def test_semantic_equivalence_on_same_vulnerability(): +def test_semantic_equivalence_on_same_vulnerability1(): vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) env = stix2.Environment().semantically_equivalent(vul1, vul2) assert round(env) == 100 +def test_semantic_equivalence_on_same_vulnerability2(): + VULN_KWARGS1 = dict( + name="Heartbleed", + external_references=[ + { + "url": "https://example", + "source_name": "some-source", + }, + ], + ) + VULN_KWARGS2 = dict( + name="Zot", + external_references=[ + { + "url": "https://example2", + "source_name": "some-source2", + }, + ], + ) + vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS1) + vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS2) + env = stix2.Environment().semantically_equivalent(vul1, vul2) + assert round(env) == 0.0 + + def test_semantic_equivalence_different_type_raises(): with pytest.raises(ValueError) as excinfo: vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) @@ -536,3 +612,88 @@ def test_semantic_equivalence_zero_match(): ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **IND_KWARGS) env = stix2.Environment().semantically_equivalent(ind1, ind2, **weigths) assert round(env) == 0 + + +@pytest.mark.parametrize( + "refs1,refs2,ret_val", [ + ( + [ + { + "url": "https://attack.mitre.org/techniques/T1150", + "source_name": "mitre-attack", + "external_id": "T1150", + }, + { + "url": "https://researchcenter.paloaltonetworks.com/2016/09/unit42-sofacys-komplex-os-x-trojan/", + "source_name": "Sofacy Komplex Trojan", + "description": "Dani Creus, Tyler Halfpop, Robert Falcone. (2016, September 26). Sofacy's 'Komplex' OS X Trojan. Retrieved July 8, 2017.", + }, + ], + [ + { + "url": "https://attack.mitre.org/techniques/T1129", + "source_name": "mitre-attack", + "external_id": "T1129", + }, + { + "url": "https://en.wikipedia.org/wiki/Microsoft_Windows_library_files", + "source_name": "Wikipedia Windows Library Files", + "description": "Wikipedia. (2017, January 31). Microsoft Windows library files. Retrieved February 13, 2017.", + }, + ], + 0.0, + ), + ( + [ + { + "url": "https://attack.mitre.org/techniques/T1129", + "source_name": "mitre-attack", + "external_id": "T1129", + }, + ], + [ + { + "url": "https://attack.mitre.org/techniques/T1129", + "source_name": "mitre-attack", + "external_id": "T1129", + }, + { + "url": "https://en.wikipedia.org/wiki/Microsoft_Windows_library_files", + "source_name": "Wikipedia Windows Library Files", + "description": "Wikipedia. (2017, January 31). Microsoft Windows library files. Retrieved February 13, 2017.", + }, + ], + 1.0, + ), + ( + [ + { + "url": "https://example", + "source_name": "some-source", + }, + ], + [ + { + "url": "https://example", + "source_name": "some-source", + }, + ], + 1.0, + ), + ], +) +def test_semantic_equivalence_external_references(refs1, refs2, ret_val): + value = stix2.environment._partial_external_reference_based(refs1, refs2) + assert value == ret_val + + +def test_semantic_equivalence_timetamp(): + t1 = "2018-10-17T00:14:20.652Z" + t2 = "2018-10-17T12:14:20.652Z" + assert stix2.environment._partial_timestamp_based(t1, t2, 1) == 0.5 + + +def test_semantic_equivalence_exact_match(): + t1 = "2018-10-17T00:14:20.652Z" + t2 = "2018-10-17T12:14:20.652Z" + assert stix2.environment._exact_match(t1, t2) == 0.0 From 351362ae3301f89aa350ab78358a943ec7c5e07c Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 17 Sep 2019 15:55:12 -0400 Subject: [PATCH 07/23] more tests for coverage --- stix2/test/v21/test_environment.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 3738b75..0ac27ce 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -379,13 +379,29 @@ def test_related_to_by_target(ds): assert any(x['id'] == INDICATOR_ID for x in resp) -def test_semantic_equivalence_on_same_attack_pattern(): +def test_semantic_equivalence_on_same_attack_pattern1(): ap1 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS) ap2 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS) env = stix2.Environment().semantically_equivalent(ap1, ap2) assert round(env) == 100 +def test_semantic_equivalence_on_same_attack_pattern2(): + ATTACK_KWARGS = dict( + name="Phishing", + external_references=[ + { + "url": "https://example2", + "source_name": "some-source2", + }, + ], + ) + ap1 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_KWARGS) + ap2 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_KWARGS) + env = stix2.Environment().semantically_equivalent(ap1, ap2) + assert round(env) == 100 + + def test_semantic_equivalence_on_same_campaign1(): camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) @@ -399,7 +415,6 @@ def test_semantic_equivalence_on_same_campaign2(): description="Campaign by Green Group against a series of targets in the financial services sector.", aliases=["super-green", "some-green"], ) - camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMP_KWARGS) camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMP_KWARGS) env = stix2.Environment().semantically_equivalent(camp1, camp2) From e138753576add9694f7d3fc2d1babf936c4d4e82 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 17 Sep 2019 16:10:54 -0400 Subject: [PATCH 08/23] add another test --- stix2/test/v21/test_environment.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 0ac27ce..7427109 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -533,6 +533,35 @@ def test_semantic_equivalence_on_same_vulnerability2(): assert round(env) == 0.0 +def test_semantic_equivalence_on_unknown_object(): + CUSTOM_KWARGS1 = dict( + type="x-foobar", + id="x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061", + name="Heartbleed", + external_references=[ + { + "url": "https://example", + "source_name": "some-source", + }, + ], + ) + CUSTOM_KWARGS2 = dict( + type="x-foobar", + id="x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061", + name="Zot", + external_references=[ + { + "url": "https://example2", + "source_name": "some-source2", + }, + ], + ) + cust1 = stix2.parse(CUSTOM_KWARGS1, allow_custom=True) + cust2 = stix2.parse(CUSTOM_KWARGS2, allow_custom=True) + env = stix2.Environment().semantically_equivalent(cust1, cust2) + assert round(env) == 0 + + def test_semantic_equivalence_different_type_raises(): with pytest.raises(ValueError) as excinfo: vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) From 4eaaee89dc62ef71d65ba0565b3a732e69c0b5cf Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 23 Sep 2019 09:44:09 -0400 Subject: [PATCH 09/23] make changes according to feedback. allow for custom objects to be supplied to method --- stix2/environment.py | 369 +++++++++++++++++------------ stix2/exceptions.py | 7 + stix2/test/v21/test_environment.py | 128 +++++----- 3 files changed, 292 insertions(+), 212 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index 3616f72..c013ae2 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -7,6 +7,7 @@ import time from .core import parse as _parse from .datastore import CompositeDataSource, DataStoreMixin +from .exceptions import SemanticEquivalenceUnsupportedTypeError from .utils import STIXdatetime, parse_into_datetime logger = logging.getLogger(__name__) @@ -207,204 +208,98 @@ class Environment(DataStoreMixin): Returns: float: A number between 0.0 and 100.0 as a measurement of equivalence. - Warnings: - Not all objects are supported. + Warning: + Course of Action, Intrusion-Set, Observed-Data, Report are not supported + by this implementation. Indicator pattern check is also limited. - Notes: + Note: This implementation follows the Committee Note on semantic equivalence. see `the Committee Note `__. """ # default weights used for the semantic equivalence process - weigths = { + weights = { "attack-pattern": { "name": 30, "external_references": 70, + "method": _attack_pattern_checks, }, "campaign": { "name": 60, "aliases": 40, + "method": _campaign_checks, + }, + "course-of-action": { + "method": _course_of_action_checks, }, "identity": { "name": 60, "identity_class": 20, "sectors": 20, + "method": _identity_checks, }, "indicator": { "indicator_types": 15, "pattern": 80, "valid_from": 5, + "tdelta": 1, # One day interval + "method": _indicator_checks, + }, + "intrusion-set": { + "method": _intrusion_set_checks, }, "location": { "longitude_latitude": 34, "region": 33, "country": 33, + "method": _location_checks, }, "malware": { "malware_types": 20, "name": 80, + "method": _malware_checks, + }, + "observed-data": { + "method": _observed_data_checks, + }, + "report": { + "method": _report_checks, }, "threat-actor": { "name": 60, "threat_actor_types": 20, "aliases": 20, + "method": _threat_actor_checks, }, "tool": { "tool_types": 20, "name": 80, + "method": _tool_checks, }, "vulnerability": { "name": 30, "external_references": 70, + "method": _vulnerability_checks, }, "_internal": { - "tdelta": 1, # One day interval + "ignore_spec_version": False, }, } if weight_dict: - weigths.update(weight_dict) + weights.update(weight_dict) - matching_score = 0.0 - sum_weights = 0.0 type1, type2 = obj1["type"], obj2["type"] + ignore_spec_version = weights["_internal"]["ignore_spec_version"] if type1 != type2: raise ValueError('The objects to compare must be of the same type!') - if obj1.get("spec_version", "2.0") != obj2.get("spec_version", "2.0"): + if ignore_spec_version is False and obj1.get("spec_version", "2.0") != obj2.get("spec_version", "2.0"): raise ValueError('The objects to compare must be of the same spec version!') - if type1 == "attack-pattern": - if _check_property_present("name", obj1, obj2): - w = weigths["attack-pattern"]["name"] - sum_weights += w - matching_score += w * _partial_string_based(obj1["name"], obj2["name"]) - if _check_property_present("external_references", obj1, obj2): - w = weigths["attack-pattern"]["external_references"] - sum_weights += w - matching_score += ( - w * - _partial_external_reference_based(obj1["external_references"], obj2["external_references"]) - ) - - elif type1 == "campaign": - if _check_property_present("name", obj1, obj2): - w = weigths["campaign"]["name"] - sum_weights += w - matching_score += w * _partial_string_based(obj1["name"], obj2["name"]) - if _check_property_present("aliases", obj1, obj2): - w = weigths["campaign"]["aliases"] - sum_weights += w - matching_score += w * _partial_list_based(obj1["aliases"], obj2["aliases"]) - - elif type1 == "course-of-action": - logger.warning("%s type has no semantic equivalence implementation", type1) - return 0 - - elif type1 == "identity": - if _check_property_present("name", obj1, obj2): - w = weigths["identity"]["name"] - sum_weights += w - matching_score += w * _exact_match(obj1["name"], obj2["name"]) - if _check_property_present("identity_class", obj1, obj2): - w = weigths["identity"]["identity_class"] - sum_weights += w - matching_score += w * _exact_match(obj1["identity_class"], obj2["identity_class"]) - if _check_property_present("sectors", obj1, obj2): - w = weigths["identity"]["sectors"] - sum_weights += w - matching_score += w * _partial_list_based(obj1["sectors"], obj2["sectors"]) - - elif type1 == "indicator": - if _check_property_present("indicator_types", obj1, obj2): - w = weigths["indicator"]["indicator_types"] - sum_weights += w - matching_score += w * _partial_list_based(obj1["indicator_types"], obj2["indicator_types"]) - if _check_property_present("pattern", obj1, obj2): - w = weigths["indicator"]["pattern"] - sum_weights += w - matching_score += w * _custom_pattern_based(obj1["pattern"], obj2["pattern"]) - if _check_property_present("valid_from", obj1, obj2): - w = weigths["indicator"]["valid_from"] - sum_weights += w - matching_score += ( - w * - _partial_timestamp_based(obj1["valid_from"], obj2["valid_from"], weigths["_internal"]["tdelta"]) - ) - - elif type1 == "intrusion-set": - logger.warning("%s type has no semantic equivalence implementation", type1) - return 0 - - elif type1 == "location": - if _check_property_present("latitude", obj1, obj2) and _check_property_present("longitude", obj1, obj2): - w = weigths["location"]["longitude_latitude"] - sum_weights += w - matching_score += ( - w * - _partial_location_distance(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"]) - ) - if _check_property_present("region", obj1, obj2): - w = weigths["location"]["region"] - sum_weights += w - matching_score += w * _exact_match(obj1["region"], obj2["region"]) - if _check_property_present("country", obj1, obj2): - w = weigths["location"]["country"] - sum_weights += w - matching_score += w * _exact_match(obj1["country"], obj2["country"]) - - elif type1 == "malware": - if _check_property_present("malware_types", obj1, obj2): - w = weigths["malware"]["malware_types"] - sum_weights += w - matching_score += w * _partial_list_based(obj1["malware_types"], obj2["malware_types"]) - if _check_property_present("name", obj1, obj2): - w = weigths["malware"]["name"] - sum_weights += w - matching_score += w * _partial_string_based(obj1["name"], obj2["name"]) - - elif type1 == "observed-data": - logger.warning("%s type has no semantic equivalence implementation", type1) - return 0 - - elif type1 == "report": - logger.warning("%s type has no semantic equivalence implementation", type1) - return 0 - - elif type1 == "threat-actor": - if _check_property_present("name", obj1, obj2): - w = weigths["threat-actor"]["name"] - sum_weights += w - matching_score += w * _partial_string_based(obj1["name"], obj2["name"]) - if _check_property_present("threat_actor_types", obj1, obj2): - w = weigths["threat-actor"]["threat_actor_types"] - sum_weights += w - matching_score += w * _partial_list_based(obj1["threat_actor_types"], obj2["threat_actor_types"]) - if _check_property_present("aliases", obj1, obj2): - w = weigths["threat-actor"]["aliases"] - sum_weights += w - matching_score += w * _partial_list_based(obj1["aliases"], obj2["aliases"]) - - elif type1 == "tool": - if _check_property_present("tool_types", obj1, obj2): - w = weigths["tool"]["tool_types"] - sum_weights += w - matching_score += w * _partial_list_based(obj1["tool_types"], obj2["tool_types"]) - if _check_property_present("name", obj1, obj2): - w = weigths["tool"]["name"] - sum_weights += w - matching_score += w * _partial_string_based(obj1["name"], obj2["name"]) - - elif type1 == "vulnerability": - if _check_property_present("name", obj1, obj2): - w = weigths["vulnerability"]["name"] - sum_weights += w - matching_score += w * _partial_string_based(obj1["name"], obj2["name"]) - if _check_property_present("external_references", obj1, obj2): - w = weigths["vulnerability"]["external_references"] - sum_weights += w - matching_score += w * _partial_external_reference_based(obj1["external_references"], obj2["external_references"]) + method = weights[type1]["method"] + matching_score, sum_weights = method(obj1, obj2, **weights[type1]) if sum_weights <= 0: return 0 @@ -413,13 +308,13 @@ class Environment(DataStoreMixin): return equivalence_score -def _check_property_present(prop, obj1, obj2): +def check_property_present(prop, obj1, obj2): if prop in obj1 and prop in obj2: return True return False -def _partial_timestamp_based(t1, t2, tdelta): +def partial_timestamp_based(t1, t2, tdelta): if not isinstance(t1, STIXdatetime): t1 = parse_into_datetime(t1) if not isinstance(t2, STIXdatetime): @@ -428,28 +323,28 @@ def _partial_timestamp_based(t1, t2, tdelta): return 1 - min(abs(t1 - t2) / (86400 * tdelta), 1) -def _partial_list_based(l1, l2): +def partial_list_based(l1, l2): l1_set, l2_set = set(l1), set(l2) return len(l1_set.intersection(l2_set)) / max(len(l1), len(l2)) -def _exact_match(val1, val2): +def exact_match(val1, val2): if val1 == val2: return 1.0 return 0.0 -def _partial_string_based(str1, str2): +def partial_string_based(str1, str2): from pyjarowinkler import distance return distance.get_jaro_distance(str1, str2) -def _custom_pattern_based(pattern1, pattern2): - logger.warning("Checking for Indicator pattern equivalence is currently not implemented!") - return 0 # TODO: Needs to be implemented +def custom_pattern_based(pattern1, pattern2): + logger.warning("Indicator pattern equivalence is not fully defined; will default to zero if not completely identical") + return exact_match(pattern1, pattern2) # TODO: Implement pattern based equivalence -def _partial_external_reference_based(refs1, refs2): +def partial_external_reference_based(refs1, refs2): allowed = set(("veris", "cve", "capec", "mitre-attack")) matches = 0 @@ -467,14 +362,14 @@ def _partial_external_reference_based(refs1, refs2): url_match = False source_name = None - if _check_property_present("source_name", ext_ref1, ext_ref2): + if check_property_present("source_name", ext_ref1, ext_ref2): if ext_ref1["source_name"] == ext_ref2["source_name"]: source_name = ext_ref1["source_name"] sn_match = True - if _check_property_present("external_id", ext_ref1, ext_ref2): + if check_property_present("external_id", ext_ref1, ext_ref2): if ext_ref1["external_id"] == ext_ref2["external_id"]: ei_match = True - if _check_property_present("url", ext_ref1, ext_ref2): + if check_property_present("url", ext_ref1, ext_ref2): if ext_ref1["url"] == ext_ref2["url"]: url_match = True @@ -492,6 +387,176 @@ def _partial_external_reference_based(refs1, refs2): return matches / max(len(refs1), len(refs2)) -def _partial_location_distance(lat1, long1, lat2, long2): +def partial_location_distance(lat1, long1, lat2, long2): distance = math.sqrt(((lat2 - lat1) ** 2) + ((long2 - long1) ** 2)) return 1 - (distance / 1000.0) + + +def _attack_pattern_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("name", obj1, obj2): + w = weights["name"] + sum_weights += w + matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + if check_property_present("external_references", obj1, obj2): + w = weights["external_references"] + sum_weights += w + matching_score += ( + w * + partial_external_reference_based(obj1["external_references"], obj2["external_references"]) + ) + return matching_score, sum_weights + + +def _campaign_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("name", obj1, obj2): + w = weights["name"] + sum_weights += w + matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + if check_property_present("aliases", obj1, obj2): + w = weights["aliases"] + sum_weights += w + matching_score += w * partial_list_based(obj1["aliases"], obj2["aliases"]) + return matching_score, sum_weights + + +def _identity_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("name", obj1, obj2): + w = weights["name"] + sum_weights += w + matching_score += w * exact_match(obj1["name"], obj2["name"]) + if check_property_present("identity_class", obj1, obj2): + w = weights["identity_class"] + sum_weights += w + matching_score += w * exact_match(obj1["identity_class"], obj2["identity_class"]) + if check_property_present("sectors", obj1, obj2): + w = weights["sectors"] + sum_weights += w + matching_score += w * partial_list_based(obj1["sectors"], obj2["sectors"]) + return matching_score, sum_weights + + +def _indicator_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("indicator_types", obj1, obj2): + w = weights["indicator_types"] + sum_weights += w + matching_score += w * partial_list_based(obj1["indicator_types"], obj2["indicator_types"]) + if check_property_present("pattern", obj1, obj2): + w = weights["pattern"] + sum_weights += w + matching_score += w * custom_pattern_based(obj1["pattern"], obj2["pattern"]) + if check_property_present("valid_from", obj1, obj2): + w = weights["valid_from"] + sum_weights += w + matching_score += ( + w * + partial_timestamp_based(obj1["valid_from"], obj2["valid_from"], weights["tdelta"]) + ) + return matching_score, sum_weights + + +def _location_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("latitude", obj1, obj2) and check_property_present("longitude", obj1, obj2): + w = weights["longitude_latitude"] + sum_weights += w + matching_score += ( + w * + partial_location_distance(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"]) + ) + if check_property_present("region", obj1, obj2): + w = weights["region"] + sum_weights += w + matching_score += w * exact_match(obj1["region"], obj2["region"]) + if check_property_present("country", obj1, obj2): + w = weights["country"] + sum_weights += w + matching_score += w * exact_match(obj1["country"], obj2["country"]) + return matching_score, sum_weights + + +def _malware_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("malware_types", obj1, obj2): + w = weights["malware_types"] + sum_weights += w + matching_score += w * partial_list_based(obj1["malware_types"], obj2["malware_types"]) + if check_property_present("name", obj1, obj2): + w = weights["name"] + sum_weights += w + matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + return matching_score, sum_weights + + +def _threat_actor_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("name", obj1, obj2): + w = weights["name"] + sum_weights += w + matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + if check_property_present("threat_actor_types", obj1, obj2): + w = weights["threat_actor_types"] + sum_weights += w + matching_score += w * partial_list_based(obj1["threat_actor_types"], obj2["threat_actor_types"]) + if check_property_present("aliases", obj1, obj2): + w = weights["aliases"] + sum_weights += w + matching_score += w * partial_list_based(obj1["aliases"], obj2["aliases"]) + return matching_score, sum_weights + + +def _tool_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("tool_types", obj1, obj2): + w = weights["tool_types"] + sum_weights += w + matching_score += w * partial_list_based(obj1["tool_types"], obj2["tool_types"]) + if check_property_present("name", obj1, obj2): + w = weights["name"] + sum_weights += w + matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + return matching_score, sum_weights + + +def _vulnerability_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("name", obj1, obj2): + w = weights["name"] + sum_weights += w + matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + if check_property_present("external_references", obj1, obj2): + w = weights["external_references"] + sum_weights += w + matching_score += w * partial_external_reference_based( + obj1["external_references"], + obj2["external_references"], + ) + return matching_score, sum_weights + + +def _course_of_action_checks(obj1, obj2, **weights): + raise SemanticEquivalenceUnsupportedTypeError("course-of-action type has no semantic equivalence implementation!") + + +def _intrusion_set_checks(obj1, obj2, **weights): + raise SemanticEquivalenceUnsupportedTypeError("intrusion-set type has no semantic equivalence implementation!") + + +def _observed_data_checks(obj1, obj2, **weights): + raise SemanticEquivalenceUnsupportedTypeError("observed-data type has no semantic equivalence implementation!") + + +def _report_checks(obj1, obj2, **weights): + raise SemanticEquivalenceUnsupportedTypeError("report type has no semantic equivalence implementation!") diff --git a/stix2/exceptions.py b/stix2/exceptions.py index f1f1c09..3e4dcf5 100644 --- a/stix2/exceptions.py +++ b/stix2/exceptions.py @@ -216,3 +216,10 @@ class TLPMarkingDefinitionError(STIXError, AssertionError): def __str__(self): msg = "Marking {0} does not match spec marking {1}!" return msg.format(self.user_obj, self.spec_obj) + + +class SemanticEquivalenceUnsupportedTypeError(STIXError, TypeError): + """STIX object type not supported by the semantic equivalence approach.""" + + def __init__(self, msg): + super(SemanticEquivalenceUnsupportedTypeError, self).__init__(msg) diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 7427109..f645513 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -2,6 +2,7 @@ import pytest import stix2 import stix2.environment +import stix2.exceptions from .constants import ( ATTACK_PATTERN_ID, ATTACK_PATTERN_KWARGS, CAMPAIGN_ID, CAMPAIGN_KWARGS, @@ -444,7 +445,7 @@ def test_semantic_equivalence_on_same_indicator(): ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) env = stix2.Environment().semantically_equivalent(ind1, ind2) - assert round(env) == 20 # No support for pattern, hence the 20 + assert round(env) == 100 def test_semantic_equivalence_on_same_location1(): @@ -556,9 +557,36 @@ def test_semantic_equivalence_on_unknown_object(): }, ], ) + + def _x_foobar_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if stix2.environment.check_property_present("external_references", obj1, obj2): + w = weights["external_references"] + sum_weights += w + matching_score += w * stix2.environment.partial_external_reference_based( + obj1["external_references"], + obj2["external_references"], + ) + if stix2.environment.check_property_present("name", obj1, obj2): + w = weights["name"] + sum_weights += w + matching_score += w * stix2.environment.partial_string_based(obj1["name"], obj2["name"]) + return matching_score, sum_weights + + weights = { + "x-foobar": { + "external_references": 40, + "name": 60, + "method": _x_foobar_checks, + }, + "_internal": { + "ignore_spec_version": False, + }, + } cust1 = stix2.parse(CUSTOM_KWARGS1, allow_custom=True) cust2 = stix2.parse(CUSTOM_KWARGS2, allow_custom=True) - env = stix2.Environment().semantically_equivalent(cust1, cust2) + env = stix2.Environment().semantically_equivalent(cust1, cust2, **weights) assert round(env) == 0 @@ -584,22 +612,35 @@ def test_semantic_equivalence_different_spec_version_raises(): assert str(excinfo.value) == "The objects to compare must be of the same spec version!" -def test_semantic_equivalence_on_unsupported_types(): - coa1 = stix2.v21.CourseOfAction(id=COURSE_OF_ACTION_ID, **COURSE_OF_ACTION_KWARGS) - ints1 = stix2.v21.IntrusionSet(id=INTRUSION_SET_ID, **INTRUSION_SET_KWARGS) - obs1 = stix2.v21.ObservedData(id=OBSERVED_DATA_ID, **OBSERVED_DATA_KWARGS) - rep1 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) - - coa2 = stix2.v21.CourseOfAction(id=COURSE_OF_ACTION_ID, **COURSE_OF_ACTION_KWARGS) - ints2 = stix2.v21.IntrusionSet(id=INTRUSION_SET_ID, **INTRUSION_SET_KWARGS) - obs2 = stix2.v21.ObservedData(id=OBSERVED_DATA_ID, **OBSERVED_DATA_KWARGS) - rep2 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) - - obj_list = [(coa1, coa2), (ints1, ints2), (obs1, obs2), (rep1, rep2)] - - for obj1, obj2 in obj_list: - env = stix2.Environment().semantically_equivalent(obj1, obj2) - assert round(env) == 0 +@pytest.mark.parametrize( + "obj1,obj2,ret_val", + [ + ( + stix2.v21.CourseOfAction(id=COURSE_OF_ACTION_ID, **COURSE_OF_ACTION_KWARGS), + stix2.v21.CourseOfAction(id=COURSE_OF_ACTION_ID, **COURSE_OF_ACTION_KWARGS), + "course-of-action type has no semantic equivalence implementation!", + ), + ( + stix2.v21.IntrusionSet(id=INTRUSION_SET_ID, **INTRUSION_SET_KWARGS), + stix2.v21.IntrusionSet(id=INTRUSION_SET_ID, **INTRUSION_SET_KWARGS), + "intrusion-set type has no semantic equivalence implementation!", + ), + ( + stix2.v21.ObservedData(id=OBSERVED_DATA_ID, **OBSERVED_DATA_KWARGS), + stix2.v21.ObservedData(id=OBSERVED_DATA_ID, **OBSERVED_DATA_KWARGS), + "observed-data type has no semantic equivalence implementation!", + ), + ( + stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS), + stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS), + "report type has no semantic equivalence implementation!", + ), + ], +) +def test_semantic_equivalence_on_unsupported_types(obj1, obj2, ret_val): + with pytest.raises(stix2.exceptions.SemanticEquivalenceUnsupportedTypeError) as excinfo: + stix2.Environment().semantically_equivalent(obj1, obj2) + assert ret_val == str(excinfo.value) def test_semantic_equivalence_zero_match(): @@ -607,54 +648,21 @@ def test_semantic_equivalence_zero_match(): indicator_types=["APTX"], pattern="[ipv4-addr:value = '192.168.1.1']", ) - weigths = { - "attack-pattern": { - "name": 30, - "external_references": 70, - }, - "campaign": { - "name": 60, - "aliases": 40, - }, - "identity": { - "name": 60, - "identity_class": 20, - "sectors": 20, - }, + weights = { "indicator": { "indicator_types": 15, - "pattern": 85, + "pattern": 80, "valid_from": 0, - }, - "location": { - "longitude_latitude": 34, - "region": 33, - "country": 33, - }, - "malware": { - "malware_types": 20, - "name": 80, - }, - "threat-actor": { - "name": 60, - "threat_actor_types": 20, - "aliases": 20, - }, - "tool": { - "tool_types": 20, - "name": 80, - }, - "vulnerability": { - "name": 30, - "external_references": 70, + "tdelta": 1, # One day interval + "method": stix2.environment._indicator_checks, }, "_internal": { - "tdelta": 1, + "ignore_spec_version": False, }, } ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **IND_KWARGS) - env = stix2.Environment().semantically_equivalent(ind1, ind2, **weigths) + env = stix2.Environment().semantically_equivalent(ind1, ind2, **weights) assert round(env) == 0 @@ -727,17 +735,17 @@ def test_semantic_equivalence_zero_match(): ], ) def test_semantic_equivalence_external_references(refs1, refs2, ret_val): - value = stix2.environment._partial_external_reference_based(refs1, refs2) + value = stix2.environment.partial_external_reference_based(refs1, refs2) assert value == ret_val def test_semantic_equivalence_timetamp(): t1 = "2018-10-17T00:14:20.652Z" t2 = "2018-10-17T12:14:20.652Z" - assert stix2.environment._partial_timestamp_based(t1, t2, 1) == 0.5 + assert stix2.environment.partial_timestamp_based(t1, t2, 1) == 0.5 def test_semantic_equivalence_exact_match(): t1 = "2018-10-17T00:14:20.652Z" t2 = "2018-10-17T12:14:20.652Z" - assert stix2.environment._exact_match(t1, t2) == 0.0 + assert stix2.environment.exact_match(t1, t2) == 0.0 From dc79a1f869e54f941e81d31a7544f4e0ebcf6e7e Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 23 Sep 2019 23:13:50 -0400 Subject: [PATCH 10/23] add docstrings for new public methods. add test with disabled spec_version check. fix calculation for distance, using incorrect algorithm. update package settings, tox settings --- setup.py | 2 +- stix2/environment.py | 87 ++++++++++++++++++++++++++++-- stix2/test/v21/test_environment.py | 23 ++++++++ tox.ini | 1 + 4 files changed, 107 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 185c76c..481534b 100644 --- a/setup.py +++ b/setup.py @@ -63,6 +63,6 @@ setup( }, extras_require={ 'taxii': ['taxii2-client'], - 'semantic': ['pyjarowinkler'], + 'semantic': ['haversine', 'pyjarowinkler'], }, ) diff --git a/stix2/environment.py b/stix2/environment.py index c013ae2..d2c6d3a 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -2,7 +2,6 @@ import copy import logging -import math import time from .core import parse as _parse @@ -252,6 +251,7 @@ class Environment(DataStoreMixin): "longitude_latitude": 34, "region": 33, "country": 33, + "threshold": 1000.0, "method": _location_checks, }, "malware": { @@ -309,12 +309,25 @@ class Environment(DataStoreMixin): def check_property_present(prop, obj1, obj2): + """Helper method checks if a property is present on both objects.""" if prop in obj1 and prop in obj2: return True return False def partial_timestamp_based(t1, t2, tdelta): + """Performs a timestamp-based matching via checking how close one timestamp is to another. + + Args: + t1: A datetime string or STIXdatetime object. + t2: A datetime string or STIXdatetime object. + tdelta (float): A given time delta. This number is multiplied by 86400 (1 day) to + extend or shrink your time change tolerance. + + Returns: + float: Number between 0.0 and 1.0 depending on match criteria. + + """ if not isinstance(t1, STIXdatetime): t1 = parse_into_datetime(t1) if not isinstance(t2, STIXdatetime): @@ -324,27 +337,77 @@ def partial_timestamp_based(t1, t2, tdelta): def partial_list_based(l1, l2): + """Performs a partial list matching via finding the intersection between common values. + + Args: + l1: A list of values. + l2: A list of values. + + Returns: + float: 1.0 if the value matches exactly, 0.0 otherwise. + + """ l1_set, l2_set = set(l1), set(l2) return len(l1_set.intersection(l2_set)) / max(len(l1), len(l2)) def exact_match(val1, val2): + """Performs an exact value match based on two values + + Args: + val1: A value suitable for an equality test. + val2: A value suitable for an equality test. + + Returns: + float: 1.0 if the value matches exactly, 0.0 otherwise. + + """ if val1 == val2: return 1.0 return 0.0 def partial_string_based(str1, str2): + """Performs a partial string match using the Jaro-Winkler distance algorithm. + + Args: + str1: A string value to check. + str2: A string value to check. + + Returns: + float: Number between 0.0 and 1.0 depending on match criteria. + + """ from pyjarowinkler import distance return distance.get_jaro_distance(str1, str2) def custom_pattern_based(pattern1, pattern2): + """Performs a matching on Indicator Patterns. + + Args: + pattern1: An Indicator pattern + pattern2: An Indicator pattern + + Returns: + float: Number between 0.0 and 1.0 depending on match criteria. + + """ logger.warning("Indicator pattern equivalence is not fully defined; will default to zero if not completely identical") return exact_match(pattern1, pattern2) # TODO: Implement pattern based equivalence def partial_external_reference_based(refs1, refs2): + """Performs a matching on External References. + + Args: + refs1: A list of external references. + refs2: A list of external references. + + Returns: + float: Number between 0.0 and 1.0 depending on matches. + + """ allowed = set(("veris", "cve", "capec", "mitre-attack")) matches = 0 @@ -387,9 +450,23 @@ def partial_external_reference_based(refs1, refs2): return matches / max(len(refs1), len(refs2)) -def partial_location_distance(lat1, long1, lat2, long2): - distance = math.sqrt(((lat2 - lat1) ** 2) + ((long2 - long1) ** 2)) - return 1 - (distance / 1000.0) +def partial_location_distance(lat1, long1, lat2, long2, threshold): + """Given two coordinates perform a matching based on its distance using the Haversine Formula. + + Args: + lat1: Latitude value for first coordinate point. + lat2: Latitude value for second coordinate point. + long1: Longitude value for first coordinate point. + long2: Longitude value for second coordinate point. + threshold (float): A kilometer measurement for the threshold distance between these two points. + + Returns: + float: Number between 0.0 and 1.0 depending on match. + + """ + from haversine import haversine, Unit + distance = haversine((lat1, long1), (lat2, long2), unit=Unit.KILOMETERS) + return 1 - (distance / threshold) def _attack_pattern_checks(obj1, obj2, **weights): @@ -470,7 +547,7 @@ def _location_checks(obj1, obj2, **weights): sum_weights += w matching_score += ( w * - partial_location_distance(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"]) + partial_location_distance(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"], weights["threshold"]) ) if check_property_present("region", obj1, obj2): w = weights["region"] diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index f645513..d962147 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -666,6 +666,29 @@ def test_semantic_equivalence_zero_match(): assert round(env) == 0 +def test_semantic_equivalence_different_spec_version(): + IND_KWARGS = dict( + labels=["APTX"], + pattern="[ipv4-addr:value = '192.168.1.1']", + ) + weights = { + "indicator": { + "indicator_types": 15, + "pattern": 80, + "valid_from": 0, + "tdelta": 1, # One day interval + "method": stix2.environment._indicator_checks, + }, + "_internal": { + "ignore_spec_version": True, # Disables spec_version check. + }, + } + ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) + ind2 = stix2.v20.Indicator(id=INDICATOR_ID, **IND_KWARGS) + env = stix2.Environment().semantically_equivalent(ind1, ind2, **weights) + assert round(env) == 0 + + @pytest.mark.parametrize( "refs1,refs2,ret_val", [ ( diff --git a/tox.ini b/tox.ini index 2bdae15..7911fde 100644 --- a/tox.ini +++ b/tox.ini @@ -10,6 +10,7 @@ deps = coverage taxii2-client pyjarowinkler + haversine medallion commands = pytest --ignore=stix2/test/v20/test_workbench.py --ignore=stix2/test/v21/test_workbench.py --cov=stix2 stix2/test/ --cov-report term-missing From de478df68720fcdc9699be44c02f7deb4b389517 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 23 Sep 2019 23:27:43 -0400 Subject: [PATCH 11/23] update test after merge, formatting --- stix2/exceptions.py | 1 + stix2/test/v21/test_environment.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/stix2/exceptions.py b/stix2/exceptions.py index 71ef099..6405c2e 100644 --- a/stix2/exceptions.py +++ b/stix2/exceptions.py @@ -234,6 +234,7 @@ class STIXDeprecationWarning(DeprecationWarning): """ pass + class SemanticEquivalenceUnsupportedTypeError(STIXError, TypeError): """STIX object type not supported by the semantic equivalence approach.""" diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 1157b50..62b0c53 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -650,6 +650,8 @@ def test_semantic_equivalence_zero_match(): IND_KWARGS = dict( indicator_types=["APTX"], pattern="[ipv4-addr:value = '192.168.1.1']", + pattern_type="stix", + valid_from="2019-01-01T12:34:56Z", ) weights = { "indicator": { From 75b87f50dd9688d76e945f1f345073486d49e989 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 23 Sep 2019 23:33:04 -0400 Subject: [PATCH 12/23] Update .isort.cfg --- .isort.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/.isort.cfg b/.isort.cfg index 0f2fca3..db580a5 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -4,6 +4,7 @@ not_skip = __init__.py known_third_party = antlr4, dateutil, + haversine, medallion, pyjarowinkler, pytest, From 47551b8cc10a1457f0c0f5df1fe7edd4c7d32d30 Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Wed, 25 Sep 2019 14:28:24 -0400 Subject: [PATCH 13/23] Add documentation for semantic equivalence --- docs/guide/environment.ipynb | 2 +- docs/guide/equivalence.ipynb | 1436 ++++++++++++++++++++++++++++++++++ 2 files changed, 1437 insertions(+), 1 deletion(-) create mode 100644 docs/guide/equivalence.ipynb diff --git a/docs/guide/environment.ipynb b/docs/guide/environment.ipynb index f7515a5..8d22e39 100644 --- a/docs/guide/environment.ipynb +++ b/docs/guide/environment.ipynb @@ -67,7 +67,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ diff --git a/docs/guide/equivalence.ipynb b/docs/guide/equivalence.ipynb new file mode 100644 index 0000000..9b49c2f --- /dev/null +++ b/docs/guide/equivalence.ipynb @@ -0,0 +1,1436 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Delete this cell to re-enable tracebacks\n", + "import sys\n", + "ipython = get_ipython()\n", + "\n", + "def hide_traceback(exc_tuple=None, filename=None, tb_offset=None,\n", + " exception_only=False, running_compiled_code=False):\n", + " etype, value, tb = sys.exc_info()\n", + " return ipython._showtraceback(etype, value, ipython.InteractiveTB.get_exception_only(etype, value))\n", + "\n", + "ipython.showtraceback = hide_traceback" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# JSON output syntax highlighting\n", + "from __future__ import print_function\n", + "from pygments import highlight\n", + "from pygments.lexers import JsonLexer, TextLexer\n", + "from pygments.formatters import HtmlFormatter\n", + "from IPython.display import display, HTML\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "\n", + "InteractiveShell.ast_node_interactivity = \"all\"\n", + "\n", + "def json_print(inpt):\n", + " string = str(inpt)\n", + " formatter = HtmlFormatter()\n", + " if string[0] == '{':\n", + " lexer = JsonLexer()\n", + " else:\n", + " lexer = TextLexer()\n", + " return HTML('{}'.format(\n", + " formatter.get_style_defs('.highlight'),\n", + " highlight(string, lexer, formatter)))\n", + "\n", + "globals()['print'] = json_print" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Checking Semantic Equivalence\n", + "\n", + "The [Environment](../api/stix2.environment.rst#stix2.environment.Environment) has a function for checking if two STIX Objects are semantically equivalent. For each supported objct type, the algorithm checks if the values for a specific set of properties match. Then each matching property is weighted since every property doesn't represent the same level of importance for semantic equivalence. The result will be the sum of these weighted values, in the range of 0 to 100. A result of 0 means that the the two objects are not equivalent, and a result of 100 means that they are equivalent.\n", + "\n", + "TODO: Add a link to the committee note when it is released.\n", + "\n", + "Below we will show examples of the semantic equivalence results of various objects. Unless otherwise specified, the ID of each object will be generated by the library, so the two objects will not have the same ID. This demonstrates that the semantic equivalence algorithm only looks at specific properties for each object type.\n", + "\n", + "### Attack Pattern Example\n", + "\n", + "For Attack Patterns, the only properties that contribute to semantic equivalence are `name` and `external_references`, with weights of 30 and 70, respectively. In this example, both attack patterns have the same external reference but the second has a slightly different yet still similar name." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
85.3\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import stix2\n", + "from stix2 import Environment, MemoryStore\n", + "from stix2.v21 import AttackPattern\n", + "\n", + "env = Environment(store=MemoryStore())\n", + "\n", + "ap1 = AttackPattern(\n", + " name=\"Phishing\",\n", + " external_references=[\n", + " {\n", + " \"url\": \"https://example2\",\n", + " \"source_name\": \"some-source2\",\n", + " },\n", + " ],\n", + ")\n", + "ap2 = AttackPattern(\n", + " name=\"Spear phishing\",\n", + " external_references=[\n", + " {\n", + " \"url\": \"https://example2\",\n", + " \"source_name\": \"some-source2\",\n", + " },\n", + " ],\n", + ")\n", + "print(env.semantically_equivalent(ap1, ap2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Campaign Example\n", + "\n", + "For Campaigns, the only properties that contribute to semantic equivalence are `name` and `aliases`, with weights of 60 and 40, respectively. In this example, the two campaigns have completely different names, but slightly similar descriptions." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
50.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import Campaign\n", + "\n", + "c1 = Campaign(\n", + " name=\"Someone Attacks Somebody\",\n", + " description=\"A campaign targeting....\",)\n", + "\n", + "c2 = Campaign(\n", + " name=\"Another Campaign\",\n", + " description=\"A campaign that targets....\",)\n", + "print(env.semantically_equivalent(c1, c2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Identity Example\n", + "\n", + "For Identities, the only properties that contribute to semantic equivalence are `name`, `identity_class`, and `sectors`, with weights of 60, 20, and 20, respectively. In this example, the two identities are identical, but are missing one of the contributing properties. The algorithm only compares properties that are actually present on the objects. Also note that they have completely different description properties, but because description is not one of the properties considered for semantic equivalence, this difference has no effect on the result." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
100.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import Identity\n", + "\n", + "id1 = Identity(\n", + " name=\"John Smith\",\n", + " identity_class=\"individual\",\n", + " description=\"Just some guy\",\n", + ")\n", + "id2 = Identity(\n", + " name=\"John Smith\",\n", + " identity_class=\"individual\",\n", + " description=\"A person\",\n", + ")\n", + "print(env.semantically_equivalent(id1, id2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Indicator Example\n", + "\n", + "For Indicators, the only properties that contribute to semantic equivalence are `indicator_types`, `pattern`, and `valid_from`, with weights of 15, 80, and 5, respectively. In this example, the two indicators have patterns with different hashes but the same indicator_type and valid_from. For patterns, the algorithm currently only checks if they are identical." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Indicator pattern equivalence is not fully defined; will default to zero if not completely identical\n" + ] + }, + { + "data": { + "text/html": [ + "
20.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import Indicator\n", + "\n", + "ind1 = Indicator(\n", + " indicator_types=['malicious-activity'],\n", + " pattern_type=\"stix\",\n", + " pattern=\"[file:hashes.MD5 = 'd41d8cd98f00b204e9800998ecf8427e']\",\n", + " valid_from=\"2017-01-01T12:34:56Z\",\n", + ")\n", + "ind2 = Indicator(\n", + " indicator_types=['malicious-activity'],\n", + " pattern_type=\"stix\",\n", + " pattern=\"[file:hashes.MD5 = '79054025255fb1a26e4bc422aef54eb4']\",\n", + " valid_from=\"2017-01-01T12:34:56Z\",\n", + ")\n", + "print(env.semantically_equivalent(ind1, ind2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Location Example\n", + "\n", + "For Locations, the only properties that contribute to semantic equivalence are `longitude`/`latitude`, `region`, and `country`, with weights of 34, 33, and 33, respectively. In this example, the two locations are Washington, D.C. and New York City. The algorithm computes the distance between two locations using the haversine formula and uses that to influence equivalence." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
67.20663955882583\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import Location\n", + "\n", + "loc1 = Location(\n", + " latitude=38.889,\n", + " longitude=-77.023,\n", + ")\n", + "loc2 = Location(\n", + " latitude=40.713,\n", + " longitude=-74.006,\n", + ")\n", + "print(env.semantically_equivalent(loc1, loc2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Malware Example\n", + "\n", + "For Malware, the only properties that contribute to semantic equivalence are `malware_types` and `name`, with weights of 20 and 80, respectively. In this example, the two malware objects only differ in the strings in their malware_types lists. For lists, the algorithm bases its calculations on the intersection of the two lists. An empty intersection will result in a 0, and a complete intersection will result in a 1 for that property." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
90.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import Malware\n", + "\n", + "MALWARE_ID = \"malware--9c4638ec-f1de-4ddb-abf4-1b760417654e\"\n", + "\n", + "mal1 = Malware(id=MALWARE_ID,\n", + " malware_types=['ransomware'],\n", + " name=\"Cryptolocker\",\n", + " is_family=False,\n", + " )\n", + "mal2 = Malware(id=MALWARE_ID,\n", + " malware_types=['ransomware', 'dropper'],\n", + " name=\"Cryptolocker\",\n", + " is_family=False,\n", + " )\n", + "print(env.semantically_equivalent(mal1, mal2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Threat Actor Example\n", + "\n", + "For Threat Actors, the only properties that contribute to semantic equivalence are `threat_actor_types`, `name`, and `aliases`, with weights of 20, 60, and 20, respectively. In this example, the two threat actors have the same id properties but everything else is different. Since the id property does not factor into semantic equivalence, the result is not very high. The result is not zero because the algorithm is using the Jaro-Winkler distance between strings in the threat_actor_types and name properties." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
33.6\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import ThreatActor\n", + "\n", + "THREAT_ACTOR_ID = \"threat-actor--8e2e2d2b-17d4-4cbf-938f-98ee46b3cd3f\"\n", + "\n", + "ta1 = ThreatActor(id=THREAT_ACTOR_ID,\n", + " threat_actor_types=[\"crime-syndicate\"],\n", + " name=\"Evil Org\",\n", + " aliases=[\"super-evil\"],\n", + ")\n", + "ta2 = ThreatActor(id=THREAT_ACTOR_ID,\n", + " threat_actor_types=[\"spy\"],\n", + " name=\"James Bond\",\n", + " aliases=[\"007\"],\n", + ")\n", + "print(env.semantically_equivalent(ta1, ta2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tool Example\n", + "\n", + "For Tools, the only properties that contribute to semantic equivalence are `tool_types` and `name`, with weights of 20 and 80, respectively. In this example, the two tools have the same values for properties that contribute to semantic equivalence but one has an additional, non-contributing property." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
100.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import Tool\n", + "\n", + "t1 = Tool(\n", + " tool_types=[\"remote-access\"],\n", + " name=\"VNC\",\n", + ")\n", + "t2 = Tool(\n", + " tool_types=[\"remote-access\"],\n", + " name=\"VNC\",\n", + " description=\"This is a tool\"\n", + ")\n", + "print(env.semantically_equivalent(t1, t2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Vulnerability Example\n", + "\n", + "For Vulnerabilities, the only properties that contribute to semantic equivalence are `name` and `external_references`, with weights of 30 and 70, respectively. In this example, the two vulnerabilities have the same name but one also has an external reference. The algorithm doesn't take into account any semantic equivalence contributing properties that are not present on both objects." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
100.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import Vulnerability\n", + "\n", + "vuln1 = Vulnerability(\n", + " name=\"Heartbleed\",\n", + " external_references=[\n", + " {\n", + " \"url\": \"https://example\",\n", + " \"source_name\": \"some-source\",\n", + " },\n", + " ],\n", + ")\n", + "vuln2 = Vulnerability(\n", + " name=\"Heartbleed\",\n", + ")\n", + "print(env.semantically_equivalent(vuln1, vuln2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Other Examples\n", + "\n", + "Comparing objects of different types will result in an error." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "The objects to compare must be of the same type!", + "output_type": "error", + "traceback": [ + "\u001b[0;31mValueError\u001b[0m\u001b[0;31m:\u001b[0m The objects to compare must be of the same type!\n" + ] + } + ], + "source": [ + "print(env.semantically_equivalent(ind1, vuln1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Some object types do not have a defined method for calculating semantic equivalence and by default will raise an error." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "ename": "SemanticEquivalenceUnsupportedTypeError", + "evalue": "report type has no semantic equivalence implementation!", + "output_type": "error", + "traceback": [ + "\u001b[0;31mSemanticEquivalenceUnsupportedTypeError\u001b[0m\u001b[0;31m:\u001b[0m report type has no semantic equivalence implementation!\n" + ] + } + ], + "source": [ + "from stix2.v21 import Report\n", + "\n", + "r1 = Report(\n", + " report_types=[\"campaign\"],\n", + " name=\"Bad Cybercrime\",\n", + " published=\"2016-04-06T20:03:00.000Z\",\n", + " object_refs=[\"indicator--a740531e-63ff-4e49-a9e1-a0a3eed0e3e7\"],\n", + ")\n", + "r2 = Report(\n", + " report_types=[\"campaign\"],\n", + " name=\"Bad Cybercrime\",\n", + " published=\"2016-04-06T20:03:00.000Z\",\n", + " object_refs=[\"indicator--a740531e-63ff-4e49-a9e1-a0a3eed0e3e7\"],\n", + ")\n", + "print(env.semantically_equivalent(r1, r2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, comparing objects of different spec versions will result in an error. You can optionally allow this by providing a configuration dictionary like in the next example:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
100.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v20 import Identity as Identity20\n", + "\n", + "id20 = Identity20(\n", + " name=\"John Smith\",\n", + " identity_class=\"individual\",\n", + ")\n", + "print(env.semantically_equivalent(id2, id20, **{\"_internal\": {\"ignore_spec_version\": True}}))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can modify the weights or provide your own functions for comparing objects of a certain type by providing them in a dictionary to the optional 3rd parameter to the semantic equivalence function. You can find functions (like `partial_string_based`) to help with this in the [Environment API docs](../api/stix2.environment.rst#stix2.environment.Environment). In this example we define semantic equivalence for our new `x-foobar` object type:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
60.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def _x_foobar_checks(obj1, obj2, **weights):\n", + " matching_score = 0.0\n", + " sum_weights = 0.0\n", + " if stix2.environment.check_property_present(\"name\", obj1, obj2):\n", + " w = weights[\"name\"]\n", + " sum_weights += w\n", + " matching_score += w * stix2.environment.partial_string_based(obj1[\"name\"], obj2[\"name\"])\n", + " if stix2.environment.check_property_present(\"color\", obj1, obj2):\n", + " w = weights[\"color\"]\n", + " sum_weights += w\n", + " matching_score += w * stix2.environment.partial_string_based(obj1[\"color\"], obj2[\"color\"])\n", + " return matching_score, sum_weights\n", + "\n", + "weights = {\n", + " \"x-foobar\": {\n", + " \"name\": 60,\n", + " \"color\": 40,\n", + " \"method\": _x_foobar_checks,\n", + " },\n", + " \"_internal\": {\n", + " \"ignore_spec_version\": False,\n", + " },\n", + "}\n", + "foo1 = {\n", + " \"type\":\"x-foobar\",\n", + " \"id\":\"x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061\",\n", + " \"name\": \"Zot\",\n", + " \"color\": \"red\",\n", + "}\n", + "foo2 = {\n", + " \"type\":\"x-foobar\",\n", + " \"id\":\"x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061\",\n", + " \"name\": \"Zot\",\n", + " \"color\": \"blue\",\n", + "}\n", + "print(env.semantically_equivalent(foo1, foo2, **weights))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 0124dbc0dcd44cf44bb99f75cbef99ee79c7d651 Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Wed, 25 Sep 2019 16:02:26 -0400 Subject: [PATCH 14/23] Update CHANGELOG for v1.2.0 --- CHANGELOG | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index f4cce28..e2cb8ad 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,15 @@ CHANGELOG ========= +1.2.0 - 2019-09-25 + +* #268, #271, #273, #275, #283, #285, #290 Changes support of STIX 2.1 to WD05 (CSD02), for all object types +* #269 Updates id properties to take a spec_version parameter +* #283 Changes the exception class hierarchy +* #289 Adds functions for calculating semantic equivalence of two objects +* #286 Fixes handling of custom observable extensions +* #287 Fixes bug with timestamp precision preservation in MarkingDefinition objects + 1.1.3 - 2019-08-12 * #258 Ignores empty values for optional fields From c6936ae7a226868ca6bd47cebb4222ff240cd332 Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Wed, 25 Sep 2019 16:04:07 -0400 Subject: [PATCH 15/23] =?UTF-8?q?Bump=20version:=201.1.3=20=E2=86=92=201.2?= =?UTF-8?q?.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.cfg | 2 +- stix2/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index b012bb9..758875d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.1.3 +current_version = 1.2.0 commit = True tag = True diff --git a/stix2/version.py b/stix2/version.py index 0b2f79d..c68196d 100644 --- a/stix2/version.py +++ b/stix2/version.py @@ -1 +1 @@ -__version__ = "1.1.3" +__version__ = "1.2.0" From 9e04481acbec649390e6ae8363d87f437584bd82 Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Thu, 26 Sep 2019 15:52:44 -0400 Subject: [PATCH 16/23] Update requirements.txt Fix failing ReadTheDocs builds. Related: https://github.com/readthedocs/readthedocs.org/issues/5332 --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 5ac0b37..08d22f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ bumpversion ipython +mkdocs<=1.0.4 nbsphinx==0.3.2 pre-commit pytest From 3bc59d6898db803e1c13827bc9a3cde8e138b252 Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Thu, 26 Sep 2019 15:53:34 -0400 Subject: [PATCH 17/23] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 08d22f4..ddbda18 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ bumpversion ipython -mkdocs<=1.0.4 +mkdocs>=1.0.4 nbsphinx==0.3.2 pre-commit pytest From b9927fd4a53f77641e59b27737d8497072bc5b91 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 30 Sep 2019 13:16:06 -0400 Subject: [PATCH 18/23] update .ipynb files with correct references. update package requirements --- docs/guide/custom.ipynb | 6 +++--- docs/guide/datastore.ipynb | 4 ++-- docs/guide/ts_support.ipynb | 2 +- docs/guide/workbench.ipynb | 4 ++-- requirements.txt | 5 ++--- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/docs/guide/custom.ipynb b/docs/guide/custom.ipynb index 042f11e..7ceb33b 100644 --- a/docs/guide/custom.ipynb +++ b/docs/guide/custom.ipynb @@ -537,7 +537,7 @@ "source": [ "### Custom STIX Object Types\n", "\n", - "To create a custom STIX object type, define a class with the @[CustomObject](../api/stix2.v20.sdo.rst#stix2.v20.sdo.CustomObject) decorator. It takes the type name and a list of property tuples, each tuple consisting of the property name and a property instance. Any special validation of the properties can be added by supplying an ``__init__`` function.\n", + "To create a custom STIX object type, define a class with the @[CustomObject](../api/v20/stix2.v20.sdo.rst#stix2.v20.sdo.CustomObject) decorator. It takes the type name and a list of property tuples, each tuple consisting of the property name and a property instance. Any special validation of the properties can be added by supplying an ``__init__`` function.\n", "\n", "Let's say zoo animals have become a serious cyber threat and we want to model them in STIX using a custom object type. Let's use a ``species`` property to store the kind of animal, and make that property required. We also want a property to store the class of animal, such as \"mammal\" or \"bird\" but only want to allow specific values in it. We can add some logic to validate this property in ``__init__``." ] @@ -841,7 +841,7 @@ "source": [ "### Custom Cyber Observable Types\n", "\n", - "Similar to custom STIX object types, use a decorator to create [custom Cyber Observable](../api/stix2.v20.observables.rst#stix2.v20.observables.CustomObservable) types. Just as before, ``__init__()`` can hold additional validation, but it is not necessary." + "Similar to custom STIX object types, use a decorator to create [custom Cyber Observable](../api/v20/stix2.v20.observables.rst#stix2.v20.observables.CustomObservable) types. Just as before, ``__init__()`` can hold additional validation, but it is not necessary." ] }, { @@ -1163,7 +1163,7 @@ "source": [ "### Custom Cyber Observable Extensions\n", "\n", - "Finally, custom extensions to existing Cyber Observable types can also be created. Just use the @[CustomExtension](../api/stix2.v20.observables.rst#stix2.v20.observables.CustomExtension) decorator. Note that you must provide the Cyber Observable class to which the extension applies. Again, any extra validation of the properties can be implemented by providing an ``__init__()`` but it is not required. Let's say we want to make an extension to the ``File`` Cyber Observable Object:" + "Finally, custom extensions to existing Cyber Observable types can also be created. Just use the @[CustomExtension](../api/v20/stix2.v20.observables.rst#stix2.v20.observables.CustomExtension) decorator. Note that you must provide the Cyber Observable class to which the extension applies. Again, any extra validation of the properties can be implemented by providing an ``__init__()`` but it is not required. Let's say we want to make an extension to the ``File`` Cyber Observable Object:" ] }, { diff --git a/docs/guide/datastore.ipynb b/docs/guide/datastore.ipynb index 1ea05ee..e4aad79 100644 --- a/docs/guide/datastore.ipynb +++ b/docs/guide/datastore.ipynb @@ -454,7 +454,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "##### Note: The `defanged` property is now always included (implicitly) for STIX 2.1 Cyber Observable Objects (SCOs)\n", + "**Note: The `defanged` property is now always included (implicitly) for STIX 2.1 Cyber Observable Objects (SCOs)**\n\n", "This is important to remember if you are writing a filter that involves checking the `objects` property of a STIX 2.1 `ObservedData` object. If any of the objects associated with the `objects` property are STIX 2.1 SCOs, then your filter must include the `defanged` property. For an example, refer to `filters[14]` & `filters[15]` in stix2/test/v21/test_datastore_filters.py " ] }, @@ -492,7 +492,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "If a STIX object has a `created_by_ref` property, you can use the [creator_of()](../api/stix2.datastore.rst#stix2.datastore.DataSource.creator_of) method to retrieve the [Identity](../api/stix2.v20.sdo.rst#stix2.v20.sdo.Identity) object that created it." + "If a STIX object has a `created_by_ref` property, you can use the [creator_of()](../api/stix2.datastore.rst#stix2.datastore.DataSource.creator_of) method to retrieve the [Identity](../api/v20/stix2.v20.sdo.rst#stix2.v20.sdo.Identity) object that created it." ] }, { diff --git a/docs/guide/ts_support.ipynb b/docs/guide/ts_support.ipynb index 8c89e93..2d36f5a 100644 --- a/docs/guide/ts_support.ipynb +++ b/docs/guide/ts_support.ipynb @@ -365,7 +365,7 @@ "source": [ "### How custom content works\n", "\n", - "[CustomObject](../api/stix2.v20.sdo.rst#stix2.v20.sdo.CustomObject), [CustomObservable](../api/stix2.v20.observables.rst#stix2.v20.observables.CustomObservable), [CustomMarking](../api/stix2.v20.common.rst#stix2.v20.common.CustomMarking) and [CustomExtension](../api/stix2.v20.observables.rst#stix2.v20.observables.CustomExtension) must be registered explicitly by STIX version. This is a design decision since properties or requirements may change as the STIX Technical Specification advances.\n", + "[CustomObject](../api/v20/stix2.v20.sdo.rst#stix2.v20.sdo.CustomObject), [CustomObservable](../api/v20/stix2.v20.observables.rst#stix2.v20.observables.CustomObservable), [CustomMarking](../api/v20/stix2.v20.common.rst#stix2.v20.common.CustomMarking) and [CustomExtension](../api/v20/stix2.v20.observables.rst#stix2.v20.observables.CustomExtension) must be registered explicitly by STIX version. This is a design decision since properties or requirements may change as the STIX Technical Specification advances.\n", "\n", "You can perform this by:" ] diff --git a/docs/guide/workbench.ipynb b/docs/guide/workbench.ipynb index 328cb88..de243cc 100644 --- a/docs/guide/workbench.ipynb +++ b/docs/guide/workbench.ipynb @@ -624,7 +624,7 @@ "source": [ "### Creating STIX Data\n", "\n", - "To create a STIX object, just use that object's class constructor. Once it's created, add it to the workbench with [save()](../api/datastore/stix2.workbench.rst#stix2.workbench.save)." + "To create a STIX object, just use that object's class constructor. Once it's created, add it to the workbench with [save()](../api/stix2.workbench.rst#stix2.workbench.save)." ] }, { @@ -760,7 +760,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Defaults can also be set for the [created timestamp](../api/datastore/stix2.workbench.rst#stix2.workbench.set_default_created), [external references](../api/datastore/stix2.workbench.rst#stix2.workbench.set_default_external_refs) and [object marking references](../api/datastore/stix2.workbench.rst#stix2.workbench.set_default_object_marking_refs)." + "Defaults can also be set for the [created timestamp](../api/stix2.workbench.rst#stix2.workbench.set_default_created), [external references](../api/stix2.workbench.rst#stix2.workbench.set_default_external_refs) and [object marking references](../api/stix2.workbench.rst#stix2.workbench.set_default_object_marking_refs)." ] }, { diff --git a/requirements.txt b/requirements.txt index ddbda18..2fb7c5d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,10 @@ bumpversion ipython -mkdocs>=1.0.4 -nbsphinx==0.3.2 +nbsphinx==0.4.3 pre-commit pytest pytest-cov -sphinx<1.6 +sphinx<2 sphinx-prompt tox From c42f42e983005d946a79b8bb0a31a2961ab59579 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Mon, 30 Sep 2019 13:55:07 -0400 Subject: [PATCH 19/23] Update README.rst add documentation badge --- README.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 256e4d6..0613a15 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -|Build_Status| |Coverage| |Version| |Downloads_Badge| +|Build_Status| |Coverage| |Version| |Downloads_Badge| |Documentation_Status| cti-python-stix2 ================ @@ -170,3 +170,6 @@ to repository-cla@oasis-open.org. .. |Downloads_Badge| image:: https://img.shields.io/pypi/dm/stix2.svg?maxAge=3600 :target: https://pypi.python.org/pypi/stix2/ :alt: Downloads +.. |Documentation_Status| image:: https://readthedocs.org/projects/stix2/badge/?version=latest + :target: https://stix2.readthedocs.io/en/latest/?badge=latest + :alt: Documentation Status From 39e1ddbbf67aaef7c998678ec7df3748777efa35 Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Mon, 14 Oct 2019 14:31:44 -0400 Subject: [PATCH 20/23] Update semantic equivalence docs --- docs/guide/equivalence.ipynb | 110 +++++++++++++++++++++++------------ 1 file changed, 74 insertions(+), 36 deletions(-) diff --git a/docs/guide/equivalence.ipynb b/docs/guide/equivalence.ipynb index 9b49c2f..d73b417 100644 --- a/docs/guide/equivalence.ipynb +++ b/docs/guide/equivalence.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 14, + "execution_count": 1, "metadata": { "nbsphinx": "hidden" }, @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 2, "metadata": { "nbsphinx": "hidden" }, @@ -58,7 +58,7 @@ "source": [ "## Checking Semantic Equivalence\n", "\n", - "The [Environment](../api/stix2.environment.rst#stix2.environment.Environment) has a function for checking if two STIX Objects are semantically equivalent. For each supported objct type, the algorithm checks if the values for a specific set of properties match. Then each matching property is weighted since every property doesn't represent the same level of importance for semantic equivalence. The result will be the sum of these weighted values, in the range of 0 to 100. A result of 0 means that the the two objects are not equivalent, and a result of 100 means that they are equivalent.\n", + "The [Environment](../api/stix2.environment.rst#stix2.environment.Environment) has a function for checking if two STIX Objects are semantically equivalent. For each supported object type, the algorithm checks if the values for a specific set of properties match. Then each matching property is weighted since every property doesn't represent the same level of importance for semantic equivalence. The result will be the sum of these weighted values, in the range of 0 to 100. A result of 0 means that the the two objects are not equivalent, and a result of 100 means that they are equivalent.\n", "\n", "TODO: Add a link to the committee note when it is released.\n", "\n", @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -152,7 +152,7 @@ "" ] }, - "execution_count": 16, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -191,12 +191,12 @@ "source": [ "### Campaign Example\n", "\n", - "For Campaigns, the only properties that contribute to semantic equivalence are `name` and `aliases`, with weights of 60 and 40, respectively. In this example, the two campaigns have completely different names, but slightly similar descriptions." + "For Campaigns, the only properties that contribute to semantic equivalence are `name` and `aliases`, with weights of 60 and 40, respectively. In this example, the two campaigns have completely different names, but slightly similar descriptions. The result may be higher than expected because the Jaro-Winkler algorithm used to compare string properties looks at the edit distance of the two strings rather than just the words in them." ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -270,14 +270,14 @@ ".highlight .vg { color: #19177C } /* Name.Variable.Global */\n", ".highlight .vi { color: #19177C } /* Name.Variable.Instance */\n", ".highlight .vm { color: #19177C } /* Name.Variable.Magic */\n", - ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
50.0\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
44.0\n",
        "
\n" ], "text/plain": [ "" ] }, - "execution_count": 17, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -286,12 +286,10 @@ "from stix2.v21 import Campaign\n", "\n", "c1 = Campaign(\n", - " name=\"Someone Attacks Somebody\",\n", - " description=\"A campaign targeting....\",)\n", + " name=\"there\",)\n", "\n", "c2 = Campaign(\n", - " name=\"Another Campaign\",\n", - " description=\"A campaign that targets....\",)\n", + " name=\"something\",)\n", "print(env.semantically_equivalent(c1, c2))" ] }, @@ -306,7 +304,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -387,7 +385,7 @@ "" ] }, - "execution_count": 18, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -419,8 +417,10 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, + "execution_count": 6, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stderr", @@ -507,7 +507,7 @@ "" ] }, - "execution_count": 19, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -530,6 +530,13 @@ "print(env.semantically_equivalent(ind1, ind2))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the patterns were identical the result would have been 100." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -541,7 +548,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 7, "metadata": { "scrolled": true }, @@ -624,7 +631,7 @@ "" ] }, - "execution_count": 20, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -654,7 +661,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 8, "metadata": { "scrolled": true }, @@ -737,7 +744,7 @@ "" ] }, - "execution_count": 21, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -771,7 +778,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 9, "metadata": { "scrolled": true }, @@ -854,7 +861,7 @@ "" ] }, - "execution_count": 22, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -888,7 +895,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 10, "metadata": { "scrolled": true }, @@ -971,7 +978,7 @@ "" ] }, - "execution_count": 23, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1002,7 +1009,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -1083,7 +1090,7 @@ "" ] }, - "execution_count": 24, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1117,7 +1124,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -1137,12 +1144,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Some object types do not have a defined method for calculating semantic equivalence and by default will raise an error." + "Some object types do not have a defined method for calculating semantic equivalence and by default will raise an [error](../api/stix2.exceptions.rst#stix2.exceptions.SemanticEquivalenceUnsupportedTypeError)." ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -1176,12 +1183,43 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "By default, comparing objects of different spec versions will result in an error. You can optionally allow this by providing a configuration dictionary like in the next example:" + "By default, comparing objects of different spec versions will result in a `ValueError`." ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "The objects to compare must be of the same spec version!", + "output_type": "error", + "traceback": [ + "\u001b[0;31mValueError\u001b[0m\u001b[0;31m:\u001b[0m The objects to compare must be of the same spec version!\n" + ] + } + ], + "source": [ + "from stix2.v20 import Identity as Identity20\n", + "\n", + "id20 = Identity20(\n", + " name=\"John Smith\",\n", + " identity_class=\"individual\",\n", + ")\n", + "print(env.semantically_equivalent(id2, id20))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can optionally allow comparing across spec versions by providing a configuration dictionary like in the next example:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1262,7 +1300,7 @@ "" ] }, - "execution_count": 27, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1286,7 +1324,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1367,7 +1405,7 @@ "" ] }, - "execution_count": 28, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } From 024e023967a587862799947dee0698df37389ad7 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 15 Oct 2019 12:54:41 -0400 Subject: [PATCH 21/23] update semantic equivalence approach to: - add more detailed output via the logging module - don't fail hard if an object sent to the semantically_equivalent() method - remove specific exception related to Semantic Equivalence and tests --- stix2/environment.py | 186 ++++++++++++++++++----------- stix2/exceptions.py | 7 -- stix2/test/v21/test_environment.py | 43 +------ 3 files changed, 124 insertions(+), 112 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index d2c6d3a..34e0a04 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -6,7 +6,6 @@ import time from .core import parse as _parse from .datastore import CompositeDataSource, DataStoreMixin -from .exceptions import SemanticEquivalenceUnsupportedTypeError from .utils import STIXdatetime, parse_into_datetime logger = logging.getLogger(__name__) @@ -228,9 +227,6 @@ class Environment(DataStoreMixin): "aliases": 40, "method": _campaign_checks, }, - "course-of-action": { - "method": _course_of_action_checks, - }, "identity": { "name": 60, "identity_class": 20, @@ -244,9 +240,6 @@ class Environment(DataStoreMixin): "tdelta": 1, # One day interval "method": _indicator_checks, }, - "intrusion-set": { - "method": _intrusion_set_checks, - }, "location": { "longitude_latitude": 34, "region": 33, @@ -259,12 +252,6 @@ class Environment(DataStoreMixin): "name": 80, "method": _malware_checks, }, - "observed-data": { - "method": _observed_data_checks, - }, - "report": { - "method": _report_checks, - }, "threat-actor": { "name": 60, "threat_actor_types": 20, @@ -298,8 +285,14 @@ class Environment(DataStoreMixin): if ignore_spec_version is False and obj1.get("spec_version", "2.0") != obj2.get("spec_version", "2.0"): raise ValueError('The objects to compare must be of the same spec version!') - method = weights[type1]["method"] - matching_score, sum_weights = method(obj1, obj2, **weights[type1]) + try: + method = weights[type1]["method"] + except KeyError: + logger.warning("'%s' type has no semantic equivalence method to call!", type1) + sum_weights = matching_score = 0 + else: + logger.debug("Starting semantic equivalence process between: '%s' and '%s'", obj1["id"], obj2["id"]) + matching_score, sum_weights = method(obj1, obj2, **weights[type1]) if sum_weights <= 0: return 0 @@ -333,7 +326,9 @@ def partial_timestamp_based(t1, t2, tdelta): if not isinstance(t2, STIXdatetime): t2 = parse_into_datetime(t2) t1, t2 = time.mktime(t1.timetuple()), time.mktime(t2.timetuple()) - return 1 - min(abs(t1 - t2) / (86400 * tdelta), 1) + result = 1 - min(abs(t1 - t2) / (86400 * tdelta), 1) + logger.debug("--\t\tpartial_timestamp_based '%s' '%s' tdelta: '%s'\tresult: '%s'", t1, t2, tdelta, result) + return result def partial_list_based(l1, l2): @@ -348,7 +343,9 @@ def partial_list_based(l1, l2): """ l1_set, l2_set = set(l1), set(l2) - return len(l1_set.intersection(l2_set)) / max(len(l1), len(l2)) + result = len(l1_set.intersection(l2_set)) / max(len(l1), len(l2)) + logger.debug("--\t\tpartial_list_based '%s' '%s'\tresult: '%s'", l1, l2, result) + return result def exact_match(val1, val2): @@ -362,9 +359,11 @@ def exact_match(val1, val2): float: 1.0 if the value matches exactly, 0.0 otherwise. """ + result = 0.0 if val1 == val2: - return 1.0 - return 0.0 + result = 1.0 + logger.debug("--\t\texact_match '%s' '%s'\tresult: '%s'", val1, val2, result) + return result def partial_string_based(str1, str2): @@ -379,7 +378,9 @@ def partial_string_based(str1, str2): """ from pyjarowinkler import distance - return distance.get_jaro_distance(str1, str2) + result = distance.get_jaro_distance(str1, str2) + logger.debug("--\t\tpartial_string_based '%s' '%s'\tresult: '%s'", str1, str2, result) + return result def custom_pattern_based(pattern1, pattern2): @@ -440,14 +441,24 @@ def partial_external_reference_based(refs1, refs2): # external_id or url match then its a perfect match and other entries # can be ignored. if sn_match and (ei_match or url_match) and source_name in allowed: - return 1.0 + result = 1.0 + logger.debug( + "--\t\tpartial_external_reference_based '%s' '%s'\tresult: '%s'", + refs1, refs2, result, + ) + return result # Regular check. If the source_name (not STIX-defined) or external_id or # url matches then we consider the entry a match. if (sn_match or ei_match or url_match) and source_name not in allowed: matches += 1 - return matches / max(len(refs1), len(refs2)) + result = matches / max(len(refs1), len(refs2)) + logger.debug( + "--\t\tpartial_external_reference_based '%s' '%s'\tresult: '%s'", + refs1, refs2, result, + ) + return result def partial_location_distance(lat1, long1, lat2, long2, threshold): @@ -466,7 +477,12 @@ def partial_location_distance(lat1, long1, lat2, long2, threshold): """ from haversine import haversine, Unit distance = haversine((lat1, long1), (lat2, long2), unit=Unit.KILOMETERS) - return 1 - (distance / threshold) + result = 1 - (distance / threshold) + logger.debug( + "--\t\tpartial_location_distance '%s' '%s' threshold: '%s'\tresult: '%s'", + (lat1, long1), (lat2, long2), threshold, result, + ) + return result def _attack_pattern_checks(obj1, obj2, **weights): @@ -474,15 +490,19 @@ def _attack_pattern_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("external_references", obj1, obj2): w = weights["external_references"] - sum_weights += w - matching_score += ( - w * - partial_external_reference_based(obj1["external_references"], obj2["external_references"]) + contributing_score = ( + w * partial_external_reference_based(obj1["external_references"], obj2["external_references"]) ) + sum_weights += w + matching_score += contributing_score + logger.debug("'external_references' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -491,12 +511,17 @@ def _campaign_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("aliases", obj1, obj2): w = weights["aliases"] + contributing_score = w * partial_list_based(obj1["aliases"], obj2["aliases"]) sum_weights += w - matching_score += w * partial_list_based(obj1["aliases"], obj2["aliases"]) + matching_score += contributing_score + logger.debug("'aliases' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -505,16 +530,23 @@ def _identity_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * exact_match(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * exact_match(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("identity_class", obj1, obj2): w = weights["identity_class"] + contributing_score = w * exact_match(obj1["identity_class"], obj2["identity_class"]) sum_weights += w - matching_score += w * exact_match(obj1["identity_class"], obj2["identity_class"]) + matching_score += contributing_score + logger.debug("'identity_class' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("sectors", obj1, obj2): w = weights["sectors"] + contributing_score = w * partial_list_based(obj1["sectors"], obj2["sectors"]) sum_weights += w - matching_score += w * partial_list_based(obj1["sectors"], obj2["sectors"]) + matching_score += contributing_score + logger.debug("'sectors' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -523,19 +555,26 @@ def _indicator_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("indicator_types", obj1, obj2): w = weights["indicator_types"] + contributing_score = w * partial_list_based(obj1["indicator_types"], obj2["indicator_types"]) sum_weights += w - matching_score += w * partial_list_based(obj1["indicator_types"], obj2["indicator_types"]) + matching_score += contributing_score + logger.debug("'indicator_types' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("pattern", obj1, obj2): w = weights["pattern"] + contributing_score = w * custom_pattern_based(obj1["pattern"], obj2["pattern"]) sum_weights += w - matching_score += w * custom_pattern_based(obj1["pattern"], obj2["pattern"]) + matching_score += contributing_score + logger.debug("'pattern' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("valid_from", obj1, obj2): w = weights["valid_from"] - sum_weights += w - matching_score += ( + contributing_score = ( w * partial_timestamp_based(obj1["valid_from"], obj2["valid_from"], weights["tdelta"]) ) + sum_weights += w + matching_score += contributing_score + logger.debug("'valid_from' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -544,19 +583,26 @@ def _location_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("latitude", obj1, obj2) and check_property_present("longitude", obj1, obj2): w = weights["longitude_latitude"] - sum_weights += w - matching_score += ( + contributing_score = ( w * partial_location_distance(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"], weights["threshold"]) ) + sum_weights += w + matching_score += contributing_score + logger.debug("'longitude_latitude' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("region", obj1, obj2): w = weights["region"] + contributing_score = w * exact_match(obj1["region"], obj2["region"]) sum_weights += w - matching_score += w * exact_match(obj1["region"], obj2["region"]) + matching_score += contributing_score + logger.debug("'region' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("country", obj1, obj2): w = weights["country"] + contributing_score = w * exact_match(obj1["country"], obj2["country"]) sum_weights += w - matching_score += w * exact_match(obj1["country"], obj2["country"]) + matching_score += contributing_score + logger.debug("'country' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -565,12 +611,17 @@ def _malware_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("malware_types", obj1, obj2): w = weights["malware_types"] + contributing_score = w * partial_list_based(obj1["malware_types"], obj2["malware_types"]) sum_weights += w - matching_score += w * partial_list_based(obj1["malware_types"], obj2["malware_types"]) + matching_score += contributing_score + logger.debug("'malware_types' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -579,16 +630,23 @@ def _threat_actor_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("threat_actor_types", obj1, obj2): w = weights["threat_actor_types"] + contributing_score = w * partial_list_based(obj1["threat_actor_types"], obj2["threat_actor_types"]) sum_weights += w - matching_score += w * partial_list_based(obj1["threat_actor_types"], obj2["threat_actor_types"]) + matching_score += contributing_score + logger.debug("'threat_actor_types' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("aliases", obj1, obj2): w = weights["aliases"] + contributing_score = w * partial_list_based(obj1["aliases"], obj2["aliases"]) sum_weights += w - matching_score += w * partial_list_based(obj1["aliases"], obj2["aliases"]) + matching_score += contributing_score + logger.debug("'aliases' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -597,12 +655,17 @@ def _tool_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("tool_types", obj1, obj2): w = weights["tool_types"] + contributing_score = w * partial_list_based(obj1["tool_types"], obj2["tool_types"]) sum_weights += w - matching_score += w * partial_list_based(obj1["tool_types"], obj2["tool_types"]) + matching_score += contributing_score + logger.debug("'tool_types' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -611,29 +674,18 @@ def _vulnerability_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("external_references", obj1, obj2): w = weights["external_references"] - sum_weights += w - matching_score += w * partial_external_reference_based( + contributing_score = w * partial_external_reference_based( obj1["external_references"], obj2["external_references"], ) + sum_weights += w + matching_score += contributing_score + logger.debug("'external_references' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights - - -def _course_of_action_checks(obj1, obj2, **weights): - raise SemanticEquivalenceUnsupportedTypeError("course-of-action type has no semantic equivalence implementation!") - - -def _intrusion_set_checks(obj1, obj2, **weights): - raise SemanticEquivalenceUnsupportedTypeError("intrusion-set type has no semantic equivalence implementation!") - - -def _observed_data_checks(obj1, obj2, **weights): - raise SemanticEquivalenceUnsupportedTypeError("observed-data type has no semantic equivalence implementation!") - - -def _report_checks(obj1, obj2, **weights): - raise SemanticEquivalenceUnsupportedTypeError("report type has no semantic equivalence implementation!") diff --git a/stix2/exceptions.py b/stix2/exceptions.py index 6405c2e..d2ec3fc 100644 --- a/stix2/exceptions.py +++ b/stix2/exceptions.py @@ -233,10 +233,3 @@ class STIXDeprecationWarning(DeprecationWarning): Represents usage of a deprecated component of a STIX specification. """ pass - - -class SemanticEquivalenceUnsupportedTypeError(STIXError, TypeError): - """STIX object type not supported by the semantic equivalence approach.""" - - def __init__(self, msg): - super(SemanticEquivalenceUnsupportedTypeError, self).__init__(msg) diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 62b0c53..a049b25 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -6,12 +6,10 @@ import stix2.exceptions from .constants import ( ATTACK_PATTERN_ID, ATTACK_PATTERN_KWARGS, CAMPAIGN_ID, CAMPAIGN_KWARGS, - COURSE_OF_ACTION_ID, COURSE_OF_ACTION_KWARGS, FAKE_TIME, IDENTITY_ID, - IDENTITY_KWARGS, INDICATOR_ID, INDICATOR_KWARGS, INTRUSION_SET_ID, - INTRUSION_SET_KWARGS, LOCATION_ID, MALWARE_ID, MALWARE_KWARGS, - OBSERVED_DATA_ID, OBSERVED_DATA_KWARGS, RELATIONSHIP_IDS, REPORT_ID, - REPORT_KWARGS, THREAT_ACTOR_ID, THREAT_ACTOR_KWARGS, TOOL_ID, TOOL_KWARGS, - VULNERABILITY_ID, VULNERABILITY_KWARGS, + FAKE_TIME, IDENTITY_ID, IDENTITY_KWARGS, INDICATOR_ID, INDICATOR_KWARGS, + LOCATION_ID, MALWARE_ID, MALWARE_KWARGS, RELATIONSHIP_IDS, THREAT_ACTOR_ID, + THREAT_ACTOR_KWARGS, TOOL_ID, TOOL_KWARGS, VULNERABILITY_ID, + VULNERABILITY_KWARGS, ) @@ -615,37 +613,6 @@ def test_semantic_equivalence_different_spec_version_raises(): assert str(excinfo.value) == "The objects to compare must be of the same spec version!" -@pytest.mark.parametrize( - "obj1,obj2,ret_val", - [ - ( - stix2.v21.CourseOfAction(id=COURSE_OF_ACTION_ID, **COURSE_OF_ACTION_KWARGS), - stix2.v21.CourseOfAction(id=COURSE_OF_ACTION_ID, **COURSE_OF_ACTION_KWARGS), - "course-of-action type has no semantic equivalence implementation!", - ), - ( - stix2.v21.IntrusionSet(id=INTRUSION_SET_ID, **INTRUSION_SET_KWARGS), - stix2.v21.IntrusionSet(id=INTRUSION_SET_ID, **INTRUSION_SET_KWARGS), - "intrusion-set type has no semantic equivalence implementation!", - ), - ( - stix2.v21.ObservedData(id=OBSERVED_DATA_ID, **OBSERVED_DATA_KWARGS), - stix2.v21.ObservedData(id=OBSERVED_DATA_ID, **OBSERVED_DATA_KWARGS), - "observed-data type has no semantic equivalence implementation!", - ), - ( - stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS), - stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS), - "report type has no semantic equivalence implementation!", - ), - ], -) -def test_semantic_equivalence_on_unsupported_types(obj1, obj2, ret_val): - with pytest.raises(stix2.exceptions.SemanticEquivalenceUnsupportedTypeError) as excinfo: - stix2.Environment().semantically_equivalent(obj1, obj2) - assert ret_val == str(excinfo.value) - - def test_semantic_equivalence_zero_match(): IND_KWARGS = dict( indicator_types=["APTX"], @@ -767,7 +734,7 @@ def test_semantic_equivalence_external_references(refs1, refs2, ret_val): assert value == ret_val -def test_semantic_equivalence_timetamp(): +def test_semantic_equivalence_timestamp(): t1 = "2018-10-17T00:14:20.652Z" t2 = "2018-10-17T12:14:20.652Z" assert stix2.environment.partial_timestamp_based(t1, t2, 1) == 0.5 From 13fda69079938b69fbe22a793b6b65b61eed7b41 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 15 Oct 2019 13:25:11 -0400 Subject: [PATCH 22/23] add test for object not present in configuration --- stix2/test/v21/test_environment.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index a049b25..d057df5 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -7,9 +7,9 @@ import stix2.exceptions from .constants import ( ATTACK_PATTERN_ID, ATTACK_PATTERN_KWARGS, CAMPAIGN_ID, CAMPAIGN_KWARGS, FAKE_TIME, IDENTITY_ID, IDENTITY_KWARGS, INDICATOR_ID, INDICATOR_KWARGS, - LOCATION_ID, MALWARE_ID, MALWARE_KWARGS, RELATIONSHIP_IDS, THREAT_ACTOR_ID, - THREAT_ACTOR_KWARGS, TOOL_ID, TOOL_KWARGS, VULNERABILITY_ID, - VULNERABILITY_KWARGS, + LOCATION_ID, MALWARE_ID, MALWARE_KWARGS, RELATIONSHIP_IDS, REPORT_ID, + REPORT_KWARGS, THREAT_ACTOR_ID, THREAT_ACTOR_KWARGS, TOOL_ID, TOOL_KWARGS, + VULNERABILITY_ID, VULNERABILITY_KWARGS, ) @@ -744,3 +744,9 @@ def test_semantic_equivalence_exact_match(): t1 = "2018-10-17T00:14:20.652Z" t2 = "2018-10-17T12:14:20.652Z" assert stix2.environment.exact_match(t1, t2) == 0.0 + + +def test_non_existent_config_for_object(): + r1 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) + r2 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) + assert stix2.Environment().semantically_equivalent(r1, r2) == 0.0 From b5612c9dc2ca842603a3a1ffd3ce965228704fdc Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Wed, 16 Oct 2019 09:08:03 -0400 Subject: [PATCH 23/23] Update semantic equivalence docs - Comparing object type not in config dictionary now gives a warning and result of 0 instead of an error. - Adds an example of the new detailed debug output. --- docs/guide/equivalence.ipynb | 229 +++++++++++++++++++++++++++++++++-- 1 file changed, 220 insertions(+), 9 deletions(-) diff --git a/docs/guide/equivalence.ipynb b/docs/guide/equivalence.ipynb index d73b417..5db3464 100644 --- a/docs/guide/equivalence.ipynb +++ b/docs/guide/equivalence.ipynb @@ -270,7 +270,7 @@ ".highlight .vg { color: #19177C } /* Name.Variable.Global */\n", ".highlight .vi { color: #19177C } /* Name.Variable.Instance */\n", ".highlight .vm { color: #19177C } /* Name.Variable.Magic */\n", - ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
44.0\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
50.0\n",
        "
\n" ], "text/plain": [ @@ -286,10 +286,10 @@ "from stix2.v21 import Campaign\n", "\n", "c1 = Campaign(\n", - " name=\"there\",)\n", + " name=\"Someone Attacks Somebody\",)\n", "\n", "c2 = Campaign(\n", - " name=\"something\",)\n", + " name=\"Another Campaign\",)\n", "print(env.semantically_equivalent(c1, c2))" ] }, @@ -1144,7 +1144,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Some object types do not have a defined method for calculating semantic equivalence and by default will raise an [error](../api/stix2.exceptions.rst#stix2.exceptions.SemanticEquivalenceUnsupportedTypeError)." + "Some object types do not have a defined method for calculating semantic equivalence and by default will give a warning and a result of zero." ] }, { @@ -1153,12 +1153,93 @@ "metadata": {}, "outputs": [ { - "ename": "SemanticEquivalenceUnsupportedTypeError", - "evalue": "report type has no semantic equivalence implementation!", - "output_type": "error", - "traceback": [ - "\u001b[0;31mSemanticEquivalenceUnsupportedTypeError\u001b[0m\u001b[0;31m:\u001b[0m report type has no semantic equivalence implementation!\n" + "name": "stderr", + "output_type": "stream", + "text": [ + "'report' type has no semantic equivalence method to call!\n" ] + }, + { + "data": { + "text/html": [ + "
0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -1448,6 +1529,136 @@ "}\n", "print(env.semantically_equivalent(foo1, foo2, **weights))" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Detailed Results\n", + "\n", + "If your logging level is set to `DEBUG` or higher, the function will log more detailed results. These show the semantic equivalence and weighting for each property that is checked, to show how the final result was arrived at." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Starting semantic equivalence process between: 'threat-actor--54dc2aac-6fde-4a68-ae2a-0c0bc575ed70' and 'threat-actor--c51bce3b-a067-4692-ab77-fcdefdd3f157'\n", + "--\t\tpartial_string_based 'Evil Org' 'James Bond'\tresult: '0.56'\n", + "'name' check -- weight: 60, contributing score: 33.6\n", + "--\t\tpartial_list_based '['crime-syndicate']' '['spy']'\tresult: '0.0'\n", + "'threat_actor_types' check -- weight: 20, contributing score: 0.0\n", + "--\t\tpartial_list_based '['super-evil']' '['007']'\tresult: '0.0'\n", + "'aliases' check -- weight: 20, contributing score: 0.0\n", + "Matching Score: 33.6, Sum of Weights: 100.0\n" + ] + }, + { + "data": { + "text/html": [ + "
33.6\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import logging\n", + "logging.basicConfig(format='%(message)s')\n", + "logger = logging.getLogger()\n", + "logger.setLevel(logging.DEBUG)\n", + "\n", + "ta3 = ThreatActor(\n", + " threat_actor_types=[\"crime-syndicate\"],\n", + " name=\"Evil Org\",\n", + " aliases=[\"super-evil\"],\n", + ")\n", + "ta4 = ThreatActor(\n", + " threat_actor_types=[\"spy\"],\n", + " name=\"James Bond\",\n", + " aliases=[\"007\"],\n", + ")\n", + "print(env.semantically_equivalent(ta3, ta4))" + ] } ], "metadata": {