From 216b43d49ef6eb8874b26dba53ab638fd45aa901 Mon Sep 17 00:00:00 2001 From: Michael Chisholm Date: Fri, 11 Oct 2019 17:12:44 -0400 Subject: [PATCH 1/8] Fix determinstic UUID handling when there are high-codepoint unicode characters. Make compatible with both python 2 and 3. --- stix2/base.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/stix2/base.py b/stix2/base.py index a13cb98..b2e20de 100644 --- a/stix2/base.py +++ b/stix2/base.py @@ -394,11 +394,14 @@ class _Observable(_STIXBase): if streamlined_obj_vals: data = canonicalize(streamlined_obj_vals, utf8=False) - # try/except here to enable python 2 compatibility - try: + # The situation is complicated w.r.t. python 2/3 behavior, so + # I'd rather not rely on particular exceptions being raised to + # determine what to do. Better to just check the python version + # directly. + if six.PY3: return required_prefix + six.text_type(uuid.uuid5(SCO_DET_ID_NAMESPACE, data)) - except UnicodeDecodeError: - return required_prefix + six.text_type(uuid.uuid5(SCO_DET_ID_NAMESPACE, six.binary_type(data))) + else: + return required_prefix + six.text_type(uuid.uuid5(SCO_DET_ID_NAMESPACE, data.encode("utf-8"))) # We return None if there are no values specified for any of the id-contributing-properties return None From edf465bd80b20f151064ac39ba4d0c1cd9643e1d Mon Sep 17 00:00:00 2001 From: Michael Chisholm Date: Fri, 11 Oct 2019 18:15:47 -0400 Subject: [PATCH 2/8] Add a unit test for deterministic ID, with unicode --- stix2/test/v21/test_base.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/stix2/test/v21/test_base.py b/stix2/test/v21/test_base.py index 18d3a50..d753ab1 100644 --- a/stix2/test/v21/test_base.py +++ b/stix2/test/v21/test_base.py @@ -1,9 +1,11 @@ import datetime as dt import json +import uuid import pytest import pytz +import stix2 from stix2.base import STIXJSONEncoder @@ -23,3 +25,14 @@ def test_encode_json_object(): json.dumps(test_dict, cls=STIXJSONEncoder) assert " is not JSON serializable" in str(excinfo.value) + + +def test_deterministic_id_unicode(): + mutex = {'name': u'D*Fl#Ed*\u00a3\u00a8', 'type': 'mutex'} + obs = stix2.parse_observable(mutex, version="2.1") + + dd_idx = obs.id.index("--") + id_uuid = uuid.UUID(obs.id[dd_idx+2:]) + + assert id_uuid.variant == uuid.RFC_4122 + assert id_uuid.version == 5 From 39e1ddbbf67aaef7c998678ec7df3748777efa35 Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Mon, 14 Oct 2019 14:31:44 -0400 Subject: [PATCH 3/8] Update semantic equivalence docs --- docs/guide/equivalence.ipynb | 110 +++++++++++++++++++++++------------ 1 file changed, 74 insertions(+), 36 deletions(-) diff --git a/docs/guide/equivalence.ipynb b/docs/guide/equivalence.ipynb index 9b49c2f..d73b417 100644 --- a/docs/guide/equivalence.ipynb +++ b/docs/guide/equivalence.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 14, + "execution_count": 1, "metadata": { "nbsphinx": "hidden" }, @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 2, "metadata": { "nbsphinx": "hidden" }, @@ -58,7 +58,7 @@ "source": [ "## Checking Semantic Equivalence\n", "\n", - "The [Environment](../api/stix2.environment.rst#stix2.environment.Environment) has a function for checking if two STIX Objects are semantically equivalent. For each supported objct type, the algorithm checks if the values for a specific set of properties match. Then each matching property is weighted since every property doesn't represent the same level of importance for semantic equivalence. The result will be the sum of these weighted values, in the range of 0 to 100. A result of 0 means that the the two objects are not equivalent, and a result of 100 means that they are equivalent.\n", + "The [Environment](../api/stix2.environment.rst#stix2.environment.Environment) has a function for checking if two STIX Objects are semantically equivalent. For each supported object type, the algorithm checks if the values for a specific set of properties match. Then each matching property is weighted since every property doesn't represent the same level of importance for semantic equivalence. The result will be the sum of these weighted values, in the range of 0 to 100. A result of 0 means that the the two objects are not equivalent, and a result of 100 means that they are equivalent.\n", "\n", "TODO: Add a link to the committee note when it is released.\n", "\n", @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -152,7 +152,7 @@ "" ] }, - "execution_count": 16, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -191,12 +191,12 @@ "source": [ "### Campaign Example\n", "\n", - "For Campaigns, the only properties that contribute to semantic equivalence are `name` and `aliases`, with weights of 60 and 40, respectively. In this example, the two campaigns have completely different names, but slightly similar descriptions." + "For Campaigns, the only properties that contribute to semantic equivalence are `name` and `aliases`, with weights of 60 and 40, respectively. In this example, the two campaigns have completely different names, but slightly similar descriptions. The result may be higher than expected because the Jaro-Winkler algorithm used to compare string properties looks at the edit distance of the two strings rather than just the words in them." ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -270,14 +270,14 @@ ".highlight .vg { color: #19177C } /* Name.Variable.Global */\n", ".highlight .vi { color: #19177C } /* Name.Variable.Instance */\n", ".highlight .vm { color: #19177C } /* Name.Variable.Magic */\n", - ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
50.0\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
44.0\n",
        "
\n" ], "text/plain": [ "" ] }, - "execution_count": 17, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -286,12 +286,10 @@ "from stix2.v21 import Campaign\n", "\n", "c1 = Campaign(\n", - " name=\"Someone Attacks Somebody\",\n", - " description=\"A campaign targeting....\",)\n", + " name=\"there\",)\n", "\n", "c2 = Campaign(\n", - " name=\"Another Campaign\",\n", - " description=\"A campaign that targets....\",)\n", + " name=\"something\",)\n", "print(env.semantically_equivalent(c1, c2))" ] }, @@ -306,7 +304,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -387,7 +385,7 @@ "" ] }, - "execution_count": 18, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -419,8 +417,10 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, + "execution_count": 6, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stderr", @@ -507,7 +507,7 @@ "" ] }, - "execution_count": 19, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -530,6 +530,13 @@ "print(env.semantically_equivalent(ind1, ind2))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the patterns were identical the result would have been 100." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -541,7 +548,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 7, "metadata": { "scrolled": true }, @@ -624,7 +631,7 @@ "" ] }, - "execution_count": 20, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -654,7 +661,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 8, "metadata": { "scrolled": true }, @@ -737,7 +744,7 @@ "" ] }, - "execution_count": 21, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -771,7 +778,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 9, "metadata": { "scrolled": true }, @@ -854,7 +861,7 @@ "" ] }, - "execution_count": 22, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -888,7 +895,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 10, "metadata": { "scrolled": true }, @@ -971,7 +978,7 @@ "" ] }, - "execution_count": 23, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1002,7 +1009,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -1083,7 +1090,7 @@ "" ] }, - "execution_count": 24, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1117,7 +1124,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -1137,12 +1144,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Some object types do not have a defined method for calculating semantic equivalence and by default will raise an error." + "Some object types do not have a defined method for calculating semantic equivalence and by default will raise an [error](../api/stix2.exceptions.rst#stix2.exceptions.SemanticEquivalenceUnsupportedTypeError)." ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -1176,12 +1183,43 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "By default, comparing objects of different spec versions will result in an error. You can optionally allow this by providing a configuration dictionary like in the next example:" + "By default, comparing objects of different spec versions will result in a `ValueError`." ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "The objects to compare must be of the same spec version!", + "output_type": "error", + "traceback": [ + "\u001b[0;31mValueError\u001b[0m\u001b[0;31m:\u001b[0m The objects to compare must be of the same spec version!\n" + ] + } + ], + "source": [ + "from stix2.v20 import Identity as Identity20\n", + "\n", + "id20 = Identity20(\n", + " name=\"John Smith\",\n", + " identity_class=\"individual\",\n", + ")\n", + "print(env.semantically_equivalent(id2, id20))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can optionally allow comparing across spec versions by providing a configuration dictionary like in the next example:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1262,7 +1300,7 @@ "" ] }, - "execution_count": 27, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1286,7 +1324,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1367,7 +1405,7 @@ "" ] }, - "execution_count": 28, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } From 024e023967a587862799947dee0698df37389ad7 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 15 Oct 2019 12:54:41 -0400 Subject: [PATCH 4/8] update semantic equivalence approach to: - add more detailed output via the logging module - don't fail hard if an object sent to the semantically_equivalent() method - remove specific exception related to Semantic Equivalence and tests --- stix2/environment.py | 186 ++++++++++++++++++----------- stix2/exceptions.py | 7 -- stix2/test/v21/test_environment.py | 43 +------ 3 files changed, 124 insertions(+), 112 deletions(-) diff --git a/stix2/environment.py b/stix2/environment.py index d2c6d3a..34e0a04 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -6,7 +6,6 @@ import time from .core import parse as _parse from .datastore import CompositeDataSource, DataStoreMixin -from .exceptions import SemanticEquivalenceUnsupportedTypeError from .utils import STIXdatetime, parse_into_datetime logger = logging.getLogger(__name__) @@ -228,9 +227,6 @@ class Environment(DataStoreMixin): "aliases": 40, "method": _campaign_checks, }, - "course-of-action": { - "method": _course_of_action_checks, - }, "identity": { "name": 60, "identity_class": 20, @@ -244,9 +240,6 @@ class Environment(DataStoreMixin): "tdelta": 1, # One day interval "method": _indicator_checks, }, - "intrusion-set": { - "method": _intrusion_set_checks, - }, "location": { "longitude_latitude": 34, "region": 33, @@ -259,12 +252,6 @@ class Environment(DataStoreMixin): "name": 80, "method": _malware_checks, }, - "observed-data": { - "method": _observed_data_checks, - }, - "report": { - "method": _report_checks, - }, "threat-actor": { "name": 60, "threat_actor_types": 20, @@ -298,8 +285,14 @@ class Environment(DataStoreMixin): if ignore_spec_version is False and obj1.get("spec_version", "2.0") != obj2.get("spec_version", "2.0"): raise ValueError('The objects to compare must be of the same spec version!') - method = weights[type1]["method"] - matching_score, sum_weights = method(obj1, obj2, **weights[type1]) + try: + method = weights[type1]["method"] + except KeyError: + logger.warning("'%s' type has no semantic equivalence method to call!", type1) + sum_weights = matching_score = 0 + else: + logger.debug("Starting semantic equivalence process between: '%s' and '%s'", obj1["id"], obj2["id"]) + matching_score, sum_weights = method(obj1, obj2, **weights[type1]) if sum_weights <= 0: return 0 @@ -333,7 +326,9 @@ def partial_timestamp_based(t1, t2, tdelta): if not isinstance(t2, STIXdatetime): t2 = parse_into_datetime(t2) t1, t2 = time.mktime(t1.timetuple()), time.mktime(t2.timetuple()) - return 1 - min(abs(t1 - t2) / (86400 * tdelta), 1) + result = 1 - min(abs(t1 - t2) / (86400 * tdelta), 1) + logger.debug("--\t\tpartial_timestamp_based '%s' '%s' tdelta: '%s'\tresult: '%s'", t1, t2, tdelta, result) + return result def partial_list_based(l1, l2): @@ -348,7 +343,9 @@ def partial_list_based(l1, l2): """ l1_set, l2_set = set(l1), set(l2) - return len(l1_set.intersection(l2_set)) / max(len(l1), len(l2)) + result = len(l1_set.intersection(l2_set)) / max(len(l1), len(l2)) + logger.debug("--\t\tpartial_list_based '%s' '%s'\tresult: '%s'", l1, l2, result) + return result def exact_match(val1, val2): @@ -362,9 +359,11 @@ def exact_match(val1, val2): float: 1.0 if the value matches exactly, 0.0 otherwise. """ + result = 0.0 if val1 == val2: - return 1.0 - return 0.0 + result = 1.0 + logger.debug("--\t\texact_match '%s' '%s'\tresult: '%s'", val1, val2, result) + return result def partial_string_based(str1, str2): @@ -379,7 +378,9 @@ def partial_string_based(str1, str2): """ from pyjarowinkler import distance - return distance.get_jaro_distance(str1, str2) + result = distance.get_jaro_distance(str1, str2) + logger.debug("--\t\tpartial_string_based '%s' '%s'\tresult: '%s'", str1, str2, result) + return result def custom_pattern_based(pattern1, pattern2): @@ -440,14 +441,24 @@ def partial_external_reference_based(refs1, refs2): # external_id or url match then its a perfect match and other entries # can be ignored. if sn_match and (ei_match or url_match) and source_name in allowed: - return 1.0 + result = 1.0 + logger.debug( + "--\t\tpartial_external_reference_based '%s' '%s'\tresult: '%s'", + refs1, refs2, result, + ) + return result # Regular check. If the source_name (not STIX-defined) or external_id or # url matches then we consider the entry a match. if (sn_match or ei_match or url_match) and source_name not in allowed: matches += 1 - return matches / max(len(refs1), len(refs2)) + result = matches / max(len(refs1), len(refs2)) + logger.debug( + "--\t\tpartial_external_reference_based '%s' '%s'\tresult: '%s'", + refs1, refs2, result, + ) + return result def partial_location_distance(lat1, long1, lat2, long2, threshold): @@ -466,7 +477,12 @@ def partial_location_distance(lat1, long1, lat2, long2, threshold): """ from haversine import haversine, Unit distance = haversine((lat1, long1), (lat2, long2), unit=Unit.KILOMETERS) - return 1 - (distance / threshold) + result = 1 - (distance / threshold) + logger.debug( + "--\t\tpartial_location_distance '%s' '%s' threshold: '%s'\tresult: '%s'", + (lat1, long1), (lat2, long2), threshold, result, + ) + return result def _attack_pattern_checks(obj1, obj2, **weights): @@ -474,15 +490,19 @@ def _attack_pattern_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("external_references", obj1, obj2): w = weights["external_references"] - sum_weights += w - matching_score += ( - w * - partial_external_reference_based(obj1["external_references"], obj2["external_references"]) + contributing_score = ( + w * partial_external_reference_based(obj1["external_references"], obj2["external_references"]) ) + sum_weights += w + matching_score += contributing_score + logger.debug("'external_references' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -491,12 +511,17 @@ def _campaign_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("aliases", obj1, obj2): w = weights["aliases"] + contributing_score = w * partial_list_based(obj1["aliases"], obj2["aliases"]) sum_weights += w - matching_score += w * partial_list_based(obj1["aliases"], obj2["aliases"]) + matching_score += contributing_score + logger.debug("'aliases' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -505,16 +530,23 @@ def _identity_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * exact_match(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * exact_match(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("identity_class", obj1, obj2): w = weights["identity_class"] + contributing_score = w * exact_match(obj1["identity_class"], obj2["identity_class"]) sum_weights += w - matching_score += w * exact_match(obj1["identity_class"], obj2["identity_class"]) + matching_score += contributing_score + logger.debug("'identity_class' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("sectors", obj1, obj2): w = weights["sectors"] + contributing_score = w * partial_list_based(obj1["sectors"], obj2["sectors"]) sum_weights += w - matching_score += w * partial_list_based(obj1["sectors"], obj2["sectors"]) + matching_score += contributing_score + logger.debug("'sectors' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -523,19 +555,26 @@ def _indicator_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("indicator_types", obj1, obj2): w = weights["indicator_types"] + contributing_score = w * partial_list_based(obj1["indicator_types"], obj2["indicator_types"]) sum_weights += w - matching_score += w * partial_list_based(obj1["indicator_types"], obj2["indicator_types"]) + matching_score += contributing_score + logger.debug("'indicator_types' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("pattern", obj1, obj2): w = weights["pattern"] + contributing_score = w * custom_pattern_based(obj1["pattern"], obj2["pattern"]) sum_weights += w - matching_score += w * custom_pattern_based(obj1["pattern"], obj2["pattern"]) + matching_score += contributing_score + logger.debug("'pattern' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("valid_from", obj1, obj2): w = weights["valid_from"] - sum_weights += w - matching_score += ( + contributing_score = ( w * partial_timestamp_based(obj1["valid_from"], obj2["valid_from"], weights["tdelta"]) ) + sum_weights += w + matching_score += contributing_score + logger.debug("'valid_from' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -544,19 +583,26 @@ def _location_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("latitude", obj1, obj2) and check_property_present("longitude", obj1, obj2): w = weights["longitude_latitude"] - sum_weights += w - matching_score += ( + contributing_score = ( w * partial_location_distance(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"], weights["threshold"]) ) + sum_weights += w + matching_score += contributing_score + logger.debug("'longitude_latitude' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("region", obj1, obj2): w = weights["region"] + contributing_score = w * exact_match(obj1["region"], obj2["region"]) sum_weights += w - matching_score += w * exact_match(obj1["region"], obj2["region"]) + matching_score += contributing_score + logger.debug("'region' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("country", obj1, obj2): w = weights["country"] + contributing_score = w * exact_match(obj1["country"], obj2["country"]) sum_weights += w - matching_score += w * exact_match(obj1["country"], obj2["country"]) + matching_score += contributing_score + logger.debug("'country' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -565,12 +611,17 @@ def _malware_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("malware_types", obj1, obj2): w = weights["malware_types"] + contributing_score = w * partial_list_based(obj1["malware_types"], obj2["malware_types"]) sum_weights += w - matching_score += w * partial_list_based(obj1["malware_types"], obj2["malware_types"]) + matching_score += contributing_score + logger.debug("'malware_types' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -579,16 +630,23 @@ def _threat_actor_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("threat_actor_types", obj1, obj2): w = weights["threat_actor_types"] + contributing_score = w * partial_list_based(obj1["threat_actor_types"], obj2["threat_actor_types"]) sum_weights += w - matching_score += w * partial_list_based(obj1["threat_actor_types"], obj2["threat_actor_types"]) + matching_score += contributing_score + logger.debug("'threat_actor_types' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("aliases", obj1, obj2): w = weights["aliases"] + contributing_score = w * partial_list_based(obj1["aliases"], obj2["aliases"]) sum_weights += w - matching_score += w * partial_list_based(obj1["aliases"], obj2["aliases"]) + matching_score += contributing_score + logger.debug("'aliases' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -597,12 +655,17 @@ def _tool_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("tool_types", obj1, obj2): w = weights["tool_types"] + contributing_score = w * partial_list_based(obj1["tool_types"], obj2["tool_types"]) sum_weights += w - matching_score += w * partial_list_based(obj1["tool_types"], obj2["tool_types"]) + matching_score += contributing_score + logger.debug("'tool_types' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -611,29 +674,18 @@ def _vulnerability_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("external_references", obj1, obj2): w = weights["external_references"] - sum_weights += w - matching_score += w * partial_external_reference_based( + contributing_score = w * partial_external_reference_based( obj1["external_references"], obj2["external_references"], ) + sum_weights += w + matching_score += contributing_score + logger.debug("'external_references' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights - - -def _course_of_action_checks(obj1, obj2, **weights): - raise SemanticEquivalenceUnsupportedTypeError("course-of-action type has no semantic equivalence implementation!") - - -def _intrusion_set_checks(obj1, obj2, **weights): - raise SemanticEquivalenceUnsupportedTypeError("intrusion-set type has no semantic equivalence implementation!") - - -def _observed_data_checks(obj1, obj2, **weights): - raise SemanticEquivalenceUnsupportedTypeError("observed-data type has no semantic equivalence implementation!") - - -def _report_checks(obj1, obj2, **weights): - raise SemanticEquivalenceUnsupportedTypeError("report type has no semantic equivalence implementation!") diff --git a/stix2/exceptions.py b/stix2/exceptions.py index 6405c2e..d2ec3fc 100644 --- a/stix2/exceptions.py +++ b/stix2/exceptions.py @@ -233,10 +233,3 @@ class STIXDeprecationWarning(DeprecationWarning): Represents usage of a deprecated component of a STIX specification. """ pass - - -class SemanticEquivalenceUnsupportedTypeError(STIXError, TypeError): - """STIX object type not supported by the semantic equivalence approach.""" - - def __init__(self, msg): - super(SemanticEquivalenceUnsupportedTypeError, self).__init__(msg) diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 62b0c53..a049b25 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -6,12 +6,10 @@ import stix2.exceptions from .constants import ( ATTACK_PATTERN_ID, ATTACK_PATTERN_KWARGS, CAMPAIGN_ID, CAMPAIGN_KWARGS, - COURSE_OF_ACTION_ID, COURSE_OF_ACTION_KWARGS, FAKE_TIME, IDENTITY_ID, - IDENTITY_KWARGS, INDICATOR_ID, INDICATOR_KWARGS, INTRUSION_SET_ID, - INTRUSION_SET_KWARGS, LOCATION_ID, MALWARE_ID, MALWARE_KWARGS, - OBSERVED_DATA_ID, OBSERVED_DATA_KWARGS, RELATIONSHIP_IDS, REPORT_ID, - REPORT_KWARGS, THREAT_ACTOR_ID, THREAT_ACTOR_KWARGS, TOOL_ID, TOOL_KWARGS, - VULNERABILITY_ID, VULNERABILITY_KWARGS, + FAKE_TIME, IDENTITY_ID, IDENTITY_KWARGS, INDICATOR_ID, INDICATOR_KWARGS, + LOCATION_ID, MALWARE_ID, MALWARE_KWARGS, RELATIONSHIP_IDS, THREAT_ACTOR_ID, + THREAT_ACTOR_KWARGS, TOOL_ID, TOOL_KWARGS, VULNERABILITY_ID, + VULNERABILITY_KWARGS, ) @@ -615,37 +613,6 @@ def test_semantic_equivalence_different_spec_version_raises(): assert str(excinfo.value) == "The objects to compare must be of the same spec version!" -@pytest.mark.parametrize( - "obj1,obj2,ret_val", - [ - ( - stix2.v21.CourseOfAction(id=COURSE_OF_ACTION_ID, **COURSE_OF_ACTION_KWARGS), - stix2.v21.CourseOfAction(id=COURSE_OF_ACTION_ID, **COURSE_OF_ACTION_KWARGS), - "course-of-action type has no semantic equivalence implementation!", - ), - ( - stix2.v21.IntrusionSet(id=INTRUSION_SET_ID, **INTRUSION_SET_KWARGS), - stix2.v21.IntrusionSet(id=INTRUSION_SET_ID, **INTRUSION_SET_KWARGS), - "intrusion-set type has no semantic equivalence implementation!", - ), - ( - stix2.v21.ObservedData(id=OBSERVED_DATA_ID, **OBSERVED_DATA_KWARGS), - stix2.v21.ObservedData(id=OBSERVED_DATA_ID, **OBSERVED_DATA_KWARGS), - "observed-data type has no semantic equivalence implementation!", - ), - ( - stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS), - stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS), - "report type has no semantic equivalence implementation!", - ), - ], -) -def test_semantic_equivalence_on_unsupported_types(obj1, obj2, ret_val): - with pytest.raises(stix2.exceptions.SemanticEquivalenceUnsupportedTypeError) as excinfo: - stix2.Environment().semantically_equivalent(obj1, obj2) - assert ret_val == str(excinfo.value) - - def test_semantic_equivalence_zero_match(): IND_KWARGS = dict( indicator_types=["APTX"], @@ -767,7 +734,7 @@ def test_semantic_equivalence_external_references(refs1, refs2, ret_val): assert value == ret_val -def test_semantic_equivalence_timetamp(): +def test_semantic_equivalence_timestamp(): t1 = "2018-10-17T00:14:20.652Z" t2 = "2018-10-17T12:14:20.652Z" assert stix2.environment.partial_timestamp_based(t1, t2, 1) == 0.5 From 13fda69079938b69fbe22a793b6b65b61eed7b41 Mon Sep 17 00:00:00 2001 From: Emmanuelle Vargas-Gonzalez Date: Tue, 15 Oct 2019 13:25:11 -0400 Subject: [PATCH 5/8] add test for object not present in configuration --- stix2/test/v21/test_environment.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index a049b25..d057df5 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -7,9 +7,9 @@ import stix2.exceptions from .constants import ( ATTACK_PATTERN_ID, ATTACK_PATTERN_KWARGS, CAMPAIGN_ID, CAMPAIGN_KWARGS, FAKE_TIME, IDENTITY_ID, IDENTITY_KWARGS, INDICATOR_ID, INDICATOR_KWARGS, - LOCATION_ID, MALWARE_ID, MALWARE_KWARGS, RELATIONSHIP_IDS, THREAT_ACTOR_ID, - THREAT_ACTOR_KWARGS, TOOL_ID, TOOL_KWARGS, VULNERABILITY_ID, - VULNERABILITY_KWARGS, + LOCATION_ID, MALWARE_ID, MALWARE_KWARGS, RELATIONSHIP_IDS, REPORT_ID, + REPORT_KWARGS, THREAT_ACTOR_ID, THREAT_ACTOR_KWARGS, TOOL_ID, TOOL_KWARGS, + VULNERABILITY_ID, VULNERABILITY_KWARGS, ) @@ -744,3 +744,9 @@ def test_semantic_equivalence_exact_match(): t1 = "2018-10-17T00:14:20.652Z" t2 = "2018-10-17T12:14:20.652Z" assert stix2.environment.exact_match(t1, t2) == 0.0 + + +def test_non_existent_config_for_object(): + r1 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) + r2 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) + assert stix2.Environment().semantically_equivalent(r1, r2) == 0.0 From b5612c9dc2ca842603a3a1ffd3ce965228704fdc Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Wed, 16 Oct 2019 09:08:03 -0400 Subject: [PATCH 6/8] Update semantic equivalence docs - Comparing object type not in config dictionary now gives a warning and result of 0 instead of an error. - Adds an example of the new detailed debug output. --- docs/guide/equivalence.ipynb | 229 +++++++++++++++++++++++++++++++++-- 1 file changed, 220 insertions(+), 9 deletions(-) diff --git a/docs/guide/equivalence.ipynb b/docs/guide/equivalence.ipynb index d73b417..5db3464 100644 --- a/docs/guide/equivalence.ipynb +++ b/docs/guide/equivalence.ipynb @@ -270,7 +270,7 @@ ".highlight .vg { color: #19177C } /* Name.Variable.Global */\n", ".highlight .vi { color: #19177C } /* Name.Variable.Instance */\n", ".highlight .vm { color: #19177C } /* Name.Variable.Magic */\n", - ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
44.0\n",
+       ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
50.0\n",
        "
\n" ], "text/plain": [ @@ -286,10 +286,10 @@ "from stix2.v21 import Campaign\n", "\n", "c1 = Campaign(\n", - " name=\"there\",)\n", + " name=\"Someone Attacks Somebody\",)\n", "\n", "c2 = Campaign(\n", - " name=\"something\",)\n", + " name=\"Another Campaign\",)\n", "print(env.semantically_equivalent(c1, c2))" ] }, @@ -1144,7 +1144,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Some object types do not have a defined method for calculating semantic equivalence and by default will raise an [error](../api/stix2.exceptions.rst#stix2.exceptions.SemanticEquivalenceUnsupportedTypeError)." + "Some object types do not have a defined method for calculating semantic equivalence and by default will give a warning and a result of zero." ] }, { @@ -1153,12 +1153,93 @@ "metadata": {}, "outputs": [ { - "ename": "SemanticEquivalenceUnsupportedTypeError", - "evalue": "report type has no semantic equivalence implementation!", - "output_type": "error", - "traceback": [ - "\u001b[0;31mSemanticEquivalenceUnsupportedTypeError\u001b[0m\u001b[0;31m:\u001b[0m report type has no semantic equivalence implementation!\n" + "name": "stderr", + "output_type": "stream", + "text": [ + "'report' type has no semantic equivalence method to call!\n" ] + }, + { + "data": { + "text/html": [ + "
0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -1448,6 +1529,136 @@ "}\n", "print(env.semantically_equivalent(foo1, foo2, **weights))" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Detailed Results\n", + "\n", + "If your logging level is set to `DEBUG` or higher, the function will log more detailed results. These show the semantic equivalence and weighting for each property that is checked, to show how the final result was arrived at." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Starting semantic equivalence process between: 'threat-actor--54dc2aac-6fde-4a68-ae2a-0c0bc575ed70' and 'threat-actor--c51bce3b-a067-4692-ab77-fcdefdd3f157'\n", + "--\t\tpartial_string_based 'Evil Org' 'James Bond'\tresult: '0.56'\n", + "'name' check -- weight: 60, contributing score: 33.6\n", + "--\t\tpartial_list_based '['crime-syndicate']' '['spy']'\tresult: '0.0'\n", + "'threat_actor_types' check -- weight: 20, contributing score: 0.0\n", + "--\t\tpartial_list_based '['super-evil']' '['007']'\tresult: '0.0'\n", + "'aliases' check -- weight: 20, contributing score: 0.0\n", + "Matching Score: 33.6, Sum of Weights: 100.0\n" + ] + }, + { + "data": { + "text/html": [ + "
33.6\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import logging\n", + "logging.basicConfig(format='%(message)s')\n", + "logger = logging.getLogger()\n", + "logger.setLevel(logging.DEBUG)\n", + "\n", + "ta3 = ThreatActor(\n", + " threat_actor_types=[\"crime-syndicate\"],\n", + " name=\"Evil Org\",\n", + " aliases=[\"super-evil\"],\n", + ")\n", + "ta4 = ThreatActor(\n", + " threat_actor_types=[\"spy\"],\n", + " name=\"James Bond\",\n", + " aliases=[\"007\"],\n", + ")\n", + "print(env.semantically_equivalent(ta3, ta4))" + ] } ], "metadata": { From 4d2925c406598bd642bd1e488ae3b2844ee95854 Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Wed, 16 Oct 2019 17:23:43 -0400 Subject: [PATCH 7/8] Update CHANGELOG for v1.2.1 --- CHANGELOG | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index e2cb8ad..2eab92a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,12 @@ CHANGELOG ========= +1.2.1 - 2019-10-16 + +* #301 Adds more detailed debugging semantic equivalence output +* #301 Updates semantic equivalence errors +* #300 Fixes bug with deterministic IDs for SCOs containing unicode + 1.2.0 - 2019-09-25 * #268, #271, #273, #275, #283, #285, #290 Changes support of STIX 2.1 to WD05 (CSD02), for all object types From d4c01157352552e2ec05bcf7b0f79ef96e34022d Mon Sep 17 00:00:00 2001 From: Chris Lenk Date: Wed, 16 Oct 2019 17:24:16 -0400 Subject: [PATCH 8/8] =?UTF-8?q?Bump=20version:=201.2.0=20=E2=86=92=201.2.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.cfg | 2 +- stix2/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 758875d..bcf51b8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.2.0 +current_version = 1.2.1 commit = True tag = True diff --git a/stix2/version.py b/stix2/version.py index c68196d..a955fda 100644 --- a/stix2/version.py +++ b/stix2/version.py @@ -1 +1 @@ -__version__ = "1.2.0" +__version__ = "1.2.1"