diff --git a/CHANGELOG b/CHANGELOG index e2cb8ad..2eab92a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,12 @@ CHANGELOG ========= +1.2.1 - 2019-10-16 + +* #301 Adds more detailed debugging semantic equivalence output +* #301 Updates semantic equivalence errors +* #300 Fixes bug with deterministic IDs for SCOs containing unicode + 1.2.0 - 2019-09-25 * #268, #271, #273, #275, #283, #285, #290 Changes support of STIX 2.1 to WD05 (CSD02), for all object types diff --git a/docs/guide/equivalence.ipynb b/docs/guide/equivalence.ipynb index 9b49c2f..5db3464 100644 --- a/docs/guide/equivalence.ipynb +++ b/docs/guide/equivalence.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 14, + "execution_count": 1, "metadata": { "nbsphinx": "hidden" }, @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 2, "metadata": { "nbsphinx": "hidden" }, @@ -58,7 +58,7 @@ "source": [ "## Checking Semantic Equivalence\n", "\n", - "The [Environment](../api/stix2.environment.rst#stix2.environment.Environment) has a function for checking if two STIX Objects are semantically equivalent. For each supported objct type, the algorithm checks if the values for a specific set of properties match. Then each matching property is weighted since every property doesn't represent the same level of importance for semantic equivalence. The result will be the sum of these weighted values, in the range of 0 to 100. A result of 0 means that the the two objects are not equivalent, and a result of 100 means that they are equivalent.\n", + "The [Environment](../api/stix2.environment.rst#stix2.environment.Environment) has a function for checking if two STIX Objects are semantically equivalent. For each supported object type, the algorithm checks if the values for a specific set of properties match. Then each matching property is weighted since every property doesn't represent the same level of importance for semantic equivalence. The result will be the sum of these weighted values, in the range of 0 to 100. A result of 0 means that the the two objects are not equivalent, and a result of 100 means that they are equivalent.\n", "\n", "TODO: Add a link to the committee note when it is released.\n", "\n", @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -152,7 +152,7 @@ "" ] }, - "execution_count": 16, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -191,12 +191,12 @@ "source": [ "### Campaign Example\n", "\n", - "For Campaigns, the only properties that contribute to semantic equivalence are `name` and `aliases`, with weights of 60 and 40, respectively. In this example, the two campaigns have completely different names, but slightly similar descriptions." + "For Campaigns, the only properties that contribute to semantic equivalence are `name` and `aliases`, with weights of 60 and 40, respectively. In this example, the two campaigns have completely different names, but slightly similar descriptions. The result may be higher than expected because the Jaro-Winkler algorithm used to compare string properties looks at the edit distance of the two strings rather than just the words in them." ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -277,7 +277,7 @@ "" ] }, - "execution_count": 17, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -286,12 +286,10 @@ "from stix2.v21 import Campaign\n", "\n", "c1 = Campaign(\n", - " name=\"Someone Attacks Somebody\",\n", - " description=\"A campaign targeting....\",)\n", + " name=\"Someone Attacks Somebody\",)\n", "\n", "c2 = Campaign(\n", - " name=\"Another Campaign\",\n", - " description=\"A campaign that targets....\",)\n", + " name=\"Another Campaign\",)\n", "print(env.semantically_equivalent(c1, c2))" ] }, @@ -306,7 +304,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -387,7 +385,7 @@ "" ] }, - "execution_count": 18, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -419,8 +417,10 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, + "execution_count": 6, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stderr", @@ -507,7 +507,7 @@ "" ] }, - "execution_count": 19, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -530,6 +530,13 @@ "print(env.semantically_equivalent(ind1, ind2))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the patterns were identical the result would have been 100." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -541,7 +548,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 7, "metadata": { "scrolled": true }, @@ -624,7 +631,7 @@ "" ] }, - "execution_count": 20, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -654,7 +661,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 8, "metadata": { "scrolled": true }, @@ -737,7 +744,7 @@ "" ] }, - "execution_count": 21, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -771,7 +778,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 9, "metadata": { "scrolled": true }, @@ -854,7 +861,7 @@ "" ] }, - "execution_count": 22, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -888,7 +895,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 10, "metadata": { "scrolled": true }, @@ -971,7 +978,7 @@ "" ] }, - "execution_count": 23, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1002,7 +1009,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -1083,7 +1090,7 @@ "" ] }, - "execution_count": 24, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1117,7 +1124,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -1137,21 +1144,102 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Some object types do not have a defined method for calculating semantic equivalence and by default will raise an error." + "Some object types do not have a defined method for calculating semantic equivalence and by default will give a warning and a result of zero." ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 13, "metadata": {}, "outputs": [ { - "ename": "SemanticEquivalenceUnsupportedTypeError", - "evalue": "report type has no semantic equivalence implementation!", - "output_type": "error", - "traceback": [ - "\u001b[0;31mSemanticEquivalenceUnsupportedTypeError\u001b[0m\u001b[0;31m:\u001b[0m report type has no semantic equivalence implementation!\n" + "name": "stderr", + "output_type": "stream", + "text": [ + "'report' type has no semantic equivalence method to call!\n" ] + }, + { + "data": { + "text/html": [ + "
0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -1176,12 +1264,43 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "By default, comparing objects of different spec versions will result in an error. You can optionally allow this by providing a configuration dictionary like in the next example:" + "By default, comparing objects of different spec versions will result in a `ValueError`." ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "The objects to compare must be of the same spec version!", + "output_type": "error", + "traceback": [ + "\u001b[0;31mValueError\u001b[0m\u001b[0;31m:\u001b[0m The objects to compare must be of the same spec version!\n" + ] + } + ], + "source": [ + "from stix2.v20 import Identity as Identity20\n", + "\n", + "id20 = Identity20(\n", + " name=\"John Smith\",\n", + " identity_class=\"individual\",\n", + ")\n", + "print(env.semantically_equivalent(id2, id20))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can optionally allow comparing across spec versions by providing a configuration dictionary like in the next example:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1262,7 +1381,7 @@ "" ] }, - "execution_count": 27, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1286,7 +1405,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1367,7 +1486,7 @@ "" ] }, - "execution_count": 28, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1410,6 +1529,136 @@ "}\n", "print(env.semantically_equivalent(foo1, foo2, **weights))" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Detailed Results\n", + "\n", + "If your logging level is set to `DEBUG` or higher, the function will log more detailed results. These show the semantic equivalence and weighting for each property that is checked, to show how the final result was arrived at." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Starting semantic equivalence process between: 'threat-actor--54dc2aac-6fde-4a68-ae2a-0c0bc575ed70' and 'threat-actor--c51bce3b-a067-4692-ab77-fcdefdd3f157'\n", + "--\t\tpartial_string_based 'Evil Org' 'James Bond'\tresult: '0.56'\n", + "'name' check -- weight: 60, contributing score: 33.6\n", + "--\t\tpartial_list_based '['crime-syndicate']' '['spy']'\tresult: '0.0'\n", + "'threat_actor_types' check -- weight: 20, contributing score: 0.0\n", + "--\t\tpartial_list_based '['super-evil']' '['007']'\tresult: '0.0'\n", + "'aliases' check -- weight: 20, contributing score: 0.0\n", + "Matching Score: 33.6, Sum of Weights: 100.0\n" + ] + }, + { + "data": { + "text/html": [ + "
33.6\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import logging\n", + "logging.basicConfig(format='%(message)s')\n", + "logger = logging.getLogger()\n", + "logger.setLevel(logging.DEBUG)\n", + "\n", + "ta3 = ThreatActor(\n", + " threat_actor_types=[\"crime-syndicate\"],\n", + " name=\"Evil Org\",\n", + " aliases=[\"super-evil\"],\n", + ")\n", + "ta4 = ThreatActor(\n", + " threat_actor_types=[\"spy\"],\n", + " name=\"James Bond\",\n", + " aliases=[\"007\"],\n", + ")\n", + "print(env.semantically_equivalent(ta3, ta4))" + ] } ], "metadata": { diff --git a/setup.cfg b/setup.cfg index 758875d..bcf51b8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.2.0 +current_version = 1.2.1 commit = True tag = True diff --git a/stix2/base.py b/stix2/base.py index bd5995f..8beddca 100644 --- a/stix2/base.py +++ b/stix2/base.py @@ -396,11 +396,14 @@ class _Observable(_STIXBase): if streamlined_obj_vals: data = canonicalize(streamlined_obj_vals, utf8=False) - # try/except here to enable python 2 compatibility - try: + # The situation is complicated w.r.t. python 2/3 behavior, so + # I'd rather not rely on particular exceptions being raised to + # determine what to do. Better to just check the python version + # directly. + if six.PY3: return required_prefix + six.text_type(uuid.uuid5(SCO_DET_ID_NAMESPACE, data)) - except UnicodeDecodeError: - return required_prefix + six.text_type(uuid.uuid5(SCO_DET_ID_NAMESPACE, six.binary_type(data))) + else: + return required_prefix + six.text_type(uuid.uuid5(SCO_DET_ID_NAMESPACE, data.encode("utf-8"))) # We return None if there are no values specified for any of the id-contributing-properties return None diff --git a/stix2/environment.py b/stix2/environment.py index d2c6d3a..34e0a04 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -6,7 +6,6 @@ import time from .core import parse as _parse from .datastore import CompositeDataSource, DataStoreMixin -from .exceptions import SemanticEquivalenceUnsupportedTypeError from .utils import STIXdatetime, parse_into_datetime logger = logging.getLogger(__name__) @@ -228,9 +227,6 @@ class Environment(DataStoreMixin): "aliases": 40, "method": _campaign_checks, }, - "course-of-action": { - "method": _course_of_action_checks, - }, "identity": { "name": 60, "identity_class": 20, @@ -244,9 +240,6 @@ class Environment(DataStoreMixin): "tdelta": 1, # One day interval "method": _indicator_checks, }, - "intrusion-set": { - "method": _intrusion_set_checks, - }, "location": { "longitude_latitude": 34, "region": 33, @@ -259,12 +252,6 @@ class Environment(DataStoreMixin): "name": 80, "method": _malware_checks, }, - "observed-data": { - "method": _observed_data_checks, - }, - "report": { - "method": _report_checks, - }, "threat-actor": { "name": 60, "threat_actor_types": 20, @@ -298,8 +285,14 @@ class Environment(DataStoreMixin): if ignore_spec_version is False and obj1.get("spec_version", "2.0") != obj2.get("spec_version", "2.0"): raise ValueError('The objects to compare must be of the same spec version!') - method = weights[type1]["method"] - matching_score, sum_weights = method(obj1, obj2, **weights[type1]) + try: + method = weights[type1]["method"] + except KeyError: + logger.warning("'%s' type has no semantic equivalence method to call!", type1) + sum_weights = matching_score = 0 + else: + logger.debug("Starting semantic equivalence process between: '%s' and '%s'", obj1["id"], obj2["id"]) + matching_score, sum_weights = method(obj1, obj2, **weights[type1]) if sum_weights <= 0: return 0 @@ -333,7 +326,9 @@ def partial_timestamp_based(t1, t2, tdelta): if not isinstance(t2, STIXdatetime): t2 = parse_into_datetime(t2) t1, t2 = time.mktime(t1.timetuple()), time.mktime(t2.timetuple()) - return 1 - min(abs(t1 - t2) / (86400 * tdelta), 1) + result = 1 - min(abs(t1 - t2) / (86400 * tdelta), 1) + logger.debug("--\t\tpartial_timestamp_based '%s' '%s' tdelta: '%s'\tresult: '%s'", t1, t2, tdelta, result) + return result def partial_list_based(l1, l2): @@ -348,7 +343,9 @@ def partial_list_based(l1, l2): """ l1_set, l2_set = set(l1), set(l2) - return len(l1_set.intersection(l2_set)) / max(len(l1), len(l2)) + result = len(l1_set.intersection(l2_set)) / max(len(l1), len(l2)) + logger.debug("--\t\tpartial_list_based '%s' '%s'\tresult: '%s'", l1, l2, result) + return result def exact_match(val1, val2): @@ -362,9 +359,11 @@ def exact_match(val1, val2): float: 1.0 if the value matches exactly, 0.0 otherwise. """ + result = 0.0 if val1 == val2: - return 1.0 - return 0.0 + result = 1.0 + logger.debug("--\t\texact_match '%s' '%s'\tresult: '%s'", val1, val2, result) + return result def partial_string_based(str1, str2): @@ -379,7 +378,9 @@ def partial_string_based(str1, str2): """ from pyjarowinkler import distance - return distance.get_jaro_distance(str1, str2) + result = distance.get_jaro_distance(str1, str2) + logger.debug("--\t\tpartial_string_based '%s' '%s'\tresult: '%s'", str1, str2, result) + return result def custom_pattern_based(pattern1, pattern2): @@ -440,14 +441,24 @@ def partial_external_reference_based(refs1, refs2): # external_id or url match then its a perfect match and other entries # can be ignored. if sn_match and (ei_match or url_match) and source_name in allowed: - return 1.0 + result = 1.0 + logger.debug( + "--\t\tpartial_external_reference_based '%s' '%s'\tresult: '%s'", + refs1, refs2, result, + ) + return result # Regular check. If the source_name (not STIX-defined) or external_id or # url matches then we consider the entry a match. if (sn_match or ei_match or url_match) and source_name not in allowed: matches += 1 - return matches / max(len(refs1), len(refs2)) + result = matches / max(len(refs1), len(refs2)) + logger.debug( + "--\t\tpartial_external_reference_based '%s' '%s'\tresult: '%s'", + refs1, refs2, result, + ) + return result def partial_location_distance(lat1, long1, lat2, long2, threshold): @@ -466,7 +477,12 @@ def partial_location_distance(lat1, long1, lat2, long2, threshold): """ from haversine import haversine, Unit distance = haversine((lat1, long1), (lat2, long2), unit=Unit.KILOMETERS) - return 1 - (distance / threshold) + result = 1 - (distance / threshold) + logger.debug( + "--\t\tpartial_location_distance '%s' '%s' threshold: '%s'\tresult: '%s'", + (lat1, long1), (lat2, long2), threshold, result, + ) + return result def _attack_pattern_checks(obj1, obj2, **weights): @@ -474,15 +490,19 @@ def _attack_pattern_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("external_references", obj1, obj2): w = weights["external_references"] - sum_weights += w - matching_score += ( - w * - partial_external_reference_based(obj1["external_references"], obj2["external_references"]) + contributing_score = ( + w * partial_external_reference_based(obj1["external_references"], obj2["external_references"]) ) + sum_weights += w + matching_score += contributing_score + logger.debug("'external_references' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -491,12 +511,17 @@ def _campaign_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("aliases", obj1, obj2): w = weights["aliases"] + contributing_score = w * partial_list_based(obj1["aliases"], obj2["aliases"]) sum_weights += w - matching_score += w * partial_list_based(obj1["aliases"], obj2["aliases"]) + matching_score += contributing_score + logger.debug("'aliases' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -505,16 +530,23 @@ def _identity_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * exact_match(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * exact_match(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("identity_class", obj1, obj2): w = weights["identity_class"] + contributing_score = w * exact_match(obj1["identity_class"], obj2["identity_class"]) sum_weights += w - matching_score += w * exact_match(obj1["identity_class"], obj2["identity_class"]) + matching_score += contributing_score + logger.debug("'identity_class' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("sectors", obj1, obj2): w = weights["sectors"] + contributing_score = w * partial_list_based(obj1["sectors"], obj2["sectors"]) sum_weights += w - matching_score += w * partial_list_based(obj1["sectors"], obj2["sectors"]) + matching_score += contributing_score + logger.debug("'sectors' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -523,19 +555,26 @@ def _indicator_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("indicator_types", obj1, obj2): w = weights["indicator_types"] + contributing_score = w * partial_list_based(obj1["indicator_types"], obj2["indicator_types"]) sum_weights += w - matching_score += w * partial_list_based(obj1["indicator_types"], obj2["indicator_types"]) + matching_score += contributing_score + logger.debug("'indicator_types' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("pattern", obj1, obj2): w = weights["pattern"] + contributing_score = w * custom_pattern_based(obj1["pattern"], obj2["pattern"]) sum_weights += w - matching_score += w * custom_pattern_based(obj1["pattern"], obj2["pattern"]) + matching_score += contributing_score + logger.debug("'pattern' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("valid_from", obj1, obj2): w = weights["valid_from"] - sum_weights += w - matching_score += ( + contributing_score = ( w * partial_timestamp_based(obj1["valid_from"], obj2["valid_from"], weights["tdelta"]) ) + sum_weights += w + matching_score += contributing_score + logger.debug("'valid_from' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -544,19 +583,26 @@ def _location_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("latitude", obj1, obj2) and check_property_present("longitude", obj1, obj2): w = weights["longitude_latitude"] - sum_weights += w - matching_score += ( + contributing_score = ( w * partial_location_distance(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"], weights["threshold"]) ) + sum_weights += w + matching_score += contributing_score + logger.debug("'longitude_latitude' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("region", obj1, obj2): w = weights["region"] + contributing_score = w * exact_match(obj1["region"], obj2["region"]) sum_weights += w - matching_score += w * exact_match(obj1["region"], obj2["region"]) + matching_score += contributing_score + logger.debug("'region' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("country", obj1, obj2): w = weights["country"] + contributing_score = w * exact_match(obj1["country"], obj2["country"]) sum_weights += w - matching_score += w * exact_match(obj1["country"], obj2["country"]) + matching_score += contributing_score + logger.debug("'country' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -565,12 +611,17 @@ def _malware_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("malware_types", obj1, obj2): w = weights["malware_types"] + contributing_score = w * partial_list_based(obj1["malware_types"], obj2["malware_types"]) sum_weights += w - matching_score += w * partial_list_based(obj1["malware_types"], obj2["malware_types"]) + matching_score += contributing_score + logger.debug("'malware_types' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -579,16 +630,23 @@ def _threat_actor_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("threat_actor_types", obj1, obj2): w = weights["threat_actor_types"] + contributing_score = w * partial_list_based(obj1["threat_actor_types"], obj2["threat_actor_types"]) sum_weights += w - matching_score += w * partial_list_based(obj1["threat_actor_types"], obj2["threat_actor_types"]) + matching_score += contributing_score + logger.debug("'threat_actor_types' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("aliases", obj1, obj2): w = weights["aliases"] + contributing_score = w * partial_list_based(obj1["aliases"], obj2["aliases"]) sum_weights += w - matching_score += w * partial_list_based(obj1["aliases"], obj2["aliases"]) + matching_score += contributing_score + logger.debug("'aliases' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -597,12 +655,17 @@ def _tool_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("tool_types", obj1, obj2): w = weights["tool_types"] + contributing_score = w * partial_list_based(obj1["tool_types"], obj2["tool_types"]) sum_weights += w - matching_score += w * partial_list_based(obj1["tool_types"], obj2["tool_types"]) + matching_score += contributing_score + logger.debug("'tool_types' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights @@ -611,29 +674,18 @@ def _vulnerability_checks(obj1, obj2, **weights): sum_weights = 0.0 if check_property_present("name", obj1, obj2): w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) sum_weights += w - matching_score += w * partial_string_based(obj1["name"], obj2["name"]) + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) if check_property_present("external_references", obj1, obj2): w = weights["external_references"] - sum_weights += w - matching_score += w * partial_external_reference_based( + contributing_score = w * partial_external_reference_based( obj1["external_references"], obj2["external_references"], ) + sum_weights += w + matching_score += contributing_score + logger.debug("'external_references' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) return matching_score, sum_weights - - -def _course_of_action_checks(obj1, obj2, **weights): - raise SemanticEquivalenceUnsupportedTypeError("course-of-action type has no semantic equivalence implementation!") - - -def _intrusion_set_checks(obj1, obj2, **weights): - raise SemanticEquivalenceUnsupportedTypeError("intrusion-set type has no semantic equivalence implementation!") - - -def _observed_data_checks(obj1, obj2, **weights): - raise SemanticEquivalenceUnsupportedTypeError("observed-data type has no semantic equivalence implementation!") - - -def _report_checks(obj1, obj2, **weights): - raise SemanticEquivalenceUnsupportedTypeError("report type has no semantic equivalence implementation!") diff --git a/stix2/exceptions.py b/stix2/exceptions.py index 6405c2e..d2ec3fc 100644 --- a/stix2/exceptions.py +++ b/stix2/exceptions.py @@ -233,10 +233,3 @@ class STIXDeprecationWarning(DeprecationWarning): Represents usage of a deprecated component of a STIX specification. """ pass - - -class SemanticEquivalenceUnsupportedTypeError(STIXError, TypeError): - """STIX object type not supported by the semantic equivalence approach.""" - - def __init__(self, msg): - super(SemanticEquivalenceUnsupportedTypeError, self).__init__(msg) diff --git a/stix2/test/v21/test_base.py b/stix2/test/v21/test_base.py index 18d3a50..d753ab1 100644 --- a/stix2/test/v21/test_base.py +++ b/stix2/test/v21/test_base.py @@ -1,9 +1,11 @@ import datetime as dt import json +import uuid import pytest import pytz +import stix2 from stix2.base import STIXJSONEncoder @@ -23,3 +25,14 @@ def test_encode_json_object(): json.dumps(test_dict, cls=STIXJSONEncoder) assert " is not JSON serializable" in str(excinfo.value) + + +def test_deterministic_id_unicode(): + mutex = {'name': u'D*Fl#Ed*\u00a3\u00a8', 'type': 'mutex'} + obs = stix2.parse_observable(mutex, version="2.1") + + dd_idx = obs.id.index("--") + id_uuid = uuid.UUID(obs.id[dd_idx+2:]) + + assert id_uuid.variant == uuid.RFC_4122 + assert id_uuid.version == 5 diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 62b0c53..d057df5 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -6,10 +6,8 @@ import stix2.exceptions from .constants import ( ATTACK_PATTERN_ID, ATTACK_PATTERN_KWARGS, CAMPAIGN_ID, CAMPAIGN_KWARGS, - COURSE_OF_ACTION_ID, COURSE_OF_ACTION_KWARGS, FAKE_TIME, IDENTITY_ID, - IDENTITY_KWARGS, INDICATOR_ID, INDICATOR_KWARGS, INTRUSION_SET_ID, - INTRUSION_SET_KWARGS, LOCATION_ID, MALWARE_ID, MALWARE_KWARGS, - OBSERVED_DATA_ID, OBSERVED_DATA_KWARGS, RELATIONSHIP_IDS, REPORT_ID, + FAKE_TIME, IDENTITY_ID, IDENTITY_KWARGS, INDICATOR_ID, INDICATOR_KWARGS, + LOCATION_ID, MALWARE_ID, MALWARE_KWARGS, RELATIONSHIP_IDS, REPORT_ID, REPORT_KWARGS, THREAT_ACTOR_ID, THREAT_ACTOR_KWARGS, TOOL_ID, TOOL_KWARGS, VULNERABILITY_ID, VULNERABILITY_KWARGS, ) @@ -615,37 +613,6 @@ def test_semantic_equivalence_different_spec_version_raises(): assert str(excinfo.value) == "The objects to compare must be of the same spec version!" -@pytest.mark.parametrize( - "obj1,obj2,ret_val", - [ - ( - stix2.v21.CourseOfAction(id=COURSE_OF_ACTION_ID, **COURSE_OF_ACTION_KWARGS), - stix2.v21.CourseOfAction(id=COURSE_OF_ACTION_ID, **COURSE_OF_ACTION_KWARGS), - "course-of-action type has no semantic equivalence implementation!", - ), - ( - stix2.v21.IntrusionSet(id=INTRUSION_SET_ID, **INTRUSION_SET_KWARGS), - stix2.v21.IntrusionSet(id=INTRUSION_SET_ID, **INTRUSION_SET_KWARGS), - "intrusion-set type has no semantic equivalence implementation!", - ), - ( - stix2.v21.ObservedData(id=OBSERVED_DATA_ID, **OBSERVED_DATA_KWARGS), - stix2.v21.ObservedData(id=OBSERVED_DATA_ID, **OBSERVED_DATA_KWARGS), - "observed-data type has no semantic equivalence implementation!", - ), - ( - stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS), - stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS), - "report type has no semantic equivalence implementation!", - ), - ], -) -def test_semantic_equivalence_on_unsupported_types(obj1, obj2, ret_val): - with pytest.raises(stix2.exceptions.SemanticEquivalenceUnsupportedTypeError) as excinfo: - stix2.Environment().semantically_equivalent(obj1, obj2) - assert ret_val == str(excinfo.value) - - def test_semantic_equivalence_zero_match(): IND_KWARGS = dict( indicator_types=["APTX"], @@ -767,7 +734,7 @@ def test_semantic_equivalence_external_references(refs1, refs2, ret_val): assert value == ret_val -def test_semantic_equivalence_timetamp(): +def test_semantic_equivalence_timestamp(): t1 = "2018-10-17T00:14:20.652Z" t2 = "2018-10-17T12:14:20.652Z" assert stix2.environment.partial_timestamp_based(t1, t2, 1) == 0.5 @@ -777,3 +744,9 @@ def test_semantic_equivalence_exact_match(): t1 = "2018-10-17T00:14:20.652Z" t2 = "2018-10-17T12:14:20.652Z" assert stix2.environment.exact_match(t1, t2) == 0.0 + + +def test_non_existent_config_for_object(): + r1 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) + r2 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) + assert stix2.Environment().semantically_equivalent(r1, r2) == 0.0 diff --git a/stix2/version.py b/stix2/version.py index c68196d..a955fda 100644 --- a/stix2/version.py +++ b/stix2/version.py @@ -1 +1 @@ -__version__ = "1.2.0" +__version__ = "1.2.1"