From f6e75cd8f81ac11671b9c16099d95b5095fe70ef Mon Sep 17 00:00:00 2001 From: "Desai, Kartikey H" Date: Wed, 4 Dec 2019 13:21:46 -0500 Subject: [PATCH] Add debug logging messages and add documentation to equivalence.ipynb --- docs/guide/equivalence.ipynb | 905 ++++++++++++++++++++++++++++++++++- stix2/environment.py | 5 +- 2 files changed, 903 insertions(+), 7 deletions(-) diff --git a/docs/guide/equivalence.ipynb b/docs/guide/equivalence.ipynb index 5db3464..99083d0 100644 --- a/docs/guide/equivalence.ipynb +++ b/docs/guide/equivalence.ipynb @@ -778,7 +778,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "metadata": { "scrolled": true }, @@ -861,7 +861,7 @@ "" ] }, - "execution_count": 9, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -1149,14 +1149,14 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "'report' type has no semantic equivalence method to call!\n" + "'report' type has no 'weights' dict specified & thus no semantic equivalence method to call!\n" ] }, { @@ -1237,7 +1237,7 @@ "" ] }, - "execution_count": 13, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -1659,6 +1659,899 @@ ")\n", "print(env.semantically_equivalent(ta3, ta4))" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Custom Comparisons\n", + "If you wish, you can run your own custom semantic equivalence comparisons. Specifically, you can do any of three things:\n", + " - Provide custom weights for each semantic equivalence contributing property\n", + " - Provide custom comparison functions for individual semantic equivalence contributing properties\n", + " - Provide a custom semantic equivalence method\n", + "\n", + "*Some of this has already been explained above, but we will go into more detail here.*\n", + "\n", + "#### The `weights` dictionary\n", + "In order to do any of the aforementioned (*optional*) custom comparisons, you will need to provide a `weights` dictionary to the `semantically_equivalent()` method call. At a minimum, you must provide the custom weight and custom comparison function for each property. Now, you may use the default weights, or provide your own. You may also use any of the existing comparison functions, or provide your own.\n", + "\n", + "##### Existing comparison functions\n", + "For reference, here is a list of comparison functions already in the codebase (found in stix2/environment.py):\n", + " - `partial_timestamp_based`\n", + " - `partial_list_based`\n", + " - `exact_match`\n", + " - `partial_string_based`\n", + " - `custom_pattern_based`\n", + " - `partial_external_reference_based`\n", + " - `partial_location_distance`\n", + "\n", + "For instance, if we wanted to compare two `ThreatActor`s, but use our own weights, then we could do the following:\n", + "\n", + "(**Please note that if you provide a custom weights dictionary but not a custom semantic equivalence method [shown later], then you must follow the general format shown in the `weights` dict below**)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Using standard weights: 43.6\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/html": [ + "
Using custom weights: 41.8\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weights = {\n", + " \"threat-actor\": { # You must specify for which object type this dict is\n", + " \"name\": (30, stix2.environment.partial_string_based), # Each property's value must be a tuple\n", + " \"threat_actor_types\": (50, stix2.environment.partial_list_based), # The 1st component must be the weight\n", + " \"aliases\": (20, stix2.environment.partial_list_based) # The 2nd component must be the comparison function\n", + " }\n", + "}\n", + "\n", + "ta5 = ThreatActor(\n", + " threat_actor_types=[\"crime-syndicate\", \"spy\"],\n", + " name=\"Evil Org\",\n", + " aliases=[\"super-evil\"],\n", + ")\n", + "ta6 = ThreatActor(\n", + " threat_actor_types=[\"spy\"],\n", + " name=\"James Bond\",\n", + " aliases=[\"007\"],\n", + ")\n", + "\n", + "print(\"Using standard weights: %s\" % (env.semantically_equivalent(ta5, ta6)))\n", + "print(\"Using custom weights: %s\" % (env.semantically_equivalent(ta5, ta6, **weights)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice how there is a difference in the semantic equivalence scores, simply due to the fact that custom weights were used.\n", + "\n", + "#### Custom Semantic Equivalence Function\n", + "As said before, you can also write and use your own semantic equivalence method. To do this, you must provide a `weights` dictionary to `semantically_equivalent()`. In this dict, you will provide a key of \"method\" whose value will be your custom semantic equivalence function.\n", + "\n", + "If you provide your own custom semantic equivalence method, you **must also provide the weights for each of the properties** (unless, for some reason, your custom method is weights-agnostic). However, since you are writing the custom method, your weights need not necessarily follow the tuple format specified in the above code box.\n", + "\n", + "Here we use our own custom semantic equivalence function to compare two `ThreatActor`s. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Using a custom method: 21.263333333333335\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def custom_semantic_equivalence_method(obj1, obj2, **weights):\n", + " sum_weights = 200.0\n", + " matching_score = 20.19\n", + " for prop in weights:\n", + " if prop != \"method\":\n", + " w = weights[prop][0]\n", + " comp_funct = weights[prop][1]\n", + " contributing_score = w * comp_funct(obj1[prop], obj2[prop])\n", + " sum_weights += w\n", + " matching_score += contributing_score\n", + " return matching_score, sum_weights\n", + "\n", + "\n", + "weights = {\n", + " \"threat-actor\": {\n", + " \"name\": (60, stix2.environment.partial_string_based), # We left each property's value as a tuple\n", + " \"threat_actor_types\": (20, stix2.environment.partial_list_based), # However, weights could be simply numeric\n", + " \"aliases\": (20, stix2.environment.partial_list_based), # They may also be anything else you want\n", + " \"method\": custom_semantic_equivalence_method # As long as your func is written accordingly\n", + " }\n", + "}\n", + "\n", + "print(\"Using a custom method: %s\" % (env.semantically_equivalent(ta5, ta6, **weights)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice the semantic equivalence score of ~21.26 when using a custom semantic equivalence method to compare `ta5` & `ta6`. Compare this to the semantic equivalence score of 43.6 when using the default semantic equivalence method for comparing `ta5` & `ta6`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `prop_scores`\n", + "The `semantically_equivalent()` function now takes an optional third argument, called `prop_scores`. As explained previously, the semantic equivalence functionality includes detailed debugging messages. This new argument is meant to be a dictionary that stores those detailed debugging messages so that the debug information can be accessed and used more programatically.\n", + "\n", + "Using `prop_scores` is simple: simply pass in a dictionary to `semantically_equivalent()`, and after the function is done executing, the dict will have the various scores in it. Specifically, it will have the overall `matching_score` and `sum_weights`, along with the weight and contributing score for each of the semantic equivalence contributing properties.\n", + "\n", + "For instance:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Semantic equivalence score using standard weights: 43.6\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/html": [ + "
Prop: name | weight: 60 | contributing_score: 33.6\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/html": [ + "
Prop: threat_actor_types | weight: 20 | contributing_score: 10.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/html": [ + "
Prop: aliases | weight: 20 | contributing_score: 0.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/html": [ + "
matching_score: 43.6\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/html": [ + "
sum_weights: 100.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prop_scores = {}\n", + "print(\"Semantic equivalence score using standard weights: %s\" % (env.semantically_equivalent(ta5, ta6, prop_scores)))\n", + "for prop in prop_scores:\n", + " if prop not in [\"matching_score\", \"sum_weights\"]:\n", + " print (\"Prop: %s | weight: %s | contributing_score: %s\" % (prop, prop_scores[prop][0], prop_scores[prop][1]))\n", + " else:\n", + " print (\"%s: %s\" % (prop, prop_scores[prop]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we wanted, we could have also passed in a custom `weights` dict to the above `semantically_equivalent()` call. If we want to use both `prop_scores` and `weights`, then they would be the third and fourth arguments, respectively, to `sematically_equivalent()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1677,7 +2570,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.3" + "version": "3.6.7" } }, "nbformat": 4, diff --git a/stix2/environment.py b/stix2/environment.py index 9eae2d6..85381b2 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -279,12 +279,13 @@ class Environment(DataStoreMixin): try: weights[type1] except KeyError: - logger.warning("'%s' type has no 'weights' dict specified in the semantic equivalence method call!", type1) + logger.warning("'%s' type has no 'weights' dict specified & thus no semantic equivalence method to call!", type1) sum_weights = matching_score = 0 else: try: method = weights[type1]["method"] except KeyError: + logger.debug("Starting semantic equivalence process between: '%s' and '%s'", obj1["id"], obj2["id"]) matching_score = 0.0 sum_weights = 0.0 @@ -309,12 +310,14 @@ class Environment(DataStoreMixin): prop_scores["matching_score"] = matching_score prop_scores["sum_weights"] = sum_weights + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) else: logger.debug("Starting semantic equivalence process between: '%s' and '%s'", obj1["id"], obj2["id"]) try: matching_score, sum_weights = method(obj1, obj2, prop_scores, **weights[type1]) except TypeError: matching_score, sum_weights = method(obj1, obj2, **weights[type1]) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) if sum_weights <= 0: return 0