diff --git a/.gitignore b/.gitignore index 5534a28..9758937 100644 --- a/.gitignore +++ b/.gitignore @@ -55,6 +55,7 @@ coverage.xml # Sphinx documentation docs/_build/ .ipynb_checkpoints +default_sem_eq_weights.rst # PyBuilder target/ diff --git a/.travis.yml b/.travis.yml index c05ec72..08214fe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,8 +8,11 @@ python: - "3.5" - "3.6" - "3.7" + - "3.8" install: - pip install -U pip setuptools + # remove pyyaml line when we drop py3.4 support + - pip install "pyyaml<5.3" - pip install tox-travis pre-commit - pip install codecov script: diff --git a/CHANGELOG b/CHANGELOG index 2eab92a..b764735 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,18 @@ CHANGELOG ========= +1.3.0 - 2020-01-04 + +* #305 Updates support of STIX 2.1 to WD06 +* #304 Updates semantic equivalence to latest draft, and allows programmatic + detailed logging +* Adds Python 3.8 support +* #297 Fixes bug with File.contains_refs +* #311 Fixes several DeprecationWarnings +* #315 Fixes parsing embedded external references with custom properties +* #316 Fix socket extension key checking +* #317 Fixes checking of Indicator's pattern property based on pattern_version + 1.2.1 - 2019-10-16 * #301 Adds more detailed debugging semantic equivalence output diff --git a/docs/conf.py b/docs/conf.py index 2a10fbd..8b372d5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,5 @@ import datetime +import json import os import re import sys @@ -7,6 +8,7 @@ from six import class_types from sphinx.ext.autodoc import ClassDocumenter from stix2.base import _STIXBase +from stix2.environment import WEIGHTS from stix2.version import __version__ sys.path.insert(0, os.path.abspath('..')) @@ -59,6 +61,14 @@ latex_documents = [ (master_doc, 'stix2.tex', 'stix2 Documentation', 'OASIS', 'manual'), ] +# Add a formatted version of environment.WEIGHTS +default_sem_eq_weights = json.dumps(WEIGHTS, indent=4, default=lambda o: o.__name__) +default_sem_eq_weights = default_sem_eq_weights.replace('\n', '\n ') +default_sem_eq_weights = default_sem_eq_weights.replace(' "', ' ') +default_sem_eq_weights = default_sem_eq_weights.replace('"\n', '\n') +with open('default_sem_eq_weights.rst', 'w') as f: + f.write(".. code-block:: py\n\n {}\n\n".format(default_sem_eq_weights)) + def get_property_type(prop): """Convert property classname into pretty string name of property. diff --git a/docs/guide/equivalence.ipynb b/docs/guide/equivalence.ipynb index 5db3464..05d0d99 100644 --- a/docs/guide/equivalence.ipynb +++ b/docs/guide/equivalence.ipynb @@ -62,8 +62,16 @@ "\n", "TODO: Add a link to the committee note when it is released.\n", "\n", + "There are a number of use cases for which calculating semantic equivalence may be helpful. It can be used for echo detection, in which a STIX producer who consumes content from other producers wants to make sure they are not creating content they have already seen or consuming content they have already created.\n", + "\n", + "Another use case for this functionality is to identify identical or near-identical content, such as a vulnerability shared under three different nicknames by three different STIX producers. A third use case involves a feed that aggregates data from multiple other sources. It will want to make sure that it is not publishing duplicate data.\n", + "\n", "Below we will show examples of the semantic equivalence results of various objects. Unless otherwise specified, the ID of each object will be generated by the library, so the two objects will not have the same ID. This demonstrates that the semantic equivalence algorithm only looks at specific properties for each object type.\n", "\n", + "**Please note** that you will need to install a few extra dependencies in order to use the semantic equivalence functions. You can do this using:\n", + "\n", + "```pip install stix2[semantic]```\n", + "\n", "### Attack Pattern Example\n", "\n", "For Attack Patterns, the only properties that contribute to semantic equivalence are `name` and `external_references`, with weights of 30 and 70, respectively. In this example, both attack patterns have the same external reference but the second has a slightly different yet still similar name." @@ -145,7 +153,7 @@ ".highlight .vg { color: #19177C } /* Name.Variable.Global */\n", ".highlight .vi { color: #19177C } /* Name.Variable.Instance */\n", ".highlight .vm { color: #19177C } /* Name.Variable.Magic */\n", - ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */
85.3\n",
+ ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */91.9\n",
"
\n"
],
"text/plain": [
@@ -270,7 +278,7 @@
".highlight .vg { color: #19177C } /* Name.Variable.Global */\n",
".highlight .vi { color: #19177C } /* Name.Variable.Instance */\n",
".highlight .vm { color: #19177C } /* Name.Variable.Magic */\n",
- ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */50.0\n",
+ ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */30.0\n",
"
\n"
],
"text/plain": [
@@ -773,7 +781,7 @@
"source": [
"### Threat Actor Example\n",
"\n",
- "For Threat Actors, the only properties that contribute to semantic equivalence are `threat_actor_types`, `name`, and `aliases`, with weights of 20, 60, and 20, respectively. In this example, the two threat actors have the same id properties but everything else is different. Since the id property does not factor into semantic equivalence, the result is not very high. The result is not zero because the algorithm is using the Jaro-Winkler distance between strings in the threat_actor_types and name properties."
+ "For Threat Actors, the only properties that contribute to semantic equivalence are `threat_actor_types`, `name`, and `aliases`, with weights of 20, 60, and 20, respectively. In this example, the two threat actors have the same id properties but everything else is different. Since the id property does not factor into semantic equivalence, the result is not very high. The result is not zero because of the \"Token Sort Ratio\" algorithm used to compare the `name` property."
]
},
{
@@ -854,7 +862,7 @@
".highlight .vg { color: #19177C } /* Name.Variable.Global */\n",
".highlight .vi { color: #19177C } /* Name.Variable.Instance */\n",
".highlight .vm { color: #19177C } /* Name.Variable.Magic */\n",
- ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */33.6\n",
+ ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */6.6000000000000005\n",
"
\n"
],
"text/plain": [
@@ -1119,7 +1127,7 @@
"source": [
"### Other Examples\n",
"\n",
- "Comparing objects of different types will result in an error."
+ "Comparing objects of different types will result in a `ValueError`."
]
},
{
@@ -1156,7 +1164,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "'report' type has no semantic equivalence method to call!\n"
+ "'report' type has no 'weights' dict specified & thus no semantic equivalence method to call!\n"
]
},
{
@@ -1295,7 +1303,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "You can optionally allow comparing across spec versions by providing a configuration dictionary like in the next example:"
+ "You can optionally allow comparing across spec versions by providing a configuration dictionary using `ignore_spec_version` like in the next example:"
]
},
{
@@ -1400,162 +1408,28 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "You can modify the weights or provide your own functions for comparing objects of a certain type by providing them in a dictionary to the optional 3rd parameter to the semantic equivalence function. You can find functions (like `partial_string_based`) to help with this in the [Environment API docs](../api/stix2.environment.rst#stix2.environment.Environment). In this example we define semantic equivalence for our new `x-foobar` object type:"
+ "### Detailed Results\n",
+ "\n",
+ "If your logging level is set to `DEBUG` or higher, the function will log more detailed results. These show the semantic equivalence and weighting for each property that is checked, to show how the final result was arrived at."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "60.0\n",
- "
\n"
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "def _x_foobar_checks(obj1, obj2, **weights):\n",
- " matching_score = 0.0\n",
- " sum_weights = 0.0\n",
- " if stix2.environment.check_property_present(\"name\", obj1, obj2):\n",
- " w = weights[\"name\"]\n",
- " sum_weights += w\n",
- " matching_score += w * stix2.environment.partial_string_based(obj1[\"name\"], obj2[\"name\"])\n",
- " if stix2.environment.check_property_present(\"color\", obj1, obj2):\n",
- " w = weights[\"color\"]\n",
- " sum_weights += w\n",
- " matching_score += w * stix2.environment.partial_string_based(obj1[\"color\"], obj2[\"color\"])\n",
- " return matching_score, sum_weights\n",
- "\n",
- "weights = {\n",
- " \"x-foobar\": {\n",
- " \"name\": 60,\n",
- " \"color\": 40,\n",
- " \"method\": _x_foobar_checks,\n",
- " },\n",
- " \"_internal\": {\n",
- " \"ignore_spec_version\": False,\n",
- " },\n",
- "}\n",
- "foo1 = {\n",
- " \"type\":\"x-foobar\",\n",
- " \"id\":\"x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061\",\n",
- " \"name\": \"Zot\",\n",
- " \"color\": \"red\",\n",
- "}\n",
- "foo2 = {\n",
- " \"type\":\"x-foobar\",\n",
- " \"id\":\"x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061\",\n",
- " \"name\": \"Zot\",\n",
- " \"color\": \"blue\",\n",
- "}\n",
- "print(env.semantically_equivalent(foo1, foo2, **weights))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Detailed Results\n",
- "\n",
- "If your logging level is set to `DEBUG` or higher, the function will log more detailed results. These show the semantic equivalence and weighting for each property that is checked, to show how the final result was arrived at."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "Starting semantic equivalence process between: 'threat-actor--54dc2aac-6fde-4a68-ae2a-0c0bc575ed70' and 'threat-actor--c51bce3b-a067-4692-ab77-fcdefdd3f157'\n",
- "--\t\tpartial_string_based 'Evil Org' 'James Bond'\tresult: '0.56'\n",
- "'name' check -- weight: 60, contributing score: 33.6\n",
+ "Starting semantic equivalence process between: 'threat-actor--664624c7-394e-49ad-ae2a-12f7a48a54a3' and 'threat-actor--1d67719e-6be6-4194-9226-1685986514f5'\n",
+ "--\t\tpartial_string_based 'Evil Org' 'James Bond'\tresult: '11'\n",
+ "'name' check -- weight: 60, contributing score: 6.6\n",
"--\t\tpartial_list_based '['crime-syndicate']' '['spy']'\tresult: '0.0'\n",
"'threat_actor_types' check -- weight: 20, contributing score: 0.0\n",
"--\t\tpartial_list_based '['super-evil']' '['007']'\tresult: '0.0'\n",
"'aliases' check -- weight: 20, contributing score: 0.0\n",
- "Matching Score: 33.6, Sum of Weights: 100.0\n"
+ "Matching Score: 6.6, Sum of Weights: 100.0\n"
]
},
{
@@ -1629,14 +1503,14 @@
".highlight .vg { color: #19177C } /* Name.Variable.Global */\n",
".highlight .vi { color: #19177C } /* Name.Variable.Instance */\n",
".highlight .vm { color: #19177C } /* Name.Variable.Magic */\n",
- ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */33.6\n",
+ ".highlight .il { color: #666666 } /* Literal.Number.Integer.Long */6.6000000000000005\n",
"
\n"
],
"text/plain": [
""
]
},
- "execution_count": 17,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -1657,7 +1531,1506 @@
" name=\"James Bond\",\n",
" aliases=[\"007\"],\n",
")\n",
- "print(env.semantically_equivalent(ta3, ta4))"
+ "print(env.semantically_equivalent(ta3, ta4))\n",
+ "\n",
+ "logger.setLevel(logging.ERROR)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can also retrieve the detailed results in a dictionary so the detailed results information can be accessed and used more programatically. The [semantically_equivalent()](../api/stix2.environment.rst#stix2.environment.Environment.semantically_equivalent) function takes an optional third argument, called `prop_scores`. This argument should be a dictionary into which the detailed debugging information will be stored.\n",
+ "\n",
+ "Using `prop_scores` is simple: simply pass in a dictionary to `semantically_equivalent()`, and after the function is done executing, the dictionary will have the various scores in it. Specifically, it will have the overall `matching_score` and `sum_weights`, along with the weight and contributing score for each of the semantic equivalence contributing properties.\n",
+ "\n",
+ "For example:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Semantic equivalence score using standard weights: 16.6\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/html": [
+ "{'name': {'weight': 60, 'contributing_score': 6.6}, 'threat_actor_types': {'weight': 20, 'contributing_score': 10.0}, 'aliases': {'weight': 20, 'contributing_score': 0.0}, 'matching_score': 16.6, 'sum_weights': 100.0}\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Prop: name | weight: 60 | contributing_score: 6.6\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Prop: threat_actor_types | weight: 20 | contributing_score: 10.0\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Prop: aliases | weight: 20 | contributing_score: 0.0\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/html": [
+ "matching_score: 16.6\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/html": [
+ "sum_weights: 100.0\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ta5 = ThreatActor(\n",
+ " threat_actor_types=[\"crime-syndicate\", \"spy\"],\n",
+ " name=\"Evil Org\",\n",
+ " aliases=[\"super-evil\"],\n",
+ ")\n",
+ "ta6 = ThreatActor(\n",
+ " threat_actor_types=[\"spy\"],\n",
+ " name=\"James Bond\",\n",
+ " aliases=[\"007\"],\n",
+ ")\n",
+ "\n",
+ "prop_scores = {}\n",
+ "print(\"Semantic equivalence score using standard weights: %s\" % (env.semantically_equivalent(ta5, ta6, prop_scores)))\n",
+ "print(prop_scores)\n",
+ "for prop in prop_scores:\n",
+ " if prop not in [\"matching_score\", \"sum_weights\"]:\n",
+ " print (\"Prop: %s | weight: %s | contributing_score: %s\" % (prop, prop_scores[prop]['weight'], prop_scores[prop]['contributing_score']))\n",
+ " else:\n",
+ " print (\"%s: %s\" % (prop, prop_scores[prop]))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Custom Comparisons\n",
+ "If you wish, you can customize semantic equivalence comparisons. Specifically, you can do any of three things:\n",
+ " - Provide custom weights for each semantic equivalence contributing property\n",
+ " - Provide custom comparison functions for individual semantic equivalence contributing properties\n",
+ " - Provide a custom semantic equivalence function for a specific object type\n",
+ "\n",
+ "#### The `weights` dictionary\n",
+ "In order to do any of the aforementioned (*optional*) custom comparisons, you will need to provide a `weights` dictionary as the last parameter to the [semantically_equivalent()](../api/stix2.environment.rst#stix2.environment.Environment.semantically_equivalent) method call. \n",
+ "\n",
+ "The weights dictionary should contain both the weight and the comparison function for each property. You may use the default weights and functions, or provide your own.\n",
+ "\n",
+ "##### Existing comparison functions\n",
+ "For reference, here is a list of the comparison functions already built in the codebase (found in [stix2/environment.py](../api/stix2.environment.rst#stix2.environment.Environment)):\n",
+ "\n",
+ " - [custom_pattern_based](../api/stix2.environment.rst#stix2.environment.custom_pattern_based)\n",
+ " - [exact_match](../api/stix2.environment.rst#stix2.environment.exact_match)\n",
+ " - [partial_external_reference_based](../api/stix2.environment.rst#stix2.environment.partial_external_reference_based)\n",
+ " - [partial_list_based](../api/stix2.environment.rst#stix2.environment.partial_list_based)\n",
+ " - [partial_location_distance](../api/stix2.environment.rst#stix2.environment.partial_location_distance)\n",
+ " - [partial_string_based](../api/stix2.environment.rst#stix2.environment.partial_string_based)\n",
+ " - [partial_timestamp_based](../api/stix2.environment.rst#stix2.environment.partial_timestamp_based)\n",
+ "\n",
+ "For instance, if we wanted to compare two of the `ThreatActor`s from before, but use our own weights, then we could do the following:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Using standard weights: 16.6\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Using custom weights: 28.300000000000004\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weights = {\n",
+ " \"threat-actor\": { # You must specify the object type\n",
+ " \"name\": (30, stix2.environment.partial_string_based), # Each property's value must be a tuple\n",
+ " \"threat_actor_types\": (50, stix2.environment.partial_list_based), # The 1st component must be the weight\n",
+ " \"aliases\": (20, stix2.environment.partial_list_based) # The 2nd component must be the comparison function\n",
+ " }\n",
+ "}\n",
+ "\n",
+ "print(\"Using standard weights: %s\" % (env.semantically_equivalent(ta5, ta6)))\n",
+ "print(\"Using custom weights: %s\" % (env.semantically_equivalent(ta5, ta6, **weights)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Notice how there is a difference in the semantic equivalence scores, simply due to the fact that custom weights were used.\n",
+ "\n",
+ "#### Custom Weights With prop_scores\n",
+ "If we want to use both `prop_scores` and `weights`, then they would be the third and fourth arguments, respectively, to `sematically_equivalent()`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "9.95"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/html": [
+ "{'name': {'weight': 45, 'contributing_score': 4.95}, 'threat_actor_types': {'weight': 10, 'contributing_score': 5.0}, 'aliases': {'weight': 45, 'contributing_score': 0.0}, 'matching_score': 9.95, 'sum_weights': 100.0}\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "prop_scores = {}\n",
+ "weights = {\n",
+ " \"threat-actor\": {\n",
+ " \"name\": (45, stix2.environment.partial_string_based),\n",
+ " \"threat_actor_types\": (10, stix2.environment.partial_list_based),\n",
+ " \"aliases\": (45, stix2.environment.partial_list_based),\n",
+ " },\n",
+ "}\n",
+ "env.semantically_equivalent(ta5, ta6, prop_scores, **weights)\n",
+ "print(prop_scores)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Custom Semantic Equivalence Functions\n",
+ "You can also write and use your own semantic equivalence functions. In the examples above, you could replace the built-in comparison functions for any or all properties. For example, here we use a custom string comparison function just for the `'name'` property:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Using custom string comparison: 5.0\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def my_string_compare(p1, p2):\n",
+ " if p1 == p2:\n",
+ " return 1\n",
+ " else:\n",
+ " return 0\n",
+ " \n",
+ "weights = {\n",
+ " \"threat-actor\": {\n",
+ " \"name\": (45, my_string_compare),\n",
+ " \"threat_actor_types\": (10, stix2.environment.partial_list_based),\n",
+ " \"aliases\": (45, stix2.environment.partial_list_based),\n",
+ " },\n",
+ "}\n",
+ "print(\"Using custom string comparison: %s\" % (env.semantically_equivalent(ta5, ta6, **weights)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can also customize the comparison of an entire object type instead of just how each property is compared. To do this, provide a `weights` dictionary to `semantically_equivalent()` and in this dictionary include a key of `\"method\"` whose value is your custom semantic equivalence function for that object type.\n",
+ "\n",
+ "If you provide your own custom semantic equivalence method, you **must also provide the weights for each of the properties** (unless, for some reason, your custom method is weights-agnostic). However, since you are writing the custom method, your weights need not necessarily follow the tuple format specified in the above code box.\n",
+ "\n",
+ "Note also that if you want detailed results with `prop_scores` you will need to implement that in your custom function, but you are not required to do so.\n",
+ "\n",
+ "In this next example we use our own custom semantic equivalence function to compare two `ThreatActor`s, and do not support `prop_scores`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Using standard weights: 16.6\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Using a custom method: 6.6000000000000005\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def custom_semantic_equivalence_method(obj1, obj2, **weights):\n",
+ " sum_weights = 0\n",
+ " matching_score = 0\n",
+ " # Compare name\n",
+ " w = weights['name']\n",
+ " sum_weights += w\n",
+ " contributing_score = w * stix2.environment.partial_string_based(obj1['name'], obj2['name'])\n",
+ " matching_score += contributing_score\n",
+ " # Compare aliases only for spies\n",
+ " if 'spy' in obj1['threat_actor_types'] + obj2['threat_actor_types']:\n",
+ " w = weights['aliases']\n",
+ " sum_weights += w\n",
+ " contributing_score = w * stix2.environment.partial_list_based(obj1['aliases'], obj2['aliases'])\n",
+ " matching_score += contributing_score\n",
+ " \n",
+ " return matching_score, sum_weights\n",
+ "\n",
+ "weights = {\n",
+ " \"threat-actor\": {\n",
+ " \"name\": 60,\n",
+ " \"aliases\": 40,\n",
+ " \"method\": custom_semantic_equivalence_method\n",
+ " }\n",
+ "}\n",
+ "\n",
+ "print(\"Using standard weights: %s\" % (env.semantically_equivalent(ta5, ta6)))\n",
+ "print(\"Using a custom method: %s\" % (env.semantically_equivalent(ta5, ta6, **weights)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can also write custom functions for comparing objects of your own custom types. Like in the previous example, you can use the built-in functions listed above to help with this, or write your own. In the following example we define semantic equivalence for our new `x-foobar` object type. Notice that this time we have included support for detailed results with `prop_scores`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "71.6\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/html": [
+ "{'name': (60, 60.0), 'color': (40, 11.6), 'matching_score': 71.6, 'sum_weights': 100.0}\n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def _x_foobar_checks(obj1, obj2, prop_scores, **weights):\n",
+ " matching_score = 0.0\n",
+ " sum_weights = 0.0\n",
+ " if stix2.environment.check_property_present(\"name\", obj1, obj2):\n",
+ " w = weights[\"name\"]\n",
+ " sum_weights += w\n",
+ " contributing_score = w * stix2.environment.partial_string_based(obj1[\"name\"], obj2[\"name\"])\n",
+ " matching_score += contributing_score\n",
+ " prop_scores[\"name\"] = (w, contributing_score)\n",
+ " if stix2.environment.check_property_present(\"color\", obj1, obj2):\n",
+ " w = weights[\"color\"]\n",
+ " sum_weights += w\n",
+ " contributing_score = w * stix2.environment.partial_string_based(obj1[\"color\"], obj2[\"color\"])\n",
+ " matching_score += contributing_score\n",
+ " prop_scores[\"color\"] = (w, contributing_score)\n",
+ " \n",
+ " prop_scores[\"matching_score\"] = matching_score\n",
+ " prop_scores[\"sum_weights\"] = sum_weights\n",
+ " return matching_score, sum_weights\n",
+ "\n",
+ "prop_scores = {}\n",
+ "weights = {\n",
+ " \"x-foobar\": {\n",
+ " \"name\": 60,\n",
+ " \"color\": 40,\n",
+ " \"method\": _x_foobar_checks,\n",
+ " },\n",
+ " \"_internal\": {\n",
+ " \"ignore_spec_version\": False,\n",
+ " },\n",
+ "}\n",
+ "foo1 = {\n",
+ " \"type\":\"x-foobar\",\n",
+ " \"id\":\"x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061\",\n",
+ " \"name\": \"Zot\",\n",
+ " \"color\": \"red\",\n",
+ "}\n",
+ "foo2 = {\n",
+ " \"type\":\"x-foobar\",\n",
+ " \"id\":\"x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061\",\n",
+ " \"name\": \"Zot\",\n",
+ " \"color\": \"blue\",\n",
+ "}\n",
+ "print(env.semantically_equivalent(foo1, foo2, prop_scores, **weights))\n",
+ "print(prop_scores)"
]
}
],
@@ -1677,7 +3050,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.3"
+ "version": "3.8.1"
}
},
"nbformat": 4,
diff --git a/setup.cfg b/setup.cfg
index bcf51b8..659a1cd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 1.2.1
+current_version = 1.3.0
commit = True
tag = True
diff --git a/setup.py b/setup.py
index ea20795..66bf302 100644
--- a/setup.py
+++ b/setup.py
@@ -46,6 +46,7 @@ setup(
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
+ 'Programming Language :: Python :: 3.8',
],
keywords='stix stix2 json cti cyber threat intelligence',
packages=find_packages(exclude=['*.test', '*.test.*']),
@@ -64,6 +65,6 @@ setup(
},
extras_require={
'taxii': ['taxii2-client'],
- 'semantic': ['haversine', 'pyjarowinkler'],
+ 'semantic': ['haversine', 'fuzzywuzzy'],
},
)
diff --git a/stix2/base.py b/stix2/base.py
index 8beddca..5da0fbc 100644
--- a/stix2/base.py
+++ b/stix2/base.py
@@ -1,6 +1,5 @@
"""Base classes for type definitions in the STIX2 library."""
-import collections
import copy
import datetime as dt
import uuid
@@ -20,6 +19,12 @@ from .utils import NOW, find_property_index, format_datetime, get_timestamp
from .utils import new_version as _new_version
from .utils import revoke as _revoke
+try:
+ from collections.abc import Mapping
+except ImportError:
+ from collections import Mapping
+
+
__all__ = ['STIXJSONEncoder', '_STIXBase']
DEFAULT_ERROR = "{type} must have {property}='{expected}'."
@@ -68,7 +73,7 @@ def get_required_properties(properties):
return (k for k, v in properties.items() if v.required)
-class _STIXBase(collections.Mapping):
+class _STIXBase(Mapping):
"""Base class for STIX object types"""
def object_properties(self):
@@ -143,7 +148,7 @@ class _STIXBase(collections.Mapping):
def __init__(self, allow_custom=False, interoperability=False, **kwargs):
cls = self.__class__
- self.__allow_custom = allow_custom
+ self._allow_custom = allow_custom
self.__interoperability = interoperability
# Use the same timestamp for any auto-generated datetimes
@@ -153,12 +158,12 @@ class _STIXBase(collections.Mapping):
custom_props = kwargs.pop('custom_properties', {})
if custom_props and not isinstance(custom_props, dict):
raise ValueError("'custom_properties' must be a dictionary")
- if not self.__allow_custom:
+ if not self._allow_custom:
extra_kwargs = list(set(kwargs) - set(self._properties))
if extra_kwargs:
raise ExtraPropertiesError(cls, extra_kwargs)
if custom_props:
- self.__allow_custom = True
+ self._allow_custom = True
# Remove any keyword arguments whose value is None or [] (i.e. empty list)
setting_kwargs = {}
@@ -236,7 +241,7 @@ class _STIXBase(collections.Mapping):
if isinstance(self, _Observable):
# Assume: valid references in the original object are still valid in the new version
new_inner['_valid_refs'] = {'*': '*'}
- new_inner['allow_custom'] = self.__allow_custom
+ new_inner['allow_custom'] = self._allow_custom
new_inner['interoperability'] = self.__interoperability
return cls(**new_inner)
@@ -308,7 +313,7 @@ class _Observable(_STIXBase):
# the constructor might be called independently of an observed data object
self._STIXBase__valid_refs = kwargs.pop('_valid_refs', [])
- self.__allow_custom = kwargs.get('allow_custom', False)
+ self._allow_custom = kwargs.get('allow_custom', False)
self._properties['extensions'].allow_custom = kwargs.get('allow_custom', False)
try:
diff --git a/stix2/environment.py b/stix2/environment.py
index 34e0a04..ada5f33 100644
--- a/stix2/environment.py
+++ b/stix2/environment.py
@@ -193,7 +193,7 @@ class Environment(DataStoreMixin):
return None
@staticmethod
- def semantically_equivalent(obj1, obj2, **weight_dict):
+ def semantically_equivalent(obj1, obj2, prop_scores={}, **weight_dict):
"""This method is meant to verify if two objects of the same type are
semantically equivalent.
@@ -210,68 +210,17 @@ class Environment(DataStoreMixin):
Course of Action, Intrusion-Set, Observed-Data, Report are not supported
by this implementation. Indicator pattern check is also limited.
+ Note:
+ Default weights_dict:
+
+ .. include:: ../default_sem_eq_weights.rst
+
Note:
This implementation follows the Committee Note on semantic equivalence.
see `the Committee Note `__.
"""
- # default weights used for the semantic equivalence process
- weights = {
- "attack-pattern": {
- "name": 30,
- "external_references": 70,
- "method": _attack_pattern_checks,
- },
- "campaign": {
- "name": 60,
- "aliases": 40,
- "method": _campaign_checks,
- },
- "identity": {
- "name": 60,
- "identity_class": 20,
- "sectors": 20,
- "method": _identity_checks,
- },
- "indicator": {
- "indicator_types": 15,
- "pattern": 80,
- "valid_from": 5,
- "tdelta": 1, # One day interval
- "method": _indicator_checks,
- },
- "location": {
- "longitude_latitude": 34,
- "region": 33,
- "country": 33,
- "threshold": 1000.0,
- "method": _location_checks,
- },
- "malware": {
- "malware_types": 20,
- "name": 80,
- "method": _malware_checks,
- },
- "threat-actor": {
- "name": 60,
- "threat_actor_types": 20,
- "aliases": 20,
- "method": _threat_actor_checks,
- },
- "tool": {
- "tool_types": 20,
- "name": 80,
- "method": _tool_checks,
- },
- "vulnerability": {
- "name": 30,
- "external_references": 70,
- "method": _vulnerability_checks,
- },
- "_internal": {
- "ignore_spec_version": False,
- },
- }
+ weights = WEIGHTS.copy()
if weight_dict:
weights.update(weight_dict)
@@ -286,17 +235,54 @@ class Environment(DataStoreMixin):
raise ValueError('The objects to compare must be of the same spec version!')
try:
- method = weights[type1]["method"]
+ weights[type1]
except KeyError:
- logger.warning("'%s' type has no semantic equivalence method to call!", type1)
+ logger.warning("'%s' type has no 'weights' dict specified & thus no semantic equivalence method to call!", type1)
sum_weights = matching_score = 0
else:
- logger.debug("Starting semantic equivalence process between: '%s' and '%s'", obj1["id"], obj2["id"])
- matching_score, sum_weights = method(obj1, obj2, **weights[type1])
+ try:
+ method = weights[type1]["method"]
+ except KeyError:
+ logger.debug("Starting semantic equivalence process between: '%s' and '%s'", obj1["id"], obj2["id"])
+ matching_score = 0.0
+ sum_weights = 0.0
+
+ for prop in weights[type1]:
+ if check_property_present(prop, obj1, obj2) or prop == "longitude_latitude":
+ w = weights[type1][prop][0]
+ comp_funct = weights[type1][prop][1]
+
+ if comp_funct == partial_timestamp_based:
+ contributing_score = w * comp_funct(obj1[prop], obj2[prop], weights[type1]["tdelta"])
+ elif comp_funct == partial_location_distance:
+ threshold = weights[type1]["threshold"]
+ contributing_score = w * comp_funct(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"], threshold)
+ else:
+ contributing_score = w * comp_funct(obj1[prop], obj2[prop])
+
+ sum_weights += w
+ matching_score += contributing_score
+
+ prop_scores[prop] = {
+ "weight": w,
+ "contributing_score": contributing_score,
+ }
+ logger.debug("'%s' check -- weight: %s, contributing score: %s", prop, w, contributing_score)
+
+ prop_scores["matching_score"] = matching_score
+ prop_scores["sum_weights"] = sum_weights
+ logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights)
+ else:
+ logger.debug("Starting semantic equivalence process between: '%s' and '%s'", obj1["id"], obj2["id"])
+ try:
+ matching_score, sum_weights = method(obj1, obj2, prop_scores, **weights[type1])
+ except TypeError:
+ # method doesn't support detailed output with prop_scores
+ matching_score, sum_weights = method(obj1, obj2, **weights[type1])
+ logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights)
if sum_weights <= 0:
return 0
-
equivalence_score = (matching_score / sum_weights) * 100.0
return equivalence_score
@@ -377,10 +363,10 @@ def partial_string_based(str1, str2):
float: Number between 0.0 and 1.0 depending on match criteria.
"""
- from pyjarowinkler import distance
- result = distance.get_jaro_distance(str1, str2)
+ from fuzzywuzzy import fuzz
+ result = fuzz.token_sort_ratio(str1, str2, force_ascii=False)
logger.debug("--\t\tpartial_string_based '%s' '%s'\tresult: '%s'", str1, str2, result)
- return result
+ return result / 100.0
def custom_pattern_based(pattern1, pattern2):
@@ -485,207 +471,51 @@ def partial_location_distance(lat1, long1, lat2, long2, threshold):
return result
-def _attack_pattern_checks(obj1, obj2, **weights):
- matching_score = 0.0
- sum_weights = 0.0
- if check_property_present("name", obj1, obj2):
- w = weights["name"]
- contributing_score = w * partial_string_based(obj1["name"], obj2["name"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score)
- if check_property_present("external_references", obj1, obj2):
- w = weights["external_references"]
- contributing_score = (
- w * partial_external_reference_based(obj1["external_references"], obj2["external_references"])
- )
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'external_references' check -- weight: %s, contributing score: %s", w, contributing_score)
- logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights)
- return matching_score, sum_weights
-
-
-def _campaign_checks(obj1, obj2, **weights):
- matching_score = 0.0
- sum_weights = 0.0
- if check_property_present("name", obj1, obj2):
- w = weights["name"]
- contributing_score = w * partial_string_based(obj1["name"], obj2["name"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score)
- if check_property_present("aliases", obj1, obj2):
- w = weights["aliases"]
- contributing_score = w * partial_list_based(obj1["aliases"], obj2["aliases"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'aliases' check -- weight: %s, contributing score: %s", w, contributing_score)
- logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights)
- return matching_score, sum_weights
-
-
-def _identity_checks(obj1, obj2, **weights):
- matching_score = 0.0
- sum_weights = 0.0
- if check_property_present("name", obj1, obj2):
- w = weights["name"]
- contributing_score = w * exact_match(obj1["name"], obj2["name"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score)
- if check_property_present("identity_class", obj1, obj2):
- w = weights["identity_class"]
- contributing_score = w * exact_match(obj1["identity_class"], obj2["identity_class"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'identity_class' check -- weight: %s, contributing score: %s", w, contributing_score)
- if check_property_present("sectors", obj1, obj2):
- w = weights["sectors"]
- contributing_score = w * partial_list_based(obj1["sectors"], obj2["sectors"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'sectors' check -- weight: %s, contributing score: %s", w, contributing_score)
- logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights)
- return matching_score, sum_weights
-
-
-def _indicator_checks(obj1, obj2, **weights):
- matching_score = 0.0
- sum_weights = 0.0
- if check_property_present("indicator_types", obj1, obj2):
- w = weights["indicator_types"]
- contributing_score = w * partial_list_based(obj1["indicator_types"], obj2["indicator_types"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'indicator_types' check -- weight: %s, contributing score: %s", w, contributing_score)
- if check_property_present("pattern", obj1, obj2):
- w = weights["pattern"]
- contributing_score = w * custom_pattern_based(obj1["pattern"], obj2["pattern"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'pattern' check -- weight: %s, contributing score: %s", w, contributing_score)
- if check_property_present("valid_from", obj1, obj2):
- w = weights["valid_from"]
- contributing_score = (
- w *
- partial_timestamp_based(obj1["valid_from"], obj2["valid_from"], weights["tdelta"])
- )
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'valid_from' check -- weight: %s, contributing score: %s", w, contributing_score)
- logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights)
- return matching_score, sum_weights
-
-
-def _location_checks(obj1, obj2, **weights):
- matching_score = 0.0
- sum_weights = 0.0
- if check_property_present("latitude", obj1, obj2) and check_property_present("longitude", obj1, obj2):
- w = weights["longitude_latitude"]
- contributing_score = (
- w *
- partial_location_distance(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"], weights["threshold"])
- )
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'longitude_latitude' check -- weight: %s, contributing score: %s", w, contributing_score)
- if check_property_present("region", obj1, obj2):
- w = weights["region"]
- contributing_score = w * exact_match(obj1["region"], obj2["region"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'region' check -- weight: %s, contributing score: %s", w, contributing_score)
- if check_property_present("country", obj1, obj2):
- w = weights["country"]
- contributing_score = w * exact_match(obj1["country"], obj2["country"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'country' check -- weight: %s, contributing score: %s", w, contributing_score)
- logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights)
- return matching_score, sum_weights
-
-
-def _malware_checks(obj1, obj2, **weights):
- matching_score = 0.0
- sum_weights = 0.0
- if check_property_present("malware_types", obj1, obj2):
- w = weights["malware_types"]
- contributing_score = w * partial_list_based(obj1["malware_types"], obj2["malware_types"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'malware_types' check -- weight: %s, contributing score: %s", w, contributing_score)
- if check_property_present("name", obj1, obj2):
- w = weights["name"]
- contributing_score = w * partial_string_based(obj1["name"], obj2["name"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score)
- logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights)
- return matching_score, sum_weights
-
-
-def _threat_actor_checks(obj1, obj2, **weights):
- matching_score = 0.0
- sum_weights = 0.0
- if check_property_present("name", obj1, obj2):
- w = weights["name"]
- contributing_score = w * partial_string_based(obj1["name"], obj2["name"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score)
- if check_property_present("threat_actor_types", obj1, obj2):
- w = weights["threat_actor_types"]
- contributing_score = w * partial_list_based(obj1["threat_actor_types"], obj2["threat_actor_types"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'threat_actor_types' check -- weight: %s, contributing score: %s", w, contributing_score)
- if check_property_present("aliases", obj1, obj2):
- w = weights["aliases"]
- contributing_score = w * partial_list_based(obj1["aliases"], obj2["aliases"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'aliases' check -- weight: %s, contributing score: %s", w, contributing_score)
- logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights)
- return matching_score, sum_weights
-
-
-def _tool_checks(obj1, obj2, **weights):
- matching_score = 0.0
- sum_weights = 0.0
- if check_property_present("tool_types", obj1, obj2):
- w = weights["tool_types"]
- contributing_score = w * partial_list_based(obj1["tool_types"], obj2["tool_types"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'tool_types' check -- weight: %s, contributing score: %s", w, contributing_score)
- if check_property_present("name", obj1, obj2):
- w = weights["name"]
- contributing_score = w * partial_string_based(obj1["name"], obj2["name"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score)
- logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights)
- return matching_score, sum_weights
-
-
-def _vulnerability_checks(obj1, obj2, **weights):
- matching_score = 0.0
- sum_weights = 0.0
- if check_property_present("name", obj1, obj2):
- w = weights["name"]
- contributing_score = w * partial_string_based(obj1["name"], obj2["name"])
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score)
- if check_property_present("external_references", obj1, obj2):
- w = weights["external_references"]
- contributing_score = w * partial_external_reference_based(
- obj1["external_references"],
- obj2["external_references"],
- )
- sum_weights += w
- matching_score += contributing_score
- logger.debug("'external_references' check -- weight: %s, contributing score: %s", w, contributing_score)
- logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights)
- return matching_score, sum_weights
+# default weights used for the semantic equivalence process
+WEIGHTS = {
+ "attack-pattern": {
+ "name": (30, partial_string_based),
+ "external_references": (70, partial_external_reference_based),
+ },
+ "campaign": {
+ "name": (60, partial_string_based),
+ "aliases": (40, partial_list_based),
+ },
+ "identity": {
+ "name": (60, partial_string_based),
+ "identity_class": (20, exact_match),
+ "sectors": (20, partial_list_based),
+ },
+ "indicator": {
+ "indicator_types": (15, partial_list_based),
+ "pattern": (80, custom_pattern_based),
+ "valid_from": (5, partial_timestamp_based),
+ "tdelta": 1, # One day interval
+ },
+ "location": {
+ "longitude_latitude": (34, partial_location_distance),
+ "region": (33, exact_match),
+ "country": (33, exact_match),
+ "threshold": 1000.0,
+ },
+ "malware": {
+ "malware_types": (20, partial_list_based),
+ "name": (80, partial_string_based),
+ },
+ "threat-actor": {
+ "name": (60, partial_string_based),
+ "threat_actor_types": (20, partial_list_based),
+ "aliases": (20, partial_list_based),
+ },
+ "tool": {
+ "tool_types": (20, partial_list_based),
+ "name": (80, partial_string_based),
+ },
+ "vulnerability": {
+ "name": (30, partial_string_based),
+ "external_references": (70, partial_external_reference_based),
+ },
+ "_internal": {
+ "ignore_spec_version": False,
+ },
+} #: :autodoc-skip:
diff --git a/stix2/properties.py b/stix2/properties.py
index 3b4ac07..7024e50 100644
--- a/stix2/properties.py
+++ b/stix2/properties.py
@@ -2,14 +2,12 @@
import base64
import binascii
-import collections
import copy
import inspect
import re
import uuid
from six import string_types, text_type
-from stix2patterns.validator import run_validator
import stix2
@@ -27,6 +25,11 @@ ID_REGEX_interoperability = re.compile(r"[0-9a-fA-F]{8}-"
"[0-9a-fA-F]{4}-"
"[0-9a-fA-F]{12}$")
+try:
+ from collections.abc import Mapping
+except ImportError:
+ from collections import Mapping
+
ERROR_INVALID_ID = (
"not a valid STIX identifier, must match --: {}"
)
@@ -208,8 +211,13 @@ class ListProperty(Property):
else:
obj_type = self.contained
- if isinstance(valid, collections.Mapping):
- result.append(obj_type(**valid))
+ if isinstance(valid, Mapping):
+ try:
+ valid._allow_custom
+ except AttributeError:
+ result.append(obj_type(**valid))
+ else:
+ result.append(obj_type(allow_custom=True, **valid))
else:
result.append(obj_type(valid))
@@ -403,7 +411,7 @@ class HashesProperty(DictionaryProperty):
def clean(self, value):
clean_dict = super(HashesProperty, self).clean(value)
- for k, v in clean_dict.items():
+ for k, v in copy.deepcopy(clean_dict).items():
key = k.upper().replace('-', '')
if key in HASHES_REGEX:
vocab_key = HASHES_REGEX[key][1]
@@ -562,14 +570,7 @@ class EnumProperty(StringProperty):
class PatternProperty(StringProperty):
-
- def clean(self, value):
- cleaned_value = super(PatternProperty, self).clean(value)
- errors = run_validator(cleaned_value)
- if errors:
- raise ValueError(str(errors[0]))
-
- return cleaned_value
+ pass
class ObservableProperty(Property):
diff --git a/stix2/test/v20/test_indicator.py b/stix2/test/v20/test_indicator.py
index b2836e5..1ae33ec 100644
--- a/stix2/test/v20/test_indicator.py
+++ b/stix2/test/v20/test_indicator.py
@@ -192,3 +192,23 @@ def test_invalid_indicator_pattern():
assert excinfo.value.cls == stix2.v20.Indicator
assert excinfo.value.prop_name == 'pattern'
assert 'mismatched input' in excinfo.value.reason
+
+
+def test_indicator_stix21_invalid_pattern():
+ now = dt.datetime(2017, 1, 1, 0, 0, 1, tzinfo=pytz.utc)
+ epoch = dt.datetime(1970, 1, 1, 0, 0, 1, tzinfo=pytz.utc)
+ patrn = "[EXISTS windows-registry-key:values]"
+
+ with pytest.raises(stix2.exceptions.InvalidValueError) as excinfo:
+ stix2.v20.Indicator(
+ type="indicator",
+ id=INDICATOR_ID,
+ created=now,
+ modified=now,
+ pattern=patrn,
+ valid_from=epoch,
+ labels=["malicious-activity"],
+ )
+
+ assert excinfo.value.cls == stix2.v20.Indicator
+ assert "FAIL: Error found at line 1:8. no viable alternative at input 'EXISTS" in str(excinfo.value)
diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py
index d057df5..fd4798f 100644
--- a/stix2/test/v21/test_environment.py
+++ b/stix2/test/v21/test_environment.py
@@ -521,7 +521,7 @@ def test_semantic_equivalence_on_same_vulnerability2():
],
)
VULN_KWARGS2 = dict(
- name="Zot",
+ name="Foo",
external_references=[
{
"url": "https://example2",
@@ -550,7 +550,7 @@ def test_semantic_equivalence_on_unknown_object():
CUSTOM_KWARGS2 = dict(
type="x-foobar",
id="x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061",
- name="Zot",
+ name="Foo",
external_references=[
{
"url": "https://example2",
@@ -622,11 +622,10 @@ def test_semantic_equivalence_zero_match():
)
weights = {
"indicator": {
- "indicator_types": 15,
- "pattern": 80,
- "valid_from": 0,
+ "indicator_types": (15, stix2.environment.partial_list_based),
+ "pattern": (80, stix2.environment.custom_pattern_based),
+ "valid_from": (5, stix2.environment.partial_timestamp_based),
"tdelta": 1, # One day interval
- "method": stix2.environment._indicator_checks,
},
"_internal": {
"ignore_spec_version": False,
@@ -645,11 +644,10 @@ def test_semantic_equivalence_different_spec_version():
)
weights = {
"indicator": {
- "indicator_types": 15,
- "pattern": 80,
- "valid_from": 0,
+ "indicator_types": (15, stix2.environment.partial_list_based),
+ "pattern": (80, stix2.environment.custom_pattern_based),
+ "valid_from": (5, stix2.environment.partial_timestamp_based),
"tdelta": 1, # One day interval
- "method": stix2.environment._indicator_checks,
},
"_internal": {
"ignore_spec_version": True, # Disables spec_version check.
@@ -750,3 +748,75 @@ def test_non_existent_config_for_object():
r1 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS)
r2 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS)
assert stix2.Environment().semantically_equivalent(r1, r2) == 0.0
+
+
+def custom_semantic_equivalence_method(obj1, obj2, **weights):
+ return 96.0, 100.0
+
+
+def test_semantic_equivalence_method_provided():
+ # Because `method` is provided, `partial_list_based` will be ignored
+ TOOL2_KWARGS = dict(
+ name="Random Software",
+ tool_types=["information-gathering"],
+ )
+
+ weights = {
+ "tool": {
+ "tool_types": (20, stix2.environment.partial_list_based),
+ "name": (80, stix2.environment.partial_string_based),
+ "method": custom_semantic_equivalence_method,
+ },
+ }
+
+ tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS)
+ tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS)
+ env = stix2.Environment().semantically_equivalent(tool1, tool2, **weights)
+ assert round(env) == 96
+
+
+def test_semantic_equivalence_prop_scores():
+ TOOL2_KWARGS = dict(
+ name="Random Software",
+ tool_types=["information-gathering"],
+ )
+
+ prop_scores = {}
+
+ tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS)
+ tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS)
+ stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores)
+ assert len(prop_scores) == 4
+ assert round(prop_scores["matching_score"], 1) == 8.8
+ assert round(prop_scores["sum_weights"], 1) == 100.0
+
+
+def custom_semantic_equivalence_method_prop_scores(obj1, obj2, prop_scores, **weights):
+ prop_scores["matching_score"] = 96.0
+ prop_scores["sum_weights"] = 100.0
+ return 96.0, 100.0
+
+
+def test_semantic_equivalence_prop_scores_method_provided():
+ TOOL2_KWARGS = dict(
+ name="Random Software",
+ tool_types=["information-gathering"],
+ )
+
+ weights = {
+ "tool": {
+ "tool_types": 20,
+ "name": 80,
+ "method": custom_semantic_equivalence_method_prop_scores,
+ },
+ }
+
+ prop_scores = {}
+
+ tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS)
+ tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS)
+ env = stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores, **weights)
+ assert round(env) == 96
+ assert len(prop_scores) == 2
+ assert prop_scores["matching_score"] == 96.0
+ assert prop_scores["sum_weights"] == 100.0
diff --git a/stix2/test/v21/test_indicator.py b/stix2/test/v21/test_indicator.py
index 0562dfd..152f253 100644
--- a/stix2/test/v21/test_indicator.py
+++ b/stix2/test/v21/test_indicator.py
@@ -207,3 +207,86 @@ def test_invalid_indicator_pattern():
assert excinfo.value.cls == stix2.v21.Indicator
assert excinfo.value.prop_name == 'pattern'
assert 'mismatched input' in excinfo.value.reason
+
+
+def test_indicator_with_custom_embedded_objs():
+ now = dt.datetime(2017, 1, 1, 0, 0, 1, tzinfo=pytz.utc)
+ epoch = dt.datetime(1970, 1, 1, 0, 0, 1, tzinfo=pytz.utc)
+
+ ext_ref = stix2.v21.ExternalReference(
+ source_name="Test",
+ description="Example Custom Ext Ref",
+ random_custom_prop="This is a custom property",
+ allow_custom=True,
+ )
+
+ ind = stix2.v21.Indicator(
+ type="indicator",
+ id=INDICATOR_ID,
+ created=now,
+ modified=now,
+ pattern="[file:hashes.MD5 = 'd41d8cd98f00b204e9800998ecf8427e']",
+ pattern_type="stix",
+ valid_from=epoch,
+ indicator_types=['malicious-activity'],
+ external_references=[ext_ref],
+ )
+
+ assert ind.indicator_types == ['malicious-activity']
+ assert len(ind.external_references) == 1
+ assert ind.external_references[0] == ext_ref
+
+
+def test_indicator_with_custom_embed_objs_extra_props_error():
+ ext_ref = stix2.v21.ExternalReference(
+ source_name="Test",
+ description="Example Custom Ext Ref",
+ random_custom_prop="This is a custom property",
+ allow_custom=True,
+ )
+
+ with pytest.raises(stix2.exceptions.ExtraPropertiesError) as excinfo:
+ stix2.v21.Indicator(external_references=[ext_ref], bad_custom_prop="shouldn't be here", **INDICATOR_KWARGS)
+
+ assert excinfo.value.cls == stix2.v21.Indicator
+ assert excinfo.value.properties == ['bad_custom_prop']
+ assert str(excinfo.value) == "Unexpected properties for Indicator: (bad_custom_prop)."
+
+
+def test_indicator_stix20_invalid_pattern():
+ now = dt.datetime(2017, 1, 1, 0, 0, 1, tzinfo=pytz.utc)
+ epoch = dt.datetime(1970, 1, 1, 0, 0, 1, tzinfo=pytz.utc)
+ patrn = "[win-registry-key:key = 'hkey_local_machine\\\\foo\\\\bar'] WITHIN 5 SECONDS WITHIN 6 SECONDS"
+
+ with pytest.raises(stix2.exceptions.InvalidValueError) as excinfo:
+ stix2.v21.Indicator(
+ type="indicator",
+ id=INDICATOR_ID,
+ created=now,
+ modified=now,
+ pattern=patrn,
+ pattern_type="stix",
+ valid_from=epoch,
+ indicator_types=['malicious-activity'],
+ )
+
+ assert excinfo.value.cls == stix2.v21.Indicator
+ assert "FAIL: The same qualifier is used more than once" in str(excinfo.value)
+
+ ind = stix2.v21.Indicator(
+ type="indicator",
+ id=INDICATOR_ID,
+ created=now,
+ modified=now,
+ pattern=patrn,
+ pattern_type="stix",
+ pattern_version="2.0",
+ valid_from=epoch,
+ indicator_types=['malicious-activity'],
+ )
+
+ assert ind.id == INDICATOR_ID
+ assert ind.indicator_types == ['malicious-activity']
+ assert ind.pattern == patrn
+ assert ind.pattern_type == "stix"
+ assert ind.pattern_version == "2.0"
diff --git a/stix2/test/v21/test_observed_data.py b/stix2/test/v21/test_observed_data.py
index 32bd0bf..c1f15cd 100644
--- a/stix2/test/v21/test_observed_data.py
+++ b/stix2/test/v21/test_observed_data.py
@@ -1117,6 +1117,20 @@ def test_network_traffic_socket_example():
assert nt.extensions['socket-ext'].socket_type == "SOCK_STREAM"
+def test_correct_socket_options():
+ se1 = stix2.v21.SocketExt(
+ is_listening=True,
+ address_family="AF_INET",
+ protocol_family="PF_INET",
+ socket_type="SOCK_STREAM",
+ options={"ICMP6_RCVTIMEO": 100},
+ )
+
+ assert se1.address_family == "AF_INET"
+ assert se1.socket_type == "SOCK_STREAM"
+ assert se1.options == {"ICMP6_RCVTIMEO": 100}
+
+
def test_incorrect_socket_options():
with pytest.raises(ValueError) as excinfo:
stix2.v21.SocketExt(
diff --git a/stix2/v20/bundle.py b/stix2/v20/bundle.py
index d42ba8f..22cc26d 100644
--- a/stix2/v20/bundle.py
+++ b/stix2/v20/bundle.py
@@ -32,7 +32,7 @@ class Bundle(_STIXBase):
kwargs['objects'] = list(args) + kwargs.get('objects', [])
allow_custom = kwargs.get('allow_custom', False)
- self.__allow_custom = allow_custom
+ self._allow_custom = allow_custom
self._properties['objects'].contained.allow_custom = allow_custom
interoperability = kwargs.get('interoperability', False)
self.__interoperability = interoperability
diff --git a/stix2/v20/sdo.py b/stix2/v20/sdo.py
index 50f0fd0..6769873 100644
--- a/stix2/v20/sdo.py
+++ b/stix2/v20/sdo.py
@@ -3,8 +3,11 @@
from collections import OrderedDict
import itertools
+from stix2patterns.validator import run_validator
+
from ..core import STIXDomainObject
from ..custom import _custom_object_builder
+from ..exceptions import InvalidValueError
from ..properties import (
BooleanProperty, IDProperty, IntegerProperty, ListProperty,
ObservableProperty, PatternProperty, ReferenceProperty, StringProperty,
@@ -135,6 +138,11 @@ class Indicator(STIXDomainObject):
('granular_markings', ListProperty(GranularMarking)),
])
+ def _check_object_constraints(self):
+ errors = run_validator(self.get('pattern'), '2.0')
+ if errors:
+ raise InvalidValueError(self.__class__, 'pattern', str(errors[0]))
+
class IntrusionSet(STIXDomainObject):
"""For more detailed information on this object's properties, see
@@ -212,7 +220,7 @@ class ObservedData(STIXDomainObject):
])
def __init__(self, *args, **kwargs):
- self.__allow_custom = kwargs.get('allow_custom', False)
+ self._allow_custom = kwargs.get('allow_custom', False)
self._properties['objects'].allow_custom = kwargs.get('allow_custom', False)
super(ObservedData, self).__init__(*args, **kwargs)
diff --git a/stix2/v21/bundle.py b/stix2/v21/bundle.py
index de1dbcb..784608a 100644
--- a/stix2/v21/bundle.py
+++ b/stix2/v21/bundle.py
@@ -30,7 +30,7 @@ class Bundle(_STIXBase):
kwargs['objects'] = list(args) + kwargs.get('objects', [])
allow_custom = kwargs.get('allow_custom', False)
- self.__allow_custom = allow_custom
+ self._allow_custom = allow_custom
self._properties['objects'].contained.allow_custom = allow_custom
interoperability = kwargs.get('interoperability', False)
self.__interoperability = interoperability
diff --git a/stix2/v21/observables.py b/stix2/v21/observables.py
index b08869e..6f53255 100644
--- a/stix2/v21/observables.py
+++ b/stix2/v21/observables.py
@@ -598,8 +598,9 @@ class SocketExt(_Extension):
options = self.get('options')
if options is not None:
+ acceptable_prefixes = ["SO_", "ICMP_", "ICMP6_", "IP_", "IPV6_", "MCAST_", "TCP_", "IRLMP_"]
for key, val in options.items():
- if key[:3] != "SO_":
+ if key[:key.find('_') + 1] not in acceptable_prefixes:
raise ValueError("Incorrect options key")
if not isinstance(val, int):
raise ValueError("Options value must be an integer")
diff --git a/stix2/v21/sdo.py b/stix2/v21/sdo.py
index 797c952..c8aacd1 100644
--- a/stix2/v21/sdo.py
+++ b/stix2/v21/sdo.py
@@ -5,10 +5,13 @@ import itertools
import warnings
from six.moves.urllib.parse import quote_plus
+from stix2patterns.validator import run_validator
from ..core import STIXDomainObject
from ..custom import _custom_object_builder
-from ..exceptions import PropertyPresenceError, STIXDeprecationWarning
+from ..exceptions import (
+ InvalidValueError, PropertyPresenceError, STIXDeprecationWarning,
+)
from ..properties import (
BinaryProperty, BooleanProperty, EmbeddedObjectProperty, EnumProperty,
FloatProperty, IDProperty, IntegerProperty, ListProperty,
@@ -232,6 +235,16 @@ class Indicator(STIXDomainObject):
msg = "{0.id} 'valid_until' must be greater than 'valid_from'"
raise ValueError(msg.format(self))
+ if self.get('pattern_type') == "stix":
+ try:
+ pat_ver = self.get('pattern_version')
+ except AttributeError:
+ pat_ver = '2.1'
+
+ errors = run_validator(self.get('pattern'), pat_ver)
+ if errors:
+ raise InvalidValueError(self.__class__, 'pattern', str(errors[0]))
+
class Infrastructure(STIXDomainObject):
# TODO: Add link
@@ -578,7 +591,7 @@ class ObservedData(STIXDomainObject):
])
def __init__(self, *args, **kwargs):
- self.__allow_custom = kwargs.get('allow_custom', False)
+ self._allow_custom = kwargs.get('allow_custom', False)
self._properties['objects'].allow_custom = kwargs.get('allow_custom', False)
if "objects" in kwargs:
diff --git a/stix2/version.py b/stix2/version.py
index a955fda..67bc602 100644
--- a/stix2/version.py
+++ b/stix2/version.py
@@ -1 +1 @@
-__version__ = "1.2.1"
+__version__ = "1.3.0"
diff --git a/stix2/workbench.py b/stix2/workbench.py
index c5aac6e..57474a9 100644
--- a/stix2/workbench.py
+++ b/stix2/workbench.py
@@ -106,7 +106,7 @@ STIX_OBJ_DOCS = """
""".format(
_environ.creator_of.__doc__,
_environ.relationships.__doc__,
- _environ.related_to.__doc__
+ _environ.related_to.__doc__,
)
diff --git a/tox.ini b/tox.ini
index d8b840f..d34aac1 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
[tox]
-envlist = py27,py34,py35,py36,py37,style,isort-check,packaging
+envlist = py27,py34,py35,py36,py37,py38,style,isort-check,packaging
[testenv]
deps =
@@ -9,8 +9,9 @@ deps =
pytest-cov
coverage
taxii2-client
- pyjarowinkler
+ fuzzywuzzy
haversine
+ python-Levenshtein
medallion
commands =
python -m pytest --cov=stix2 stix2/test/ --cov-report term-missing -W ignore::stix2.exceptions.STIXDeprecationWarning
@@ -42,7 +43,8 @@ commands =
[travis]
python =
2.7: py27, style
- 3.4: py34, style
- 3.5: py35, style
- 3.6: py36, style, packaging
- 3.7: py37, style
+ 3.4: py34
+ 3.5: py35
+ 3.6: py36
+ 3.7: py37
+ 3.8: py38, style, packaging