diff --git a/.isort.cfg b/.isort.cfg index d644f60..db580a5 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -4,7 +4,9 @@ not_skip = __init__.py known_third_party = antlr4, dateutil, + haversine, medallion, + pyjarowinkler, pytest, pytz, requests, diff --git a/.travis.yml b/.travis.yml index 261f125..c05ec72 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,16 +1,13 @@ sudo: false language: python cache: pip +dist: xenial python: - "2.7" - "3.4" - "3.5" - "3.6" -matrix: - include: - - python: 3.7 # https://github.com/travis-ci/travis-ci/issues/9069#issuecomment-425720905 - dist: xenial - sudo: true + - "3.7" install: - pip install -U pip setuptools - pip install tox-travis pre-commit diff --git a/CHANGELOG b/CHANGELOG index f4cce28..e2cb8ad 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,15 @@ CHANGELOG ========= +1.2.0 - 2019-09-25 + +* #268, #271, #273, #275, #283, #285, #290 Changes support of STIX 2.1 to WD05 (CSD02), for all object types +* #269 Updates id properties to take a spec_version parameter +* #283 Changes the exception class hierarchy +* #289 Adds functions for calculating semantic equivalence of two objects +* #286 Fixes handling of custom observable extensions +* #287 Fixes bug with timestamp precision preservation in MarkingDefinition objects + 1.1.3 - 2019-08-12 * #258 Ignores empty values for optional fields diff --git a/README.rst b/README.rst index 256e4d6..0613a15 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -|Build_Status| |Coverage| |Version| |Downloads_Badge| +|Build_Status| |Coverage| |Version| |Downloads_Badge| |Documentation_Status| cti-python-stix2 ================ @@ -170,3 +170,6 @@ to repository-cla@oasis-open.org. .. |Downloads_Badge| image:: https://img.shields.io/pypi/dm/stix2.svg?maxAge=3600 :target: https://pypi.python.org/pypi/stix2/ :alt: Downloads +.. |Documentation_Status| image:: https://readthedocs.org/projects/stix2/badge/?version=latest + :target: https://stix2.readthedocs.io/en/latest/?badge=latest + :alt: Documentation Status diff --git a/docs/guide/custom.ipynb b/docs/guide/custom.ipynb index 042f11e..7ceb33b 100644 --- a/docs/guide/custom.ipynb +++ b/docs/guide/custom.ipynb @@ -537,7 +537,7 @@ "source": [ "### Custom STIX Object Types\n", "\n", - "To create a custom STIX object type, define a class with the @[CustomObject](../api/stix2.v20.sdo.rst#stix2.v20.sdo.CustomObject) decorator. It takes the type name and a list of property tuples, each tuple consisting of the property name and a property instance. Any special validation of the properties can be added by supplying an ``__init__`` function.\n", + "To create a custom STIX object type, define a class with the @[CustomObject](../api/v20/stix2.v20.sdo.rst#stix2.v20.sdo.CustomObject) decorator. It takes the type name and a list of property tuples, each tuple consisting of the property name and a property instance. Any special validation of the properties can be added by supplying an ``__init__`` function.\n", "\n", "Let's say zoo animals have become a serious cyber threat and we want to model them in STIX using a custom object type. Let's use a ``species`` property to store the kind of animal, and make that property required. We also want a property to store the class of animal, such as \"mammal\" or \"bird\" but only want to allow specific values in it. We can add some logic to validate this property in ``__init__``." ] @@ -841,7 +841,7 @@ "source": [ "### Custom Cyber Observable Types\n", "\n", - "Similar to custom STIX object types, use a decorator to create [custom Cyber Observable](../api/stix2.v20.observables.rst#stix2.v20.observables.CustomObservable) types. Just as before, ``__init__()`` can hold additional validation, but it is not necessary." + "Similar to custom STIX object types, use a decorator to create [custom Cyber Observable](../api/v20/stix2.v20.observables.rst#stix2.v20.observables.CustomObservable) types. Just as before, ``__init__()`` can hold additional validation, but it is not necessary." ] }, { @@ -1163,7 +1163,7 @@ "source": [ "### Custom Cyber Observable Extensions\n", "\n", - "Finally, custom extensions to existing Cyber Observable types can also be created. Just use the @[CustomExtension](../api/stix2.v20.observables.rst#stix2.v20.observables.CustomExtension) decorator. Note that you must provide the Cyber Observable class to which the extension applies. Again, any extra validation of the properties can be implemented by providing an ``__init__()`` but it is not required. Let's say we want to make an extension to the ``File`` Cyber Observable Object:" + "Finally, custom extensions to existing Cyber Observable types can also be created. Just use the @[CustomExtension](../api/v20/stix2.v20.observables.rst#stix2.v20.observables.CustomExtension) decorator. Note that you must provide the Cyber Observable class to which the extension applies. Again, any extra validation of the properties can be implemented by providing an ``__init__()`` but it is not required. Let's say we want to make an extension to the ``File`` Cyber Observable Object:" ] }, { diff --git a/docs/guide/datastore.ipynb b/docs/guide/datastore.ipynb index 1ea05ee..e4aad79 100644 --- a/docs/guide/datastore.ipynb +++ b/docs/guide/datastore.ipynb @@ -454,7 +454,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "##### Note: The `defanged` property is now always included (implicitly) for STIX 2.1 Cyber Observable Objects (SCOs)\n", + "**Note: The `defanged` property is now always included (implicitly) for STIX 2.1 Cyber Observable Objects (SCOs)**\n\n", "This is important to remember if you are writing a filter that involves checking the `objects` property of a STIX 2.1 `ObservedData` object. If any of the objects associated with the `objects` property are STIX 2.1 SCOs, then your filter must include the `defanged` property. For an example, refer to `filters[14]` & `filters[15]` in stix2/test/v21/test_datastore_filters.py " ] }, @@ -492,7 +492,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "If a STIX object has a `created_by_ref` property, you can use the [creator_of()](../api/stix2.datastore.rst#stix2.datastore.DataSource.creator_of) method to retrieve the [Identity](../api/stix2.v20.sdo.rst#stix2.v20.sdo.Identity) object that created it." + "If a STIX object has a `created_by_ref` property, you can use the [creator_of()](../api/stix2.datastore.rst#stix2.datastore.DataSource.creator_of) method to retrieve the [Identity](../api/v20/stix2.v20.sdo.rst#stix2.v20.sdo.Identity) object that created it." ] }, { diff --git a/docs/guide/environment.ipynb b/docs/guide/environment.ipynb index f7515a5..8d22e39 100644 --- a/docs/guide/environment.ipynb +++ b/docs/guide/environment.ipynb @@ -67,7 +67,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ diff --git a/docs/guide/equivalence.ipynb b/docs/guide/equivalence.ipynb new file mode 100644 index 0000000..5db3464 --- /dev/null +++ b/docs/guide/equivalence.ipynb @@ -0,0 +1,1685 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# Delete this cell to re-enable tracebacks\n", + "import sys\n", + "ipython = get_ipython()\n", + "\n", + "def hide_traceback(exc_tuple=None, filename=None, tb_offset=None,\n", + " exception_only=False, running_compiled_code=False):\n", + " etype, value, tb = sys.exc_info()\n", + " return ipython._showtraceback(etype, value, ipython.InteractiveTB.get_exception_only(etype, value))\n", + "\n", + "ipython.showtraceback = hide_traceback" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# JSON output syntax highlighting\n", + "from __future__ import print_function\n", + "from pygments import highlight\n", + "from pygments.lexers import JsonLexer, TextLexer\n", + "from pygments.formatters import HtmlFormatter\n", + "from IPython.display import display, HTML\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "\n", + "InteractiveShell.ast_node_interactivity = \"all\"\n", + "\n", + "def json_print(inpt):\n", + " string = str(inpt)\n", + " formatter = HtmlFormatter()\n", + " if string[0] == '{':\n", + " lexer = JsonLexer()\n", + " else:\n", + " lexer = TextLexer()\n", + " return HTML('{}'.format(\n", + " formatter.get_style_defs('.highlight'),\n", + " highlight(string, lexer, formatter)))\n", + "\n", + "globals()['print'] = json_print" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Checking Semantic Equivalence\n", + "\n", + "The [Environment](../api/stix2.environment.rst#stix2.environment.Environment) has a function for checking if two STIX Objects are semantically equivalent. For each supported object type, the algorithm checks if the values for a specific set of properties match. Then each matching property is weighted since every property doesn't represent the same level of importance for semantic equivalence. The result will be the sum of these weighted values, in the range of 0 to 100. A result of 0 means that the the two objects are not equivalent, and a result of 100 means that they are equivalent.\n", + "\n", + "TODO: Add a link to the committee note when it is released.\n", + "\n", + "Below we will show examples of the semantic equivalence results of various objects. Unless otherwise specified, the ID of each object will be generated by the library, so the two objects will not have the same ID. This demonstrates that the semantic equivalence algorithm only looks at specific properties for each object type.\n", + "\n", + "### Attack Pattern Example\n", + "\n", + "For Attack Patterns, the only properties that contribute to semantic equivalence are `name` and `external_references`, with weights of 30 and 70, respectively. In this example, both attack patterns have the same external reference but the second has a slightly different yet still similar name." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
85.3\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import stix2\n", + "from stix2 import Environment, MemoryStore\n", + "from stix2.v21 import AttackPattern\n", + "\n", + "env = Environment(store=MemoryStore())\n", + "\n", + "ap1 = AttackPattern(\n", + " name=\"Phishing\",\n", + " external_references=[\n", + " {\n", + " \"url\": \"https://example2\",\n", + " \"source_name\": \"some-source2\",\n", + " },\n", + " ],\n", + ")\n", + "ap2 = AttackPattern(\n", + " name=\"Spear phishing\",\n", + " external_references=[\n", + " {\n", + " \"url\": \"https://example2\",\n", + " \"source_name\": \"some-source2\",\n", + " },\n", + " ],\n", + ")\n", + "print(env.semantically_equivalent(ap1, ap2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Campaign Example\n", + "\n", + "For Campaigns, the only properties that contribute to semantic equivalence are `name` and `aliases`, with weights of 60 and 40, respectively. In this example, the two campaigns have completely different names, but slightly similar descriptions. The result may be higher than expected because the Jaro-Winkler algorithm used to compare string properties looks at the edit distance of the two strings rather than just the words in them." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
50.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import Campaign\n", + "\n", + "c1 = Campaign(\n", + " name=\"Someone Attacks Somebody\",)\n", + "\n", + "c2 = Campaign(\n", + " name=\"Another Campaign\",)\n", + "print(env.semantically_equivalent(c1, c2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Identity Example\n", + "\n", + "For Identities, the only properties that contribute to semantic equivalence are `name`, `identity_class`, and `sectors`, with weights of 60, 20, and 20, respectively. In this example, the two identities are identical, but are missing one of the contributing properties. The algorithm only compares properties that are actually present on the objects. Also note that they have completely different description properties, but because description is not one of the properties considered for semantic equivalence, this difference has no effect on the result." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
100.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import Identity\n", + "\n", + "id1 = Identity(\n", + " name=\"John Smith\",\n", + " identity_class=\"individual\",\n", + " description=\"Just some guy\",\n", + ")\n", + "id2 = Identity(\n", + " name=\"John Smith\",\n", + " identity_class=\"individual\",\n", + " description=\"A person\",\n", + ")\n", + "print(env.semantically_equivalent(id1, id2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Indicator Example\n", + "\n", + "For Indicators, the only properties that contribute to semantic equivalence are `indicator_types`, `pattern`, and `valid_from`, with weights of 15, 80, and 5, respectively. In this example, the two indicators have patterns with different hashes but the same indicator_type and valid_from. For patterns, the algorithm currently only checks if they are identical." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Indicator pattern equivalence is not fully defined; will default to zero if not completely identical\n" + ] + }, + { + "data": { + "text/html": [ + "
20.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import Indicator\n", + "\n", + "ind1 = Indicator(\n", + " indicator_types=['malicious-activity'],\n", + " pattern_type=\"stix\",\n", + " pattern=\"[file:hashes.MD5 = 'd41d8cd98f00b204e9800998ecf8427e']\",\n", + " valid_from=\"2017-01-01T12:34:56Z\",\n", + ")\n", + "ind2 = Indicator(\n", + " indicator_types=['malicious-activity'],\n", + " pattern_type=\"stix\",\n", + " pattern=\"[file:hashes.MD5 = '79054025255fb1a26e4bc422aef54eb4']\",\n", + " valid_from=\"2017-01-01T12:34:56Z\",\n", + ")\n", + "print(env.semantically_equivalent(ind1, ind2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the patterns were identical the result would have been 100." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Location Example\n", + "\n", + "For Locations, the only properties that contribute to semantic equivalence are `longitude`/`latitude`, `region`, and `country`, with weights of 34, 33, and 33, respectively. In this example, the two locations are Washington, D.C. and New York City. The algorithm computes the distance between two locations using the haversine formula and uses that to influence equivalence." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
67.20663955882583\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import Location\n", + "\n", + "loc1 = Location(\n", + " latitude=38.889,\n", + " longitude=-77.023,\n", + ")\n", + "loc2 = Location(\n", + " latitude=40.713,\n", + " longitude=-74.006,\n", + ")\n", + "print(env.semantically_equivalent(loc1, loc2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Malware Example\n", + "\n", + "For Malware, the only properties that contribute to semantic equivalence are `malware_types` and `name`, with weights of 20 and 80, respectively. In this example, the two malware objects only differ in the strings in their malware_types lists. For lists, the algorithm bases its calculations on the intersection of the two lists. An empty intersection will result in a 0, and a complete intersection will result in a 1 for that property." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
90.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import Malware\n", + "\n", + "MALWARE_ID = \"malware--9c4638ec-f1de-4ddb-abf4-1b760417654e\"\n", + "\n", + "mal1 = Malware(id=MALWARE_ID,\n", + " malware_types=['ransomware'],\n", + " name=\"Cryptolocker\",\n", + " is_family=False,\n", + " )\n", + "mal2 = Malware(id=MALWARE_ID,\n", + " malware_types=['ransomware', 'dropper'],\n", + " name=\"Cryptolocker\",\n", + " is_family=False,\n", + " )\n", + "print(env.semantically_equivalent(mal1, mal2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Threat Actor Example\n", + "\n", + "For Threat Actors, the only properties that contribute to semantic equivalence are `threat_actor_types`, `name`, and `aliases`, with weights of 20, 60, and 20, respectively. In this example, the two threat actors have the same id properties but everything else is different. Since the id property does not factor into semantic equivalence, the result is not very high. The result is not zero because the algorithm is using the Jaro-Winkler distance between strings in the threat_actor_types and name properties." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
33.6\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import ThreatActor\n", + "\n", + "THREAT_ACTOR_ID = \"threat-actor--8e2e2d2b-17d4-4cbf-938f-98ee46b3cd3f\"\n", + "\n", + "ta1 = ThreatActor(id=THREAT_ACTOR_ID,\n", + " threat_actor_types=[\"crime-syndicate\"],\n", + " name=\"Evil Org\",\n", + " aliases=[\"super-evil\"],\n", + ")\n", + "ta2 = ThreatActor(id=THREAT_ACTOR_ID,\n", + " threat_actor_types=[\"spy\"],\n", + " name=\"James Bond\",\n", + " aliases=[\"007\"],\n", + ")\n", + "print(env.semantically_equivalent(ta1, ta2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tool Example\n", + "\n", + "For Tools, the only properties that contribute to semantic equivalence are `tool_types` and `name`, with weights of 20 and 80, respectively. In this example, the two tools have the same values for properties that contribute to semantic equivalence but one has an additional, non-contributing property." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
100.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import Tool\n", + "\n", + "t1 = Tool(\n", + " tool_types=[\"remote-access\"],\n", + " name=\"VNC\",\n", + ")\n", + "t2 = Tool(\n", + " tool_types=[\"remote-access\"],\n", + " name=\"VNC\",\n", + " description=\"This is a tool\"\n", + ")\n", + "print(env.semantically_equivalent(t1, t2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Vulnerability Example\n", + "\n", + "For Vulnerabilities, the only properties that contribute to semantic equivalence are `name` and `external_references`, with weights of 30 and 70, respectively. In this example, the two vulnerabilities have the same name but one also has an external reference. The algorithm doesn't take into account any semantic equivalence contributing properties that are not present on both objects." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
100.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import Vulnerability\n", + "\n", + "vuln1 = Vulnerability(\n", + " name=\"Heartbleed\",\n", + " external_references=[\n", + " {\n", + " \"url\": \"https://example\",\n", + " \"source_name\": \"some-source\",\n", + " },\n", + " ],\n", + ")\n", + "vuln2 = Vulnerability(\n", + " name=\"Heartbleed\",\n", + ")\n", + "print(env.semantically_equivalent(vuln1, vuln2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Other Examples\n", + "\n", + "Comparing objects of different types will result in an error." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "The objects to compare must be of the same type!", + "output_type": "error", + "traceback": [ + "\u001b[0;31mValueError\u001b[0m\u001b[0;31m:\u001b[0m The objects to compare must be of the same type!\n" + ] + } + ], + "source": [ + "print(env.semantically_equivalent(ind1, vuln1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Some object types do not have a defined method for calculating semantic equivalence and by default will give a warning and a result of zero." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "'report' type has no semantic equivalence method to call!\n" + ] + }, + { + "data": { + "text/html": [ + "
0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v21 import Report\n", + "\n", + "r1 = Report(\n", + " report_types=[\"campaign\"],\n", + " name=\"Bad Cybercrime\",\n", + " published=\"2016-04-06T20:03:00.000Z\",\n", + " object_refs=[\"indicator--a740531e-63ff-4e49-a9e1-a0a3eed0e3e7\"],\n", + ")\n", + "r2 = Report(\n", + " report_types=[\"campaign\"],\n", + " name=\"Bad Cybercrime\",\n", + " published=\"2016-04-06T20:03:00.000Z\",\n", + " object_refs=[\"indicator--a740531e-63ff-4e49-a9e1-a0a3eed0e3e7\"],\n", + ")\n", + "print(env.semantically_equivalent(r1, r2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, comparing objects of different spec versions will result in a `ValueError`." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "The objects to compare must be of the same spec version!", + "output_type": "error", + "traceback": [ + "\u001b[0;31mValueError\u001b[0m\u001b[0;31m:\u001b[0m The objects to compare must be of the same spec version!\n" + ] + } + ], + "source": [ + "from stix2.v20 import Identity as Identity20\n", + "\n", + "id20 = Identity20(\n", + " name=\"John Smith\",\n", + " identity_class=\"individual\",\n", + ")\n", + "print(env.semantically_equivalent(id2, id20))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can optionally allow comparing across spec versions by providing a configuration dictionary like in the next example:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
100.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from stix2.v20 import Identity as Identity20\n", + "\n", + "id20 = Identity20(\n", + " name=\"John Smith\",\n", + " identity_class=\"individual\",\n", + ")\n", + "print(env.semantically_equivalent(id2, id20, **{\"_internal\": {\"ignore_spec_version\": True}}))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can modify the weights or provide your own functions for comparing objects of a certain type by providing them in a dictionary to the optional 3rd parameter to the semantic equivalence function. You can find functions (like `partial_string_based`) to help with this in the [Environment API docs](../api/stix2.environment.rst#stix2.environment.Environment). In this example we define semantic equivalence for our new `x-foobar` object type:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
60.0\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def _x_foobar_checks(obj1, obj2, **weights):\n", + " matching_score = 0.0\n", + " sum_weights = 0.0\n", + " if stix2.environment.check_property_present(\"name\", obj1, obj2):\n", + " w = weights[\"name\"]\n", + " sum_weights += w\n", + " matching_score += w * stix2.environment.partial_string_based(obj1[\"name\"], obj2[\"name\"])\n", + " if stix2.environment.check_property_present(\"color\", obj1, obj2):\n", + " w = weights[\"color\"]\n", + " sum_weights += w\n", + " matching_score += w * stix2.environment.partial_string_based(obj1[\"color\"], obj2[\"color\"])\n", + " return matching_score, sum_weights\n", + "\n", + "weights = {\n", + " \"x-foobar\": {\n", + " \"name\": 60,\n", + " \"color\": 40,\n", + " \"method\": _x_foobar_checks,\n", + " },\n", + " \"_internal\": {\n", + " \"ignore_spec_version\": False,\n", + " },\n", + "}\n", + "foo1 = {\n", + " \"type\":\"x-foobar\",\n", + " \"id\":\"x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061\",\n", + " \"name\": \"Zot\",\n", + " \"color\": \"red\",\n", + "}\n", + "foo2 = {\n", + " \"type\":\"x-foobar\",\n", + " \"id\":\"x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061\",\n", + " \"name\": \"Zot\",\n", + " \"color\": \"blue\",\n", + "}\n", + "print(env.semantically_equivalent(foo1, foo2, **weights))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Detailed Results\n", + "\n", + "If your logging level is set to `DEBUG` or higher, the function will log more detailed results. These show the semantic equivalence and weighting for each property that is checked, to show how the final result was arrived at." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Starting semantic equivalence process between: 'threat-actor--54dc2aac-6fde-4a68-ae2a-0c0bc575ed70' and 'threat-actor--c51bce3b-a067-4692-ab77-fcdefdd3f157'\n", + "--\t\tpartial_string_based 'Evil Org' 'James Bond'\tresult: '0.56'\n", + "'name' check -- weight: 60, contributing score: 33.6\n", + "--\t\tpartial_list_based '['crime-syndicate']' '['spy']'\tresult: '0.0'\n", + "'threat_actor_types' check -- weight: 20, contributing score: 0.0\n", + "--\t\tpartial_list_based '['super-evil']' '['007']'\tresult: '0.0'\n", + "'aliases' check -- weight: 20, contributing score: 0.0\n", + "Matching Score: 33.6, Sum of Weights: 100.0\n" + ] + }, + { + "data": { + "text/html": [ + "
33.6\n",
+       "
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import logging\n", + "logging.basicConfig(format='%(message)s')\n", + "logger = logging.getLogger()\n", + "logger.setLevel(logging.DEBUG)\n", + "\n", + "ta3 = ThreatActor(\n", + " threat_actor_types=[\"crime-syndicate\"],\n", + " name=\"Evil Org\",\n", + " aliases=[\"super-evil\"],\n", + ")\n", + "ta4 = ThreatActor(\n", + " threat_actor_types=[\"spy\"],\n", + " name=\"James Bond\",\n", + " aliases=[\"007\"],\n", + ")\n", + "print(env.semantically_equivalent(ta3, ta4))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/guide/ts_support.ipynb b/docs/guide/ts_support.ipynb index 8c89e93..2d36f5a 100644 --- a/docs/guide/ts_support.ipynb +++ b/docs/guide/ts_support.ipynb @@ -365,7 +365,7 @@ "source": [ "### How custom content works\n", "\n", - "[CustomObject](../api/stix2.v20.sdo.rst#stix2.v20.sdo.CustomObject), [CustomObservable](../api/stix2.v20.observables.rst#stix2.v20.observables.CustomObservable), [CustomMarking](../api/stix2.v20.common.rst#stix2.v20.common.CustomMarking) and [CustomExtension](../api/stix2.v20.observables.rst#stix2.v20.observables.CustomExtension) must be registered explicitly by STIX version. This is a design decision since properties or requirements may change as the STIX Technical Specification advances.\n", + "[CustomObject](../api/v20/stix2.v20.sdo.rst#stix2.v20.sdo.CustomObject), [CustomObservable](../api/v20/stix2.v20.observables.rst#stix2.v20.observables.CustomObservable), [CustomMarking](../api/v20/stix2.v20.common.rst#stix2.v20.common.CustomMarking) and [CustomExtension](../api/v20/stix2.v20.observables.rst#stix2.v20.observables.CustomExtension) must be registered explicitly by STIX version. This is a design decision since properties or requirements may change as the STIX Technical Specification advances.\n", "\n", "You can perform this by:" ] diff --git a/docs/guide/workbench.ipynb b/docs/guide/workbench.ipynb index 328cb88..de243cc 100644 --- a/docs/guide/workbench.ipynb +++ b/docs/guide/workbench.ipynb @@ -624,7 +624,7 @@ "source": [ "### Creating STIX Data\n", "\n", - "To create a STIX object, just use that object's class constructor. Once it's created, add it to the workbench with [save()](../api/datastore/stix2.workbench.rst#stix2.workbench.save)." + "To create a STIX object, just use that object's class constructor. Once it's created, add it to the workbench with [save()](../api/stix2.workbench.rst#stix2.workbench.save)." ] }, { @@ -760,7 +760,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Defaults can also be set for the [created timestamp](../api/datastore/stix2.workbench.rst#stix2.workbench.set_default_created), [external references](../api/datastore/stix2.workbench.rst#stix2.workbench.set_default_external_refs) and [object marking references](../api/datastore/stix2.workbench.rst#stix2.workbench.set_default_object_marking_refs)." + "Defaults can also be set for the [created timestamp](../api/stix2.workbench.rst#stix2.workbench.set_default_created), [external references](../api/stix2.workbench.rst#stix2.workbench.set_default_external_refs) and [object marking references](../api/stix2.workbench.rst#stix2.workbench.set_default_object_marking_refs)." ] }, { diff --git a/requirements.txt b/requirements.txt index 5ac0b37..2fb7c5d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ bumpversion ipython -nbsphinx==0.3.2 +nbsphinx==0.4.3 pre-commit pytest pytest-cov -sphinx<1.6 +sphinx<2 sphinx-prompt tox diff --git a/setup.cfg b/setup.cfg index b012bb9..758875d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.1.3 +current_version = 1.2.0 commit = True tag = True diff --git a/setup.py b/setup.py index 497bf01..ea20795 100644 --- a/setup.py +++ b/setup.py @@ -64,5 +64,6 @@ setup( }, extras_require={ 'taxii': ['taxii2-client'], + 'semantic': ['haversine', 'pyjarowinkler'], }, ) diff --git a/stix2/environment.py b/stix2/environment.py index 104fdb2..34e0a04 100644 --- a/stix2/environment.py +++ b/stix2/environment.py @@ -1,9 +1,14 @@ """Python STIX2 Environment API.""" import copy +import logging +import time from .core import parse as _parse from .datastore import CompositeDataSource, DataStoreMixin +from .utils import STIXdatetime, parse_into_datetime + +logger = logging.getLogger(__name__) class ObjectFactory(object): @@ -186,3 +191,501 @@ class Environment(DataStoreMixin): return self.get(creator_id) else: return None + + @staticmethod + def semantically_equivalent(obj1, obj2, **weight_dict): + """This method is meant to verify if two objects of the same type are + semantically equivalent. + + Args: + obj1: A stix2 object instance + obj2: A stix2 object instance + weight_dict: A dictionary that can be used to override settings + in the semantic equivalence process + + Returns: + float: A number between 0.0 and 100.0 as a measurement of equivalence. + + Warning: + Course of Action, Intrusion-Set, Observed-Data, Report are not supported + by this implementation. Indicator pattern check is also limited. + + Note: + This implementation follows the Committee Note on semantic equivalence. + see `the Committee Note `__. + + """ + # default weights used for the semantic equivalence process + weights = { + "attack-pattern": { + "name": 30, + "external_references": 70, + "method": _attack_pattern_checks, + }, + "campaign": { + "name": 60, + "aliases": 40, + "method": _campaign_checks, + }, + "identity": { + "name": 60, + "identity_class": 20, + "sectors": 20, + "method": _identity_checks, + }, + "indicator": { + "indicator_types": 15, + "pattern": 80, + "valid_from": 5, + "tdelta": 1, # One day interval + "method": _indicator_checks, + }, + "location": { + "longitude_latitude": 34, + "region": 33, + "country": 33, + "threshold": 1000.0, + "method": _location_checks, + }, + "malware": { + "malware_types": 20, + "name": 80, + "method": _malware_checks, + }, + "threat-actor": { + "name": 60, + "threat_actor_types": 20, + "aliases": 20, + "method": _threat_actor_checks, + }, + "tool": { + "tool_types": 20, + "name": 80, + "method": _tool_checks, + }, + "vulnerability": { + "name": 30, + "external_references": 70, + "method": _vulnerability_checks, + }, + "_internal": { + "ignore_spec_version": False, + }, + } + + if weight_dict: + weights.update(weight_dict) + + type1, type2 = obj1["type"], obj2["type"] + ignore_spec_version = weights["_internal"]["ignore_spec_version"] + + if type1 != type2: + raise ValueError('The objects to compare must be of the same type!') + + if ignore_spec_version is False and obj1.get("spec_version", "2.0") != obj2.get("spec_version", "2.0"): + raise ValueError('The objects to compare must be of the same spec version!') + + try: + method = weights[type1]["method"] + except KeyError: + logger.warning("'%s' type has no semantic equivalence method to call!", type1) + sum_weights = matching_score = 0 + else: + logger.debug("Starting semantic equivalence process between: '%s' and '%s'", obj1["id"], obj2["id"]) + matching_score, sum_weights = method(obj1, obj2, **weights[type1]) + + if sum_weights <= 0: + return 0 + + equivalence_score = (matching_score / sum_weights) * 100.0 + return equivalence_score + + +def check_property_present(prop, obj1, obj2): + """Helper method checks if a property is present on both objects.""" + if prop in obj1 and prop in obj2: + return True + return False + + +def partial_timestamp_based(t1, t2, tdelta): + """Performs a timestamp-based matching via checking how close one timestamp is to another. + + Args: + t1: A datetime string or STIXdatetime object. + t2: A datetime string or STIXdatetime object. + tdelta (float): A given time delta. This number is multiplied by 86400 (1 day) to + extend or shrink your time change tolerance. + + Returns: + float: Number between 0.0 and 1.0 depending on match criteria. + + """ + if not isinstance(t1, STIXdatetime): + t1 = parse_into_datetime(t1) + if not isinstance(t2, STIXdatetime): + t2 = parse_into_datetime(t2) + t1, t2 = time.mktime(t1.timetuple()), time.mktime(t2.timetuple()) + result = 1 - min(abs(t1 - t2) / (86400 * tdelta), 1) + logger.debug("--\t\tpartial_timestamp_based '%s' '%s' tdelta: '%s'\tresult: '%s'", t1, t2, tdelta, result) + return result + + +def partial_list_based(l1, l2): + """Performs a partial list matching via finding the intersection between common values. + + Args: + l1: A list of values. + l2: A list of values. + + Returns: + float: 1.0 if the value matches exactly, 0.0 otherwise. + + """ + l1_set, l2_set = set(l1), set(l2) + result = len(l1_set.intersection(l2_set)) / max(len(l1), len(l2)) + logger.debug("--\t\tpartial_list_based '%s' '%s'\tresult: '%s'", l1, l2, result) + return result + + +def exact_match(val1, val2): + """Performs an exact value match based on two values + + Args: + val1: A value suitable for an equality test. + val2: A value suitable for an equality test. + + Returns: + float: 1.0 if the value matches exactly, 0.0 otherwise. + + """ + result = 0.0 + if val1 == val2: + result = 1.0 + logger.debug("--\t\texact_match '%s' '%s'\tresult: '%s'", val1, val2, result) + return result + + +def partial_string_based(str1, str2): + """Performs a partial string match using the Jaro-Winkler distance algorithm. + + Args: + str1: A string value to check. + str2: A string value to check. + + Returns: + float: Number between 0.0 and 1.0 depending on match criteria. + + """ + from pyjarowinkler import distance + result = distance.get_jaro_distance(str1, str2) + logger.debug("--\t\tpartial_string_based '%s' '%s'\tresult: '%s'", str1, str2, result) + return result + + +def custom_pattern_based(pattern1, pattern2): + """Performs a matching on Indicator Patterns. + + Args: + pattern1: An Indicator pattern + pattern2: An Indicator pattern + + Returns: + float: Number between 0.0 and 1.0 depending on match criteria. + + """ + logger.warning("Indicator pattern equivalence is not fully defined; will default to zero if not completely identical") + return exact_match(pattern1, pattern2) # TODO: Implement pattern based equivalence + + +def partial_external_reference_based(refs1, refs2): + """Performs a matching on External References. + + Args: + refs1: A list of external references. + refs2: A list of external references. + + Returns: + float: Number between 0.0 and 1.0 depending on matches. + + """ + allowed = set(("veris", "cve", "capec", "mitre-attack")) + matches = 0 + + if len(refs1) >= len(refs2): + l1 = refs1 + l2 = refs2 + else: + l1 = refs2 + l2 = refs1 + + for ext_ref1 in l1: + for ext_ref2 in l2: + sn_match = False + ei_match = False + url_match = False + source_name = None + + if check_property_present("source_name", ext_ref1, ext_ref2): + if ext_ref1["source_name"] == ext_ref2["source_name"]: + source_name = ext_ref1["source_name"] + sn_match = True + if check_property_present("external_id", ext_ref1, ext_ref2): + if ext_ref1["external_id"] == ext_ref2["external_id"]: + ei_match = True + if check_property_present("url", ext_ref1, ext_ref2): + if ext_ref1["url"] == ext_ref2["url"]: + url_match = True + + # Special case: if source_name is a STIX defined name and either + # external_id or url match then its a perfect match and other entries + # can be ignored. + if sn_match and (ei_match or url_match) and source_name in allowed: + result = 1.0 + logger.debug( + "--\t\tpartial_external_reference_based '%s' '%s'\tresult: '%s'", + refs1, refs2, result, + ) + return result + + # Regular check. If the source_name (not STIX-defined) or external_id or + # url matches then we consider the entry a match. + if (sn_match or ei_match or url_match) and source_name not in allowed: + matches += 1 + + result = matches / max(len(refs1), len(refs2)) + logger.debug( + "--\t\tpartial_external_reference_based '%s' '%s'\tresult: '%s'", + refs1, refs2, result, + ) + return result + + +def partial_location_distance(lat1, long1, lat2, long2, threshold): + """Given two coordinates perform a matching based on its distance using the Haversine Formula. + + Args: + lat1: Latitude value for first coordinate point. + lat2: Latitude value for second coordinate point. + long1: Longitude value for first coordinate point. + long2: Longitude value for second coordinate point. + threshold (float): A kilometer measurement for the threshold distance between these two points. + + Returns: + float: Number between 0.0 and 1.0 depending on match. + + """ + from haversine import haversine, Unit + distance = haversine((lat1, long1), (lat2, long2), unit=Unit.KILOMETERS) + result = 1 - (distance / threshold) + logger.debug( + "--\t\tpartial_location_distance '%s' '%s' threshold: '%s'\tresult: '%s'", + (lat1, long1), (lat2, long2), threshold, result, + ) + return result + + +def _attack_pattern_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("name", obj1, obj2): + w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) + if check_property_present("external_references", obj1, obj2): + w = weights["external_references"] + contributing_score = ( + w * partial_external_reference_based(obj1["external_references"], obj2["external_references"]) + ) + sum_weights += w + matching_score += contributing_score + logger.debug("'external_references' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) + return matching_score, sum_weights + + +def _campaign_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("name", obj1, obj2): + w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) + if check_property_present("aliases", obj1, obj2): + w = weights["aliases"] + contributing_score = w * partial_list_based(obj1["aliases"], obj2["aliases"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'aliases' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) + return matching_score, sum_weights + + +def _identity_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("name", obj1, obj2): + w = weights["name"] + contributing_score = w * exact_match(obj1["name"], obj2["name"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) + if check_property_present("identity_class", obj1, obj2): + w = weights["identity_class"] + contributing_score = w * exact_match(obj1["identity_class"], obj2["identity_class"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'identity_class' check -- weight: %s, contributing score: %s", w, contributing_score) + if check_property_present("sectors", obj1, obj2): + w = weights["sectors"] + contributing_score = w * partial_list_based(obj1["sectors"], obj2["sectors"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'sectors' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) + return matching_score, sum_weights + + +def _indicator_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("indicator_types", obj1, obj2): + w = weights["indicator_types"] + contributing_score = w * partial_list_based(obj1["indicator_types"], obj2["indicator_types"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'indicator_types' check -- weight: %s, contributing score: %s", w, contributing_score) + if check_property_present("pattern", obj1, obj2): + w = weights["pattern"] + contributing_score = w * custom_pattern_based(obj1["pattern"], obj2["pattern"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'pattern' check -- weight: %s, contributing score: %s", w, contributing_score) + if check_property_present("valid_from", obj1, obj2): + w = weights["valid_from"] + contributing_score = ( + w * + partial_timestamp_based(obj1["valid_from"], obj2["valid_from"], weights["tdelta"]) + ) + sum_weights += w + matching_score += contributing_score + logger.debug("'valid_from' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) + return matching_score, sum_weights + + +def _location_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("latitude", obj1, obj2) and check_property_present("longitude", obj1, obj2): + w = weights["longitude_latitude"] + contributing_score = ( + w * + partial_location_distance(obj1["latitude"], obj1["longitude"], obj2["latitude"], obj2["longitude"], weights["threshold"]) + ) + sum_weights += w + matching_score += contributing_score + logger.debug("'longitude_latitude' check -- weight: %s, contributing score: %s", w, contributing_score) + if check_property_present("region", obj1, obj2): + w = weights["region"] + contributing_score = w * exact_match(obj1["region"], obj2["region"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'region' check -- weight: %s, contributing score: %s", w, contributing_score) + if check_property_present("country", obj1, obj2): + w = weights["country"] + contributing_score = w * exact_match(obj1["country"], obj2["country"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'country' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) + return matching_score, sum_weights + + +def _malware_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("malware_types", obj1, obj2): + w = weights["malware_types"] + contributing_score = w * partial_list_based(obj1["malware_types"], obj2["malware_types"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'malware_types' check -- weight: %s, contributing score: %s", w, contributing_score) + if check_property_present("name", obj1, obj2): + w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) + return matching_score, sum_weights + + +def _threat_actor_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("name", obj1, obj2): + w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) + if check_property_present("threat_actor_types", obj1, obj2): + w = weights["threat_actor_types"] + contributing_score = w * partial_list_based(obj1["threat_actor_types"], obj2["threat_actor_types"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'threat_actor_types' check -- weight: %s, contributing score: %s", w, contributing_score) + if check_property_present("aliases", obj1, obj2): + w = weights["aliases"] + contributing_score = w * partial_list_based(obj1["aliases"], obj2["aliases"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'aliases' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) + return matching_score, sum_weights + + +def _tool_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("tool_types", obj1, obj2): + w = weights["tool_types"] + contributing_score = w * partial_list_based(obj1["tool_types"], obj2["tool_types"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'tool_types' check -- weight: %s, contributing score: %s", w, contributing_score) + if check_property_present("name", obj1, obj2): + w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) + return matching_score, sum_weights + + +def _vulnerability_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if check_property_present("name", obj1, obj2): + w = weights["name"] + contributing_score = w * partial_string_based(obj1["name"], obj2["name"]) + sum_weights += w + matching_score += contributing_score + logger.debug("'name' check -- weight: %s, contributing score: %s", w, contributing_score) + if check_property_present("external_references", obj1, obj2): + w = weights["external_references"] + contributing_score = w * partial_external_reference_based( + obj1["external_references"], + obj2["external_references"], + ) + sum_weights += w + matching_score += contributing_score + logger.debug("'external_references' check -- weight: %s, contributing score: %s", w, contributing_score) + logger.debug("Matching Score: %s, Sum of Weights: %s", matching_score, sum_weights) + return matching_score, sum_weights diff --git a/stix2/test/v21/test_environment.py b/stix2/test/v21/test_environment.py index 90f31cb..d057df5 100644 --- a/stix2/test/v21/test_environment.py +++ b/stix2/test/v21/test_environment.py @@ -1,11 +1,15 @@ import pytest import stix2 +import stix2.environment +import stix2.exceptions from .constants import ( - CAMPAIGN_ID, CAMPAIGN_KWARGS, FAKE_TIME, IDENTITY_ID, IDENTITY_KWARGS, - INDICATOR_ID, INDICATOR_KWARGS, MALWARE_ID, MALWARE_KWARGS, - RELATIONSHIP_IDS, + ATTACK_PATTERN_ID, ATTACK_PATTERN_KWARGS, CAMPAIGN_ID, CAMPAIGN_KWARGS, + FAKE_TIME, IDENTITY_ID, IDENTITY_KWARGS, INDICATOR_ID, INDICATOR_KWARGS, + LOCATION_ID, MALWARE_ID, MALWARE_KWARGS, RELATIONSHIP_IDS, REPORT_ID, + REPORT_KWARGS, THREAT_ACTOR_ID, THREAT_ACTOR_KWARGS, TOOL_ID, TOOL_KWARGS, + VULNERABILITY_ID, VULNERABILITY_KWARGS, ) @@ -375,3 +379,374 @@ def test_related_to_by_target(ds): assert len(resp) == 2 assert any(x['id'] == CAMPAIGN_ID for x in resp) assert any(x['id'] == INDICATOR_ID for x in resp) + + +def test_semantic_equivalence_on_same_attack_pattern1(): + ap1 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS) + ap2 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_PATTERN_KWARGS) + env = stix2.Environment().semantically_equivalent(ap1, ap2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_attack_pattern2(): + ATTACK_KWARGS = dict( + name="Phishing", + external_references=[ + { + "url": "https://example2", + "source_name": "some-source2", + }, + ], + ) + ap1 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_KWARGS) + ap2 = stix2.v21.AttackPattern(id=ATTACK_PATTERN_ID, **ATTACK_KWARGS) + env = stix2.Environment().semantically_equivalent(ap1, ap2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_campaign1(): + camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) + camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMPAIGN_KWARGS) + env = stix2.Environment().semantically_equivalent(camp1, camp2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_campaign2(): + CAMP_KWARGS = dict( + name="Green Group Attacks Against Finance", + description="Campaign by Green Group against a series of targets in the financial services sector.", + aliases=["super-green", "some-green"], + ) + camp1 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMP_KWARGS) + camp2 = stix2.v21.Campaign(id=CAMPAIGN_ID, **CAMP_KWARGS) + env = stix2.Environment().semantically_equivalent(camp1, camp2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_identity1(): + iden1 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS) + iden2 = stix2.v21.Identity(id=IDENTITY_ID, **IDENTITY_KWARGS) + env = stix2.Environment().semantically_equivalent(iden1, iden2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_identity2(): + IDEN_KWARGS = dict( + name="John Smith", + identity_class="individual", + sectors=["government", "critical-infrastructure"], + ) + iden1 = stix2.v21.Identity(id=IDENTITY_ID, **IDEN_KWARGS) + iden2 = stix2.v21.Identity(id=IDENTITY_ID, **IDEN_KWARGS) + env = stix2.Environment().semantically_equivalent(iden1, iden2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_indicator(): + ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) + ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) + env = stix2.Environment().semantically_equivalent(ind1, ind2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_location1(): + LOCATION_KWARGS = dict(latitude=45, longitude=179) + loc1 = stix2.v21.Location(id=LOCATION_ID, **LOCATION_KWARGS) + loc2 = stix2.v21.Location(id=LOCATION_ID, **LOCATION_KWARGS) + env = stix2.Environment().semantically_equivalent(loc1, loc2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_location2(): + LOCATION_KWARGS = dict( + latitude=38.889, + longitude=-77.023, + region="northern-america", + country="us", + ) + loc1 = stix2.v21.Location(id=LOCATION_ID, **LOCATION_KWARGS) + loc2 = stix2.v21.Location(id=LOCATION_ID, **LOCATION_KWARGS) + env = stix2.Environment().semantically_equivalent(loc1, loc2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_malware(): + malw1 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS) + malw2 = stix2.v21.Malware(id=MALWARE_ID, **MALWARE_KWARGS) + env = stix2.Environment().semantically_equivalent(malw1, malw2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_threat_actor1(): + ta1 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS) + ta2 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_ACTOR_KWARGS) + env = stix2.Environment().semantically_equivalent(ta1, ta2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_threat_actor2(): + THREAT_KWARGS = dict( + threat_actor_types=["crime-syndicate"], + aliases=["super-evil"], + name="Evil Org", + ) + ta1 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_KWARGS) + ta2 = stix2.v21.ThreatActor(id=THREAT_ACTOR_ID, **THREAT_KWARGS) + env = stix2.Environment().semantically_equivalent(ta1, ta2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_tool(): + tool1 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) + tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL_KWARGS) + env = stix2.Environment().semantically_equivalent(tool1, tool2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_vulnerability1(): + vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) + vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) + env = stix2.Environment().semantically_equivalent(vul1, vul2) + assert round(env) == 100 + + +def test_semantic_equivalence_on_same_vulnerability2(): + VULN_KWARGS1 = dict( + name="Heartbleed", + external_references=[ + { + "url": "https://example", + "source_name": "some-source", + }, + ], + ) + VULN_KWARGS2 = dict( + name="Zot", + external_references=[ + { + "url": "https://example2", + "source_name": "some-source2", + }, + ], + ) + vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS1) + vul2 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULN_KWARGS2) + env = stix2.Environment().semantically_equivalent(vul1, vul2) + assert round(env) == 0.0 + + +def test_semantic_equivalence_on_unknown_object(): + CUSTOM_KWARGS1 = dict( + type="x-foobar", + id="x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061", + name="Heartbleed", + external_references=[ + { + "url": "https://example", + "source_name": "some-source", + }, + ], + ) + CUSTOM_KWARGS2 = dict( + type="x-foobar", + id="x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061", + name="Zot", + external_references=[ + { + "url": "https://example2", + "source_name": "some-source2", + }, + ], + ) + + def _x_foobar_checks(obj1, obj2, **weights): + matching_score = 0.0 + sum_weights = 0.0 + if stix2.environment.check_property_present("external_references", obj1, obj2): + w = weights["external_references"] + sum_weights += w + matching_score += w * stix2.environment.partial_external_reference_based( + obj1["external_references"], + obj2["external_references"], + ) + if stix2.environment.check_property_present("name", obj1, obj2): + w = weights["name"] + sum_weights += w + matching_score += w * stix2.environment.partial_string_based(obj1["name"], obj2["name"]) + return matching_score, sum_weights + + weights = { + "x-foobar": { + "external_references": 40, + "name": 60, + "method": _x_foobar_checks, + }, + "_internal": { + "ignore_spec_version": False, + }, + } + cust1 = stix2.parse(CUSTOM_KWARGS1, allow_custom=True) + cust2 = stix2.parse(CUSTOM_KWARGS2, allow_custom=True) + env = stix2.Environment().semantically_equivalent(cust1, cust2, **weights) + assert round(env) == 0 + + +def test_semantic_equivalence_different_type_raises(): + with pytest.raises(ValueError) as excinfo: + vul1 = stix2.v21.Vulnerability(id=VULNERABILITY_ID, **VULNERABILITY_KWARGS) + ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) + stix2.Environment().semantically_equivalent(vul1, ind1) + + assert str(excinfo.value) == "The objects to compare must be of the same type!" + + +def test_semantic_equivalence_different_spec_version_raises(): + with pytest.raises(ValueError) as excinfo: + V20_KWARGS = dict( + labels=['malicious-activity'], + pattern="[file:hashes.MD5 = 'd41d8cd98f00b204e9800998ecf8427e']", + ) + ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) + ind2 = stix2.v20.Indicator(id=INDICATOR_ID, **V20_KWARGS) + stix2.Environment().semantically_equivalent(ind1, ind2) + + assert str(excinfo.value) == "The objects to compare must be of the same spec version!" + + +def test_semantic_equivalence_zero_match(): + IND_KWARGS = dict( + indicator_types=["APTX"], + pattern="[ipv4-addr:value = '192.168.1.1']", + pattern_type="stix", + valid_from="2019-01-01T12:34:56Z", + ) + weights = { + "indicator": { + "indicator_types": 15, + "pattern": 80, + "valid_from": 0, + "tdelta": 1, # One day interval + "method": stix2.environment._indicator_checks, + }, + "_internal": { + "ignore_spec_version": False, + }, + } + ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) + ind2 = stix2.v21.Indicator(id=INDICATOR_ID, **IND_KWARGS) + env = stix2.Environment().semantically_equivalent(ind1, ind2, **weights) + assert round(env) == 0 + + +def test_semantic_equivalence_different_spec_version(): + IND_KWARGS = dict( + labels=["APTX"], + pattern="[ipv4-addr:value = '192.168.1.1']", + ) + weights = { + "indicator": { + "indicator_types": 15, + "pattern": 80, + "valid_from": 0, + "tdelta": 1, # One day interval + "method": stix2.environment._indicator_checks, + }, + "_internal": { + "ignore_spec_version": True, # Disables spec_version check. + }, + } + ind1 = stix2.v21.Indicator(id=INDICATOR_ID, **INDICATOR_KWARGS) + ind2 = stix2.v20.Indicator(id=INDICATOR_ID, **IND_KWARGS) + env = stix2.Environment().semantically_equivalent(ind1, ind2, **weights) + assert round(env) == 0 + + +@pytest.mark.parametrize( + "refs1,refs2,ret_val", [ + ( + [ + { + "url": "https://attack.mitre.org/techniques/T1150", + "source_name": "mitre-attack", + "external_id": "T1150", + }, + { + "url": "https://researchcenter.paloaltonetworks.com/2016/09/unit42-sofacys-komplex-os-x-trojan/", + "source_name": "Sofacy Komplex Trojan", + "description": "Dani Creus, Tyler Halfpop, Robert Falcone. (2016, September 26). Sofacy's 'Komplex' OS X Trojan. Retrieved July 8, 2017.", + }, + ], + [ + { + "url": "https://attack.mitre.org/techniques/T1129", + "source_name": "mitre-attack", + "external_id": "T1129", + }, + { + "url": "https://en.wikipedia.org/wiki/Microsoft_Windows_library_files", + "source_name": "Wikipedia Windows Library Files", + "description": "Wikipedia. (2017, January 31). Microsoft Windows library files. Retrieved February 13, 2017.", + }, + ], + 0.0, + ), + ( + [ + { + "url": "https://attack.mitre.org/techniques/T1129", + "source_name": "mitre-attack", + "external_id": "T1129", + }, + ], + [ + { + "url": "https://attack.mitre.org/techniques/T1129", + "source_name": "mitre-attack", + "external_id": "T1129", + }, + { + "url": "https://en.wikipedia.org/wiki/Microsoft_Windows_library_files", + "source_name": "Wikipedia Windows Library Files", + "description": "Wikipedia. (2017, January 31). Microsoft Windows library files. Retrieved February 13, 2017.", + }, + ], + 1.0, + ), + ( + [ + { + "url": "https://example", + "source_name": "some-source", + }, + ], + [ + { + "url": "https://example", + "source_name": "some-source", + }, + ], + 1.0, + ), + ], +) +def test_semantic_equivalence_external_references(refs1, refs2, ret_val): + value = stix2.environment.partial_external_reference_based(refs1, refs2) + assert value == ret_val + + +def test_semantic_equivalence_timestamp(): + t1 = "2018-10-17T00:14:20.652Z" + t2 = "2018-10-17T12:14:20.652Z" + assert stix2.environment.partial_timestamp_based(t1, t2, 1) == 0.5 + + +def test_semantic_equivalence_exact_match(): + t1 = "2018-10-17T00:14:20.652Z" + t2 = "2018-10-17T12:14:20.652Z" + assert stix2.environment.exact_match(t1, t2) == 0.0 + + +def test_non_existent_config_for_object(): + r1 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) + r2 = stix2.v21.Report(id=REPORT_ID, **REPORT_KWARGS) + assert stix2.Environment().semantically_equivalent(r1, r2) == 0.0 diff --git a/stix2/version.py b/stix2/version.py index 0b2f79d..c68196d 100644 --- a/stix2/version.py +++ b/stix2/version.py @@ -1 +1 @@ -__version__ = "1.1.3" +__version__ = "1.2.0" diff --git a/tox.ini b/tox.ini index 2225bae..d8b840f 100644 --- a/tox.ini +++ b/tox.ini @@ -9,6 +9,8 @@ deps = pytest-cov coverage taxii2-client + pyjarowinkler + haversine medallion commands = python -m pytest --cov=stix2 stix2/test/ --cov-report term-missing -W ignore::stix2.exceptions.STIXDeprecationWarning