2020-08-11 00:33:26 +02:00
|
|
|
"""
|
|
|
|
Comparison utilities for STIX pattern comparison expressions.
|
|
|
|
"""
|
|
|
|
import base64
|
|
|
|
import functools
|
2020-08-13 23:44:42 +02:00
|
|
|
|
Graph Equivalence (#449)
* new packages for graph and object-based semantic equivalence
* new method graphically_equivalent for Environment, move equivalence methods out
* object equivalence function, methods used for object-based moved here.
* new graph_equivalence methods
* add notes
* add support for versioning checks (default disabled)
* new tests to cover graph equivalence and new methods
* added more imports to environment.py to prevent breaking changes
* variable changes, new fields for checks, reset depth check per call
* flexibility when object is not available on graph.
* refactor debug logging message
* new file stix2.equivalence.graph_equivalence.rst and stix2.equivalence.object_equivalence.rst for docs
* API documentation for new modules
* additional text required to build docs
* add more test methods for list_semantic_check an graphically_equivalent/versioning
* add logging debug messages, code clean-up
* include individual scoring on results dict, fix issue on list_semantic_check not keeping highest score
* include results as summary in prop_scores, minor tweaks
* Update __init__.py
doctrings update
* apply feedback from pull request
- rename semantic_check to reference_check
- rename modules to graph and object respectively to eliminate redundancy
- remove created_by_ref and object_marking_refs from graph WEIGHTS and rebalance
* update docs/ entries
* add more checks, make max score based on actual objects checked instead of the full list, only create entry when type is present in WEIGHTS dictionary
update tests to reflect changes
* rename package patterns -> pattern
* documentation, moving weights around
* more documentation moving
* rename WEIGHTS variable for graph_equivalence
2020-10-16 17:35:26 +02:00
|
|
|
from stix2.equivalence.pattern.compare import generic_cmp, iter_lex_cmp
|
2020-08-11 00:33:26 +02:00
|
|
|
from stix2.patterns import (
|
2020-08-13 23:44:42 +02:00
|
|
|
AndBooleanExpression, BinaryConstant, BooleanConstant, FloatConstant,
|
|
|
|
HexConstant, IntegerConstant, ListConstant, ListObjectPathComponent,
|
|
|
|
OrBooleanExpression, StringConstant, TimestampConstant,
|
|
|
|
_ComparisonExpression,
|
2020-08-11 00:33:26 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
_COMPARISON_OP_ORDER = (
|
|
|
|
"=", "!=", "<>", "<", "<=", ">", ">=",
|
2020-08-13 23:44:42 +02:00
|
|
|
"IN", "LIKE", "MATCHES", "ISSUBSET", "ISSUPERSET",
|
2020-08-11 00:33:26 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
_CONSTANT_TYPE_ORDER = (
|
|
|
|
# ints/floats come first, but have special handling since the types are
|
|
|
|
# treated equally as a generic "number" type. So they aren't in this list.
|
|
|
|
# See constant_cmp().
|
|
|
|
StringConstant, BooleanConstant,
|
2020-08-13 23:44:42 +02:00
|
|
|
TimestampConstant, HexConstant, BinaryConstant, ListConstant,
|
2020-08-11 00:33:26 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def generic_constant_cmp(const1, const2):
|
|
|
|
"""
|
|
|
|
Generic comparator for most _Constant instances. They must have a "value"
|
|
|
|
attribute whose value supports the builtin comparison operators.
|
|
|
|
|
2020-11-20 21:59:55 +01:00
|
|
|
Args:
|
|
|
|
const1: The first _Constant instance
|
|
|
|
const2: The second _Constant instance
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
<0, 0, or >0 depending on whether the first arg is less, equal or
|
2020-08-11 00:33:26 +02:00
|
|
|
greater than the second
|
|
|
|
"""
|
|
|
|
return generic_cmp(const1.value, const2.value)
|
|
|
|
|
|
|
|
|
|
|
|
def bool_cmp(value1, value2):
|
|
|
|
"""
|
|
|
|
Compare two boolean constants.
|
|
|
|
|
2020-11-20 21:59:55 +01:00
|
|
|
Args:
|
|
|
|
value1: The first BooleanConstant instance
|
|
|
|
value2: The second BooleanConstant instance
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
<0, 0, or >0 depending on whether the first arg is less, equal or
|
2020-08-11 00:33:26 +02:00
|
|
|
greater than the second
|
|
|
|
"""
|
|
|
|
|
|
|
|
# unwrap from _Constant instances
|
|
|
|
value1 = value1.value
|
|
|
|
value2 = value2.value
|
|
|
|
|
|
|
|
if (value1 and value2) or (not value1 and not value2):
|
|
|
|
result = 0
|
|
|
|
|
|
|
|
# Let's say... True < False?
|
|
|
|
elif value1:
|
|
|
|
result = -1
|
|
|
|
|
|
|
|
else:
|
|
|
|
result = 1
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def hex_cmp(value1, value2):
|
|
|
|
"""
|
|
|
|
Compare two STIX "hex" values. This decodes to bytes and compares that.
|
|
|
|
It does *not* do a string compare on the hex representations.
|
|
|
|
|
2020-11-20 21:59:55 +01:00
|
|
|
Args:
|
|
|
|
value1: The first HexConstant
|
|
|
|
value2: The second HexConstant
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
<0, 0, or >0 depending on whether the first arg is less, equal or
|
2020-08-11 00:33:26 +02:00
|
|
|
greater than the second
|
|
|
|
"""
|
|
|
|
bytes1 = bytes.fromhex(value1.value)
|
|
|
|
bytes2 = bytes.fromhex(value2.value)
|
|
|
|
|
|
|
|
return generic_cmp(bytes1, bytes2)
|
|
|
|
|
|
|
|
|
|
|
|
def bin_cmp(value1, value2):
|
|
|
|
"""
|
|
|
|
Compare two STIX "binary" values. This decodes to bytes and compares that.
|
|
|
|
It does *not* do a string compare on the base64 representations.
|
|
|
|
|
2020-11-20 21:59:55 +01:00
|
|
|
Args:
|
|
|
|
value1: The first BinaryConstant
|
|
|
|
value2: The second BinaryConstant
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
<0, 0, or >0 depending on whether the first arg is less, equal or
|
2020-08-11 00:33:26 +02:00
|
|
|
greater than the second
|
|
|
|
"""
|
|
|
|
bytes1 = base64.standard_b64decode(value1.value)
|
|
|
|
bytes2 = base64.standard_b64decode(value2.value)
|
|
|
|
|
|
|
|
return generic_cmp(bytes1, bytes2)
|
|
|
|
|
|
|
|
|
|
|
|
def list_cmp(value1, value2):
|
|
|
|
"""
|
|
|
|
Compare lists order-insensitively.
|
|
|
|
|
2020-11-20 21:59:55 +01:00
|
|
|
Args:
|
|
|
|
value1: The first ListConstant
|
|
|
|
value2: The second ListConstant
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
<0, 0, or >0 depending on whether the first arg is less, equal or
|
2020-08-11 00:33:26 +02:00
|
|
|
greater than the second
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Achieve order-independence by sorting the lists first.
|
|
|
|
sorted_value1 = sorted(
|
2020-08-13 23:44:42 +02:00
|
|
|
value1.value, key=functools.cmp_to_key(constant_cmp),
|
2020-08-11 00:33:26 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
sorted_value2 = sorted(
|
2020-08-13 23:44:42 +02:00
|
|
|
value2.value, key=functools.cmp_to_key(constant_cmp),
|
2020-08-11 00:33:26 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
result = iter_lex_cmp(sorted_value1, sorted_value2, constant_cmp)
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
_CONSTANT_COMPARATORS = {
|
|
|
|
# We have special handling for ints/floats, so no entries for those AST
|
|
|
|
# classes here. See constant_cmp().
|
|
|
|
StringConstant: generic_constant_cmp,
|
|
|
|
BooleanConstant: bool_cmp,
|
|
|
|
TimestampConstant: generic_constant_cmp,
|
|
|
|
HexConstant: hex_cmp,
|
|
|
|
BinaryConstant: bin_cmp,
|
2020-08-13 23:44:42 +02:00
|
|
|
ListConstant: list_cmp,
|
2020-08-11 00:33:26 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def object_path_component_cmp(comp1, comp2):
|
|
|
|
"""
|
|
|
|
Compare a string/int to another string/int; this induces an ordering over
|
|
|
|
all strings and ints. It is used to perform a lexicographical sort on
|
|
|
|
object paths.
|
|
|
|
|
|
|
|
Ints and strings compare as usual to each other; ints compare less than
|
|
|
|
strings.
|
|
|
|
|
2020-11-20 21:59:55 +01:00
|
|
|
Args:
|
|
|
|
comp1: An object path component (string or int)
|
|
|
|
comp2: An object path component (string or int)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
<0, 0, or >0 depending on whether the first arg is less, equal or
|
2020-08-11 00:33:26 +02:00
|
|
|
greater than the second
|
|
|
|
"""
|
|
|
|
|
|
|
|
# both ints or both strings: use builtin comparison operators
|
|
|
|
if (isinstance(comp1, int) and isinstance(comp2, int)) \
|
|
|
|
or (isinstance(comp1, str) and isinstance(comp2, str)):
|
|
|
|
result = generic_cmp(comp1, comp2)
|
|
|
|
|
|
|
|
# one is int, one is string. Let's say ints come before strings.
|
|
|
|
elif isinstance(comp1, int):
|
|
|
|
result = -1
|
|
|
|
|
|
|
|
else:
|
|
|
|
result = 1
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def object_path_to_raw_values(path):
|
|
|
|
"""
|
|
|
|
Converts the given ObjectPath instance to a list of strings and ints.
|
|
|
|
All property names become strings, regardless of whether they're *_ref
|
|
|
|
properties; "*" index steps become that string; and numeric index steps
|
|
|
|
become integers.
|
|
|
|
|
2020-11-20 21:59:55 +01:00
|
|
|
Args:
|
|
|
|
path: An ObjectPath instance
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A generator iterator over the values
|
2020-08-11 00:33:26 +02:00
|
|
|
"""
|
|
|
|
|
|
|
|
for comp in path.property_path:
|
|
|
|
if isinstance(comp, ListObjectPathComponent):
|
|
|
|
yield comp.property_name
|
|
|
|
|
|
|
|
if comp.index == "*" or isinstance(comp.index, int):
|
|
|
|
yield comp.index
|
|
|
|
else:
|
|
|
|
# in case the index is a stringified int; convert to an actual
|
|
|
|
# int
|
|
|
|
yield int(comp.index)
|
|
|
|
|
|
|
|
else:
|
|
|
|
yield comp.property_name
|
|
|
|
|
|
|
|
|
|
|
|
def object_path_cmp(path1, path2):
|
|
|
|
"""
|
|
|
|
Compare two object paths.
|
|
|
|
|
2020-11-20 21:59:55 +01:00
|
|
|
Args:
|
|
|
|
path1: The first ObjectPath instance
|
|
|
|
path2: The second ObjectPath instance
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
<0, 0, or >0 depending on whether the first arg is less, equal or
|
2020-08-11 00:33:26 +02:00
|
|
|
greater than the second
|
|
|
|
"""
|
|
|
|
if path1.object_type_name < path2.object_type_name:
|
|
|
|
result = -1
|
|
|
|
|
|
|
|
elif path1.object_type_name > path2.object_type_name:
|
|
|
|
result = 1
|
|
|
|
|
|
|
|
else:
|
|
|
|
# I always thought of key and index path steps as separate. The AST
|
|
|
|
# lumps indices in with the previous key as a single path component.
|
|
|
|
# The following splits the path components into individual comparable
|
|
|
|
# values again. Maybe I should not do this...
|
|
|
|
path_vals1 = object_path_to_raw_values(path1)
|
|
|
|
path_vals2 = object_path_to_raw_values(path2)
|
|
|
|
result = iter_lex_cmp(
|
2020-08-13 23:44:42 +02:00
|
|
|
path_vals1, path_vals2, object_path_component_cmp,
|
2020-08-11 00:33:26 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def comparison_operator_cmp(op1, op2):
|
|
|
|
"""
|
|
|
|
Compare two comparison operators.
|
|
|
|
|
2020-11-20 21:59:55 +01:00
|
|
|
Args:
|
|
|
|
op1: The first comparison operator (a string)
|
|
|
|
op2: The second comparison operator (a string)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
<0, 0, or >0 depending on whether the first arg is less, equal or
|
2020-08-11 00:33:26 +02:00
|
|
|
greater than the second
|
|
|
|
"""
|
|
|
|
op1_idx = _COMPARISON_OP_ORDER.index(op1)
|
|
|
|
op2_idx = _COMPARISON_OP_ORDER.index(op2)
|
|
|
|
|
|
|
|
result = generic_cmp(op1_idx, op2_idx)
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def constant_cmp(value1, value2):
|
|
|
|
"""
|
|
|
|
Compare two constants.
|
|
|
|
|
2020-11-20 21:59:55 +01:00
|
|
|
Args:
|
|
|
|
value1: The first _Constant instance
|
|
|
|
value2: The second _Constant instance
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
<0, 0, or >0 depending on whether the first arg is less, equal or
|
2020-08-11 00:33:26 +02:00
|
|
|
greater than the second
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Special handling for ints/floats: treat them generically as numbers,
|
|
|
|
# ordered before all other types.
|
|
|
|
if isinstance(value1, (IntegerConstant, FloatConstant)) \
|
|
|
|
and isinstance(value2, (IntegerConstant, FloatConstant)):
|
|
|
|
result = generic_constant_cmp(value1, value2)
|
|
|
|
|
|
|
|
elif isinstance(value1, (IntegerConstant, FloatConstant)):
|
|
|
|
result = -1
|
|
|
|
|
|
|
|
elif isinstance(value2, (IntegerConstant, FloatConstant)):
|
|
|
|
result = 1
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
type1 = type(value1)
|
|
|
|
type2 = type(value2)
|
|
|
|
|
|
|
|
type1_idx = _CONSTANT_TYPE_ORDER.index(type1)
|
|
|
|
type2_idx = _CONSTANT_TYPE_ORDER.index(type2)
|
|
|
|
|
|
|
|
result = generic_cmp(type1_idx, type2_idx)
|
|
|
|
if result == 0:
|
|
|
|
# Types are the same; must compare values
|
|
|
|
cmp_func = _CONSTANT_COMPARATORS.get(type1)
|
|
|
|
if not cmp_func:
|
|
|
|
raise TypeError("Don't know how to compare " + type1.__name__)
|
|
|
|
|
|
|
|
result = cmp_func(value1, value2)
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def simple_comparison_expression_cmp(expr1, expr2):
|
|
|
|
"""
|
|
|
|
Compare "simple" comparison expressions: those which aren't AND/OR
|
|
|
|
combinations, just <path> <op> <value> comparisons.
|
|
|
|
|
2020-11-20 21:59:55 +01:00
|
|
|
Args:
|
|
|
|
expr1: first _ComparisonExpression instance
|
|
|
|
expr2: second _ComparisonExpression instance
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
<0, 0, or >0 depending on whether the first arg is less, equal or
|
2020-08-11 00:33:26 +02:00
|
|
|
greater than the second
|
|
|
|
"""
|
|
|
|
|
|
|
|
result = object_path_cmp(expr1.lhs, expr2.lhs)
|
|
|
|
|
|
|
|
if result == 0:
|
|
|
|
result = comparison_operator_cmp(expr1.operator, expr2.operator)
|
|
|
|
|
|
|
|
if result == 0:
|
|
|
|
# _ComparisonExpression's have a "negated" attribute. Umm...
|
|
|
|
# non-negated < negated?
|
|
|
|
if not expr1.negated and expr2.negated:
|
|
|
|
result = -1
|
|
|
|
elif expr1.negated and not expr2.negated:
|
|
|
|
result = 1
|
|
|
|
|
|
|
|
if result == 0:
|
|
|
|
result = constant_cmp(expr1.rhs, expr2.rhs)
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def comparison_expression_cmp(expr1, expr2):
|
|
|
|
"""
|
|
|
|
Compare two comparison expressions. This is sensitive to the order of the
|
|
|
|
expressions' sub-components. To achieve an order-insensitive comparison,
|
2021-02-12 01:33:57 +01:00
|
|
|
the sub-component ASTs must be ordered first.
|
2020-08-11 00:33:26 +02:00
|
|
|
|
2020-11-20 21:59:55 +01:00
|
|
|
Args:
|
|
|
|
expr1: The first comparison expression
|
|
|
|
expr2: The second comparison expression
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
<0, 0, or >0 depending on whether the first arg is less, equal or
|
2020-08-11 00:33:26 +02:00
|
|
|
greater than the second
|
|
|
|
"""
|
|
|
|
if isinstance(expr1, _ComparisonExpression) \
|
|
|
|
and isinstance(expr2, _ComparisonExpression):
|
|
|
|
result = simple_comparison_expression_cmp(expr1, expr2)
|
|
|
|
|
|
|
|
# One is simple, one is compound. Let's say... simple ones come first?
|
|
|
|
elif isinstance(expr1, _ComparisonExpression):
|
|
|
|
result = -1
|
|
|
|
|
|
|
|
elif isinstance(expr2, _ComparisonExpression):
|
|
|
|
result = 1
|
|
|
|
|
|
|
|
# Both are compound: AND's before OR's?
|
|
|
|
elif isinstance(expr1, AndBooleanExpression) \
|
|
|
|
and isinstance(expr2, OrBooleanExpression):
|
|
|
|
result = -1
|
|
|
|
|
|
|
|
elif isinstance(expr1, OrBooleanExpression) \
|
|
|
|
and isinstance(expr2, AndBooleanExpression):
|
|
|
|
result = 1
|
|
|
|
|
|
|
|
else:
|
|
|
|
# Both compound, same boolean operator: sort according to contents.
|
|
|
|
# This will order according to recursive invocations of this comparator,
|
|
|
|
# on sub-expressions.
|
|
|
|
result = iter_lex_cmp(
|
2020-08-13 23:44:42 +02:00
|
|
|
expr1.operands, expr2.operands, comparison_expression_cmp,
|
2020-08-11 00:33:26 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
return result
|