cti-python-stix2/stix2/equivalence/pattern/compare/comparison.py

388 lines
11 KiB
Python
Raw Normal View History

"""
Comparison utilities for STIX pattern comparison expressions.
"""
import base64
import functools
2020-08-13 23:44:42 +02:00
Graph Equivalence (#449) * new packages for graph and object-based semantic equivalence * new method graphically_equivalent for Environment, move equivalence methods out * object equivalence function, methods used for object-based moved here. * new graph_equivalence methods * add notes * add support for versioning checks (default disabled) * new tests to cover graph equivalence and new methods * added more imports to environment.py to prevent breaking changes * variable changes, new fields for checks, reset depth check per call * flexibility when object is not available on graph. * refactor debug logging message * new file stix2.equivalence.graph_equivalence.rst and stix2.equivalence.object_equivalence.rst for docs * API documentation for new modules * additional text required to build docs * add more test methods for list_semantic_check an graphically_equivalent/versioning * add logging debug messages, code clean-up * include individual scoring on results dict, fix issue on list_semantic_check not keeping highest score * include results as summary in prop_scores, minor tweaks * Update __init__.py doctrings update * apply feedback from pull request - rename semantic_check to reference_check - rename modules to graph and object respectively to eliminate redundancy - remove created_by_ref and object_marking_refs from graph WEIGHTS and rebalance * update docs/ entries * add more checks, make max score based on actual objects checked instead of the full list, only create entry when type is present in WEIGHTS dictionary update tests to reflect changes * rename package patterns -> pattern * documentation, moving weights around * more documentation moving * rename WEIGHTS variable for graph_equivalence
2020-10-16 17:35:26 +02:00
from stix2.equivalence.pattern.compare import generic_cmp, iter_lex_cmp
from stix2.patterns import (
2020-08-13 23:44:42 +02:00
AndBooleanExpression, BinaryConstant, BooleanConstant, FloatConstant,
HexConstant, IntegerConstant, ListConstant, ListObjectPathComponent,
OrBooleanExpression, StringConstant, TimestampConstant,
_ComparisonExpression,
)
_COMPARISON_OP_ORDER = (
"=", "!=", "<>", "<", "<=", ">", ">=",
2020-08-13 23:44:42 +02:00
"IN", "LIKE", "MATCHES", "ISSUBSET", "ISSUPERSET",
)
_CONSTANT_TYPE_ORDER = (
# ints/floats come first, but have special handling since the types are
# treated equally as a generic "number" type. So they aren't in this list.
# See constant_cmp().
StringConstant, BooleanConstant,
2020-08-13 23:44:42 +02:00
TimestampConstant, HexConstant, BinaryConstant, ListConstant,
)
def generic_constant_cmp(const1, const2):
"""
Generic comparator for most _Constant instances. They must have a "value"
attribute whose value supports the builtin comparison operators.
Args:
const1: The first _Constant instance
const2: The second _Constant instance
Returns:
<0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
return generic_cmp(const1.value, const2.value)
def bool_cmp(value1, value2):
"""
Compare two boolean constants.
Args:
value1: The first BooleanConstant instance
value2: The second BooleanConstant instance
Returns:
<0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
# unwrap from _Constant instances
value1 = value1.value
value2 = value2.value
if (value1 and value2) or (not value1 and not value2):
result = 0
# Let's say... True < False?
elif value1:
result = -1
else:
result = 1
return result
def hex_cmp(value1, value2):
"""
Compare two STIX "hex" values. This decodes to bytes and compares that.
It does *not* do a string compare on the hex representations.
Args:
value1: The first HexConstant
value2: The second HexConstant
Returns:
<0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
bytes1 = bytes.fromhex(value1.value)
bytes2 = bytes.fromhex(value2.value)
return generic_cmp(bytes1, bytes2)
def bin_cmp(value1, value2):
"""
Compare two STIX "binary" values. This decodes to bytes and compares that.
It does *not* do a string compare on the base64 representations.
Args:
value1: The first BinaryConstant
value2: The second BinaryConstant
Returns:
<0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
bytes1 = base64.standard_b64decode(value1.value)
bytes2 = base64.standard_b64decode(value2.value)
return generic_cmp(bytes1, bytes2)
def list_cmp(value1, value2):
"""
Compare lists order-insensitively.
Args:
value1: The first ListConstant
value2: The second ListConstant
Returns:
<0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
# Achieve order-independence by sorting the lists first.
sorted_value1 = sorted(
2020-08-13 23:44:42 +02:00
value1.value, key=functools.cmp_to_key(constant_cmp),
)
sorted_value2 = sorted(
2020-08-13 23:44:42 +02:00
value2.value, key=functools.cmp_to_key(constant_cmp),
)
result = iter_lex_cmp(sorted_value1, sorted_value2, constant_cmp)
return result
_CONSTANT_COMPARATORS = {
# We have special handling for ints/floats, so no entries for those AST
# classes here. See constant_cmp().
StringConstant: generic_constant_cmp,
BooleanConstant: bool_cmp,
TimestampConstant: generic_constant_cmp,
HexConstant: hex_cmp,
BinaryConstant: bin_cmp,
2020-08-13 23:44:42 +02:00
ListConstant: list_cmp,
}
def object_path_component_cmp(comp1, comp2):
"""
Compare a string/int to another string/int; this induces an ordering over
all strings and ints. It is used to perform a lexicographical sort on
object paths.
Ints and strings compare as usual to each other; ints compare less than
strings.
Args:
comp1: An object path component (string or int)
comp2: An object path component (string or int)
Returns:
<0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
# both ints or both strings: use builtin comparison operators
if (isinstance(comp1, int) and isinstance(comp2, int)) \
or (isinstance(comp1, str) and isinstance(comp2, str)):
result = generic_cmp(comp1, comp2)
# one is int, one is string. Let's say ints come before strings.
elif isinstance(comp1, int):
result = -1
else:
result = 1
return result
def object_path_to_raw_values(path):
"""
Converts the given ObjectPath instance to a list of strings and ints.
All property names become strings, regardless of whether they're *_ref
properties; "*" index steps become that string; and numeric index steps
become integers.
Args:
path: An ObjectPath instance
Returns:
A generator iterator over the values
"""
for comp in path.property_path:
if isinstance(comp, ListObjectPathComponent):
yield comp.property_name
if comp.index == "*" or isinstance(comp.index, int):
yield comp.index
else:
# in case the index is a stringified int; convert to an actual
# int
yield int(comp.index)
else:
yield comp.property_name
def object_path_cmp(path1, path2):
"""
Compare two object paths.
Args:
path1: The first ObjectPath instance
path2: The second ObjectPath instance
Returns:
<0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
if path1.object_type_name < path2.object_type_name:
result = -1
elif path1.object_type_name > path2.object_type_name:
result = 1
else:
# I always thought of key and index path steps as separate. The AST
# lumps indices in with the previous key as a single path component.
# The following splits the path components into individual comparable
# values again. Maybe I should not do this...
path_vals1 = object_path_to_raw_values(path1)
path_vals2 = object_path_to_raw_values(path2)
result = iter_lex_cmp(
2020-08-13 23:44:42 +02:00
path_vals1, path_vals2, object_path_component_cmp,
)
return result
def comparison_operator_cmp(op1, op2):
"""
Compare two comparison operators.
Args:
op1: The first comparison operator (a string)
op2: The second comparison operator (a string)
Returns:
<0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
op1_idx = _COMPARISON_OP_ORDER.index(op1)
op2_idx = _COMPARISON_OP_ORDER.index(op2)
result = generic_cmp(op1_idx, op2_idx)
return result
def constant_cmp(value1, value2):
"""
Compare two constants.
Args:
value1: The first _Constant instance
value2: The second _Constant instance
Returns:
<0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
# Special handling for ints/floats: treat them generically as numbers,
# ordered before all other types.
if isinstance(value1, (IntegerConstant, FloatConstant)) \
and isinstance(value2, (IntegerConstant, FloatConstant)):
result = generic_constant_cmp(value1, value2)
elif isinstance(value1, (IntegerConstant, FloatConstant)):
result = -1
elif isinstance(value2, (IntegerConstant, FloatConstant)):
result = 1
else:
type1 = type(value1)
type2 = type(value2)
type1_idx = _CONSTANT_TYPE_ORDER.index(type1)
type2_idx = _CONSTANT_TYPE_ORDER.index(type2)
result = generic_cmp(type1_idx, type2_idx)
if result == 0:
# Types are the same; must compare values
cmp_func = _CONSTANT_COMPARATORS.get(type1)
if not cmp_func:
raise TypeError("Don't know how to compare " + type1.__name__)
result = cmp_func(value1, value2)
return result
def simple_comparison_expression_cmp(expr1, expr2):
"""
Compare "simple" comparison expressions: those which aren't AND/OR
combinations, just <path> <op> <value> comparisons.
Args:
expr1: first _ComparisonExpression instance
expr2: second _ComparisonExpression instance
Returns:
<0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
result = object_path_cmp(expr1.lhs, expr2.lhs)
if result == 0:
result = comparison_operator_cmp(expr1.operator, expr2.operator)
if result == 0:
# _ComparisonExpression's have a "negated" attribute. Umm...
# non-negated < negated?
if not expr1.negated and expr2.negated:
result = -1
elif expr1.negated and not expr2.negated:
result = 1
if result == 0:
result = constant_cmp(expr1.rhs, expr2.rhs)
return result
def comparison_expression_cmp(expr1, expr2):
"""
Compare two comparison expressions. This is sensitive to the order of the
expressions' sub-components. To achieve an order-insensitive comparison,
the ASTs must be canonically ordered first.
Args:
expr1: The first comparison expression
expr2: The second comparison expression
Returns:
<0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
if isinstance(expr1, _ComparisonExpression) \
and isinstance(expr2, _ComparisonExpression):
result = simple_comparison_expression_cmp(expr1, expr2)
# One is simple, one is compound. Let's say... simple ones come first?
elif isinstance(expr1, _ComparisonExpression):
result = -1
elif isinstance(expr2, _ComparisonExpression):
result = 1
# Both are compound: AND's before OR's?
elif isinstance(expr1, AndBooleanExpression) \
and isinstance(expr2, OrBooleanExpression):
result = -1
elif isinstance(expr1, OrBooleanExpression) \
and isinstance(expr2, AndBooleanExpression):
result = 1
else:
# Both compound, same boolean operator: sort according to contents.
# This will order according to recursive invocations of this comparator,
# on sub-expressions.
result = iter_lex_cmp(
2020-08-13 23:44:42 +02:00
expr1.operands, expr2.operands, comparison_expression_cmp,
)
return result