cti-python-stix2/stix2/equivalence/pattern/compare/comparison.py

352 lines
11 KiB
Python
Raw Normal View History

"""
Comparison utilities for STIX pattern comparison expressions.
"""
import base64
import functools
2020-08-13 23:44:42 +02:00
Graph Equivalence (#449) * new packages for graph and object-based semantic equivalence * new method graphically_equivalent for Environment, move equivalence methods out * object equivalence function, methods used for object-based moved here. * new graph_equivalence methods * add notes * add support for versioning checks (default disabled) * new tests to cover graph equivalence and new methods * added more imports to environment.py to prevent breaking changes * variable changes, new fields for checks, reset depth check per call * flexibility when object is not available on graph. * refactor debug logging message * new file stix2.equivalence.graph_equivalence.rst and stix2.equivalence.object_equivalence.rst for docs * API documentation for new modules * additional text required to build docs * add more test methods for list_semantic_check an graphically_equivalent/versioning * add logging debug messages, code clean-up * include individual scoring on results dict, fix issue on list_semantic_check not keeping highest score * include results as summary in prop_scores, minor tweaks * Update __init__.py doctrings update * apply feedback from pull request - rename semantic_check to reference_check - rename modules to graph and object respectively to eliminate redundancy - remove created_by_ref and object_marking_refs from graph WEIGHTS and rebalance * update docs/ entries * add more checks, make max score based on actual objects checked instead of the full list, only create entry when type is present in WEIGHTS dictionary update tests to reflect changes * rename package patterns -> pattern * documentation, moving weights around * more documentation moving * rename WEIGHTS variable for graph_equivalence
2020-10-16 17:35:26 +02:00
from stix2.equivalence.pattern.compare import generic_cmp, iter_lex_cmp
from stix2.patterns import (
2020-08-13 23:44:42 +02:00
AndBooleanExpression, BinaryConstant, BooleanConstant, FloatConstant,
HexConstant, IntegerConstant, ListConstant, ListObjectPathComponent,
OrBooleanExpression, StringConstant, TimestampConstant,
_ComparisonExpression,
)
_COMPARISON_OP_ORDER = (
"=", "!=", "<>", "<", "<=", ">", ">=",
2020-08-13 23:44:42 +02:00
"IN", "LIKE", "MATCHES", "ISSUBSET", "ISSUPERSET",
)
_CONSTANT_TYPE_ORDER = (
# ints/floats come first, but have special handling since the types are
# treated equally as a generic "number" type. So they aren't in this list.
# See constant_cmp().
StringConstant, BooleanConstant,
2020-08-13 23:44:42 +02:00
TimestampConstant, HexConstant, BinaryConstant, ListConstant,
)
def generic_constant_cmp(const1, const2):
"""
Generic comparator for most _Constant instances. They must have a "value"
attribute whose value supports the builtin comparison operators.
:param const1: The first _Constant instance
:param const2: The second _Constant instance
:return: <0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
return generic_cmp(const1.value, const2.value)
def bool_cmp(value1, value2):
"""
Compare two boolean constants.
:param value1: The first BooleanConstant instance
:param value2: The second BooleanConstant instance
:return: <0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
# unwrap from _Constant instances
value1 = value1.value
value2 = value2.value
if (value1 and value2) or (not value1 and not value2):
result = 0
# Let's say... True < False?
elif value1:
result = -1
else:
result = 1
return result
def hex_cmp(value1, value2):
"""
Compare two STIX "hex" values. This decodes to bytes and compares that.
It does *not* do a string compare on the hex representations.
:param value1: The first HexConstant
:param value2: The second HexConstant
:return: <0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
bytes1 = bytes.fromhex(value1.value)
bytes2 = bytes.fromhex(value2.value)
return generic_cmp(bytes1, bytes2)
def bin_cmp(value1, value2):
"""
Compare two STIX "binary" values. This decodes to bytes and compares that.
It does *not* do a string compare on the base64 representations.
:param value1: The first BinaryConstant
:param value2: The second BinaryConstant
:return: <0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
bytes1 = base64.standard_b64decode(value1.value)
bytes2 = base64.standard_b64decode(value2.value)
return generic_cmp(bytes1, bytes2)
def list_cmp(value1, value2):
"""
Compare lists order-insensitively.
:param value1: The first ListConstant
:param value2: The second ListConstant
:return: <0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
# Achieve order-independence by sorting the lists first.
sorted_value1 = sorted(
2020-08-13 23:44:42 +02:00
value1.value, key=functools.cmp_to_key(constant_cmp),
)
sorted_value2 = sorted(
2020-08-13 23:44:42 +02:00
value2.value, key=functools.cmp_to_key(constant_cmp),
)
result = iter_lex_cmp(sorted_value1, sorted_value2, constant_cmp)
return result
_CONSTANT_COMPARATORS = {
# We have special handling for ints/floats, so no entries for those AST
# classes here. See constant_cmp().
StringConstant: generic_constant_cmp,
BooleanConstant: bool_cmp,
TimestampConstant: generic_constant_cmp,
HexConstant: hex_cmp,
BinaryConstant: bin_cmp,
2020-08-13 23:44:42 +02:00
ListConstant: list_cmp,
}
def object_path_component_cmp(comp1, comp2):
"""
Compare a string/int to another string/int; this induces an ordering over
all strings and ints. It is used to perform a lexicographical sort on
object paths.
Ints and strings compare as usual to each other; ints compare less than
strings.
:param comp1: An object path component (string or int)
:param comp2: An object path component (string or int)
:return: <0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
# both ints or both strings: use builtin comparison operators
if (isinstance(comp1, int) and isinstance(comp2, int)) \
or (isinstance(comp1, str) and isinstance(comp2, str)):
result = generic_cmp(comp1, comp2)
# one is int, one is string. Let's say ints come before strings.
elif isinstance(comp1, int):
result = -1
else:
result = 1
return result
def object_path_to_raw_values(path):
"""
Converts the given ObjectPath instance to a list of strings and ints.
All property names become strings, regardless of whether they're *_ref
properties; "*" index steps become that string; and numeric index steps
become integers.
:param path: An ObjectPath instance
:return: A generator iterator over the values
"""
for comp in path.property_path:
if isinstance(comp, ListObjectPathComponent):
yield comp.property_name
if comp.index == "*" or isinstance(comp.index, int):
yield comp.index
else:
# in case the index is a stringified int; convert to an actual
# int
yield int(comp.index)
else:
yield comp.property_name
def object_path_cmp(path1, path2):
"""
Compare two object paths.
:param path1: The first ObjectPath instance
:param path2: The second ObjectPath instance
:return: <0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
if path1.object_type_name < path2.object_type_name:
result = -1
elif path1.object_type_name > path2.object_type_name:
result = 1
else:
# I always thought of key and index path steps as separate. The AST
# lumps indices in with the previous key as a single path component.
# The following splits the path components into individual comparable
# values again. Maybe I should not do this...
path_vals1 = object_path_to_raw_values(path1)
path_vals2 = object_path_to_raw_values(path2)
result = iter_lex_cmp(
2020-08-13 23:44:42 +02:00
path_vals1, path_vals2, object_path_component_cmp,
)
return result
def comparison_operator_cmp(op1, op2):
"""
Compare two comparison operators.
:param op1: The first comparison operator (a string)
:param op2: The second comparison operator (a string)
:return: <0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
op1_idx = _COMPARISON_OP_ORDER.index(op1)
op2_idx = _COMPARISON_OP_ORDER.index(op2)
result = generic_cmp(op1_idx, op2_idx)
return result
def constant_cmp(value1, value2):
"""
Compare two constants.
:param value1: The first _Constant instance
:param value2: The second _Constant instance
:return: <0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
# Special handling for ints/floats: treat them generically as numbers,
# ordered before all other types.
if isinstance(value1, (IntegerConstant, FloatConstant)) \
and isinstance(value2, (IntegerConstant, FloatConstant)):
result = generic_constant_cmp(value1, value2)
elif isinstance(value1, (IntegerConstant, FloatConstant)):
result = -1
elif isinstance(value2, (IntegerConstant, FloatConstant)):
result = 1
else:
type1 = type(value1)
type2 = type(value2)
type1_idx = _CONSTANT_TYPE_ORDER.index(type1)
type2_idx = _CONSTANT_TYPE_ORDER.index(type2)
result = generic_cmp(type1_idx, type2_idx)
if result == 0:
# Types are the same; must compare values
cmp_func = _CONSTANT_COMPARATORS.get(type1)
if not cmp_func:
raise TypeError("Don't know how to compare " + type1.__name__)
result = cmp_func(value1, value2)
return result
def simple_comparison_expression_cmp(expr1, expr2):
"""
Compare "simple" comparison expressions: those which aren't AND/OR
combinations, just <path> <op> <value> comparisons.
:param expr1: first _ComparisonExpression instance
:param expr2: second _ComparisonExpression instance
:return: <0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
result = object_path_cmp(expr1.lhs, expr2.lhs)
if result == 0:
result = comparison_operator_cmp(expr1.operator, expr2.operator)
if result == 0:
# _ComparisonExpression's have a "negated" attribute. Umm...
# non-negated < negated?
if not expr1.negated and expr2.negated:
result = -1
elif expr1.negated and not expr2.negated:
result = 1
if result == 0:
result = constant_cmp(expr1.rhs, expr2.rhs)
return result
def comparison_expression_cmp(expr1, expr2):
"""
Compare two comparison expressions. This is sensitive to the order of the
expressions' sub-components. To achieve an order-insensitive comparison,
the ASTs must be canonically ordered first.
:param expr1: The first comparison expression
:param expr2: The second comparison expression
:return: <0, 0, or >0 depending on whether the first arg is less, equal or
greater than the second
"""
if isinstance(expr1, _ComparisonExpression) \
and isinstance(expr2, _ComparisonExpression):
result = simple_comparison_expression_cmp(expr1, expr2)
# One is simple, one is compound. Let's say... simple ones come first?
elif isinstance(expr1, _ComparisonExpression):
result = -1
elif isinstance(expr2, _ComparisonExpression):
result = 1
# Both are compound: AND's before OR's?
elif isinstance(expr1, AndBooleanExpression) \
and isinstance(expr2, OrBooleanExpression):
result = -1
elif isinstance(expr1, OrBooleanExpression) \
and isinstance(expr2, AndBooleanExpression):
result = 1
else:
# Both compound, same boolean operator: sort according to contents.
# This will order according to recursive invocations of this comparator,
# on sub-expressions.
result = iter_lex_cmp(
2020-08-13 23:44:42 +02:00
expr1.operands, expr2.operands, comparison_expression_cmp,
)
return result