cti-python-stix2/stix2/equivalence/pattern/transform/observation.py

508 lines
16 KiB
Python
Raw Normal View History

"""
Transformation utilities for STIX pattern observation expressions.
"""
import functools
import itertools
2020-08-13 23:44:42 +02:00
Graph Equivalence (#449) * new packages for graph and object-based semantic equivalence * new method graphically_equivalent for Environment, move equivalence methods out * object equivalence function, methods used for object-based moved here. * new graph_equivalence methods * add notes * add support for versioning checks (default disabled) * new tests to cover graph equivalence and new methods * added more imports to environment.py to prevent breaking changes * variable changes, new fields for checks, reset depth check per call * flexibility when object is not available on graph. * refactor debug logging message * new file stix2.equivalence.graph_equivalence.rst and stix2.equivalence.object_equivalence.rst for docs * API documentation for new modules * additional text required to build docs * add more test methods for list_semantic_check an graphically_equivalent/versioning * add logging debug messages, code clean-up * include individual scoring on results dict, fix issue on list_semantic_check not keeping highest score * include results as summary in prop_scores, minor tweaks * Update __init__.py doctrings update * apply feedback from pull request - rename semantic_check to reference_check - rename modules to graph and object respectively to eliminate redundancy - remove created_by_ref and object_marking_refs from graph WEIGHTS and rebalance * update docs/ entries * add more checks, make max score based on actual objects checked instead of the full list, only create entry when type is present in WEIGHTS dictionary update tests to reflect changes * rename package patterns -> pattern * documentation, moving weights around * more documentation moving * rename WEIGHTS variable for graph_equivalence
2020-10-16 17:35:26 +02:00
from stix2.equivalence.pattern.compare import iter_in, iter_lex_cmp
from stix2.equivalence.pattern.compare.observation import (
2020-08-13 23:44:42 +02:00
observation_expression_cmp,
)
Graph Equivalence (#449) * new packages for graph and object-based semantic equivalence * new method graphically_equivalent for Environment, move equivalence methods out * object equivalence function, methods used for object-based moved here. * new graph_equivalence methods * add notes * add support for versioning checks (default disabled) * new tests to cover graph equivalence and new methods * added more imports to environment.py to prevent breaking changes * variable changes, new fields for checks, reset depth check per call * flexibility when object is not available on graph. * refactor debug logging message * new file stix2.equivalence.graph_equivalence.rst and stix2.equivalence.object_equivalence.rst for docs * API documentation for new modules * additional text required to build docs * add more test methods for list_semantic_check an graphically_equivalent/versioning * add logging debug messages, code clean-up * include individual scoring on results dict, fix issue on list_semantic_check not keeping highest score * include results as summary in prop_scores, minor tweaks * Update __init__.py doctrings update * apply feedback from pull request - rename semantic_check to reference_check - rename modules to graph and object respectively to eliminate redundancy - remove created_by_ref and object_marking_refs from graph WEIGHTS and rebalance * update docs/ entries * add more checks, make max score based on actual objects checked instead of the full list, only create entry when type is present in WEIGHTS dictionary update tests to reflect changes * rename package patterns -> pattern * documentation, moving weights around * more documentation moving * rename WEIGHTS variable for graph_equivalence
2020-10-16 17:35:26 +02:00
from stix2.equivalence.pattern.transform import (
2020-08-13 23:44:42 +02:00
ChainTransformer, SettleTransformer, Transformer,
)
Graph Equivalence (#449) * new packages for graph and object-based semantic equivalence * new method graphically_equivalent for Environment, move equivalence methods out * object equivalence function, methods used for object-based moved here. * new graph_equivalence methods * add notes * add support for versioning checks (default disabled) * new tests to cover graph equivalence and new methods * added more imports to environment.py to prevent breaking changes * variable changes, new fields for checks, reset depth check per call * flexibility when object is not available on graph. * refactor debug logging message * new file stix2.equivalence.graph_equivalence.rst and stix2.equivalence.object_equivalence.rst for docs * API documentation for new modules * additional text required to build docs * add more test methods for list_semantic_check an graphically_equivalent/versioning * add logging debug messages, code clean-up * include individual scoring on results dict, fix issue on list_semantic_check not keeping highest score * include results as summary in prop_scores, minor tweaks * Update __init__.py doctrings update * apply feedback from pull request - rename semantic_check to reference_check - rename modules to graph and object respectively to eliminate redundancy - remove created_by_ref and object_marking_refs from graph WEIGHTS and rebalance * update docs/ entries * add more checks, make max score based on actual objects checked instead of the full list, only create entry when type is present in WEIGHTS dictionary update tests to reflect changes * rename package patterns -> pattern * documentation, moving weights around * more documentation moving * rename WEIGHTS variable for graph_equivalence
2020-10-16 17:35:26 +02:00
from stix2.equivalence.pattern.transform.comparison import (
2020-08-13 23:44:42 +02:00
SpecialValueCanonicalization,
)
Graph Equivalence (#449) * new packages for graph and object-based semantic equivalence * new method graphically_equivalent for Environment, move equivalence methods out * object equivalence function, methods used for object-based moved here. * new graph_equivalence methods * add notes * add support for versioning checks (default disabled) * new tests to cover graph equivalence and new methods * added more imports to environment.py to prevent breaking changes * variable changes, new fields for checks, reset depth check per call * flexibility when object is not available on graph. * refactor debug logging message * new file stix2.equivalence.graph_equivalence.rst and stix2.equivalence.object_equivalence.rst for docs * API documentation for new modules * additional text required to build docs * add more test methods for list_semantic_check an graphically_equivalent/versioning * add logging debug messages, code clean-up * include individual scoring on results dict, fix issue on list_semantic_check not keeping highest score * include results as summary in prop_scores, minor tweaks * Update __init__.py doctrings update * apply feedback from pull request - rename semantic_check to reference_check - rename modules to graph and object respectively to eliminate redundancy - remove created_by_ref and object_marking_refs from graph WEIGHTS and rebalance * update docs/ entries * add more checks, make max score based on actual objects checked instead of the full list, only create entry when type is present in WEIGHTS dictionary update tests to reflect changes * rename package patterns -> pattern * documentation, moving weights around * more documentation moving * rename WEIGHTS variable for graph_equivalence
2020-10-16 17:35:26 +02:00
from stix2.equivalence.pattern.transform.comparison import \
2020-08-13 23:44:42 +02:00
AbsorptionTransformer as CAbsorptionTransformer
Graph Equivalence (#449) * new packages for graph and object-based semantic equivalence * new method graphically_equivalent for Environment, move equivalence methods out * object equivalence function, methods used for object-based moved here. * new graph_equivalence methods * add notes * add support for versioning checks (default disabled) * new tests to cover graph equivalence and new methods * added more imports to environment.py to prevent breaking changes * variable changes, new fields for checks, reset depth check per call * flexibility when object is not available on graph. * refactor debug logging message * new file stix2.equivalence.graph_equivalence.rst and stix2.equivalence.object_equivalence.rst for docs * API documentation for new modules * additional text required to build docs * add more test methods for list_semantic_check an graphically_equivalent/versioning * add logging debug messages, code clean-up * include individual scoring on results dict, fix issue on list_semantic_check not keeping highest score * include results as summary in prop_scores, minor tweaks * Update __init__.py doctrings update * apply feedback from pull request - rename semantic_check to reference_check - rename modules to graph and object respectively to eliminate redundancy - remove created_by_ref and object_marking_refs from graph WEIGHTS and rebalance * update docs/ entries * add more checks, make max score based on actual objects checked instead of the full list, only create entry when type is present in WEIGHTS dictionary update tests to reflect changes * rename package patterns -> pattern * documentation, moving weights around * more documentation moving * rename WEIGHTS variable for graph_equivalence
2020-10-16 17:35:26 +02:00
from stix2.equivalence.pattern.transform.comparison import \
2020-08-13 23:44:42 +02:00
DNFTransformer as CDNFTransformer
Graph Equivalence (#449) * new packages for graph and object-based semantic equivalence * new method graphically_equivalent for Environment, move equivalence methods out * object equivalence function, methods used for object-based moved here. * new graph_equivalence methods * add notes * add support for versioning checks (default disabled) * new tests to cover graph equivalence and new methods * added more imports to environment.py to prevent breaking changes * variable changes, new fields for checks, reset depth check per call * flexibility when object is not available on graph. * refactor debug logging message * new file stix2.equivalence.graph_equivalence.rst and stix2.equivalence.object_equivalence.rst for docs * API documentation for new modules * additional text required to build docs * add more test methods for list_semantic_check an graphically_equivalent/versioning * add logging debug messages, code clean-up * include individual scoring on results dict, fix issue on list_semantic_check not keeping highest score * include results as summary in prop_scores, minor tweaks * Update __init__.py doctrings update * apply feedback from pull request - rename semantic_check to reference_check - rename modules to graph and object respectively to eliminate redundancy - remove created_by_ref and object_marking_refs from graph WEIGHTS and rebalance * update docs/ entries * add more checks, make max score based on actual objects checked instead of the full list, only create entry when type is present in WEIGHTS dictionary update tests to reflect changes * rename package patterns -> pattern * documentation, moving weights around * more documentation moving * rename WEIGHTS variable for graph_equivalence
2020-10-16 17:35:26 +02:00
from stix2.equivalence.pattern.transform.comparison import \
2020-08-13 23:44:42 +02:00
FlattenTransformer as CFlattenTransformer
Graph Equivalence (#449) * new packages for graph and object-based semantic equivalence * new method graphically_equivalent for Environment, move equivalence methods out * object equivalence function, methods used for object-based moved here. * new graph_equivalence methods * add notes * add support for versioning checks (default disabled) * new tests to cover graph equivalence and new methods * added more imports to environment.py to prevent breaking changes * variable changes, new fields for checks, reset depth check per call * flexibility when object is not available on graph. * refactor debug logging message * new file stix2.equivalence.graph_equivalence.rst and stix2.equivalence.object_equivalence.rst for docs * API documentation for new modules * additional text required to build docs * add more test methods for list_semantic_check an graphically_equivalent/versioning * add logging debug messages, code clean-up * include individual scoring on results dict, fix issue on list_semantic_check not keeping highest score * include results as summary in prop_scores, minor tweaks * Update __init__.py doctrings update * apply feedback from pull request - rename semantic_check to reference_check - rename modules to graph and object respectively to eliminate redundancy - remove created_by_ref and object_marking_refs from graph WEIGHTS and rebalance * update docs/ entries * add more checks, make max score based on actual objects checked instead of the full list, only create entry when type is present in WEIGHTS dictionary update tests to reflect changes * rename package patterns -> pattern * documentation, moving weights around * more documentation moving * rename WEIGHTS variable for graph_equivalence
2020-10-16 17:35:26 +02:00
from stix2.equivalence.pattern.transform.comparison import \
2020-08-13 23:44:42 +02:00
OrderDedupeTransformer as COrderDedupeTransformer
from stix2.patterns import (
AndObservationExpression, FollowedByObservationExpression,
ObservationExpression, OrObservationExpression, ParentheticalExpression,
QualifiedObservationExpression, _CompoundObservationExpression,
)
def _dupe_ast(ast):
"""
Create a duplicate of the given AST. The AST root must be an observation
expression of some kind (AND/OR/qualified, etc).
Note: the observation expression "leaves", i.e. simple square-bracket
observation expressions are currently not duplicated. I don't think it's
necessary as of this writing. But revisit this if/when necessary.
Args:
ast: The AST to duplicate
Returns:
The duplicate AST
"""
if isinstance(ast, AndObservationExpression):
result = AndObservationExpression([
_dupe_ast(child) for child in ast.operands
])
elif isinstance(ast, OrObservationExpression):
result = OrObservationExpression([
_dupe_ast(child) for child in ast.operands
])
elif isinstance(ast, FollowedByObservationExpression):
result = FollowedByObservationExpression([
_dupe_ast(child) for child in ast.operands
])
elif isinstance(ast, QualifiedObservationExpression):
# Don't need to dupe the qualifier object at this point
result = QualifiedObservationExpression(
2020-08-13 23:44:42 +02:00
_dupe_ast(ast.observation_expression), ast.qualifier,
)
elif isinstance(ast, ObservationExpression):
result = ast
else:
raise TypeError("Can't duplicate " + type(ast).__name__)
return result
class ObservationExpressionTransformer(Transformer):
"""
Transformer base class with special support for transforming observation
expressions. The transform method implemented here performs a bottom-up
in-place transformation, with support for some observation
expression-specific callbacks. It recurses down as far as the "leaf node"
observation expressions; it does not go inside of them, to the individual
components of a comparison expression.
Specifically, subclasses can implement methods:
"transform_or" for OR nodes
"transform_and" for AND nodes
"transform_followedby" for FOLLOWEDBY nodes
"transform_qualified" for qualified nodes (all qualifier types)
"transform_observation" for "leaf" observation expression nodes
"transform_default" for all types of nodes
"transform_default" is a fallback, if a type-specific callback is not
found. The default implementation does nothing to the AST. The
type-specific callbacks are preferred over the default, if both exist.
In all cases, the callbacks are called with an AST for a subtree rooted at
the appropriate node type, where the AST's children have already been
transformed. They must return the same thing as the base transform()
method: a 2-tuple with the transformed AST and a boolean for change
detection. See doc for the superclass' method.
This process currently silently drops parenthetical nodes.
"""
# Determines how AST node types map to callback method names
_DISPATCH_NAME_MAP = {
ObservationExpression: "observation",
AndObservationExpression: "and",
OrObservationExpression: "or",
FollowedByObservationExpression: "followedby",
2020-08-13 23:44:42 +02:00
QualifiedObservationExpression: "qualified",
}
def transform(self, ast):
changed = False
if isinstance(ast, ObservationExpression):
# A "leaf node" for observation expressions. We don't recurse into
# these.
result, this_changed = self.__dispatch_transform(ast)
if this_changed:
changed = True
elif isinstance(ast, _CompoundObservationExpression):
for i, operand in enumerate(ast.operands):
result, this_changed = self.transform(operand)
if this_changed:
ast.operands[i] = result
changed = True
result, this_changed = self.__dispatch_transform(ast)
if this_changed:
changed = True
elif isinstance(ast, QualifiedObservationExpression):
# I don't think we need to process/transform the qualifier by
# itself, do we?
result, this_changed = self.transform(ast.observation_expression)
if this_changed:
ast.observation_expression = result
changed = True
result, this_changed = self.__dispatch_transform(ast)
if this_changed:
changed = True
elif isinstance(ast, ParentheticalExpression):
result, _ = self.transform(ast.expression)
# Dropping a node is a change, right?
changed = True
else:
raise TypeError(
"Not an observation expression: {}: {}".format(
type(ast).__name__, str(ast),
),
)
return result, changed
def __dispatch_transform(self, ast):
"""
Invoke a transformer callback method based on the given ast root node
type.
Args:
ast: The AST
Returns:
The callback's result
"""
dispatch_name = self._DISPATCH_NAME_MAP.get(type(ast))
if dispatch_name:
meth_name = "transform_" + dispatch_name
meth = getattr(self, meth_name, self.transform_default)
else:
meth = self.transform_default
return meth(ast)
def transform_default(self, ast):
return ast, False
class FlattenTransformer(ObservationExpressionTransformer):
"""
Flatten an observation expression AST. E.g.:
A and (B and C) => A and B and C
A or (B or C) => A or B or C
A followedby (B followedby C) => A followedby B followedby C
(A) => A
"""
def __transform(self, ast):
changed = False
if len(ast.operands) == 1:
# Replace an AND/OR/FOLLOWEDBY with one child, with the child
# itself.
result = ast.operands[0]
changed = True
else:
flat_children = []
for operand in ast.operands:
if isinstance(operand, _CompoundObservationExpression) \
and ast.operator == operand.operator:
flat_children.extend(operand.operands)
changed = True
else:
flat_children.append(operand)
ast.operands = flat_children
result = ast
return result, changed
def transform_and(self, ast):
return self.__transform(ast)
def transform_or(self, ast):
return self.__transform(ast)
def transform_followedby(self, ast):
return self.__transform(ast)
class OrderDedupeTransformer(
ObservationExpressionTransformer,
):
"""
Canonically order AND/OR expressions, and dedupe ORs. E.g.:
A or A => A
B or A => A or B
B and A => A and B
"""
def __transform(self, ast):
sorted_children = sorted(
2020-08-13 23:44:42 +02:00
ast.operands, key=functools.cmp_to_key(observation_expression_cmp),
)
# Deduping only applies to ORs
if ast.operator == "OR":
deduped_children = [
key.obj for key, _ in itertools.groupby(
sorted_children, key=functools.cmp_to_key(
2020-08-13 23:44:42 +02:00
observation_expression_cmp,
),
)
]
else:
deduped_children = sorted_children
changed = iter_lex_cmp(
2020-08-13 23:44:42 +02:00
ast.operands, deduped_children, observation_expression_cmp,
) != 0
ast.operands = deduped_children
return ast, changed
def transform_and(self, ast):
return self.__transform(ast)
def transform_or(self, ast):
return self.__transform(ast)
class AbsorptionTransformer(
ObservationExpressionTransformer,
):
"""
Applies boolean "absorption" rules for observation expressions, for AST
simplification:
A or (A and B) = A
A or (A followedby B) = A
Other variants do not hold for observation expressions.
"""
def __is_contained_and(self, exprs_containee, exprs_container):
"""
Determine whether the "containee" expressions are contained in the
"container" expressions, with AND semantics (order-independent but need
distinct bindings). For example (with containee on left and container
on right):
(A and A and B) or (A and B and C)
In the above, all of the lhs vars have a counterpart in the rhs, but
there are two A's on the left and only one on the right. Therefore,
the right does not "contain" the left. You would need two A's on the
right.
Args:
exprs_containee: The expressions we want to check for containment
exprs_container: The expressions acting as the "container"
Returns:
True if the containee is contained in the container; False if not
"""
# make our own list we are free to manipulate without affecting the
# function args.
container = list(exprs_container)
result = True
for ee in exprs_containee:
for i, er in enumerate(container):
if observation_expression_cmp(ee, er) == 0:
# Found a match in the container; delete it so we never try
# to match a container expr to two different containee
# expressions.
del container[i]
break
else:
result = False
break
return result
def __is_contained_followedby(self, exprs_containee, exprs_container):
"""
Determine whether the "containee" expressions are contained in the
"container" expressions, with FOLLOWEDBY semantics (order-sensitive and
need distinct bindings). For example (with containee on left and
container on right):
(A followedby B) or (B followedby A)
In the above, all of the lhs vars have a counterpart in the rhs, but
the vars on the right are not in the same order. Therefore, the right
does not "contain" the left. The container vars don't have to be
contiguous though. E.g. in:
(A followedby B) or (D followedby A followedby C followedby B)
in the container (rhs), B follows A, so it "contains" the lhs even
though there is other stuff mixed in.
Args:
exprs_containee: The expressions we want to check for containment
exprs_container: The expressions acting as the "container"
Returns:
True if the containee is contained in the container; False if not
"""
ee_iter = iter(exprs_containee)
er_iter = iter(exprs_container)
result = True
while True:
ee = next(ee_iter, None)
if not ee:
break
while True:
er = next(er_iter, None)
if er:
if observation_expression_cmp(ee, er) == 0:
break
else:
break
if not er:
result = False
break
return result
def transform_or(self, ast):
changed = False
to_delete = set()
for i, child1 in enumerate(ast.operands):
if i in to_delete:
continue
# The simplification doesn't work across qualifiers
if isinstance(child1, QualifiedObservationExpression):
continue
for j, child2 in enumerate(ast.operands):
if i == j or j in to_delete:
continue
if isinstance(
child2, (
AndObservationExpression,
2020-08-13 23:44:42 +02:00
FollowedByObservationExpression,
),
):
# The simple check: is child1 contained in child2?
if iter_in(
2020-08-13 23:44:42 +02:00
child1, child2.operands, observation_expression_cmp,
):
to_delete.add(j)
# A more complicated check: does child1 occur in child2
# in a "flattened" form?
elif type(child1) is type(child2):
if isinstance(child1, AndObservationExpression):
can_simplify = self.__is_contained_and(
2020-08-13 23:44:42 +02:00
child1.operands, child2.operands,
)
else: # child1 and 2 are followedby nodes
can_simplify = self.__is_contained_followedby(
2020-08-13 23:44:42 +02:00
child1.operands, child2.operands,
)
if can_simplify:
to_delete.add(j)
if to_delete:
changed = True
for i in reversed(sorted(to_delete)):
del ast.operands[i]
return ast, changed
class DNFTransformer(ObservationExpressionTransformer):
"""
Transform an observation expression to DNF. This will distribute AND and
FOLLOWEDBY over OR:
A and (B or C) => (A and B) or (A and C)
A followedby (B or C) => (A followedby B) or (A followedby C)
"""
def __transform(self, ast):
root_type = type(ast) # will be AST class for AND or FOLLOWEDBY
changed = False
or_children = []
other_children = []
for child in ast.operands:
if isinstance(child, OrObservationExpression):
or_children.append(child.operands)
else:
other_children.append(child)
if or_children:
distributed_children = [
root_type([
_dupe_ast(sub_ast) for sub_ast in itertools.chain(
2020-08-13 23:44:42 +02:00
other_children, prod_seq,
)
])
for prod_seq in itertools.product(*or_children)
]
# Need to recursively continue to distribute AND/FOLLOWEDBY over OR
# in any of our new sub-expressions which need it.
distributed_children = [
self.transform(child)[0] for child in distributed_children
]
result = OrObservationExpression(distributed_children)
changed = True
else:
result = ast
return result, changed
def transform_and(self, ast):
return self.__transform(ast)
def transform_followedby(self, ast):
return self.__transform(ast)
class CanonicalizeComparisonExpressionsTransformer(
ObservationExpressionTransformer,
):
"""
Canonicalize all comparison expressions.
"""
def __init__(self):
comp_flatten = CFlattenTransformer()
comp_order = COrderDedupeTransformer()
comp_absorb = CAbsorptionTransformer()
simplify = ChainTransformer(comp_flatten, comp_order, comp_absorb)
settle_simplify = SettleTransformer(simplify)
comp_special = SpecialValueCanonicalization()
comp_dnf = CDNFTransformer()
self.__comp_canonicalize = ChainTransformer(
2020-08-13 23:44:42 +02:00
comp_special, settle_simplify, comp_dnf, settle_simplify,
)
def transform_observation(self, ast):
comp_expr = ast.operand
canon_comp_expr, changed = self.__comp_canonicalize.transform(comp_expr)
ast.operand = canon_comp_expr
return ast, changed