""" Transformation utilities for STIX pattern comparison expressions. """ import functools import itertools from stix2.equivalence.pattern.compare import iter_in, iter_lex_cmp from stix2.equivalence.pattern.compare.comparison import ( comparison_expression_cmp, ) from stix2.equivalence.pattern.transform import Transformer from stix2.equivalence.pattern.transform.specials import ( ipv4_addr, ipv6_addr, windows_reg_key, ) from stix2.patterns import ( AndBooleanExpression, OrBooleanExpression, ParentheticalExpression, _BooleanExpression, _ComparisonExpression, ) def _dupe_ast(ast): """ Create a duplicate of the given AST. Note: The comparison expression "leaves", i.e. simple comparisons are currently not duplicated. I don't think it's necessary as of this writing; they are never changed. But revisit this if/when necessary. Args: ast: The AST to duplicate Returns: The duplicate AST """ if isinstance(ast, AndBooleanExpression): result = AndBooleanExpression([ _dupe_ast(operand) for operand in ast.operands ]) elif isinstance(ast, OrBooleanExpression): result = OrBooleanExpression([ _dupe_ast(operand) for operand in ast.operands ]) elif isinstance(ast, _ComparisonExpression): # Change this to create a dupe, if we ever need to change simple # comparison expressions as part of canonicalization. result = ast else: raise TypeError("Can't duplicate " + type(ast).__name__) return result class ComparisonExpressionTransformer(Transformer): """ Transformer base class with special support for transforming comparison expressions. The transform method implemented here performs a bottom-up in-place transformation, with support for some comparison expression-specific callbacks. Specifically, subclasses can implement methods: "transform_or" for OR nodes "transform_and" for AND nodes "transform_comparison" for plain comparison nodes ( ) "transform_default" for both types of nodes "transform_default" is a fallback, if a type-specific callback is not found. The default implementation does nothing to the AST. The type-specific callbacks are preferred over the default, if both exist. In all cases, the callbacks are called with an AST for a subtree rooted at the appropriate node type, where the subtree's children have already been transformed. They must return the same thing as the base transform() method: a 2-tuple with the transformed AST and a boolean for change detection. See doc for the superclass' method. This process currently silently drops parenthetical nodes. """ def transform(self, ast): if isinstance(ast, _BooleanExpression): changed = False for i, operand in enumerate(ast.operands): operand_result, this_changed = self.transform(operand) if this_changed: changed = True ast.operands[i] = operand_result result, this_changed = self.__dispatch_transform(ast) if this_changed: changed = True elif isinstance(ast, _ComparisonExpression): result, changed = self.__dispatch_transform(ast) elif isinstance(ast, ParentheticalExpression): # Drop these result, changed = self.transform(ast.expression) else: raise TypeError("Not a comparison expression: " + str(ast)) return result, changed def __dispatch_transform(self, ast): """ Invoke a transformer callback method based on the given ast root node type. Args: ast: The AST Returns: The callback's result """ if isinstance(ast, AndBooleanExpression): meth = getattr(self, "transform_and", self.transform_default) elif isinstance(ast, OrBooleanExpression): meth = getattr(self, "transform_or", self.transform_default) elif isinstance(ast, _ComparisonExpression): meth = getattr( self, "transform_comparison", self.transform_default, ) else: meth = self.transform_default return meth(ast) def transform_default(self, ast): """ Override to handle transforming AST nodes which don't have a more specific method implemented. """ return ast, False class OrderDedupeTransformer( ComparisonExpressionTransformer, ): """ Canonically order the children of all nodes in the AST. Because the deduping algorithm is based on sorted data, this transformation also does deduping. E.g.: A and A => A A or A => A """ def __transform(self, ast): """ Sort/dedupe children. AND and OR can be treated identically. Args: ast: The comparison expression AST Returns: The same AST node, but with sorted children """ sorted_children = sorted( ast.operands, key=functools.cmp_to_key(comparison_expression_cmp), ) deduped_children = [ # Apparently when using a key function, groupby()'s "keys" are the # key wrappers, not actual sequence values. Obviously we don't # need key wrappers in our ASTs! k.obj for k, _ in itertools.groupby( sorted_children, key=functools.cmp_to_key( comparison_expression_cmp, ), ) ] changed = iter_lex_cmp( ast.operands, deduped_children, comparison_expression_cmp, ) != 0 ast.operands = deduped_children return ast, changed def transform_or(self, ast): return self.__transform(ast) def transform_and(self, ast): return self.__transform(ast) class FlattenTransformer(ComparisonExpressionTransformer): """ Flatten all nodes of the AST. E.g.: A and (B and C) => A and B and C A or (B or C) => A or B or C (A) => A """ def __transform(self, ast): """ Flatten children. AND and OR can be treated mostly identically. The little difference is that we can absorb AND children if we're an AND ourselves; and OR for OR. Args: ast: The comparison expression AST Returns: The same AST node, but with flattened children """ changed = False if len(ast.operands) == 1: # Replace an AND/OR with one child, with the child itself. ast = ast.operands[0] changed = True else: flat_operands = [] for operand in ast.operands: if isinstance(operand, _BooleanExpression) \ and ast.operator == operand.operator: flat_operands.extend(operand.operands) changed = True else: flat_operands.append(operand) ast.operands = flat_operands return ast, changed def transform_or(self, ast): return self.__transform(ast) def transform_and(self, ast): return self.__transform(ast) class AbsorptionTransformer( ComparisonExpressionTransformer, ): """ Applies boolean "absorption" rules for AST simplification. E.g.: A and (A or B) = A A or (A and B) = A """ def __transform(self, ast): changed = False secondary_op = "AND" if ast.operator == "OR" else "OR" to_delete = set() # Check i (child1) against j to see if we can delete j. for i, child1 in enumerate(ast.operands): if i in to_delete: continue for j, child2 in enumerate(ast.operands): if i == j or j in to_delete: continue # We're checking if child1 is contained in child2, so # child2 has to be a compound object, not just a simple # comparison expression. We also require the right operator # for child2: "AND" if ast is "OR" and vice versa. if not isinstance(child2, _BooleanExpression) \ or child2.operator != secondary_op: continue # The simple check: is child1 contained in child2? if iter_in( child1, child2.operands, comparison_expression_cmp, ): to_delete.add(j) # A more complicated check: does child1 occur in child2 # in a "flattened" form? elif child1.operator == child2.operator: if all( iter_in( child1_operand, child2.operands, comparison_expression_cmp, ) for child1_operand in child1.operands ): to_delete.add(j) if to_delete: changed = True for i in reversed(sorted(to_delete)): del ast.operands[i] return ast, changed def transform_or(self, ast): return self.__transform(ast) def transform_and(self, ast): return self.__transform(ast) class DNFTransformer(ComparisonExpressionTransformer): """ Convert a comparison expression AST to DNF. E.g.: A and (B or C) => (A and B) or (A and C) """ def transform_and(self, ast): or_children = [] other_children = [] changed = False # Sort AND children into two piles: the ORs and everything else for child in ast.operands: if isinstance(child, _BooleanExpression) and child.operator == "OR": # Need a list of operand lists, so we can compute the # product below. or_children.append(child.operands) else: other_children.append(child) if or_children: distributed_children = [ AndBooleanExpression([ # Make dupes: distribution implies adding repetition, and # we should ensure each repetition is independent of the # others. _dupe_ast(sub_ast) for sub_ast in itertools.chain( other_children, prod_seq, ) ]) for prod_seq in itertools.product(*or_children) ] # Need to recursively continue to distribute AND over OR in # any of our new sub-expressions which need it. This causes # more downward recursion in the midst of this bottom-up transform. # It's not good for performance. I wonder if a top-down # transformation algorithm would make more sense in this phase? # But then we'd be using two different algorithms for the same # thing... Maybe this transform should be completely top-down # (no bottom-up component at all)? distributed_children = [ self.transform(child)[0] for child in distributed_children ] result = OrBooleanExpression(distributed_children) changed = True else: # No AND-over-OR; nothing to do result = ast return result, changed class SpecialValueCanonicalization(ComparisonExpressionTransformer): """ Try to find particular leaf-node comparison expressions whose rhs (i.e. the constant) can be canonicalized. This is an idiosyncratic transformation based on some ideas people had for context-sensitive semantic equivalence in constant values. """ def transform_comparison(self, ast): if ast.lhs.object_type_name == "windows-registry-key": windows_reg_key(ast) elif ast.lhs.object_type_name == "ipv4-addr": ipv4_addr(ast) elif ast.lhs.object_type_name == "ipv6-addr": ipv6_addr(ast) # Hard-code False here since this particular canonicalization is never # worth doing more than once. I think it's okay to pretend nothing has # changed. return ast, False