cti-python-stix2/stix2/equivalence/graph/__init__.py

import logging

from ..object import (
    WEIGHTS, exact_match, list_reference_check, partial_string_based,
    partial_timestamp_based, reference_check, semantically_equivalent,
)

logger = logging.getLogger(__name__)


def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):
    """This method verifies if two graphs are semantically equivalent.
    Each DataStore can contain a connected or disconnected graph and the
    final result is weighted over the amount of objects we managed to compare.
    This approach builds on top of the object-based semantic equivalence process
    and each comparison can return a value between 0 and 100.

    Args:
        ds1: A DataStore object instance representing your graph
        ds2: A DataStore object instance representing your graph
        prop_scores: A dictionary that can hold individual property scores,
            weights, contributing score, matching score and sum of weights.
        weight_dict: A dictionary that can be used to override settings
            in the semantic equivalence process

    Returns:
        float: A number between 0.0 and 100.0 as a measurement of equivalence.

    Warning:
        Object types need to have property weights defined for the equivalence process.
        Otherwise, those objects will not influence the final score. The WEIGHTS
        dictionary under `stix2.equivalence.graph` can give you an idea on how to add
        new entries and pass them via the `weight_dict` argument. Similarly, the values
        or methods can be fine tuned for a particular use case.

    Note:
        Default weights_dict:

        .. include:: ../default_sem_eq_weights.rst

    Note:
        This implementation follows the Semantic Equivalence Committee Note.
        see `the Committee Note <link here>`__.

    """
    weights = GRAPH_WEIGHTS.copy()

    if weight_dict:
        weights.update(weight_dict)

    results = {}
    depth = weights["_internal"]["max_depth"]

    graph1 = ds1.query([])
    graph2 = ds2.query([])

    graph1.sort(key=lambda x: x["type"])
    graph2.sort(key=lambda x: x["type"])

    if len(graph1) < len(graph2):
        weights["_internal"]["ds1"] = ds1
        weights["_internal"]["ds2"] = ds2
        g1 = graph1
        g2 = graph2
    else:
        weights["_internal"]["ds1"] = ds2
        weights["_internal"]["ds2"] = ds1
        g1 = graph2
        g2 = graph1

    for object1 in g1:
        for object2 in g2:
            if object1["type"] == object2["type"] and object1["type"] in weights:
                iprop_score = {}
                result = semantically_equivalent(object1, object2, iprop_score, **weights)
                objects1_id = object1["id"]
                weights["_internal"]["max_depth"] = depth

                if objects1_id not in results:
                    results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result}
                elif result > results[objects1_id]["value"]:
                    results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result}

    equivalence_score = 0
    matching_score = sum(x["value"] for x in results.values())
    sum_weights = len(results) * 100.0
    if sum_weights > 0:
        equivalence_score = (matching_score / sum_weights) * 100
    prop_scores["matching_score"] = matching_score
    prop_scores["sum_weights"] = sum_weights
    prop_scores["summary"] = results

    logger.debug(
        "DONE\t\tSUM_WEIGHT: %.2f\tMATCHING_SCORE: %.2f\t SCORE: %.2f",
        sum_weights,
        matching_score,
        equivalence_score,
    )
    return equivalence_score


# default weights used for the graph semantic equivalence process
GRAPH_WEIGHTS = WEIGHTS.copy()
GRAPH_WEIGHTS.update({
    "grouping": {
        "name": (20, partial_string_based),
        "context": (20, partial_string_based),
        "object_refs": (60, list_reference_check),
    },
    "relationship": {
        "relationship_type": (20, exact_match),
        "source_ref": (40, reference_check),
        "target_ref": (40, reference_check),
    },
    "report": {
        "name": (30, partial_string_based),
        "published": (10, partial_timestamp_based),
        "object_refs": (60, list_reference_check),
        "tdelta": 1,  # One day interval
    },
    "sighting": {
        "first_seen": (5, partial_timestamp_based),
        "last_seen": (5, partial_timestamp_based),
        "sighting_of_ref": (40, reference_check),
        "observed_data_refs": (20, list_reference_check),
        "where_sighted_refs": (20, list_reference_check),
        "summary": (10, exact_match),
    },
    "_internal": {
        "ignore_spec_version": False,
        "versioning_checks": False,
        "ds1": None,
        "ds2": None,
        "max_depth": 1,
    },
})  #: :autodoc-skip:
Graph Equivalence (#449) * new packages for graph and object-based semantic equivalence * new method graphically_equivalent for Environment, move equivalence methods out * object equivalence function, methods used for object-based moved here. * new graph_equivalence methods * add notes * add support for versioning checks (default disabled) * new tests to cover graph equivalence and new methods * added more imports to environment.py to prevent breaking changes * variable changes, new fields for checks, reset depth check per call * flexibility when object is not available on graph. * refactor debug logging message * new file stix2.equivalence.graph_equivalence.rst and stix2.equivalence.object_equivalence.rst for docs * API documentation for new modules * additional text required to build docs * add more test methods for list_semantic_check an graphically_equivalent/versioning * add logging debug messages, code clean-up * include individual scoring on results dict, fix issue on list_semantic_check not keeping highest score * include results as summary in prop_scores, minor tweaks * Update __init__.py doctrings update * apply feedback from pull request - rename semantic_check to reference_check - rename modules to graph and object respectively to eliminate redundancy - remove created_by_ref and object_marking_refs from graph WEIGHTS and rebalance * update docs/ entries * add more checks, make max score based on actual objects checked instead of the full list, only create entry when type is present in WEIGHTS dictionary update tests to reflect changes * rename package patterns -> pattern * documentation, moving weights around * more documentation moving * rename WEIGHTS variable for graph_equivalence 2020-10-16 17:35:26 +02:00			`import logging`

			`from ..object import (`
			`WEIGHTS, exact_match, list_reference_check, partial_string_based,`
			`partial_timestamp_based, reference_check, semantically_equivalent,`
			`)`

			`logger = logging.getLogger(__name__)`


			`def graphically_equivalent(ds1, ds2, prop_scores={}, **weight_dict):`
			`"""This method verifies if two graphs are semantically equivalent.`
			`Each DataStore can contain a connected or disconnected graph and the`
			`final result is weighted over the amount of objects we managed to compare.`
			`This approach builds on top of the object-based semantic equivalence process`
			`and each comparison can return a value between 0 and 100.`

			`Args:`
			`ds1: A DataStore object instance representing your graph`
			`ds2: A DataStore object instance representing your graph`
			`prop_scores: A dictionary that can hold individual property scores,`
			`weights, contributing score, matching score and sum of weights.`
			`weight_dict: A dictionary that can be used to override settings`
			`in the semantic equivalence process`

			`Returns:`
			`float: A number between 0.0 and 100.0 as a measurement of equivalence.`

			`Warning:`
			`Object types need to have property weights defined for the equivalence process.`
			`Otherwise, those objects will not influence the final score. The WEIGHTS`
			dictionary under `stix2.equivalence.graph` can give you an idea on how to add
			new entries and pass them via the `weight_dict` argument. Similarly, the values
			`or methods can be fine tuned for a particular use case.`

			`Note:`
			`Default weights_dict:`

			`.. include:: ../default_sem_eq_weights.rst`

			`Note:`
			`This implementation follows the Semantic Equivalence Committee Note.`
			see `the Committee Note <link here>`__.

			`"""`
			`weights = GRAPH_WEIGHTS.copy()`

			`if weight_dict:`
			`weights.update(weight_dict)`

			`results = {}`
			`depth = weights["_internal"]["max_depth"]`

			`graph1 = ds1.query([])`
			`graph2 = ds2.query([])`

			`graph1.sort(key=lambda x: x["type"])`
			`graph2.sort(key=lambda x: x["type"])`

			`if len(graph1) < len(graph2):`
			`weights["_internal"]["ds1"] = ds1`
			`weights["_internal"]["ds2"] = ds2`
			`g1 = graph1`
			`g2 = graph2`
			`else:`
			`weights["_internal"]["ds1"] = ds2`
			`weights["_internal"]["ds2"] = ds1`
			`g1 = graph2`
			`g2 = graph1`

			`for object1 in g1:`
			`for object2 in g2:`
			`if object1["type"] == object2["type"] and object1["type"] in weights:`
			`iprop_score = {}`
			`result = semantically_equivalent(object1, object2, iprop_score, **weights)`
			`objects1_id = object1["id"]`
			`weights["_internal"]["max_depth"] = depth`

			`if objects1_id not in results:`
			`results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result}`
			`elif result > results[objects1_id]["value"]:`
			`results[objects1_id] = {"matched": object2["id"], "prop_score": iprop_score, "value": result}`

			`equivalence_score = 0`
			`matching_score = sum(x["value"] for x in results.values())`
			`sum_weights = len(results) * 100.0`
			`if sum_weights > 0:`
			`equivalence_score = (matching_score / sum_weights) * 100`
			`prop_scores["matching_score"] = matching_score`
			`prop_scores["sum_weights"] = sum_weights`
			`prop_scores["summary"] = results`

			`logger.debug(`
			`"DONE\t\tSUM_WEIGHT: %.2f\tMATCHING_SCORE: %.2f\t SCORE: %.2f",`
			`sum_weights,`
			`matching_score,`
			`equivalence_score,`
			`)`
			`return equivalence_score`


			`# default weights used for the graph semantic equivalence process`
			`GRAPH_WEIGHTS = WEIGHTS.copy()`
			`GRAPH_WEIGHTS.update({`
			`"grouping": {`
			`"name": (20, partial_string_based),`
			`"context": (20, partial_string_based),`
			`"object_refs": (60, list_reference_check),`
			`},`
			`"relationship": {`
			`"relationship_type": (20, exact_match),`
			`"source_ref": (40, reference_check),`
			`"target_ref": (40, reference_check),`
			`},`
			`"report": {`
			`"name": (30, partial_string_based),`
			`"published": (10, partial_timestamp_based),`
			`"object_refs": (60, list_reference_check),`
			`"tdelta": 1, # One day interval`
			`},`
			`"sighting": {`
			`"first_seen": (5, partial_timestamp_based),`
			`"last_seen": (5, partial_timestamp_based),`
			`"sighting_of_ref": (40, reference_check),`
			`"observed_data_refs": (20, list_reference_check),`
			`"where_sighted_refs": (20, list_reference_check),`
			`"summary": (10, exact_match),`
			`},`
			`"_internal": {`
			`"ignore_spec_version": False,`
			`"versioning_checks": False,`
			`"ds1": None,`
			`"ds2": None,`
			`"max_depth": 1,`
			`},`
			`}) #: :autodoc-skip:`