Revamp deterministic ID generation code to fix bugs.

2020-06-01 20:24:22 -04:00 · 2020-06-01 20:24:22 -04:00 · 6c2c4781e7
parent 41525f9be0
commit 6c2c4781e7
1 changed files with 130 additions and 70 deletions
--- a/stix2/base.py
+++ b/stix2/base.py
@ -334,24 +334,21 @@ class _Observable(_STIXBase):
    def __init__(self, **kwargs):
        # the constructor might be called independently of an observed data object
        self._STIXBase__valid_refs = kwargs.pop('_valid_refs', [])
-
-        self._allow_custom = kwargs.get('allow_custom', False)
        self._properties['extensions'].allow_custom = kwargs.get('allow_custom', False)
-
-        try:
-            # Since `spec_version` is optional, this is how we check for a 2.1 SCO
-            self._id_contributing_properties
-
-            if 'id' not in kwargs:
-                possible_id = self._generate_id(kwargs)
-                if possible_id is not None:
-                    kwargs['id'] = possible_id
-        except AttributeError:
-            # End up here if handling a 2.0 SCO, and don't need to do anything further
-            pass
-
        super(_Observable, self).__init__(**kwargs)

+        if 'id' not in kwargs and not isinstance(self, stix2.v20._Observable):
+            # Specific to 2.1+ observables: generate a deterministic ID
+            id_ = self._generate_id()
+
+            # Spec says fall back to UUIDv4 if no contributing properties were
+            # given.  That's what already happened (the following is actually
+            # overwriting the default uuidv4), so nothing to do here.
+            if id_ is not None:
+                # Can't assign to self (we're immutable), so slip the ID in
+                # more sneakily.
+                self._inner["id"] = id_
+
    def _check_ref(self, ref, prop, prop_name):
        """
        Only for checking `*_ref` or `*_refs` properties in spec_version 2.0
@ -396,42 +393,50 @@ class _Observable(_STIXBase):
                for ref in kwargs[prop_name]:
                    self._check_ref(ref, prop, prop_name)

-    def _generate_id(self, kwargs):
-        required_prefix = self._type + "--"
+    def _generate_id(self):
+        """
+        Generate a UUIDv5 for this observable, using its "ID contributing
+        properties".

-        properties_to_use = self._id_contributing_properties
-        if properties_to_use:
-            streamlined_object = {}
-            if "hashes" in kwargs and "hashes" in properties_to_use:
-                possible_hash = _choose_one_hash(kwargs["hashes"])
-                if possible_hash:
-                    streamlined_object["hashes"] = possible_hash
-            for key in properties_to_use:
-                if key != "hashes" and key in kwargs:
-                    if isinstance(kwargs[key], dict) or isinstance(kwargs[key], _STIXBase):
-                        temp_deep_copy = copy.deepcopy(dict(kwargs[key]))
-                        _recursive_stix_to_dict(temp_deep_copy)
-                        streamlined_object[key] = temp_deep_copy
-                    elif isinstance(kwargs[key], list):
-                        temp_deep_copy = copy.deepcopy(kwargs[key])
-                        _recursive_stix_list_to_dict(temp_deep_copy)
-                        streamlined_object[key] = temp_deep_copy
-                    else:
-                        streamlined_object[key] = kwargs[key]
-            if streamlined_object:
-                data = canonicalize(streamlined_object, utf8=False)
+        :return: The ID, or None if no ID contributing properties are set
+        """
+
+        id_ = None
+        json_serializable_object = {}
+
+        for key in self._id_contributing_properties:
+
+            if key in self:
+                obj_value = self[key]
+
+                if key == "hashes":
+                    possible_hash = _choose_one_hash(obj_value)
+                    if possible_hash:
+                        serializable_value = possible_hash

-                # The situation is complicated w.r.t. python 2/3 behavior, so
-                # I'd rather not rely on particular exceptions being raised to
-                # determine what to do.  Better to just check the python version
-                # directly.
-                if six.PY3:
-                    return required_prefix + six.text_type(uuid.uuid5(SCO_DET_ID_NAMESPACE, data))
                else:
-                    return required_prefix + six.text_type(uuid.uuid5(SCO_DET_ID_NAMESPACE, data.encode("utf-8")))
+                    serializable_value = _make_json_serializable(obj_value)

-        # We return None if there are no values specified for any of the id-contributing-properties
-        return None
+                json_serializable_object[key] = serializable_value
+
+        if json_serializable_object:
+
+            data = canonicalize(json_serializable_object, utf8=False)
+
+            # The situation is complicated w.r.t. python 2/3 behavior, so
+            # I'd rather not rely on particular exceptions being raised to
+            # determine what to do.  Better to just check the python version
+            # directly.
+            if six.PY3:
+                uuid_ = uuid.uuid5(SCO_DET_ID_NAMESPACE, data)
+            else:
+                uuid_ = uuid.uuid5(
+                    SCO_DET_ID_NAMESPACE, data.encode("utf-8")
+                )
+
+            id_ = "{}--{}".format(self._type, six.text_type(uuid_))
+
+        return id_


 class _Extension(_STIXBase):
@ -455,35 +460,90 @@ def _choose_one_hash(hash_dict):
        if k is not None:
            return {k: hash_dict[k]}

+    return None
+

 def _cls_init(cls, obj, kwargs):
    if getattr(cls, '__init__', object.__init__) is not object.__init__:
        cls.__init__(obj, **kwargs)


-def _recursive_stix_to_dict(input_dict):
-    for key in input_dict:
-        if isinstance(input_dict[key], dict):
-            _recursive_stix_to_dict(input_dict[key])
-        elif isinstance(input_dict[key], _STIXBase):
-            input_dict[key] = dict(input_dict[key])
+def _make_json_serializable(value):
+    """
+    Make the given value JSON-serializable; required for the JSON canonicalizer
+    to work.  This recurses into lists/dicts, converts stix objects to dicts,
+    etc.  "Convenience" types this library uses as property values are
+    JSON-serialized to produce a JSON-serializable value.  (So you will always
+    get strings for those.)

-            # There may stil be nested _STIXBase objects
-            _recursive_stix_to_dict(input_dict[key])
-        elif isinstance(input_dict[key], list):
-            _recursive_stix_list_to_dict(input_dict[key])
-        else:
-            pass
+    The conversion will not affect the passed in value.
+
+    :param value: The value to make JSON-serializable.
+    :return: The JSON-serializable value.
+    :raises ValueError: If value is None (since nulls are not allowed in STIX
+        objects).
+    """
+    if value is None:
+        raise ValueError("Illegal null value found in a STIX object")
+
+    json_value = value  # default assumption
+
+    if isinstance(value, Mapping):
+        json_value = {
+            k: _make_json_serializable(v)
+            for k, v in value.items()
+        }
+
+    elif isinstance(value, list):
+        json_value = [
+            _make_json_serializable(v)
+            for v in value
+        ]
+
+    elif not isinstance(value, (int, float, six.string_types, bool)):
+        # If a "simple" value which is not already JSON-serializable,
+        # JSON-serialize to a string and use that as our JSON-serializable
+        # value.  This applies to our datetime objects currently (timestamp
+        # properties), and could apply to any other "convenience" types this
+        # library uses for property values in the future.
+        json_value = json.dumps(value, ensure_ascii=False, cls=STIXJSONEncoder)
+
+        # If it looks like a string literal was output, strip off the quotes.
+        # Otherwise, a second pair will be added when it's canonicalized.  Also
+        # to be extra safe, we need to unescape.
+        if len(json_value) >= 2 and \
+                json_value[0] == '"' and json_value[-1] == '"':
+            json_value = _un_json_escape(json_value[1:-1])
+
+    return json_value


-def _recursive_stix_list_to_dict(input_list):
-    for i in range(len(input_list)):
-        if isinstance(input_list[i], _STIXBase):
-            input_list[i] = dict(input_list[i])
-        elif isinstance(input_list[i], dict):
-            pass
-        elif isinstance(input_list[i], list):
-            _recursive_stix_list_to_dict(input_list[i])
-        else:
-            continue
-        _recursive_stix_to_dict(input_list[i])
+def _un_json_escape(json_string):
+    """
+    Removes JSON string literal escapes.  We should undo these things Python's
+    serializer does, so we can ensure they're done canonically.  The
+    canonicalizer should be in charge of everything, as much as is feasible.
+
+    :param json_string: String literal output of Python's JSON serializer,
+        minus the surrounding quotes.
+    :return: The unescaped string
+    """
+
+    # I don't think I should need to worry about the unicode escapes (\uXXXX)
+    # since I use ensure_ascii=False when generating it.  I will just fix all
+    # the other escapes, e.g. \n, \r, etc.
+    #
+    # This list is taken from RFC7159 section 7:
+    # https://tools.ietf.org/html/rfc7159.html#section-7
+
+    result = json_string\
+        .replace(r"\"", "\"")\
+        .replace(r"\/", "/")\
+        .replace(r"\b", "\b")\
+        .replace(r"\f", "\f")\
+        .replace(r"\n", "\n")\
+        .replace(r"\r", "\r")\
+        .replace(r"\t", "\t")\
+        .replace(r"\\", "\\")  # Must do this one last!
+
+    return result