Change string semantic comparison algorithm

Use `fuzzywuzzy`'s Token Sort Ratio instead of Jaro-Winkler.
2019-12-23 17:00:52 -05:00 · 2019-12-23 17:00:52 -05:00 · 62cd4fd33c
parent 457564f2f9
commit 62cd4fd33c
4 changed files with 8 additions and 8 deletions
--- a/setup.py
+++ b/setup.py
@ -64,6 +64,6 @@ setup(
    },
    extras_require={
        'taxii': ['taxii2-client'],
-        'semantic': ['haversine', 'pyjarowinkler'],
+        'semantic': ['haversine', 'fuzzywuzzy'],
    },
 )
--- a/stix2/environment.py
+++ b/stix2/environment.py
@ -363,10 +363,10 @@ def partial_string_based(str1, str2):
        float: Number between 0.0 and 1.0 depending on match criteria.

    """
-    from pyjarowinkler import distance
-    result = distance.get_jaro_distance(str1, str2)
+    from fuzzywuzzy import fuzz
+    result = fuzz.token_sort_ratio(str1, str2, force_ascii=False)
    logger.debug("--\t\tpartial_string_based '%s' '%s'\tresult: '%s'", str1, str2, result)
-    return result
+    return result / 100.0


 def custom_pattern_based(pattern1, pattern2):
--- a/stix2/test/v21/test_environment.py
+++ b/stix2/test/v21/test_environment.py
@ -521,7 +521,7 @@ def test_semantic_equivalence_on_same_vulnerability2():
        ],
    )
    VULN_KWARGS2 = dict(
-        name="Zot",
+        name="Foo",
        external_references=[
            {
                "url": "https://example2",
@ -550,7 +550,7 @@ def test_semantic_equivalence_on_unknown_object():
    CUSTOM_KWARGS2 = dict(
        type="x-foobar",
        id="x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061",
-        name="Zot",
+        name="Foo",
        external_references=[
            {
                "url": "https://example2",
@ -787,7 +787,7 @@ def test_semantic_equivalence_prop_scores():
    tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS)
    stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores)
    assert len(prop_scores) == 4
-    assert round(prop_scores["matching_score"], 1) == 37.6
+    assert round(prop_scores["matching_score"], 1) == 8.8
    assert round(prop_scores["sum_weights"], 1) == 100.0


--- a/tox.ini
+++ b/tox.ini
@ -9,7 +9,7 @@ deps =
  pytest-cov
  coverage
  taxii2-client
-  pyjarowinkler
+  fuzzywuzzy
  haversine
  medallion
 commands =