Change string semantic comparison algorithm

Use `fuzzywuzzy`'s Token Sort Ratio instead of Jaro-Winkler.
master
Chris Lenk 2019-12-23 17:00:52 -05:00
parent 457564f2f9
commit 62cd4fd33c
4 changed files with 8 additions and 8 deletions

View File

@ -64,6 +64,6 @@ setup(
},
extras_require={
'taxii': ['taxii2-client'],
'semantic': ['haversine', 'pyjarowinkler'],
'semantic': ['haversine', 'fuzzywuzzy'],
},
)

View File

@ -363,10 +363,10 @@ def partial_string_based(str1, str2):
float: Number between 0.0 and 1.0 depending on match criteria.
"""
from pyjarowinkler import distance
result = distance.get_jaro_distance(str1, str2)
from fuzzywuzzy import fuzz
result = fuzz.token_sort_ratio(str1, str2, force_ascii=False)
logger.debug("--\t\tpartial_string_based '%s' '%s'\tresult: '%s'", str1, str2, result)
return result
return result / 100.0
def custom_pattern_based(pattern1, pattern2):

View File

@ -521,7 +521,7 @@ def test_semantic_equivalence_on_same_vulnerability2():
],
)
VULN_KWARGS2 = dict(
name="Zot",
name="Foo",
external_references=[
{
"url": "https://example2",
@ -550,7 +550,7 @@ def test_semantic_equivalence_on_unknown_object():
CUSTOM_KWARGS2 = dict(
type="x-foobar",
id="x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061",
name="Zot",
name="Foo",
external_references=[
{
"url": "https://example2",
@ -787,7 +787,7 @@ def test_semantic_equivalence_prop_scores():
tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS)
stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores)
assert len(prop_scores) == 4
assert round(prop_scores["matching_score"], 1) == 37.6
assert round(prop_scores["matching_score"], 1) == 8.8
assert round(prop_scores["sum_weights"], 1) == 100.0

View File

@ -9,7 +9,7 @@ deps =
pytest-cov
coverage
taxii2-client
pyjarowinkler
fuzzywuzzy
haversine
medallion
commands =