Change string semantic comparison algorithm

Use `fuzzywuzzy`'s Token Sort Ratio instead of Jaro-Winkler.
master
Chris Lenk 2019-12-23 17:00:52 -05:00
parent 457564f2f9
commit 62cd4fd33c
4 changed files with 8 additions and 8 deletions

View File

@ -64,6 +64,6 @@ setup(
}, },
extras_require={ extras_require={
'taxii': ['taxii2-client'], 'taxii': ['taxii2-client'],
'semantic': ['haversine', 'pyjarowinkler'], 'semantic': ['haversine', 'fuzzywuzzy'],
}, },
) )

View File

@ -363,10 +363,10 @@ def partial_string_based(str1, str2):
float: Number between 0.0 and 1.0 depending on match criteria. float: Number between 0.0 and 1.0 depending on match criteria.
""" """
from pyjarowinkler import distance from fuzzywuzzy import fuzz
result = distance.get_jaro_distance(str1, str2) result = fuzz.token_sort_ratio(str1, str2, force_ascii=False)
logger.debug("--\t\tpartial_string_based '%s' '%s'\tresult: '%s'", str1, str2, result) logger.debug("--\t\tpartial_string_based '%s' '%s'\tresult: '%s'", str1, str2, result)
return result return result / 100.0
def custom_pattern_based(pattern1, pattern2): def custom_pattern_based(pattern1, pattern2):

View File

@ -521,7 +521,7 @@ def test_semantic_equivalence_on_same_vulnerability2():
], ],
) )
VULN_KWARGS2 = dict( VULN_KWARGS2 = dict(
name="Zot", name="Foo",
external_references=[ external_references=[
{ {
"url": "https://example2", "url": "https://example2",
@ -550,7 +550,7 @@ def test_semantic_equivalence_on_unknown_object():
CUSTOM_KWARGS2 = dict( CUSTOM_KWARGS2 = dict(
type="x-foobar", type="x-foobar",
id="x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061", id="x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061",
name="Zot", name="Foo",
external_references=[ external_references=[
{ {
"url": "https://example2", "url": "https://example2",
@ -787,7 +787,7 @@ def test_semantic_equivalence_prop_scores():
tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS) tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS)
stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores) stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores)
assert len(prop_scores) == 4 assert len(prop_scores) == 4
assert round(prop_scores["matching_score"], 1) == 37.6 assert round(prop_scores["matching_score"], 1) == 8.8
assert round(prop_scores["sum_weights"], 1) == 100.0 assert round(prop_scores["sum_weights"], 1) == 100.0

View File

@ -9,7 +9,7 @@ deps =
pytest-cov pytest-cov
coverage coverage
taxii2-client taxii2-client
pyjarowinkler fuzzywuzzy
haversine haversine
medallion medallion
commands = commands =