Change string semantic comparison algorithm
Use `fuzzywuzzy`'s Token Sort Ratio instead of Jaro-Winkler.master
parent
457564f2f9
commit
62cd4fd33c
2
setup.py
2
setup.py
|
@ -64,6 +64,6 @@ setup(
|
|||
},
|
||||
extras_require={
|
||||
'taxii': ['taxii2-client'],
|
||||
'semantic': ['haversine', 'pyjarowinkler'],
|
||||
'semantic': ['haversine', 'fuzzywuzzy'],
|
||||
},
|
||||
)
|
||||
|
|
|
@ -363,10 +363,10 @@ def partial_string_based(str1, str2):
|
|||
float: Number between 0.0 and 1.0 depending on match criteria.
|
||||
|
||||
"""
|
||||
from pyjarowinkler import distance
|
||||
result = distance.get_jaro_distance(str1, str2)
|
||||
from fuzzywuzzy import fuzz
|
||||
result = fuzz.token_sort_ratio(str1, str2, force_ascii=False)
|
||||
logger.debug("--\t\tpartial_string_based '%s' '%s'\tresult: '%s'", str1, str2, result)
|
||||
return result
|
||||
return result / 100.0
|
||||
|
||||
|
||||
def custom_pattern_based(pattern1, pattern2):
|
||||
|
|
|
@ -521,7 +521,7 @@ def test_semantic_equivalence_on_same_vulnerability2():
|
|||
],
|
||||
)
|
||||
VULN_KWARGS2 = dict(
|
||||
name="Zot",
|
||||
name="Foo",
|
||||
external_references=[
|
||||
{
|
||||
"url": "https://example2",
|
||||
|
@ -550,7 +550,7 @@ def test_semantic_equivalence_on_unknown_object():
|
|||
CUSTOM_KWARGS2 = dict(
|
||||
type="x-foobar",
|
||||
id="x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061",
|
||||
name="Zot",
|
||||
name="Foo",
|
||||
external_references=[
|
||||
{
|
||||
"url": "https://example2",
|
||||
|
@ -787,7 +787,7 @@ def test_semantic_equivalence_prop_scores():
|
|||
tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS)
|
||||
stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores)
|
||||
assert len(prop_scores) == 4
|
||||
assert round(prop_scores["matching_score"], 1) == 37.6
|
||||
assert round(prop_scores["matching_score"], 1) == 8.8
|
||||
assert round(prop_scores["sum_weights"], 1) == 100.0
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue