Change string semantic comparison algorithm
Use `fuzzywuzzy`'s Token Sort Ratio instead of Jaro-Winkler.master
parent
457564f2f9
commit
62cd4fd33c
2
setup.py
2
setup.py
|
@ -64,6 +64,6 @@ setup(
|
||||||
},
|
},
|
||||||
extras_require={
|
extras_require={
|
||||||
'taxii': ['taxii2-client'],
|
'taxii': ['taxii2-client'],
|
||||||
'semantic': ['haversine', 'pyjarowinkler'],
|
'semantic': ['haversine', 'fuzzywuzzy'],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
|
@ -363,10 +363,10 @@ def partial_string_based(str1, str2):
|
||||||
float: Number between 0.0 and 1.0 depending on match criteria.
|
float: Number between 0.0 and 1.0 depending on match criteria.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
from pyjarowinkler import distance
|
from fuzzywuzzy import fuzz
|
||||||
result = distance.get_jaro_distance(str1, str2)
|
result = fuzz.token_sort_ratio(str1, str2, force_ascii=False)
|
||||||
logger.debug("--\t\tpartial_string_based '%s' '%s'\tresult: '%s'", str1, str2, result)
|
logger.debug("--\t\tpartial_string_based '%s' '%s'\tresult: '%s'", str1, str2, result)
|
||||||
return result
|
return result / 100.0
|
||||||
|
|
||||||
|
|
||||||
def custom_pattern_based(pattern1, pattern2):
|
def custom_pattern_based(pattern1, pattern2):
|
||||||
|
|
|
@ -521,7 +521,7 @@ def test_semantic_equivalence_on_same_vulnerability2():
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
VULN_KWARGS2 = dict(
|
VULN_KWARGS2 = dict(
|
||||||
name="Zot",
|
name="Foo",
|
||||||
external_references=[
|
external_references=[
|
||||||
{
|
{
|
||||||
"url": "https://example2",
|
"url": "https://example2",
|
||||||
|
@ -550,7 +550,7 @@ def test_semantic_equivalence_on_unknown_object():
|
||||||
CUSTOM_KWARGS2 = dict(
|
CUSTOM_KWARGS2 = dict(
|
||||||
type="x-foobar",
|
type="x-foobar",
|
||||||
id="x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061",
|
id="x-foobar--0c7b5b88-8ff7-4a4d-aa9d-feb398cd0061",
|
||||||
name="Zot",
|
name="Foo",
|
||||||
external_references=[
|
external_references=[
|
||||||
{
|
{
|
||||||
"url": "https://example2",
|
"url": "https://example2",
|
||||||
|
@ -787,7 +787,7 @@ def test_semantic_equivalence_prop_scores():
|
||||||
tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS)
|
tool2 = stix2.v21.Tool(id=TOOL_ID, **TOOL2_KWARGS)
|
||||||
stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores)
|
stix2.Environment().semantically_equivalent(tool1, tool2, prop_scores)
|
||||||
assert len(prop_scores) == 4
|
assert len(prop_scores) == 4
|
||||||
assert round(prop_scores["matching_score"], 1) == 37.6
|
assert round(prop_scores["matching_score"], 1) == 8.8
|
||||||
assert round(prop_scores["sum_weights"], 1) == 100.0
|
assert round(prop_scores["sum_weights"], 1) == 100.0
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue