chg: [command] Improved FieldSquasher closest matcher

pull/40/head
mokaddem 2021-03-01 11:17:13 +01:00
parent f8e9632f78
commit 407c63edff
1 changed files with 170 additions and 24 deletions

View File

@ -41,8 +41,13 @@ class FieldSquasherCommand extends Command
$sourceData = $this->getDataFromSource($source);
$candidateResult = $this->findCanditates($this->{$table}, $config, $sourceData);
$entitiesSample = array_slice($candidateResult['candidates'], 0, min(10, count($candidateResult['candidates'])));
$entitiesSample = array_slice($candidateResult['candidates'], 0, min(100, count($candidateResult['candidates'])));
$noCandidatesSample = array_slice($candidateResult['noCandidatesFound'], 0, min(10, count($candidateResult['noCandidatesFound'])));
$notExactCandidates = array_slice($candidateResult['notExactCandidates'], 0, min(10, count($candidateResult['notExactCandidates'])));
$totalNotFound = count($candidateResult['noCandidatesFound']);
$totalClosestFound = count($candidateResult['notExactCandidates']);
$totalFound = count($candidateResult['candidates']);
$this->io->out("Sample of no candidates found (total: {$totalNotFound}):");
$ioTable = $this->transformEntitiesIntoTable($noCandidatesSample);
$io->helper('Table')->output($ioTable);
@ -51,13 +56,30 @@ class FieldSquasherCommand extends Command
if ($selection == 'Y') {
$this->saveDataOnDisk($filename, $candidateResult['noCandidatesFound']);
}
$this->io->out('');
if (!empty($notExactCandidates)) {
$this->io->out("Sample of closest candidates found (total not strictly matching: {$totalClosestFound}):");
$ioTable = $this->transformEntitiesIntoTable($notExactCandidates);
$io->helper('Table')->output($ioTable);
$filename = 'closest_candidates_found_' . time() . '.json';
$selection = $io->askChoice("Would you like to save these entries on the disk as `{$filename}`", ['Y', 'N'], 'Y');
if ($selection == 'Y') {
$this->saveDataOnDisk($filename, $candidateResult['notExactCandidates']);
}
$this->io->out('');
}
$this->io->out("Sample of exact candidates found (total striclty matching: {$totalFound}):");
$ioTable = $this->transformEntitiesIntoTable($entitiesSample, [
'id',
$config['finder']['joinFields']['squashed'],
$config['target']['squashedField'],
"{$config['target']['squashedField']}_original_value",
'based_on_best_match',
'best_candidates_found',
'match_score'
]);
$io->helper('Table')->output($ioTable);
$filename = 'replacement_done_' . time() . '.json';
@ -102,6 +124,8 @@ class FieldSquasherCommand extends Command
$this->io->verbose('Finding candidates');
if ($config['finder']['type'] == 'exact') {
$candidateResult = $this->findCanditatesByStrictMatching($table, $config, $source);
} else if ($config['finder']['type'] == 'closest') {
$candidateResult = $this->findCanditatesByClosestMatch($table, $config, $source);
} else {
$this->io->error('Unsupported search type');
die(1);
@ -129,15 +153,15 @@ class FieldSquasherCommand extends Command
$candidates = [];
$noCandidatesFound = [];
foreach ($squashingObjects as $i => $squashingObject) {
$squashingData = Hash::get($squashingObject, $config['squashingData']['squashingField']);
if (isset($this->{$config['squashingData']['massage']})) {
$squashingData = $this->{$config['squashingData']['massage']}($squashingData);
}
foreach ($squashingObjects as $squashingObject) {
$squashingJoinField = Hash::get($squashingObject, $config['finder']['joinFields']['squashing']);
if (empty($potentialCanditates[$squashingJoinField])) {
$noCandidatesFound[] = $squashingObject;
} else {
$squashingData = Hash::get($squashingObject, $config['squashingData']['squashingField']);
if (isset($this->{$config['squashingData']['massage']})) {
$squashingData = $this->{$config['squashingData']['massage']}($squashingData);
}
$squashedTarget = $potentialCanditates[$squashingJoinField];
$squashedTarget->{"{$config['target']['squashedField']}_original_value"} = $squashedTarget->{$config['target']['squashedField']};
$squashedTarget->{$config['target']['squashedField']} = $squashingData;
@ -146,31 +170,153 @@ class FieldSquasherCommand extends Command
}
return [
'candidates' => $candidates,
'notExactCandidates' => [],
'noCandidatesFound' => $noCandidatesFound,
];
}
private function findCanditatesByClosestMatch($table, $config, $source)
{
$squashingObjects = Hash::extract($source, $config['finder']['path']);
if (empty($squashingObjects)) {
$this->io->error('finder.path returned nothing');
return [];
}
$query = $table->find();
$allCanditates = $query->toArray();
$squashingJoinField = $config['finder']['joinFields']['squashing'];
$squashedJoinField = $config['finder']['joinFields']['squashed'];
$closestMatchResults = [];
// Compute proximity score
foreach ($squashingObjects as $i => $squashingObject) {
$squashingJoinValue = Hash::get($squashingObject, $squashingJoinField);
foreach ($allCanditates as $candidate) {
$squashedJoinValue = Hash::get($candidate, $squashedJoinField);
$proximityScore = $this->getProximityScore($squashingJoinValue, $squashedJoinValue);
$closestMatchResults[$candidate['id']][$proximityScore][] = $squashingObject;
$squashingObjects[$i]['__scores'][$proximityScore][] = $candidate;
}
}
// sort by score
foreach ($squashingObjects as $i => $squashingObject) {
ksort($squashingObjects[$i]['__scores'], SORT_NUMERIC);
}
foreach ($closestMatchResults as $i => $proximityScore) {
ksort($closestMatchResults[$i], SORT_NUMERIC);
}
// remove best occurence in other matching sets
foreach ($allCanditates as $candidate) {
$bestScore = array_key_first($closestMatchResults[$candidate['id']]);
$bestMatch = $closestMatchResults[$candidate['id']][$bestScore][0];
$squashingObjects = $this->removeCandidatesFromSquashingSet($squashingObjects, $bestMatch, $candidate['id']);
}
// pick the best match
foreach ($squashingObjects as $i => $squashingObject) {
if (empty($squashingObjects[$i]['__scores'])) {
continue;
}
ksort($squashingObjects[$i]['__scores'], SORT_NUMERIC);
$squashingObjects[$i]['__scores'] = array_slice($squashingObjects[$i]['__scores'], 0, 1, true);
$bestScore = array_key_first($squashingObjects[$i]['__scores']);
$squashingObjects[$i]['__scores'][$bestScore] = array_values($squashingObjects[$i]['__scores'][$bestScore])[0];
}
$candidates = [];
$noCandidatesFound = [];
$notExactCandidates = [];
$scoreThreshold = !empty($config['finder']['levenshteinScore']) ? $config['finder']['levenshteinScore'] : 10;
foreach ($squashingObjects as $i => $squashingObject) {
if (empty($squashingObjects[$i]['__scores'])) {
$noCandidatesFound[] = $squashingObject;
continue;
}
$bestScore = array_key_first($squashingObject['__scores']);
$bestMatch = $squashingObject['__scores'][$bestScore];
$squashingData = Hash::get($squashingObject, $config['squashingData']['squashingField']);
if (isset($this->{$config['squashingData']['massage']})) {
$squashingData = $this->{$config['squashingData']['massage']}($squashingData);
}
$squashedTarget = $bestMatch;
if ($bestScore <= $scoreThreshold) {
$squashedTarget["{$config['target']['squashedField']}_original_value"] = $squashedTarget[$config['target']['squashedField']];
$squashedTarget['match_score'] = $bestScore;
$squashedTarget['based_on_best_match_joinFields'] = Hash::get($squashingObject, $squashingJoinField);
$squashedTarget['based_on_best_match'] = json_encode($squashingObject);
$squashedTarget[$config['target']['squashedField']] = $squashingData;
if ($bestScore > 0) {
$notExactCandidates[] = $squashedTarget;
} else {
$candidates[] = $squashedTarget;
}
} else {
$squashingObjectBestMatchInfo = "[{$bestMatch[$squashingJoinField]}, {$bestScore}]";
$squashingObject['__scores'] = $squashingObjectBestMatchInfo;
$noCandidatesFound[] = $squashingObject;
}
}
return [
'candidates' => $candidates,
'notExactCandidates' => $notExactCandidates,
'noCandidatesFound' => $noCandidatesFound
];
}
private function extractDataFromJSON($defaultFields, $config, $source)
private function removeCandidatesFromSquashingSet($squashingObjects, $bestMatch, $candidateID)
{
$data = [];
foreach ($config['mapping'] as $key => $fieldConfig) {
$values = null;
if (!is_array($fieldConfig)) {
$fieldConfig = ['path' => $fieldConfig];
}
if (!empty($fieldConfig['path'])) {
$values = Hash::extract($source, $fieldConfig['path']);
}
if (!empty($fieldConfig['massage'])) {
$values = array_map("self::{$fieldConfig['massage']}", $values);
}
if (isset($defaultFields[$key])) {
$data[$key] = $values;
foreach ($squashingObjects as $i => $squashingObject) {
if (Hash::remove($squashingObject, '__scores') == $bestMatch) {
continue;
} else {
$data['metaFields'][$key] = $values;
foreach ($squashingObject['__scores'] as $score => $candidates) {
foreach ($candidates as $j => $candidate) {
if ($candidate['id'] == $candidateID) {
unset($squashingObjects[$i]['__scores'][$score][$j]);
}
}
if (empty($squashingObjects[$i]['__scores'][$score])) {
unset($squashingObjects[$i]['__scores'][$score]);
}
}
}
}
return $this->invertArray($data);
return $squashingObjects;
}
private function removeMatchFromOtherMatches(&$closestMatchResults, $match, $currentIndex)
{
// remove squashingObject from all other matches
foreach ($closestMatchResults as $i => $proximityScore) {
if ($i == $currentIndex) {
continue;
}
foreach ($proximityScore as $score => $squashingObjects) {
foreach ($squashingObjects as $j => $squashingObject) {
if ($squashingObject == $match) {
unset($closestMatchResults[$i][$score][$j]);
}
}
if (empty($closestMatchResults[$i][$score])) {
unset($closestMatchResults[$i][$score]);
}
}
}
}
private function getProximityScore($value1, $value2)
{
if ($value1 == $value2) {
return -1;
} else {
return levenshtein(strtolower($value1), strtolower($value2));
}
}
private function getDataFromSource($source)
@ -251,7 +397,7 @@ class FieldSquasherCommand extends Command
private function processConfig($config)
{
$allowedModels = ['Organisations', 'Individuals'];
$allowedFinderType = ['exact', 'bestMatch'];
$allowedFinderType = ['exact', 'closest'];
if (empty($config['source']) || empty($config['finder']) || empty($config['target']) || empty($config['squashingData'])) {
$this->io->error('Error while parsing the configuration file, some of these fields are missing: `source`, `finder`, `target`, `squashingData`');
die(1);