2017-02-26 02:53:06 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# coding=utf-8
|
|
|
|
"""
|
|
|
|
Tools to find duplicate in galaxies
|
|
|
|
"""
|
|
|
|
import json
|
|
|
|
import os
|
|
|
|
import collections
|
|
|
|
|
|
|
|
|
2019-08-30 09:57:05 +02:00
|
|
|
def loadjsons(path, return_paths=False):
|
2017-02-26 02:53:06 +01:00
|
|
|
"""
|
2019-08-30 09:57:05 +02:00
|
|
|
Find all Jsons and load them in a dict
|
|
|
|
|
|
|
|
Parameters:
|
|
|
|
path: string
|
|
|
|
return_names: boolean, if the name of the file should be returned,
|
|
|
|
default: False
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of parsed file contents.
|
|
|
|
If return_paths is True, then every list item is a tuple of the
|
|
|
|
file name and the file content
|
2017-02-26 02:53:06 +01:00
|
|
|
"""
|
|
|
|
files = []
|
|
|
|
data = []
|
|
|
|
for name in os.listdir(path):
|
|
|
|
if os.path.isfile(os.path.join(path, name)) and name.endswith('.json'):
|
|
|
|
files.append(name)
|
|
|
|
for jfile in files:
|
2019-08-30 09:57:05 +02:00
|
|
|
filepath = os.path.join(path, jfile)
|
|
|
|
if return_paths:
|
|
|
|
data.append((filepath, json.load(open(filepath))))
|
|
|
|
else:
|
|
|
|
data.append(json.load(json.load(open(filepath))))
|
2017-02-26 02:53:06 +01:00
|
|
|
return data
|
|
|
|
|
2019-08-30 09:57:05 +02:00
|
|
|
|
2017-02-26 02:53:06 +01:00
|
|
|
if __name__ == '__main__':
|
|
|
|
"""
|
|
|
|
Iterate all name + synonyms
|
|
|
|
tell what is duplicated.
|
|
|
|
"""
|
|
|
|
jsons = loadjsons("../clusters")
|
|
|
|
counter = collections.Counter()
|
|
|
|
namespace = []
|
|
|
|
for djson in jsons:
|
|
|
|
items = djson.get('values')
|
|
|
|
for entry in items:
|
|
|
|
name = entry.get('value').strip().lower()
|
2019-08-30 09:57:05 +02:00
|
|
|
counter[name] += 1
|
2017-02-26 02:53:06 +01:00
|
|
|
namespace.append([name, djson.get('name')])
|
|
|
|
try:
|
|
|
|
for synonym in entry.get('meta').get('synonyms'):
|
|
|
|
name = synonym.strip().lower()
|
2019-08-30 09:57:05 +02:00
|
|
|
counter[name] += 1
|
2017-02-26 02:53:06 +01:00
|
|
|
namespace.append([name, djson.get('name')])
|
|
|
|
except (AttributeError, TypeError):
|
|
|
|
pass
|
|
|
|
counter = dict(counter)
|
|
|
|
for key, val in counter.items():
|
2019-08-30 09:57:05 +02:00
|
|
|
if val > 1:
|
|
|
|
print("Warning duplicate %s" % key)
|
2017-02-26 02:53:06 +01:00
|
|
|
for item in namespace:
|
2019-08-30 09:57:05 +02:00
|
|
|
if item[0] == key:
|
|
|
|
print(item)
|