#!/usr/bin/env python3 # coding=utf-8 """ Tools to find duplicate in galaxies """ import json import os import collections def loadjsons(path): """ Find all Jsons and load them in a dict """ files = [] data = [] for name in os.listdir(path): if os.path.isfile(os.path.join(path, name)) and name.endswith('.json'): files.append(name) for jfile in files: data.append(json.load(open("%s/%s" % (path, jfile)))) return data if __name__ == '__main__': """ Iterate all name + synonyms tell what is duplicated. """ jsons = loadjsons("../clusters") counter = collections.Counter() namespace = [] for djson in jsons: items = djson.get('values') for entry in items: name = entry.get('value').strip().lower() counter[name]+=1 namespace.append([name, djson.get('name')]) try: for synonym in entry.get('meta').get('synonyms'): name = synonym.strip().lower() counter[name]+=1 namespace.append([name, djson.get('name')]) except (AttributeError, TypeError): pass counter = dict(counter) for key, val in counter.items(): if val>1: print ("Warning duplicate %s" % key) for item in namespace: if item[0]==key: print (item)