#!/usr/bin/env python3 # coding=utf-8 """ Tools to find duplicate in galaxies """ import json import os import collections def loadjsons(path, return_paths=False): """ Find all Jsons and load them in a dict Parameters: path: string return_names: boolean, if the name of the file should be returned, default: False Returns: List of parsed file contents. If return_paths is True, then every list item is a tuple of the file name and the file content """ files = [] data = [] for name in os.listdir(path): if os.path.isfile(os.path.join(path, name)) and name.endswith('.json'): files.append(name) for jfile in files: filepath = os.path.join(path, jfile) if return_paths: data.append((filepath, json.load(open(filepath)))) else: data.append(json.load(json.load(open(filepath)))) return data if __name__ == '__main__': """ Iterate all name + synonyms tell what is duplicated. """ jsons = loadjsons("../clusters") counter = collections.Counter() namespace = [] for djson in jsons: items = djson.get('values') for entry in items: name = entry.get('value').strip().lower() counter[name] += 1 namespace.append([name, djson.get('name')]) try: for synonym in entry.get('meta').get('synonyms'): name = synonym.strip().lower() counter[name] += 1 namespace.append([name, djson.get('name')]) except (AttributeError, TypeError): pass counter = dict(counter) for key, val in counter.items(): if val > 1: print("Warning duplicate %s" % key) for item in namespace: if item[0] == key: print(item)