diff --git a/tools/mkdocs/generator.py b/tools/mkdocs/generator.py index 0b3342c..f765353 100644 --- a/tools/mkdocs/generator.py +++ b/tools/mkdocs/generator.py @@ -1,9 +1,12 @@ from modules.universe import Universe from modules.site import IndexSite, StatisticsSite +from utils.helper import generate_relations_table import multiprocessing from multiprocessing import Pool +from concurrent.futures import ThreadPoolExecutor + import json import os import time @@ -16,6 +19,12 @@ CLUSTER_PATH = "../../clusters" SITE_PATH = "./site/docs" GALAXY_PATH = "../../galaxies" +def write_relations_table(cluster): + if cluster.relationships: + print(f"Writing {cluster.uuid}.md") + with open(os.path.join(relation_path, f"{cluster.uuid}.md"), "w") as index: + index.write(generate_relations_table(cluster.relationships)) + def get_cluster_relationships(cluster_data): galaxy, cluster = cluster_data relationships = universe.get_relationships_with_levels(universe.galaxies[galaxy].clusters[cluster]) @@ -32,36 +41,6 @@ def get_deprecated_galaxy_files(): return deprecated_galaxy_files -def cluster_transform_to_link(cluster): - placeholder = "__TMP__" - section = ( - cluster - .value.lower() - .replace(" - ", placeholder) # Replace " - " first - .replace(" ", "-") - .replace("/", "") - .replace(":", "") - .replace(placeholder, "-") - ) - galaxy_folder = cluster.galaxy.json_file_name.replace(".json", "") - return f"[{cluster.value} ({cluster.uuid})](../../{galaxy_folder}/index.md#{section})" - -def galaxy_transform_to_link(galaxy): - galaxy_folder = galaxy.json_file_name.replace(".json", "") - return f"[{galaxy.galaxy_name}](../../{galaxy_folder}/index.md)" - -def generate_relations_table(relationships): - markdown = "|Cluster A | Galaxy A | Cluster B | Galaxy B | Level { .graph } |\n" - markdown += "| --- | --- | --- | --- | --- |\n" - for from_cluster, to_cluster, level in relationships: - from_galaxy = from_cluster.galaxy - if to_cluster.value != "Private Cluster": - to_galaxy = to_cluster.galaxy - markdown += f"{cluster_transform_to_link(from_cluster)} | {galaxy_transform_to_link(from_galaxy)} | {cluster_transform_to_link(to_cluster)} | {galaxy_transform_to_link(to_galaxy)} | {level}\n" - else: - markdown += f"{cluster_transform_to_link(from_cluster)} | {galaxy_transform_to_link(from_galaxy)} | {to_cluster.value} | Unknown | {level}\n" - return markdown - if __name__ == "__main__": start_time = time.time() universe = Universe() @@ -123,9 +102,10 @@ if __name__ == "__main__": index.write_entry() statistics = StatisticsSite(SITE_PATH) + statistics.add_cluster_statistics(len([cluster for galaxy in universe.galaxies.values() for cluster in galaxy.clusters.values()]), len(universe.private_clusters)) statistics.add_galaxy_statistics(universe.galaxies.values()) - statistics.add_cluster_statistics([cluster for galaxy in universe.galaxies.values() for cluster in galaxy.clusters.values()]) statistics.add_relation_statistics([cluster for galaxy in universe.galaxies.values() for cluster in galaxy.clusters.values()]) + statistics.add_synonym_statistics([cluster for galaxy in universe.galaxies.values() for cluster in galaxy.clusters.values()]) statistics.write_entry() for galaxy in universe.galaxies.values(): @@ -141,10 +121,8 @@ if __name__ == "__main__": with open(os.path.join(relation_path, ".pages"), "w") as index: index.write(f"hide: true\n") - for cluster in galaxy.clusters.values(): - if cluster.relationships: - print(f"Writing {cluster.uuid}.md") - with open(os.path.join(relation_path, f"{cluster.uuid}.md"), "w") as index: - index.write(generate_relations_table(cluster.relationships)) + with ThreadPoolExecutor(max_workers=(multiprocessing.cpu_count() * 4)) as executor: + executor.map(write_relations_table, galaxy.clusters.values()) + print(f"Finished in {time.time() - start_time} seconds") diff --git a/tools/mkdocs/modules/site.py b/tools/mkdocs/modules/site.py index 2e651dc..2938874 100644 --- a/tools/mkdocs/modules/site.py +++ b/tools/mkdocs/modules/site.py @@ -12,6 +12,8 @@ class Site: self.content += content def write_entry(self): + if not os.path.exists(self.path): + os.makedirs(self.path) with open(os.path.join(self.path, self.name), "w") as index: index.write(self.content) @@ -30,23 +32,16 @@ class StatisticsSite(Site): super().__init__(path=path, name="statistics.md") def add_galaxy_statistics(self, galaxies): - galaxy_cluster_count = {galaxy.galaxy_name: len(galaxy.clusters) for galaxy in galaxies} + galaxy_cluster_count = {galaxy: len(galaxy.clusters) for galaxy in galaxies} top_20 = get_top_x(galaxy_cluster_count, 20) flop_20 = get_top_x(galaxy_cluster_count, 20, False) self.add_content(f"# Galaxy statistics\n") - self.add_content(f"## Galaxies with the most clusters\n") - self.add_content(create_bar_chart(x_axis="Galaxy", y_axis="Count", values=top_20)) - self.add_content(f"## Galaxies with the least clusters\n") - self.add_content(create_bar_chart(x_axis="Galaxy", y_axis="Count", values=flop_20)) + self.add_content(f"## Galaxies with the most clusters\n\n") + self.add_content(create_bar_chart(x_axis="Galaxy", y_axis="Count", values=top_20, galaxy=True)) + self.add_content(f"## Galaxies with the least clusters\n\n") + self.add_content(create_bar_chart(x_axis="Galaxy", y_axis="Count", values=flop_20, galaxy=True)) - def add_cluster_statistics(self, clusters): - public_clusters = 0 - private_clusters = 0 - for cluster in clusters: - if cluster.value == "Private Cluster": - private_clusters += 1 - else: - public_clusters += 1 + def add_cluster_statistics(self, public_clusters, private_clusters): values = {"Public clusters": public_clusters, "Private clusters": private_clusters} self.add_content(f"# Cluster statistics\n") self.add_content(f"## Number of clusters\n") @@ -58,9 +53,9 @@ class StatisticsSite(Site): private_relations = 0 public_relations = 0 for cluster in clusters: - cluster_relations[cluster.uuid] = len(cluster.relations) - for relation in cluster.relations: - if relation.to_cluster.value == "Private Cluster": + cluster_relations[cluster] = len(cluster.relationships) + for relation in cluster.relationships: + if relation[1].value == "Private Cluster": private_relations += 1 else: public_relations += 1 @@ -68,14 +63,20 @@ class StatisticsSite(Site): flop_20 = get_top_x(cluster_relations, 20, False) self.add_content(f"# Relation statistics\n") self.add_content(f"Here you can find the total number of relations including public and private relations. The number includes relations between public clusters and relations between public and private clusters. Therefore relatons between private clusters are not included in the statistics.\n\n") - self.add_content(f"## Number of relations\n") + self.add_content(f"## Number of relations\n\n") self.add_content(create_pie_chart(sector="Type", unit="Count", values={"Public relations": public_relations, "Private relations": private_relations})) self.add_content(f"**Average number of relations per cluster**: {int(sum(cluster_relations.values()) / len(cluster_relations))}\n") - self.add_content(f"## Cluster with the most relations\n") + self.add_content(f"## Cluster with the most relations\n\n") self.add_content(create_bar_chart(x_axis="Cluster", y_axis="Count", values=top_20)) - self.add_content(f"## Cluster with the least relations\n") + self.add_content(f"## Cluster with the least relations\n\n") self.add_content(create_bar_chart(x_axis="Cluster", y_axis="Count", values=flop_20)) def add_synonym_statistics(self, clusters): - pass - + synonyms = {} + for cluster in clusters: + if cluster.meta and cluster.meta.get("synonyms"): + synonyms[cluster] = len(cluster.meta["synonyms"]) + top_20 = get_top_x(synonyms, 20) + self.add_content(f"# Synonym statistics\n") + self.add_content(f"## Cluster with the most synonyms\n\n") + self.add_content(create_bar_chart(x_axis="Cluster", y_axis="Count", values=top_20)) diff --git a/tools/mkdocs/modules/statistics.py b/tools/mkdocs/modules/statistics.py deleted file mode 100644 index a1c986a..0000000 --- a/tools/mkdocs/modules/statistics.py +++ /dev/null @@ -1,118 +0,0 @@ -from utils.helper import get_top_x, name_to_section -import os - - -class Statistics: - def __init__(self, cluster_dict): - self.public_relations_count = 0 - self.private_relations_count = 0 - self.private_clusters = [] - self.public_clusters_dict = {} - self.relation_count_dict = {} - self.synonyms_count_dict = {} - self.empty_uuids_dict = {} - self.cluster_dict = cluster_dict - self.entry = "" - - def create_entry(self): - self.entry += f"# MISP Galaxy statistics\n" - self.entry += "The MISP galaxy statistics are automatically generated based on the MISP galaxy JSON files. Therefore the statistics only include detailed infomration about public clusters and relations. Some statistics about private clusters and relations is included but only as an approximation based on the information gathered from the public clusters.\n" - self.entry += "\n" - self._create_cluster_statistics() - self._create_galaxy_statistics() - self._create_relation_statistics() - self._create_synonym_statistics() - - def _create_galaxy_statistics(self): - self.entry += f"# Galaxy statistics\n" - self.entry += f"## Galaxies with the most clusters\n" - galaxy_counts = {} - for galaxy in self.public_clusters_dict.values(): - galaxy_counts[galaxy] = galaxy_counts.get(galaxy, 0) + 1 - top_galaxies, top_galaxies_values = get_top_x(galaxy_counts, 20) - self.entry += f" | No. | Galaxy | Count {{ .log-bar-chart }}|\n" - self.entry += f" |----|--------|-------|\n" - for i, galaxy in enumerate(top_galaxies, 1): - galaxy_section = name_to_section(galaxy.json_file_name) - self.entry += f" | {i} | [{galaxy.name}](../{galaxy_section}) | {top_galaxies_values[i-1]} |\n" - self.entry += f"\n" - - self.entry += f"## Galaxies with the least clusters\n" - flop_galaxies, flop_galaxies_values = get_top_x(galaxy_counts, 20, False) - self.entry += f" | No. | Galaxy | Count {{ .bar-chart }}|\n" - self.entry += f" |----|--------|-------|\n" - for i, galaxy in enumerate(flop_galaxies, 1): - galaxy_section = name_to_section(galaxy.json_file_name) - self.entry += f" | {i} | [{galaxy.name}](../{galaxy_section}) | {flop_galaxies_values[i-1]} |\n" - self.entry += f"\n" - - def _create_cluster_statistics(self): - self.entry += f"# Cluster statistics\n" - self.entry += f"## Number of clusters\n" - self.entry += f"Here you can find the total number of clusters including public and private clusters. The number of public clusters has been calculated based on the number of unique Clusters in the MISP galaxy JSON files. The number of private clusters could only be approximated based on the number of relations to non-existing clusters. Therefore the number of private clusters is not accurate and only an approximation.\n" - self.entry += f"\n" - self.entry += f"| No. | Type | Count {{ .pie-chart }}|\n" - self.entry += f"|-----|------|-----------------------|\n" - self.entry += f"| 1 | Public clusters | {len(self.public_clusters_dict)} |\n" - self.entry += f"| 2 | Private clusters | {len(self.private_clusters)} |\n" - self.entry += f"\n" - - def _create_relation_statistics(self): - self.entry += f"# Relation statistics\n" - self.entry += f"Here you can find the total number of relations including public and private relations. The number includes relations between public clusters and relations between public and private clusters. Therefore relatons between private clusters are not included in the statistics.\n" - self.entry += f"\n" - self.entry += f"## Number of relations\n" - self.entry += f"| No. | Type | Count {{ .pie-chart }}|\n" - self.entry += f"|----|------|-------|\n" - self.entry += f"| 1 | Public relations | {self.public_relations_count} |\n" - self.entry += f"| 2 | Private relations | {self.private_relations_count} |\n" - self.entry += f"\n" - - self.entry += f"**Average number of relations per cluster**: {int(sum(self.relation_count_dict.values()) / len(self.relation_count_dict))}\n" - - self.entry += f"## Cluster with the most relations\n" - relation_count_dict_names = { - self.cluster_dict[uuid].value: count - for uuid, count in self.relation_count_dict.items() - } - top_25_relation, top_25_relation_values = get_top_x( - relation_count_dict_names, 20 - ) - self.entry += f" | No. | Cluster | Count {{ .bar-chart }}|\n" - self.entry += f" |----|--------|-------|\n" - relation_count_dict_galaxies = { - self.cluster_dict[uuid].value: self.cluster_dict[uuid].galaxy.json_file_name - for uuid in self.relation_count_dict.keys() - } - for i, cluster in enumerate(top_25_relation, 1): - cluster_section = name_to_section(cluster) - self.entry += f" | {i} | [{cluster}](../{relation_count_dict_galaxies[cluster]}/#{cluster_section}) | {top_25_relation_values[i-1]} |\n" - self.entry += f"\n" - - def _create_synonym_statistics(self): - self.entry += f"# Synonym statistics\n" - self.entry += f"## Cluster with the most synonyms\n" - synonyms_count_dict_names = { - self.cluster_dict[uuid].value: count - for uuid, count in self.synonyms_count_dict.items() - } - top_synonyms, top_synonyms_values = get_top_x(synonyms_count_dict_names, 20) - self.entry += f" | No. | Cluster | Count {{ .bar-chart }}|\n" - self.entry += f" |----|--------|-------|\n" - synonyms_count_dict_galaxies = { - self.cluster_dict[uuid].value: self.cluster_dict[uuid].galaxy.json_file_name - for uuid in self.synonyms_count_dict.keys() - } - for i, cluster in enumerate(top_synonyms, 1): - cluster_section = name_to_section(cluster) - self.entry += f" | {i} | [{cluster}](../{synonyms_count_dict_galaxies[cluster]}/#{cluster_section}) | {top_synonyms_values[i-1]} |\n" - self.entry += f"\n" - - def write_entry(self, path): - self.create_entry() - with open(os.path.join(path, "statistics.md"), "w") as index: - index.write(self.entry) - - def add_cluster(self, cluster): - self.public_clusters_dict[cluster.uuid] = cluster.galaxy - cluster.statistics = self diff --git a/tools/mkdocs/modules/universe.py b/tools/mkdocs/modules/universe.py index d2e701f..a22f48c 100644 --- a/tools/mkdocs/modules/universe.py +++ b/tools/mkdocs/modules/universe.py @@ -8,6 +8,7 @@ class Universe: def __init__(self, add_inbound_relationship=False): self.galaxies = {} # Maps galaxy_name to Galaxy objects self.add_inbound_relationship = add_inbound_relationship + self.private_clusters = {} def add_galaxy(self, galaxy_name, json_file_name, authors, description): if galaxy_name not in self.galaxies: @@ -39,7 +40,9 @@ class Universe: cluster_b.add_inbound_relationship(cluster_a) else: if cluster_a: + # private_cluster = self.add_cluster(uuid=cluster_b_id, galaxy_name="Unknown", description=None, value="Private Cluster", meta=None) private_cluster = Cluster(uuid=cluster_b_id, galaxy=None, description=None, value="Private Cluster", meta=None) + self.private_clusters[cluster_b_id] = private_cluster cluster_a.add_outbound_relationship(private_cluster) else: raise ValueError(f"Cluster {cluster_a} not found in any galaxy") diff --git a/tools/mkdocs/site/docs/01_attachements/javascripts/graph.js b/tools/mkdocs/site/docs/01_attachements/javascripts/graph.js index ced8070..db5f3ff 100644 --- a/tools/mkdocs/site/docs/01_attachements/javascripts/graph.js +++ b/tools/mkdocs/site/docs/01_attachements/javascripts/graph.js @@ -82,7 +82,9 @@ document$.subscribe(function () { path: nodePaths[id] })); - const Parent_Node = nodes[0]; + let header = document.querySelector('h1').textContent; + const parentUUID = header.replace(/\s+/g, '-').charAt(0).toLowerCase() + header.replace(/\s+/g, '-').slice(1); + const Parent_Node = nodes.find(node => node.id.includes(parentUUID)); var links = data.map(d => ({ source: d.source, target: d.target })); diff --git a/tools/mkdocs/utils/helper.py b/tools/mkdocs/utils/helper.py index 38a734b..c043fe7 100644 --- a/tools/mkdocs/utils/helper.py +++ b/tools/mkdocs/utils/helper.py @@ -4,10 +4,8 @@ def get_top_x(dict, x, big_to_small=True): sorted_dict = sorted( dict.items(), key=operator.itemgetter(1), reverse=big_to_small )[:x] - top_x = [key for key, value in sorted_dict] - top_x_values = sorted(dict.values(), reverse=big_to_small)[:x] - return top_x, top_x_values - + top_x = {key: value for key, value in sorted_dict} + return top_x def name_to_section(name): placeholder = "__TMP__" @@ -20,20 +18,57 @@ def name_to_section(name): .replace(placeholder, "-") ) # Replace the placeholder with "-" - -def create_bar_chart(x_axis, y_axis, values, log=False): +def create_bar_chart(x_axis, y_axis, values, log=False, galaxy=False): if not log: chart = f"| No. | {x_axis} | {y_axis} {{ .bar-chart }}|\n" else: chart = f"| No. | {x_axis} | {y_axis} {{ .log-bar-chart }}|\n" chart += f"|----|--------|-------|\n" - for i, x, y in enumerate(values): - chart += f"| {i+1} | {x} | {y} |\n" + for i, (x, y) in enumerate(values.items()): + if galaxy: + chart += f"| {i+1} | {galaxy_transform_to_link(x)} | {y} |\n" + else: + chart += f"| {i+1} | {cluster_transform_to_link(x)} | {y} |\n" + chart += "\n" return chart def create_pie_chart(sector, unit, values): chart = f"| No. | {sector} | {unit} {{ .pie-chart }}|\n" chart += f"|----|--------|-------|\n" - for i, x, y in enumerate(values): + for i, (x, y) in enumerate(values.items()): chart += f"| {i+1} | {x} | {y} |\n" + chart += "\n" return chart + +def cluster_transform_to_link(cluster, uuid=False): + placeholder = "__TMP__" + section = ( + cluster + .value.lower() + .replace(" - ", placeholder) # Replace " - " first + .replace(" ", "-") + .replace("/", "") + .replace(":", "") + .replace(placeholder, "-") + ) + galaxy_folder = cluster.galaxy.json_file_name.replace(".json", "") + if uuid: + return f"[{cluster.value} ({cluster.uuid})](../../{galaxy_folder}/index.md#{section})" + else: + return f"[{cluster.value}](../../{galaxy_folder}/index.md#{section})" + +def galaxy_transform_to_link(galaxy): + galaxy_folder = galaxy.json_file_name.replace(".json", "") + return f"[{galaxy.galaxy_name}](../../{galaxy_folder}/index.md)" + +def generate_relations_table(relationships): + markdown = "|Cluster A | Galaxy A | Cluster B | Galaxy B | Level { .graph } |\n" + markdown += "| --- | --- | --- | --- | --- |\n" + for from_cluster, to_cluster, level in relationships: + from_galaxy = from_cluster.galaxy + if to_cluster.value != "Private Cluster": + to_galaxy = to_cluster.galaxy + markdown += f"{cluster_transform_to_link(from_cluster, uuid=True)} | {galaxy_transform_to_link(from_galaxy)} | {cluster_transform_to_link(to_cluster, uuid=True)} | {galaxy_transform_to_link(to_galaxy)} | {level}\n" + else: + markdown += f"{cluster_transform_to_link(from_cluster, uuid=True)} | {galaxy_transform_to_link(from_galaxy)} | {to_cluster.value} | Unknown | {level}\n" + return markdown \ No newline at end of file