Add [tool] multithreading

pull/941/head
niclas 2024-03-05 10:23:19 +01:00
parent 94e0b855d1
commit 9514ce7fcd
6 changed files with 86 additions and 185 deletions

View File

@ -1,9 +1,12 @@
from modules.universe import Universe
from modules.site import IndexSite, StatisticsSite
from utils.helper import generate_relations_table
import multiprocessing
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor
import json
import os
import time
@ -16,6 +19,12 @@ CLUSTER_PATH = "../../clusters"
SITE_PATH = "./site/docs"
GALAXY_PATH = "../../galaxies"
def write_relations_table(cluster):
if cluster.relationships:
print(f"Writing {cluster.uuid}.md")
with open(os.path.join(relation_path, f"{cluster.uuid}.md"), "w") as index:
index.write(generate_relations_table(cluster.relationships))
def get_cluster_relationships(cluster_data):
galaxy, cluster = cluster_data
relationships = universe.get_relationships_with_levels(universe.galaxies[galaxy].clusters[cluster])
@ -32,36 +41,6 @@ def get_deprecated_galaxy_files():
return deprecated_galaxy_files
def cluster_transform_to_link(cluster):
placeholder = "__TMP__"
section = (
cluster
.value.lower()
.replace(" - ", placeholder) # Replace " - " first
.replace(" ", "-")
.replace("/", "")
.replace(":", "")
.replace(placeholder, "-")
)
galaxy_folder = cluster.galaxy.json_file_name.replace(".json", "")
return f"[{cluster.value} ({cluster.uuid})](../../{galaxy_folder}/index.md#{section})"
def galaxy_transform_to_link(galaxy):
galaxy_folder = galaxy.json_file_name.replace(".json", "")
return f"[{galaxy.galaxy_name}](../../{galaxy_folder}/index.md)"
def generate_relations_table(relationships):
markdown = "|Cluster A | Galaxy A | Cluster B | Galaxy B | Level { .graph } |\n"
markdown += "| --- | --- | --- | --- | --- |\n"
for from_cluster, to_cluster, level in relationships:
from_galaxy = from_cluster.galaxy
if to_cluster.value != "Private Cluster":
to_galaxy = to_cluster.galaxy
markdown += f"{cluster_transform_to_link(from_cluster)} | {galaxy_transform_to_link(from_galaxy)} | {cluster_transform_to_link(to_cluster)} | {galaxy_transform_to_link(to_galaxy)} | {level}\n"
else:
markdown += f"{cluster_transform_to_link(from_cluster)} | {galaxy_transform_to_link(from_galaxy)} | {to_cluster.value} | Unknown | {level}\n"
return markdown
if __name__ == "__main__":
start_time = time.time()
universe = Universe()
@ -123,9 +102,10 @@ if __name__ == "__main__":
index.write_entry()
statistics = StatisticsSite(SITE_PATH)
statistics.add_cluster_statistics(len([cluster for galaxy in universe.galaxies.values() for cluster in galaxy.clusters.values()]), len(universe.private_clusters))
statistics.add_galaxy_statistics(universe.galaxies.values())
statistics.add_cluster_statistics([cluster for galaxy in universe.galaxies.values() for cluster in galaxy.clusters.values()])
statistics.add_relation_statistics([cluster for galaxy in universe.galaxies.values() for cluster in galaxy.clusters.values()])
statistics.add_synonym_statistics([cluster for galaxy in universe.galaxies.values() for cluster in galaxy.clusters.values()])
statistics.write_entry()
for galaxy in universe.galaxies.values():
@ -141,10 +121,8 @@ if __name__ == "__main__":
with open(os.path.join(relation_path, ".pages"), "w") as index:
index.write(f"hide: true\n")
for cluster in galaxy.clusters.values():
if cluster.relationships:
print(f"Writing {cluster.uuid}.md")
with open(os.path.join(relation_path, f"{cluster.uuid}.md"), "w") as index:
index.write(generate_relations_table(cluster.relationships))
with ThreadPoolExecutor(max_workers=(multiprocessing.cpu_count() * 4)) as executor:
executor.map(write_relations_table, galaxy.clusters.values())
print(f"Finished in {time.time() - start_time} seconds")

View File

@ -12,6 +12,8 @@ class Site:
self.content += content
def write_entry(self):
if not os.path.exists(self.path):
os.makedirs(self.path)
with open(os.path.join(self.path, self.name), "w") as index:
index.write(self.content)
@ -30,23 +32,16 @@ class StatisticsSite(Site):
super().__init__(path=path, name="statistics.md")
def add_galaxy_statistics(self, galaxies):
galaxy_cluster_count = {galaxy.galaxy_name: len(galaxy.clusters) for galaxy in galaxies}
galaxy_cluster_count = {galaxy: len(galaxy.clusters) for galaxy in galaxies}
top_20 = get_top_x(galaxy_cluster_count, 20)
flop_20 = get_top_x(galaxy_cluster_count, 20, False)
self.add_content(f"# Galaxy statistics\n")
self.add_content(f"## Galaxies with the most clusters\n")
self.add_content(create_bar_chart(x_axis="Galaxy", y_axis="Count", values=top_20))
self.add_content(f"## Galaxies with the least clusters\n")
self.add_content(create_bar_chart(x_axis="Galaxy", y_axis="Count", values=flop_20))
self.add_content(f"## Galaxies with the most clusters\n\n")
self.add_content(create_bar_chart(x_axis="Galaxy", y_axis="Count", values=top_20, galaxy=True))
self.add_content(f"## Galaxies with the least clusters\n\n")
self.add_content(create_bar_chart(x_axis="Galaxy", y_axis="Count", values=flop_20, galaxy=True))
def add_cluster_statistics(self, clusters):
public_clusters = 0
private_clusters = 0
for cluster in clusters:
if cluster.value == "Private Cluster":
private_clusters += 1
else:
public_clusters += 1
def add_cluster_statistics(self, public_clusters, private_clusters):
values = {"Public clusters": public_clusters, "Private clusters": private_clusters}
self.add_content(f"# Cluster statistics\n")
self.add_content(f"## Number of clusters\n")
@ -58,9 +53,9 @@ class StatisticsSite(Site):
private_relations = 0
public_relations = 0
for cluster in clusters:
cluster_relations[cluster.uuid] = len(cluster.relations)
for relation in cluster.relations:
if relation.to_cluster.value == "Private Cluster":
cluster_relations[cluster] = len(cluster.relationships)
for relation in cluster.relationships:
if relation[1].value == "Private Cluster":
private_relations += 1
else:
public_relations += 1
@ -68,14 +63,20 @@ class StatisticsSite(Site):
flop_20 = get_top_x(cluster_relations, 20, False)
self.add_content(f"# Relation statistics\n")
self.add_content(f"Here you can find the total number of relations including public and private relations. The number includes relations between public clusters and relations between public and private clusters. Therefore relatons between private clusters are not included in the statistics.\n\n")
self.add_content(f"## Number of relations\n")
self.add_content(f"## Number of relations\n\n")
self.add_content(create_pie_chart(sector="Type", unit="Count", values={"Public relations": public_relations, "Private relations": private_relations}))
self.add_content(f"**Average number of relations per cluster**: {int(sum(cluster_relations.values()) / len(cluster_relations))}\n")
self.add_content(f"## Cluster with the most relations\n")
self.add_content(f"## Cluster with the most relations\n\n")
self.add_content(create_bar_chart(x_axis="Cluster", y_axis="Count", values=top_20))
self.add_content(f"## Cluster with the least relations\n")
self.add_content(f"## Cluster with the least relations\n\n")
self.add_content(create_bar_chart(x_axis="Cluster", y_axis="Count", values=flop_20))
def add_synonym_statistics(self, clusters):
pass
synonyms = {}
for cluster in clusters:
if cluster.meta and cluster.meta.get("synonyms"):
synonyms[cluster] = len(cluster.meta["synonyms"])
top_20 = get_top_x(synonyms, 20)
self.add_content(f"# Synonym statistics\n")
self.add_content(f"## Cluster with the most synonyms\n\n")
self.add_content(create_bar_chart(x_axis="Cluster", y_axis="Count", values=top_20))

View File

@ -1,118 +0,0 @@
from utils.helper import get_top_x, name_to_section
import os
class Statistics:
def __init__(self, cluster_dict):
self.public_relations_count = 0
self.private_relations_count = 0
self.private_clusters = []
self.public_clusters_dict = {}
self.relation_count_dict = {}
self.synonyms_count_dict = {}
self.empty_uuids_dict = {}
self.cluster_dict = cluster_dict
self.entry = ""
def create_entry(self):
self.entry += f"# MISP Galaxy statistics\n"
self.entry += "The MISP galaxy statistics are automatically generated based on the MISP galaxy JSON files. Therefore the statistics only include detailed infomration about public clusters and relations. Some statistics about private clusters and relations is included but only as an approximation based on the information gathered from the public clusters.\n"
self.entry += "\n"
self._create_cluster_statistics()
self._create_galaxy_statistics()
self._create_relation_statistics()
self._create_synonym_statistics()
def _create_galaxy_statistics(self):
self.entry += f"# Galaxy statistics\n"
self.entry += f"## Galaxies with the most clusters\n"
galaxy_counts = {}
for galaxy in self.public_clusters_dict.values():
galaxy_counts[galaxy] = galaxy_counts.get(galaxy, 0) + 1
top_galaxies, top_galaxies_values = get_top_x(galaxy_counts, 20)
self.entry += f" | No. | Galaxy | Count {{ .log-bar-chart }}|\n"
self.entry += f" |----|--------|-------|\n"
for i, galaxy in enumerate(top_galaxies, 1):
galaxy_section = name_to_section(galaxy.json_file_name)
self.entry += f" | {i} | [{galaxy.name}](../{galaxy_section}) | {top_galaxies_values[i-1]} |\n"
self.entry += f"\n"
self.entry += f"## Galaxies with the least clusters\n"
flop_galaxies, flop_galaxies_values = get_top_x(galaxy_counts, 20, False)
self.entry += f" | No. | Galaxy | Count {{ .bar-chart }}|\n"
self.entry += f" |----|--------|-------|\n"
for i, galaxy in enumerate(flop_galaxies, 1):
galaxy_section = name_to_section(galaxy.json_file_name)
self.entry += f" | {i} | [{galaxy.name}](../{galaxy_section}) | {flop_galaxies_values[i-1]} |\n"
self.entry += f"\n"
def _create_cluster_statistics(self):
self.entry += f"# Cluster statistics\n"
self.entry += f"## Number of clusters\n"
self.entry += f"Here you can find the total number of clusters including public and private clusters. The number of public clusters has been calculated based on the number of unique Clusters in the MISP galaxy JSON files. The number of private clusters could only be approximated based on the number of relations to non-existing clusters. Therefore the number of private clusters is not accurate and only an approximation.\n"
self.entry += f"\n"
self.entry += f"| No. | Type | Count {{ .pie-chart }}|\n"
self.entry += f"|-----|------|-----------------------|\n"
self.entry += f"| 1 | Public clusters | {len(self.public_clusters_dict)} |\n"
self.entry += f"| 2 | Private clusters | {len(self.private_clusters)} |\n"
self.entry += f"\n"
def _create_relation_statistics(self):
self.entry += f"# Relation statistics\n"
self.entry += f"Here you can find the total number of relations including public and private relations. The number includes relations between public clusters and relations between public and private clusters. Therefore relatons between private clusters are not included in the statistics.\n"
self.entry += f"\n"
self.entry += f"## Number of relations\n"
self.entry += f"| No. | Type | Count {{ .pie-chart }}|\n"
self.entry += f"|----|------|-------|\n"
self.entry += f"| 1 | Public relations | {self.public_relations_count} |\n"
self.entry += f"| 2 | Private relations | {self.private_relations_count} |\n"
self.entry += f"\n"
self.entry += f"**Average number of relations per cluster**: {int(sum(self.relation_count_dict.values()) / len(self.relation_count_dict))}\n"
self.entry += f"## Cluster with the most relations\n"
relation_count_dict_names = {
self.cluster_dict[uuid].value: count
for uuid, count in self.relation_count_dict.items()
}
top_25_relation, top_25_relation_values = get_top_x(
relation_count_dict_names, 20
)
self.entry += f" | No. | Cluster | Count {{ .bar-chart }}|\n"
self.entry += f" |----|--------|-------|\n"
relation_count_dict_galaxies = {
self.cluster_dict[uuid].value: self.cluster_dict[uuid].galaxy.json_file_name
for uuid in self.relation_count_dict.keys()
}
for i, cluster in enumerate(top_25_relation, 1):
cluster_section = name_to_section(cluster)
self.entry += f" | {i} | [{cluster}](../{relation_count_dict_galaxies[cluster]}/#{cluster_section}) | {top_25_relation_values[i-1]} |\n"
self.entry += f"\n"
def _create_synonym_statistics(self):
self.entry += f"# Synonym statistics\n"
self.entry += f"## Cluster with the most synonyms\n"
synonyms_count_dict_names = {
self.cluster_dict[uuid].value: count
for uuid, count in self.synonyms_count_dict.items()
}
top_synonyms, top_synonyms_values = get_top_x(synonyms_count_dict_names, 20)
self.entry += f" | No. | Cluster | Count {{ .bar-chart }}|\n"
self.entry += f" |----|--------|-------|\n"
synonyms_count_dict_galaxies = {
self.cluster_dict[uuid].value: self.cluster_dict[uuid].galaxy.json_file_name
for uuid in self.synonyms_count_dict.keys()
}
for i, cluster in enumerate(top_synonyms, 1):
cluster_section = name_to_section(cluster)
self.entry += f" | {i} | [{cluster}](../{synonyms_count_dict_galaxies[cluster]}/#{cluster_section}) | {top_synonyms_values[i-1]} |\n"
self.entry += f"\n"
def write_entry(self, path):
self.create_entry()
with open(os.path.join(path, "statistics.md"), "w") as index:
index.write(self.entry)
def add_cluster(self, cluster):
self.public_clusters_dict[cluster.uuid] = cluster.galaxy
cluster.statistics = self

View File

@ -8,6 +8,7 @@ class Universe:
def __init__(self, add_inbound_relationship=False):
self.galaxies = {} # Maps galaxy_name to Galaxy objects
self.add_inbound_relationship = add_inbound_relationship
self.private_clusters = {}
def add_galaxy(self, galaxy_name, json_file_name, authors, description):
if galaxy_name not in self.galaxies:
@ -39,7 +40,9 @@ class Universe:
cluster_b.add_inbound_relationship(cluster_a)
else:
if cluster_a:
# private_cluster = self.add_cluster(uuid=cluster_b_id, galaxy_name="Unknown", description=None, value="Private Cluster", meta=None)
private_cluster = Cluster(uuid=cluster_b_id, galaxy=None, description=None, value="Private Cluster", meta=None)
self.private_clusters[cluster_b_id] = private_cluster
cluster_a.add_outbound_relationship(private_cluster)
else:
raise ValueError(f"Cluster {cluster_a} not found in any galaxy")

View File

@ -82,7 +82,9 @@ document$.subscribe(function () {
path: nodePaths[id]
}));
const Parent_Node = nodes[0];
let header = document.querySelector('h1').textContent;
const parentUUID = header.replace(/\s+/g, '-').charAt(0).toLowerCase() + header.replace(/\s+/g, '-').slice(1);
const Parent_Node = nodes.find(node => node.id.includes(parentUUID));
var links = data.map(d => ({ source: d.source, target: d.target }));

View File

@ -4,10 +4,8 @@ def get_top_x(dict, x, big_to_small=True):
sorted_dict = sorted(
dict.items(), key=operator.itemgetter(1), reverse=big_to_small
)[:x]
top_x = [key for key, value in sorted_dict]
top_x_values = sorted(dict.values(), reverse=big_to_small)[:x]
return top_x, top_x_values
top_x = {key: value for key, value in sorted_dict}
return top_x
def name_to_section(name):
placeholder = "__TMP__"
@ -20,20 +18,57 @@ def name_to_section(name):
.replace(placeholder, "-")
) # Replace the placeholder with "-"
def create_bar_chart(x_axis, y_axis, values, log=False):
def create_bar_chart(x_axis, y_axis, values, log=False, galaxy=False):
if not log:
chart = f"| No. | {x_axis} | {y_axis} {{ .bar-chart }}|\n"
else:
chart = f"| No. | {x_axis} | {y_axis} {{ .log-bar-chart }}|\n"
chart += f"|----|--------|-------|\n"
for i, x, y in enumerate(values):
chart += f"| {i+1} | {x} | {y} |\n"
for i, (x, y) in enumerate(values.items()):
if galaxy:
chart += f"| {i+1} | {galaxy_transform_to_link(x)} | {y} |\n"
else:
chart += f"| {i+1} | {cluster_transform_to_link(x)} | {y} |\n"
chart += "\n"
return chart
def create_pie_chart(sector, unit, values):
chart = f"| No. | {sector} | {unit} {{ .pie-chart }}|\n"
chart += f"|----|--------|-------|\n"
for i, x, y in enumerate(values):
for i, (x, y) in enumerate(values.items()):
chart += f"| {i+1} | {x} | {y} |\n"
chart += "\n"
return chart
def cluster_transform_to_link(cluster, uuid=False):
placeholder = "__TMP__"
section = (
cluster
.value.lower()
.replace(" - ", placeholder) # Replace " - " first
.replace(" ", "-")
.replace("/", "")
.replace(":", "")
.replace(placeholder, "-")
)
galaxy_folder = cluster.galaxy.json_file_name.replace(".json", "")
if uuid:
return f"[{cluster.value} ({cluster.uuid})](../../{galaxy_folder}/index.md#{section})"
else:
return f"[{cluster.value}](../../{galaxy_folder}/index.md#{section})"
def galaxy_transform_to_link(galaxy):
galaxy_folder = galaxy.json_file_name.replace(".json", "")
return f"[{galaxy.galaxy_name}](../../{galaxy_folder}/index.md)"
def generate_relations_table(relationships):
markdown = "|Cluster A | Galaxy A | Cluster B | Galaxy B | Level { .graph } |\n"
markdown += "| --- | --- | --- | --- | --- |\n"
for from_cluster, to_cluster, level in relationships:
from_galaxy = from_cluster.galaxy
if to_cluster.value != "Private Cluster":
to_galaxy = to_cluster.galaxy
markdown += f"{cluster_transform_to_link(from_cluster, uuid=True)} | {galaxy_transform_to_link(from_galaxy)} | {cluster_transform_to_link(to_cluster, uuid=True)} | {galaxy_transform_to_link(to_galaxy)} | {level}\n"
else:
markdown += f"{cluster_transform_to_link(from_cluster, uuid=True)} | {galaxy_transform_to_link(from_galaxy)} | {to_cluster.value} | Unknown | {level}\n"
return markdown