Add [tool] multithreading

pull/941/head
niclas 2024-03-05 10:23:19 +01:00
parent 94e0b855d1
commit 9514ce7fcd
6 changed files with 86 additions and 185 deletions

View File

@ -1,9 +1,12 @@
from modules.universe import Universe from modules.universe import Universe
from modules.site import IndexSite, StatisticsSite from modules.site import IndexSite, StatisticsSite
from utils.helper import generate_relations_table
import multiprocessing import multiprocessing
from multiprocessing import Pool from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor
import json import json
import os import os
import time import time
@ -16,6 +19,12 @@ CLUSTER_PATH = "../../clusters"
SITE_PATH = "./site/docs" SITE_PATH = "./site/docs"
GALAXY_PATH = "../../galaxies" GALAXY_PATH = "../../galaxies"
def write_relations_table(cluster):
if cluster.relationships:
print(f"Writing {cluster.uuid}.md")
with open(os.path.join(relation_path, f"{cluster.uuid}.md"), "w") as index:
index.write(generate_relations_table(cluster.relationships))
def get_cluster_relationships(cluster_data): def get_cluster_relationships(cluster_data):
galaxy, cluster = cluster_data galaxy, cluster = cluster_data
relationships = universe.get_relationships_with_levels(universe.galaxies[galaxy].clusters[cluster]) relationships = universe.get_relationships_with_levels(universe.galaxies[galaxy].clusters[cluster])
@ -32,36 +41,6 @@ def get_deprecated_galaxy_files():
return deprecated_galaxy_files return deprecated_galaxy_files
def cluster_transform_to_link(cluster):
placeholder = "__TMP__"
section = (
cluster
.value.lower()
.replace(" - ", placeholder) # Replace " - " first
.replace(" ", "-")
.replace("/", "")
.replace(":", "")
.replace(placeholder, "-")
)
galaxy_folder = cluster.galaxy.json_file_name.replace(".json", "")
return f"[{cluster.value} ({cluster.uuid})](../../{galaxy_folder}/index.md#{section})"
def galaxy_transform_to_link(galaxy):
galaxy_folder = galaxy.json_file_name.replace(".json", "")
return f"[{galaxy.galaxy_name}](../../{galaxy_folder}/index.md)"
def generate_relations_table(relationships):
markdown = "|Cluster A | Galaxy A | Cluster B | Galaxy B | Level { .graph } |\n"
markdown += "| --- | --- | --- | --- | --- |\n"
for from_cluster, to_cluster, level in relationships:
from_galaxy = from_cluster.galaxy
if to_cluster.value != "Private Cluster":
to_galaxy = to_cluster.galaxy
markdown += f"{cluster_transform_to_link(from_cluster)} | {galaxy_transform_to_link(from_galaxy)} | {cluster_transform_to_link(to_cluster)} | {galaxy_transform_to_link(to_galaxy)} | {level}\n"
else:
markdown += f"{cluster_transform_to_link(from_cluster)} | {galaxy_transform_to_link(from_galaxy)} | {to_cluster.value} | Unknown | {level}\n"
return markdown
if __name__ == "__main__": if __name__ == "__main__":
start_time = time.time() start_time = time.time()
universe = Universe() universe = Universe()
@ -123,9 +102,10 @@ if __name__ == "__main__":
index.write_entry() index.write_entry()
statistics = StatisticsSite(SITE_PATH) statistics = StatisticsSite(SITE_PATH)
statistics.add_cluster_statistics(len([cluster for galaxy in universe.galaxies.values() for cluster in galaxy.clusters.values()]), len(universe.private_clusters))
statistics.add_galaxy_statistics(universe.galaxies.values()) statistics.add_galaxy_statistics(universe.galaxies.values())
statistics.add_cluster_statistics([cluster for galaxy in universe.galaxies.values() for cluster in galaxy.clusters.values()])
statistics.add_relation_statistics([cluster for galaxy in universe.galaxies.values() for cluster in galaxy.clusters.values()]) statistics.add_relation_statistics([cluster for galaxy in universe.galaxies.values() for cluster in galaxy.clusters.values()])
statistics.add_synonym_statistics([cluster for galaxy in universe.galaxies.values() for cluster in galaxy.clusters.values()])
statistics.write_entry() statistics.write_entry()
for galaxy in universe.galaxies.values(): for galaxy in universe.galaxies.values():
@ -141,10 +121,8 @@ if __name__ == "__main__":
with open(os.path.join(relation_path, ".pages"), "w") as index: with open(os.path.join(relation_path, ".pages"), "w") as index:
index.write(f"hide: true\n") index.write(f"hide: true\n")
for cluster in galaxy.clusters.values(): with ThreadPoolExecutor(max_workers=(multiprocessing.cpu_count() * 4)) as executor:
if cluster.relationships: executor.map(write_relations_table, galaxy.clusters.values())
print(f"Writing {cluster.uuid}.md")
with open(os.path.join(relation_path, f"{cluster.uuid}.md"), "w") as index:
index.write(generate_relations_table(cluster.relationships))
print(f"Finished in {time.time() - start_time} seconds") print(f"Finished in {time.time() - start_time} seconds")

View File

@ -12,6 +12,8 @@ class Site:
self.content += content self.content += content
def write_entry(self): def write_entry(self):
if not os.path.exists(self.path):
os.makedirs(self.path)
with open(os.path.join(self.path, self.name), "w") as index: with open(os.path.join(self.path, self.name), "w") as index:
index.write(self.content) index.write(self.content)
@ -30,23 +32,16 @@ class StatisticsSite(Site):
super().__init__(path=path, name="statistics.md") super().__init__(path=path, name="statistics.md")
def add_galaxy_statistics(self, galaxies): def add_galaxy_statistics(self, galaxies):
galaxy_cluster_count = {galaxy.galaxy_name: len(galaxy.clusters) for galaxy in galaxies} galaxy_cluster_count = {galaxy: len(galaxy.clusters) for galaxy in galaxies}
top_20 = get_top_x(galaxy_cluster_count, 20) top_20 = get_top_x(galaxy_cluster_count, 20)
flop_20 = get_top_x(galaxy_cluster_count, 20, False) flop_20 = get_top_x(galaxy_cluster_count, 20, False)
self.add_content(f"# Galaxy statistics\n") self.add_content(f"# Galaxy statistics\n")
self.add_content(f"## Galaxies with the most clusters\n") self.add_content(f"## Galaxies with the most clusters\n\n")
self.add_content(create_bar_chart(x_axis="Galaxy", y_axis="Count", values=top_20)) self.add_content(create_bar_chart(x_axis="Galaxy", y_axis="Count", values=top_20, galaxy=True))
self.add_content(f"## Galaxies with the least clusters\n") self.add_content(f"## Galaxies with the least clusters\n\n")
self.add_content(create_bar_chart(x_axis="Galaxy", y_axis="Count", values=flop_20)) self.add_content(create_bar_chart(x_axis="Galaxy", y_axis="Count", values=flop_20, galaxy=True))
def add_cluster_statistics(self, clusters): def add_cluster_statistics(self, public_clusters, private_clusters):
public_clusters = 0
private_clusters = 0
for cluster in clusters:
if cluster.value == "Private Cluster":
private_clusters += 1
else:
public_clusters += 1
values = {"Public clusters": public_clusters, "Private clusters": private_clusters} values = {"Public clusters": public_clusters, "Private clusters": private_clusters}
self.add_content(f"# Cluster statistics\n") self.add_content(f"# Cluster statistics\n")
self.add_content(f"## Number of clusters\n") self.add_content(f"## Number of clusters\n")
@ -58,9 +53,9 @@ class StatisticsSite(Site):
private_relations = 0 private_relations = 0
public_relations = 0 public_relations = 0
for cluster in clusters: for cluster in clusters:
cluster_relations[cluster.uuid] = len(cluster.relations) cluster_relations[cluster] = len(cluster.relationships)
for relation in cluster.relations: for relation in cluster.relationships:
if relation.to_cluster.value == "Private Cluster": if relation[1].value == "Private Cluster":
private_relations += 1 private_relations += 1
else: else:
public_relations += 1 public_relations += 1
@ -68,14 +63,20 @@ class StatisticsSite(Site):
flop_20 = get_top_x(cluster_relations, 20, False) flop_20 = get_top_x(cluster_relations, 20, False)
self.add_content(f"# Relation statistics\n") self.add_content(f"# Relation statistics\n")
self.add_content(f"Here you can find the total number of relations including public and private relations. The number includes relations between public clusters and relations between public and private clusters. Therefore relatons between private clusters are not included in the statistics.\n\n") self.add_content(f"Here you can find the total number of relations including public and private relations. The number includes relations between public clusters and relations between public and private clusters. Therefore relatons between private clusters are not included in the statistics.\n\n")
self.add_content(f"## Number of relations\n") self.add_content(f"## Number of relations\n\n")
self.add_content(create_pie_chart(sector="Type", unit="Count", values={"Public relations": public_relations, "Private relations": private_relations})) self.add_content(create_pie_chart(sector="Type", unit="Count", values={"Public relations": public_relations, "Private relations": private_relations}))
self.add_content(f"**Average number of relations per cluster**: {int(sum(cluster_relations.values()) / len(cluster_relations))}\n") self.add_content(f"**Average number of relations per cluster**: {int(sum(cluster_relations.values()) / len(cluster_relations))}\n")
self.add_content(f"## Cluster with the most relations\n") self.add_content(f"## Cluster with the most relations\n\n")
self.add_content(create_bar_chart(x_axis="Cluster", y_axis="Count", values=top_20)) self.add_content(create_bar_chart(x_axis="Cluster", y_axis="Count", values=top_20))
self.add_content(f"## Cluster with the least relations\n") self.add_content(f"## Cluster with the least relations\n\n")
self.add_content(create_bar_chart(x_axis="Cluster", y_axis="Count", values=flop_20)) self.add_content(create_bar_chart(x_axis="Cluster", y_axis="Count", values=flop_20))
def add_synonym_statistics(self, clusters): def add_synonym_statistics(self, clusters):
pass synonyms = {}
for cluster in clusters:
if cluster.meta and cluster.meta.get("synonyms"):
synonyms[cluster] = len(cluster.meta["synonyms"])
top_20 = get_top_x(synonyms, 20)
self.add_content(f"# Synonym statistics\n")
self.add_content(f"## Cluster with the most synonyms\n\n")
self.add_content(create_bar_chart(x_axis="Cluster", y_axis="Count", values=top_20))

View File

@ -1,118 +0,0 @@
from utils.helper import get_top_x, name_to_section
import os
class Statistics:
def __init__(self, cluster_dict):
self.public_relations_count = 0
self.private_relations_count = 0
self.private_clusters = []
self.public_clusters_dict = {}
self.relation_count_dict = {}
self.synonyms_count_dict = {}
self.empty_uuids_dict = {}
self.cluster_dict = cluster_dict
self.entry = ""
def create_entry(self):
self.entry += f"# MISP Galaxy statistics\n"
self.entry += "The MISP galaxy statistics are automatically generated based on the MISP galaxy JSON files. Therefore the statistics only include detailed infomration about public clusters and relations. Some statistics about private clusters and relations is included but only as an approximation based on the information gathered from the public clusters.\n"
self.entry += "\n"
self._create_cluster_statistics()
self._create_galaxy_statistics()
self._create_relation_statistics()
self._create_synonym_statistics()
def _create_galaxy_statistics(self):
self.entry += f"# Galaxy statistics\n"
self.entry += f"## Galaxies with the most clusters\n"
galaxy_counts = {}
for galaxy in self.public_clusters_dict.values():
galaxy_counts[galaxy] = galaxy_counts.get(galaxy, 0) + 1
top_galaxies, top_galaxies_values = get_top_x(galaxy_counts, 20)
self.entry += f" | No. | Galaxy | Count {{ .log-bar-chart }}|\n"
self.entry += f" |----|--------|-------|\n"
for i, galaxy in enumerate(top_galaxies, 1):
galaxy_section = name_to_section(galaxy.json_file_name)
self.entry += f" | {i} | [{galaxy.name}](../{galaxy_section}) | {top_galaxies_values[i-1]} |\n"
self.entry += f"\n"
self.entry += f"## Galaxies with the least clusters\n"
flop_galaxies, flop_galaxies_values = get_top_x(galaxy_counts, 20, False)
self.entry += f" | No. | Galaxy | Count {{ .bar-chart }}|\n"
self.entry += f" |----|--------|-------|\n"
for i, galaxy in enumerate(flop_galaxies, 1):
galaxy_section = name_to_section(galaxy.json_file_name)
self.entry += f" | {i} | [{galaxy.name}](../{galaxy_section}) | {flop_galaxies_values[i-1]} |\n"
self.entry += f"\n"
def _create_cluster_statistics(self):
self.entry += f"# Cluster statistics\n"
self.entry += f"## Number of clusters\n"
self.entry += f"Here you can find the total number of clusters including public and private clusters. The number of public clusters has been calculated based on the number of unique Clusters in the MISP galaxy JSON files. The number of private clusters could only be approximated based on the number of relations to non-existing clusters. Therefore the number of private clusters is not accurate and only an approximation.\n"
self.entry += f"\n"
self.entry += f"| No. | Type | Count {{ .pie-chart }}|\n"
self.entry += f"|-----|------|-----------------------|\n"
self.entry += f"| 1 | Public clusters | {len(self.public_clusters_dict)} |\n"
self.entry += f"| 2 | Private clusters | {len(self.private_clusters)} |\n"
self.entry += f"\n"
def _create_relation_statistics(self):
self.entry += f"# Relation statistics\n"
self.entry += f"Here you can find the total number of relations including public and private relations. The number includes relations between public clusters and relations between public and private clusters. Therefore relatons between private clusters are not included in the statistics.\n"
self.entry += f"\n"
self.entry += f"## Number of relations\n"
self.entry += f"| No. | Type | Count {{ .pie-chart }}|\n"
self.entry += f"|----|------|-------|\n"
self.entry += f"| 1 | Public relations | {self.public_relations_count} |\n"
self.entry += f"| 2 | Private relations | {self.private_relations_count} |\n"
self.entry += f"\n"
self.entry += f"**Average number of relations per cluster**: {int(sum(self.relation_count_dict.values()) / len(self.relation_count_dict))}\n"
self.entry += f"## Cluster with the most relations\n"
relation_count_dict_names = {
self.cluster_dict[uuid].value: count
for uuid, count in self.relation_count_dict.items()
}
top_25_relation, top_25_relation_values = get_top_x(
relation_count_dict_names, 20
)
self.entry += f" | No. | Cluster | Count {{ .bar-chart }}|\n"
self.entry += f" |----|--------|-------|\n"
relation_count_dict_galaxies = {
self.cluster_dict[uuid].value: self.cluster_dict[uuid].galaxy.json_file_name
for uuid in self.relation_count_dict.keys()
}
for i, cluster in enumerate(top_25_relation, 1):
cluster_section = name_to_section(cluster)
self.entry += f" | {i} | [{cluster}](../{relation_count_dict_galaxies[cluster]}/#{cluster_section}) | {top_25_relation_values[i-1]} |\n"
self.entry += f"\n"
def _create_synonym_statistics(self):
self.entry += f"# Synonym statistics\n"
self.entry += f"## Cluster with the most synonyms\n"
synonyms_count_dict_names = {
self.cluster_dict[uuid].value: count
for uuid, count in self.synonyms_count_dict.items()
}
top_synonyms, top_synonyms_values = get_top_x(synonyms_count_dict_names, 20)
self.entry += f" | No. | Cluster | Count {{ .bar-chart }}|\n"
self.entry += f" |----|--------|-------|\n"
synonyms_count_dict_galaxies = {
self.cluster_dict[uuid].value: self.cluster_dict[uuid].galaxy.json_file_name
for uuid in self.synonyms_count_dict.keys()
}
for i, cluster in enumerate(top_synonyms, 1):
cluster_section = name_to_section(cluster)
self.entry += f" | {i} | [{cluster}](../{synonyms_count_dict_galaxies[cluster]}/#{cluster_section}) | {top_synonyms_values[i-1]} |\n"
self.entry += f"\n"
def write_entry(self, path):
self.create_entry()
with open(os.path.join(path, "statistics.md"), "w") as index:
index.write(self.entry)
def add_cluster(self, cluster):
self.public_clusters_dict[cluster.uuid] = cluster.galaxy
cluster.statistics = self

View File

@ -8,6 +8,7 @@ class Universe:
def __init__(self, add_inbound_relationship=False): def __init__(self, add_inbound_relationship=False):
self.galaxies = {} # Maps galaxy_name to Galaxy objects self.galaxies = {} # Maps galaxy_name to Galaxy objects
self.add_inbound_relationship = add_inbound_relationship self.add_inbound_relationship = add_inbound_relationship
self.private_clusters = {}
def add_galaxy(self, galaxy_name, json_file_name, authors, description): def add_galaxy(self, galaxy_name, json_file_name, authors, description):
if galaxy_name not in self.galaxies: if galaxy_name not in self.galaxies:
@ -39,7 +40,9 @@ class Universe:
cluster_b.add_inbound_relationship(cluster_a) cluster_b.add_inbound_relationship(cluster_a)
else: else:
if cluster_a: if cluster_a:
# private_cluster = self.add_cluster(uuid=cluster_b_id, galaxy_name="Unknown", description=None, value="Private Cluster", meta=None)
private_cluster = Cluster(uuid=cluster_b_id, galaxy=None, description=None, value="Private Cluster", meta=None) private_cluster = Cluster(uuid=cluster_b_id, galaxy=None, description=None, value="Private Cluster", meta=None)
self.private_clusters[cluster_b_id] = private_cluster
cluster_a.add_outbound_relationship(private_cluster) cluster_a.add_outbound_relationship(private_cluster)
else: else:
raise ValueError(f"Cluster {cluster_a} not found in any galaxy") raise ValueError(f"Cluster {cluster_a} not found in any galaxy")

View File

@ -82,7 +82,9 @@ document$.subscribe(function () {
path: nodePaths[id] path: nodePaths[id]
})); }));
const Parent_Node = nodes[0]; let header = document.querySelector('h1').textContent;
const parentUUID = header.replace(/\s+/g, '-').charAt(0).toLowerCase() + header.replace(/\s+/g, '-').slice(1);
const Parent_Node = nodes.find(node => node.id.includes(parentUUID));
var links = data.map(d => ({ source: d.source, target: d.target })); var links = data.map(d => ({ source: d.source, target: d.target }));

View File

@ -4,10 +4,8 @@ def get_top_x(dict, x, big_to_small=True):
sorted_dict = sorted( sorted_dict = sorted(
dict.items(), key=operator.itemgetter(1), reverse=big_to_small dict.items(), key=operator.itemgetter(1), reverse=big_to_small
)[:x] )[:x]
top_x = [key for key, value in sorted_dict] top_x = {key: value for key, value in sorted_dict}
top_x_values = sorted(dict.values(), reverse=big_to_small)[:x] return top_x
return top_x, top_x_values
def name_to_section(name): def name_to_section(name):
placeholder = "__TMP__" placeholder = "__TMP__"
@ -20,20 +18,57 @@ def name_to_section(name):
.replace(placeholder, "-") .replace(placeholder, "-")
) # Replace the placeholder with "-" ) # Replace the placeholder with "-"
def create_bar_chart(x_axis, y_axis, values, log=False, galaxy=False):
def create_bar_chart(x_axis, y_axis, values, log=False):
if not log: if not log:
chart = f"| No. | {x_axis} | {y_axis} {{ .bar-chart }}|\n" chart = f"| No. | {x_axis} | {y_axis} {{ .bar-chart }}|\n"
else: else:
chart = f"| No. | {x_axis} | {y_axis} {{ .log-bar-chart }}|\n" chart = f"| No. | {x_axis} | {y_axis} {{ .log-bar-chart }}|\n"
chart += f"|----|--------|-------|\n" chart += f"|----|--------|-------|\n"
for i, x, y in enumerate(values): for i, (x, y) in enumerate(values.items()):
chart += f"| {i+1} | {x} | {y} |\n" if galaxy:
chart += f"| {i+1} | {galaxy_transform_to_link(x)} | {y} |\n"
else:
chart += f"| {i+1} | {cluster_transform_to_link(x)} | {y} |\n"
chart += "\n"
return chart return chart
def create_pie_chart(sector, unit, values): def create_pie_chart(sector, unit, values):
chart = f"| No. | {sector} | {unit} {{ .pie-chart }}|\n" chart = f"| No. | {sector} | {unit} {{ .pie-chart }}|\n"
chart += f"|----|--------|-------|\n" chart += f"|----|--------|-------|\n"
for i, x, y in enumerate(values): for i, (x, y) in enumerate(values.items()):
chart += f"| {i+1} | {x} | {y} |\n" chart += f"| {i+1} | {x} | {y} |\n"
chart += "\n"
return chart return chart
def cluster_transform_to_link(cluster, uuid=False):
placeholder = "__TMP__"
section = (
cluster
.value.lower()
.replace(" - ", placeholder) # Replace " - " first
.replace(" ", "-")
.replace("/", "")
.replace(":", "")
.replace(placeholder, "-")
)
galaxy_folder = cluster.galaxy.json_file_name.replace(".json", "")
if uuid:
return f"[{cluster.value} ({cluster.uuid})](../../{galaxy_folder}/index.md#{section})"
else:
return f"[{cluster.value}](../../{galaxy_folder}/index.md#{section})"
def galaxy_transform_to_link(galaxy):
galaxy_folder = galaxy.json_file_name.replace(".json", "")
return f"[{galaxy.galaxy_name}](../../{galaxy_folder}/index.md)"
def generate_relations_table(relationships):
markdown = "|Cluster A | Galaxy A | Cluster B | Galaxy B | Level { .graph } |\n"
markdown += "| --- | --- | --- | --- | --- |\n"
for from_cluster, to_cluster, level in relationships:
from_galaxy = from_cluster.galaxy
if to_cluster.value != "Private Cluster":
to_galaxy = to_cluster.galaxy
markdown += f"{cluster_transform_to_link(from_cluster, uuid=True)} | {galaxy_transform_to_link(from_galaxy)} | {cluster_transform_to_link(to_cluster, uuid=True)} | {galaxy_transform_to_link(to_galaxy)} | {level}\n"
else:
markdown += f"{cluster_transform_to_link(from_cluster, uuid=True)} | {galaxy_transform_to_link(from_galaxy)} | {to_cluster.value} | Unknown | {level}\n"
return markdown