diff --git a/tools/mkdocs/generator.py b/tools/mkdocs/generator.py index a1bbe6d..b32f2b0 100644 --- a/tools/mkdocs/generator.py +++ b/tools/mkdocs/generator.py @@ -1,39 +1,29 @@ #!/usr/bin/python import json +import operator import os +import re from typing import List import validators CLUSTER_PATH = '../../clusters' SITE_PATH = './site/docs' -PROJECTS_PATH = './site/projects' +# PROJECTS_PATH = './site/projects' -galaxies_fnames = [] FILES_TO_IGNORE = [] # if you want to skip a specific cluster in the generation -for f in os.listdir(CLUSTER_PATH): - if '.json' in f and f not in FILES_TO_IGNORE: - galaxies_fnames.append(f) - -galaxies_fnames.sort() -galaxy_output = {} - # Variables for statistics public_relations_count = 0 private_relations_count = 0 - private_clusters = [] -public_clusters = [] - +public_clusters_dict = {} relation_count_dict = {} synonyms_count_dict = {} - empty_uuids_dict = {} - -intro = """ +INTRO = """ # MISP Galaxy The MISP galaxy offers a streamlined approach for representing large entities, known as clusters, which can be linked to MISP events or attributes. Each cluster consists of one or more elements, represented as key-value pairs. MISP galaxy comes with a default knowledge base, encompassing areas like Threat Actors, Tools, Ransomware, and ATT&CK matrices. However, users have the flexibility to modify, update, replace, or share these elements according to their needs. @@ -51,7 +41,15 @@ Clusters serve as an open and freely accessible knowledge base, which can be uti ## Publicly available clusters """ -contributing = """ + +STATISTICS= """ +## Statistics + +You can find some statistics about MISP galaxies [here](./statistics.md). + +""" + +CONTRIBUTING = """ # Contributing @@ -95,6 +93,7 @@ class Galaxy(): self.entry += f' |{author}|\n' def _create_clusters(self): + # global public_clusters_dict clusters = [] for cluster in self.cluster_list: clusters.append(Cluster( @@ -104,21 +103,29 @@ class Galaxy(): date=cluster.get('date', None), related_list=cluster.get('related', None), meta=cluster.get('meta', None), - galaxie=self.name + galaxie=self.json_file_name )) return clusters - def _create_clusters_entry(self): + def _create_clusters_entry(self, cluster_dict): for cluster in self.clusters: - self.entry += cluster.create_entry() + self.entry += cluster.create_entry(cluster_dict) - def create_entry(self): + def create_entry(self, cluster_dict): self._create_metadata_entry() self._create_title_entry() self._create_description_entry() self._create_authors_entry() - self._create_clusters_entry() + self._create_clusters_entry(cluster_dict) return self.entry + + def write_entry(self, path, cluster_dict): + self.create_entry(cluster_dict) + galaxy_path = os.path.join(path, self.json_file_name) + if not os.path.exists(galaxy_path): + os.mkdir(galaxy_path) + with open(os.path.join(galaxy_path, 'index.md'), "w") as index: + index.write(self.entry) class Cluster(): def __init__(self, description, uuid, date, value, related_list, meta, galaxie): @@ -130,6 +137,10 @@ class Cluster(): self.meta = meta self.entry = "" self.galaxie = galaxie + + global public_clusters_dict + if self.galaxie: + public_clusters_dict[self.uuid] = self.galaxie def _create_title_entry(self): self.entry += f'## {self.value}\n' @@ -189,10 +200,9 @@ class Cluster(): self.entry += f' | {meta} | {self.meta[meta]} |\n' - def get_related_clusters(self, depth=-1, visited=None): + def get_related_clusters(self, cluster_dict, depth=-1, visited=None): global public_relations_count global private_relations_count - global public_clusters global private_clusters global empty_uuids_dict empty_uuids = 0 @@ -224,14 +234,12 @@ class Cluster(): related_cluster = cluster_dict[dest_uuid] public_relations_count += 1 - if dest_uuid not in public_clusters: - public_clusters.append(dest_uuid) related_clusters.append((self, related_cluster)) if (depth > 1 or depth == -1) and related_cluster.uuid not in visited: new_depth = depth - 1 if depth > 1 else -1 - related_clusters += related_cluster.get_related_clusters(depth=new_depth, visited=visited) + related_clusters += related_cluster.get_related_clusters(depth=new_depth, visited=visited, cluster_dict=cluster_dict) if empty_uuids > 0: empty_uuids_dict[self.value] = empty_uuids @@ -242,87 +250,174 @@ class Cluster(): return related_clusters - def _create_related_entry(self): - if self.related_list and cluster_dict: - related_clusters = self.get_related_clusters() - self.entry += f'\n' - self.entry += f'??? info "Related clusters"\n' - self.entry += f'\n' - self.entry += f' ```mermaid\n' - self.entry += f' graph TD\n' + # def _create_related_entry(self, cluster_dict): + # if self.related_list and cluster_dict: + # related_clusters = self.get_related_clusters(cluster_dict) + # self.entry += f'\n' + # self.entry += f'??? info "Related clusters"\n' + # self.entry += f'\n' + # self.entry += f' ```mermaid\n' + # self.entry += f' graph TD\n' - global relation_count_dict - relation_count = 0 + # global relation_count_dict + # relation_count = 0 - for relation in related_clusters: - relation_count += 1 - # print(self.value) - # print(relation) - # print(relation[0].value) - # print(relation[1].value) - self.entry += f' {relation[0].uuid}[{relation[0].value}] --- {relation[1].uuid}[{relation[1].value}]\n' - self.entry += f' ```\n' - relation_count_dict[self.value] = relation_count - - def create_entry(self): + # for relation in related_clusters: + # relation_count += 1 + # self.entry += f' {relation[0].uuid}[{relation[0].value}] --- {relation[1].uuid}[{relation[1].value}]\n' + # self.entry += f' ```\n' + # relation_count_dict[self.value] = relation_count + + def _get_related_entry(self, relations): + output = "" + output += f'## Related clusters for {self.value}\n' + output += f'\n' + return output + + def create_entry(self, cluster_dict): self._create_title_entry() self._create_description_entry() self._create_synonyms_entry() self._create_uuid_entry() self._create_refs_entry() self._create_associated_metadata_entry() - self._create_related_entry() + # self._create_related_entry(cluster_dict) + if self.related_list: + self._write_relations(cluster_dict, SITE_PATH) return self.entry - -galaxies = [] -for galaxy in galaxies_fnames: - with open(os.path.join(CLUSTER_PATH, galaxy)) as fr: - galaxie_json = json.load(fr) - galaxies.append(Galaxy(galaxie_json['values'], galaxie_json['authors'], galaxie_json['description'], galaxie_json['name'], galaxy.split('.')[0])) - -cluster_dict = {} -for galaxy in galaxies: - for cluster in galaxy.clusters: - cluster_dict[cluster.uuid] = cluster - -def create_index(intro, contributing, galaxies): - index_output = intro + + def _write_relations(self, cluster_dict, path): + related_clusters = self.get_related_clusters(cluster_dict) + global relation_count_dict + relation_count_dict[self.value] = len(related_clusters) + galaxy_path = os.path.join(path, self.galaxie) + if not os.path.exists(galaxy_path): + os.mkdir(galaxy_path) + with open(os.path.join(galaxy_path, f'{self.uuid}.md'), "w") as index: + index.write(self._get_related_entry(related_clusters)) + +def create_index(galaxies): + index_output = INTRO + index_output += STATISTICS for galaxie in galaxies: index_output += f'- [{galaxie.name}](./{galaxie.json_file_name}/index.md)\n' - index_output += contributing + index_output += CONTRIBUTING return index_output -def create_galaxies(galaxies): +def create_galaxies(galaxies, cluster_dict): galaxy_output = {} for galaxie in galaxies: - galaxy_output[galaxie.json_file_name] = galaxie.create_entry() + galaxy_output[galaxie.json_file_name] = galaxie.create_entry(cluster_dict) return galaxy_output -if __name__ == "__main__": - index_output = create_index(intro, contributing, galaxies) - galaxy_output = create_galaxies(galaxies) +def create_xy_chart(title, width, height, x_axis, y_axis, bar): + output = "" + output += f'```mermaid\n' + output += f'---\n' + output += f'config:\n' + output += f' xyChart:\n' + output += f' width: {width}\n' + output += f' height: {height}\n' + output += f'---\n' + output += f'xychart-beta\n' + output += f' title "{title}"\n' + output += f' x-axis [{x_axis}]\n' + output += f' y-axis "{y_axis}"\n' + output += f' bar {bar}\n' + output += f'```\n' + output += f'\n' + return output - if not os.path.exists(SITE_PATH): - os.mkdir(SITE_PATH) - with open(os.path.join(SITE_PATH, 'index.md'), "w") as index: - index.write(index_output) +def create_statistics(): + statistic_output = "" + + statistic_output += f'# Cluster statistics\n' + statistic_output += f'## Number of clusters\n' + statistic_output += f'```mermaid\n' + statistic_output += f"pie showData\n" + statistic_output += f' title Number of clusters\n' + statistic_output += f' "Public clusters" : {len(public_clusters_dict)}\n' + statistic_output += f' "Private clusters" : {len(private_clusters)}\n' + statistic_output += f'```\n' + statistic_output += f'\n' + + statistic_output += f'## Galaxies with the most clusters\n' + galaxy_counts = {} + for galaxy in public_clusters_dict.values(): + galaxy_counts[galaxy] = galaxy_counts.get(galaxy, 0) + 1 + sorted_galaxies = sorted(galaxy_counts, key=galaxy_counts.get, reverse=True)[:15] + top_15_galaxies = [re.sub(r"[^A-Za-z0-9 ]", "", galaxy) for galaxy in sorted_galaxies] + top_15_galaxies = ", ".join(top_15_galaxies) + top_15_galaxies_values = sorted(galaxy_counts.values(), reverse=True)[:15] + statistic_output += create_xy_chart("Galaxies with the most clusters", 1800, 500, top_15_galaxies, "Number of clusters", top_15_galaxies_values) + + statistic_output += f'## Galaxies with the least clusters\n' + sorted_galaxies = sorted(galaxy_counts, key=galaxy_counts.get)[:15] + top_15_galaxies = [re.sub(r"[^A-Za-z0-9 ]", "", galaxy) for galaxy in sorted_galaxies] + top_15_galaxies = ", ".join(top_15_galaxies) + top_15_galaxies_values = sorted(galaxy_counts.values())[:15] + statistic_output += create_xy_chart("Galaxies with the least clusters", 1800, 500, top_15_galaxies, "Number of clusters", top_15_galaxies_values) + + galaxy_number = 0 + for galaxy in public_clusters_dict.values(): + galaxy_number += 1 + statistic_output += f'**Average number of clusters per galaxy**: {len(public_clusters_dict) / galaxy_number}\n' + + statistic_output += f'# Relation statistics\n' + statistic_output += f'## Number of relations\n' + statistic_output += f'```mermaid\n' + statistic_output += f"pie showData\n" + statistic_output += f' title Number of relations\n' + statistic_output += f' "Public relations" : {public_relations_count}\n' + statistic_output += f' "Private relations" : {private_relations_count}\n' + statistic_output += f'```\n' + statistic_output += f'\n' + + statistic_output += f'**Average number of relations per cluster**: {sum(relation_count_dict.values()) / len(relation_count_dict)}\n' + + statistic_output += f'## Cluster with the most relations\n' + sorted_relation = sorted(relation_count_dict.items(), key=operator.itemgetter(1), reverse=True)[:15] + top_15_relation = [re.sub(r"[^A-Za-z0-9 ]", "", key) for key, value in sorted_relation] + top_15_relation = ", ".join(top_15_relation) + top_15_relation_values = sorted(relation_count_dict.values(), reverse=True)[:15] + statistic_output += create_xy_chart("Cluster with the most relations", 2000, 500, top_15_relation, "Number of relations", top_15_relation_values) + + statistic_output += f'## Cluster with the least relations\n' + sorted_relation = sorted(relation_count_dict.items(), key=operator.itemgetter(1))[:15] + top_15_relation = [re.sub(r"[^A-Za-z0-9 ]", "", key) for key, value in sorted_relation] + top_15_relation = ", ".join(top_15_relation) + top_15_relation_values = sorted(relation_count_dict.values())[:15] + statistic_output += create_xy_chart("Cluster with the least relations", 2000, 500, top_15_relation, "Number of relations", top_15_relation_values) + + statistic_output += f'# Synonyms statistics\n' + statistic_output += f'## Cluster with the most synonyms\n' + sorted_synonyms = sorted(synonyms_count_dict.items(), key=operator.itemgetter(1), reverse=True)[:15] + top_15_synonyms = [re.sub(r"[^A-Za-z0-9 ]", "", key) for key, value in sorted_synonyms] + top_15_synonyms = ", ".join(top_15_synonyms) + top_15_synonyms_values = sorted(synonyms_count_dict.values(), reverse=True)[:15] + statistic_output += create_xy_chart("Cluster with the most synonyms", 1800, 500, top_15_synonyms, "Number of synonyms", top_15_synonyms_values) + + statistic_output += f'## Cluster with the least synonyms\n' + sorted_synonyms = sorted(synonyms_count_dict.items(), key=operator.itemgetter(1))[:15] + top_15_synonyms = [re.sub(r"[^A-Za-z0-9 ]", "", key) for key, value in sorted_synonyms] + top_15_synonyms = ", ".join(top_15_synonyms) + top_15_synonyms_values = sorted(synonyms_count_dict.values())[:15] + statistic_output += create_xy_chart("Cluster with the least synonyms", 1800, 500, top_15_synonyms, "Number of synonyms", top_15_synonyms_values) + + statistic_output += f'# Empty UUIDs statistics\n' + statistic_output += f'**Number of empty UUIDs**: {sum(empty_uuids_dict.values())}\n' + statistic_output += f'\n' + statistic_output += f'**Empty UUIDs per cluster**: {empty_uuids_dict}\n' - for f in galaxies_fnames: - cluster_filename = f.split('.')[0] - pathSiteCluster = os.path.join(SITE_PATH, cluster_filename) - if not os.path.exists(pathSiteCluster): - os.mkdir(pathSiteCluster) - with open(os.path.join(pathSiteCluster, 'index.md'), "w") as index: - index.write(galaxy_output[cluster_filename]) print(f"Public relations: {public_relations_count}") print(f"Private relations: {private_relations_count}") print(f"Total relations: {public_relations_count + private_relations_count}") print(f"Percetage of private relations: {private_relations_count / (public_relations_count + private_relations_count) * 100}%") print(f"Private clusters: {len(private_clusters)}") - print(f"Public clusters: {len(public_clusters)}") - print(f"Total clusters: {len(private_clusters) + len(public_clusters)}") - print(f"Percentage of private clusters: {len(private_clusters) / (len(private_clusters) + len(public_clusters)) * 100}%") + print(f"Public clusters: {len(public_clusters_dict)}") + print(f"Total clusters: {len(private_clusters) + len(public_clusters_dict)}") + print(f"Percentage of private clusters: {len(private_clusters) / (len(private_clusters) + len(public_clusters_dict)) * 100}%") print(f"Average number of relations per cluster: {sum(relation_count_dict.values()) / len(relation_count_dict)}") print(f"Max number of relations per cluster: {max(relation_count_dict.values())} from {max(relation_count_dict, key=relation_count_dict.get)}") print(f"Min number of relations per cluster: {min(relation_count_dict.values())} from {min(relation_count_dict, key=relation_count_dict.get)}") @@ -331,6 +426,64 @@ if __name__ == "__main__": print(f"Min number of synonyms per cluster: {min(synonyms_count_dict.values())} from {min(synonyms_count_dict, key=synonyms_count_dict.get)}") print(f"Number of empty UUIDs: {sum(empty_uuids_dict.values())}") print(f"Empty UUIDs per cluster: {empty_uuids_dict}") + print(sorted(relation_count_dict.items(), key=operator.itemgetter(1), reverse=True)[:30]) + lol = [re.sub(r"[^A-Za-z0-9 ]", "", key) for key, value in sorted_relation] + print(", ".join(lol)) + + + return statistic_output + +def main(): + galaxies_fnames = [] + for f in os.listdir(CLUSTER_PATH): + if '.json' in f and f not in FILES_TO_IGNORE: + galaxies_fnames.append(f) + + galaxies_fnames.sort() + galaxy_output = {} + + galaxies = [] + for galaxy in galaxies_fnames: + with open(os.path.join(CLUSTER_PATH, galaxy)) as fr: + galaxie_json = json.load(fr) + galaxies.append(Galaxy(galaxie_json['values'], galaxie_json['authors'], galaxie_json['description'], galaxie_json['name'], galaxy.split('.')[0])) + + cluster_dict = {} + for galaxy in galaxies: + for cluster in galaxy.clusters: + cluster_dict[cluster.uuid] = cluster + + # Write files + if not os.path.exists(SITE_PATH): + os.mkdir(SITE_PATH) + + for galaxy in galaxies: + galaxy.write_entry(SITE_PATH, cluster_dict) + + index_output = create_index(galaxies) + # galaxy_output = create_galaxies(galaxies, cluster_dict) + statistic_output = create_statistics() + + with open(os.path.join(SITE_PATH, 'index.md'), "w") as index: + index.write(index_output) + + with open(os.path.join(SITE_PATH, 'statistics.md'), "w") as index: + index.write(statistic_output) + + # for f in galaxies_fnames: + # cluster_filename = f.split('.')[0] + # pathSiteCluster = os.path.join(SITE_PATH, cluster_filename) + # if not os.path.exists(pathSiteCluster): + # os.mkdir(pathSiteCluster) + # with open(os.path.join(pathSiteCluster, 'index.md'), "w") as index: + # index.write(galaxy_output[cluster_filename]) + + + # for cluster in cluster_dict.values(): + # print(cluster.uuid) + +if __name__ == "__main__": + main() # test = cluster_dict['f0ec2df5-2e38-4df3-970d-525352006f2e'] # test = cluster_dict['d7247cf9-13b6-4781-b789-a5f33521633b']