Add [generator] statistics

pull/926/head
niclas 2024-02-01 11:05:45 +01:00
parent 590554cb0f
commit 29b39f55d7
1 changed files with 235 additions and 82 deletions

View File

@ -1,39 +1,29 @@
#!/usr/bin/python
import json
import operator
import os
import re
from typing import List
import validators
CLUSTER_PATH = '../../clusters'
SITE_PATH = './site/docs'
PROJECTS_PATH = './site/projects'
# PROJECTS_PATH = './site/projects'
galaxies_fnames = []
FILES_TO_IGNORE = [] # if you want to skip a specific cluster in the generation
for f in os.listdir(CLUSTER_PATH):
if '.json' in f and f not in FILES_TO_IGNORE:
galaxies_fnames.append(f)
galaxies_fnames.sort()
galaxy_output = {}
# Variables for statistics
public_relations_count = 0
private_relations_count = 0
private_clusters = []
public_clusters = []
public_clusters_dict = {}
relation_count_dict = {}
synonyms_count_dict = {}
empty_uuids_dict = {}
intro = """
INTRO = """
# MISP Galaxy
The MISP galaxy offers a streamlined approach for representing large entities, known as clusters, which can be linked to MISP events or attributes. Each cluster consists of one or more elements, represented as key-value pairs. MISP galaxy comes with a default knowledge base, encompassing areas like Threat Actors, Tools, Ransomware, and ATT&CK matrices. However, users have the flexibility to modify, update, replace, or share these elements according to their needs.
@ -51,7 +41,15 @@ Clusters serve as an open and freely accessible knowledge base, which can be uti
## Publicly available clusters
"""
contributing = """
STATISTICS= """
## Statistics
You can find some statistics about MISP galaxies [here](./statistics.md).
"""
CONTRIBUTING = """
# Contributing
@ -95,6 +93,7 @@ class Galaxy():
self.entry += f' |{author}|\n'
def _create_clusters(self):
# global public_clusters_dict
clusters = []
for cluster in self.cluster_list:
clusters.append(Cluster(
@ -104,21 +103,29 @@ class Galaxy():
date=cluster.get('date', None),
related_list=cluster.get('related', None),
meta=cluster.get('meta', None),
galaxie=self.name
galaxie=self.json_file_name
))
return clusters
def _create_clusters_entry(self):
def _create_clusters_entry(self, cluster_dict):
for cluster in self.clusters:
self.entry += cluster.create_entry()
self.entry += cluster.create_entry(cluster_dict)
def create_entry(self):
def create_entry(self, cluster_dict):
self._create_metadata_entry()
self._create_title_entry()
self._create_description_entry()
self._create_authors_entry()
self._create_clusters_entry()
self._create_clusters_entry(cluster_dict)
return self.entry
def write_entry(self, path, cluster_dict):
self.create_entry(cluster_dict)
galaxy_path = os.path.join(path, self.json_file_name)
if not os.path.exists(galaxy_path):
os.mkdir(galaxy_path)
with open(os.path.join(galaxy_path, 'index.md'), "w") as index:
index.write(self.entry)
class Cluster():
def __init__(self, description, uuid, date, value, related_list, meta, galaxie):
@ -130,6 +137,10 @@ class Cluster():
self.meta = meta
self.entry = ""
self.galaxie = galaxie
global public_clusters_dict
if self.galaxie:
public_clusters_dict[self.uuid] = self.galaxie
def _create_title_entry(self):
self.entry += f'## {self.value}\n'
@ -189,10 +200,9 @@ class Cluster():
self.entry += f' | {meta} | {self.meta[meta]} |\n'
def get_related_clusters(self, depth=-1, visited=None):
def get_related_clusters(self, cluster_dict, depth=-1, visited=None):
global public_relations_count
global private_relations_count
global public_clusters
global private_clusters
global empty_uuids_dict
empty_uuids = 0
@ -224,14 +234,12 @@ class Cluster():
related_cluster = cluster_dict[dest_uuid]
public_relations_count += 1
if dest_uuid not in public_clusters:
public_clusters.append(dest_uuid)
related_clusters.append((self, related_cluster))
if (depth > 1 or depth == -1) and related_cluster.uuid not in visited:
new_depth = depth - 1 if depth > 1 else -1
related_clusters += related_cluster.get_related_clusters(depth=new_depth, visited=visited)
related_clusters += related_cluster.get_related_clusters(depth=new_depth, visited=visited, cluster_dict=cluster_dict)
if empty_uuids > 0:
empty_uuids_dict[self.value] = empty_uuids
@ -242,87 +250,174 @@ class Cluster():
return related_clusters
def _create_related_entry(self):
if self.related_list and cluster_dict:
related_clusters = self.get_related_clusters()
self.entry += f'\n'
self.entry += f'??? info "Related clusters"\n'
self.entry += f'\n'
self.entry += f' ```mermaid\n'
self.entry += f' graph TD\n'
# def _create_related_entry(self, cluster_dict):
# if self.related_list and cluster_dict:
# related_clusters = self.get_related_clusters(cluster_dict)
# self.entry += f'\n'
# self.entry += f'??? info "Related clusters"\n'
# self.entry += f'\n'
# self.entry += f' ```mermaid\n'
# self.entry += f' graph TD\n'
global relation_count_dict
relation_count = 0
# global relation_count_dict
# relation_count = 0
for relation in related_clusters:
relation_count += 1
# print(self.value)
# print(relation)
# print(relation[0].value)
# print(relation[1].value)
self.entry += f' {relation[0].uuid}[{relation[0].value}] --- {relation[1].uuid}[{relation[1].value}]\n'
self.entry += f' ```\n'
relation_count_dict[self.value] = relation_count
def create_entry(self):
# for relation in related_clusters:
# relation_count += 1
# self.entry += f' {relation[0].uuid}[{relation[0].value}] --- {relation[1].uuid}[{relation[1].value}]\n'
# self.entry += f' ```\n'
# relation_count_dict[self.value] = relation_count
def _get_related_entry(self, relations):
output = ""
output += f'## Related clusters for {self.value}\n'
output += f'\n'
return output
def create_entry(self, cluster_dict):
self._create_title_entry()
self._create_description_entry()
self._create_synonyms_entry()
self._create_uuid_entry()
self._create_refs_entry()
self._create_associated_metadata_entry()
self._create_related_entry()
# self._create_related_entry(cluster_dict)
if self.related_list:
self._write_relations(cluster_dict, SITE_PATH)
return self.entry
galaxies = []
for galaxy in galaxies_fnames:
with open(os.path.join(CLUSTER_PATH, galaxy)) as fr:
galaxie_json = json.load(fr)
galaxies.append(Galaxy(galaxie_json['values'], galaxie_json['authors'], galaxie_json['description'], galaxie_json['name'], galaxy.split('.')[0]))
cluster_dict = {}
for galaxy in galaxies:
for cluster in galaxy.clusters:
cluster_dict[cluster.uuid] = cluster
def create_index(intro, contributing, galaxies):
index_output = intro
def _write_relations(self, cluster_dict, path):
related_clusters = self.get_related_clusters(cluster_dict)
global relation_count_dict
relation_count_dict[self.value] = len(related_clusters)
galaxy_path = os.path.join(path, self.galaxie)
if not os.path.exists(galaxy_path):
os.mkdir(galaxy_path)
with open(os.path.join(galaxy_path, f'{self.uuid}.md'), "w") as index:
index.write(self._get_related_entry(related_clusters))
def create_index(galaxies):
index_output = INTRO
index_output += STATISTICS
for galaxie in galaxies:
index_output += f'- [{galaxie.name}](./{galaxie.json_file_name}/index.md)\n'
index_output += contributing
index_output += CONTRIBUTING
return index_output
def create_galaxies(galaxies):
def create_galaxies(galaxies, cluster_dict):
galaxy_output = {}
for galaxie in galaxies:
galaxy_output[galaxie.json_file_name] = galaxie.create_entry()
galaxy_output[galaxie.json_file_name] = galaxie.create_entry(cluster_dict)
return galaxy_output
if __name__ == "__main__":
index_output = create_index(intro, contributing, galaxies)
galaxy_output = create_galaxies(galaxies)
def create_xy_chart(title, width, height, x_axis, y_axis, bar):
output = ""
output += f'```mermaid\n'
output += f'---\n'
output += f'config:\n'
output += f' xyChart:\n'
output += f' width: {width}\n'
output += f' height: {height}\n'
output += f'---\n'
output += f'xychart-beta\n'
output += f' title "{title}"\n'
output += f' x-axis [{x_axis}]\n'
output += f' y-axis "{y_axis}"\n'
output += f' bar {bar}\n'
output += f'```\n'
output += f'\n'
return output
if not os.path.exists(SITE_PATH):
os.mkdir(SITE_PATH)
with open(os.path.join(SITE_PATH, 'index.md'), "w") as index:
index.write(index_output)
def create_statistics():
statistic_output = ""
statistic_output += f'# Cluster statistics\n'
statistic_output += f'## Number of clusters\n'
statistic_output += f'```mermaid\n'
statistic_output += f"pie showData\n"
statistic_output += f' title Number of clusters\n'
statistic_output += f' "Public clusters" : {len(public_clusters_dict)}\n'
statistic_output += f' "Private clusters" : {len(private_clusters)}\n'
statistic_output += f'```\n'
statistic_output += f'\n'
statistic_output += f'## Galaxies with the most clusters\n'
galaxy_counts = {}
for galaxy in public_clusters_dict.values():
galaxy_counts[galaxy] = galaxy_counts.get(galaxy, 0) + 1
sorted_galaxies = sorted(galaxy_counts, key=galaxy_counts.get, reverse=True)[:15]
top_15_galaxies = [re.sub(r"[^A-Za-z0-9 ]", "", galaxy) for galaxy in sorted_galaxies]
top_15_galaxies = ", ".join(top_15_galaxies)
top_15_galaxies_values = sorted(galaxy_counts.values(), reverse=True)[:15]
statistic_output += create_xy_chart("Galaxies with the most clusters", 1800, 500, top_15_galaxies, "Number of clusters", top_15_galaxies_values)
statistic_output += f'## Galaxies with the least clusters\n'
sorted_galaxies = sorted(galaxy_counts, key=galaxy_counts.get)[:15]
top_15_galaxies = [re.sub(r"[^A-Za-z0-9 ]", "", galaxy) for galaxy in sorted_galaxies]
top_15_galaxies = ", ".join(top_15_galaxies)
top_15_galaxies_values = sorted(galaxy_counts.values())[:15]
statistic_output += create_xy_chart("Galaxies with the least clusters", 1800, 500, top_15_galaxies, "Number of clusters", top_15_galaxies_values)
galaxy_number = 0
for galaxy in public_clusters_dict.values():
galaxy_number += 1
statistic_output += f'**Average number of clusters per galaxy**: {len(public_clusters_dict) / galaxy_number}\n'
statistic_output += f'# Relation statistics\n'
statistic_output += f'## Number of relations\n'
statistic_output += f'```mermaid\n'
statistic_output += f"pie showData\n"
statistic_output += f' title Number of relations\n'
statistic_output += f' "Public relations" : {public_relations_count}\n'
statistic_output += f' "Private relations" : {private_relations_count}\n'
statistic_output += f'```\n'
statistic_output += f'\n'
statistic_output += f'**Average number of relations per cluster**: {sum(relation_count_dict.values()) / len(relation_count_dict)}\n'
statistic_output += f'## Cluster with the most relations\n'
sorted_relation = sorted(relation_count_dict.items(), key=operator.itemgetter(1), reverse=True)[:15]
top_15_relation = [re.sub(r"[^A-Za-z0-9 ]", "", key) for key, value in sorted_relation]
top_15_relation = ", ".join(top_15_relation)
top_15_relation_values = sorted(relation_count_dict.values(), reverse=True)[:15]
statistic_output += create_xy_chart("Cluster with the most relations", 2000, 500, top_15_relation, "Number of relations", top_15_relation_values)
statistic_output += f'## Cluster with the least relations\n'
sorted_relation = sorted(relation_count_dict.items(), key=operator.itemgetter(1))[:15]
top_15_relation = [re.sub(r"[^A-Za-z0-9 ]", "", key) for key, value in sorted_relation]
top_15_relation = ", ".join(top_15_relation)
top_15_relation_values = sorted(relation_count_dict.values())[:15]
statistic_output += create_xy_chart("Cluster with the least relations", 2000, 500, top_15_relation, "Number of relations", top_15_relation_values)
statistic_output += f'# Synonyms statistics\n'
statistic_output += f'## Cluster with the most synonyms\n'
sorted_synonyms = sorted(synonyms_count_dict.items(), key=operator.itemgetter(1), reverse=True)[:15]
top_15_synonyms = [re.sub(r"[^A-Za-z0-9 ]", "", key) for key, value in sorted_synonyms]
top_15_synonyms = ", ".join(top_15_synonyms)
top_15_synonyms_values = sorted(synonyms_count_dict.values(), reverse=True)[:15]
statistic_output += create_xy_chart("Cluster with the most synonyms", 1800, 500, top_15_synonyms, "Number of synonyms", top_15_synonyms_values)
statistic_output += f'## Cluster with the least synonyms\n'
sorted_synonyms = sorted(synonyms_count_dict.items(), key=operator.itemgetter(1))[:15]
top_15_synonyms = [re.sub(r"[^A-Za-z0-9 ]", "", key) for key, value in sorted_synonyms]
top_15_synonyms = ", ".join(top_15_synonyms)
top_15_synonyms_values = sorted(synonyms_count_dict.values())[:15]
statistic_output += create_xy_chart("Cluster with the least synonyms", 1800, 500, top_15_synonyms, "Number of synonyms", top_15_synonyms_values)
statistic_output += f'# Empty UUIDs statistics\n'
statistic_output += f'**Number of empty UUIDs**: {sum(empty_uuids_dict.values())}\n'
statistic_output += f'\n'
statistic_output += f'**Empty UUIDs per cluster**: {empty_uuids_dict}\n'
for f in galaxies_fnames:
cluster_filename = f.split('.')[0]
pathSiteCluster = os.path.join(SITE_PATH, cluster_filename)
if not os.path.exists(pathSiteCluster):
os.mkdir(pathSiteCluster)
with open(os.path.join(pathSiteCluster, 'index.md'), "w") as index:
index.write(galaxy_output[cluster_filename])
print(f"Public relations: {public_relations_count}")
print(f"Private relations: {private_relations_count}")
print(f"Total relations: {public_relations_count + private_relations_count}")
print(f"Percetage of private relations: {private_relations_count / (public_relations_count + private_relations_count) * 100}%")
print(f"Private clusters: {len(private_clusters)}")
print(f"Public clusters: {len(public_clusters)}")
print(f"Total clusters: {len(private_clusters) + len(public_clusters)}")
print(f"Percentage of private clusters: {len(private_clusters) / (len(private_clusters) + len(public_clusters)) * 100}%")
print(f"Public clusters: {len(public_clusters_dict)}")
print(f"Total clusters: {len(private_clusters) + len(public_clusters_dict)}")
print(f"Percentage of private clusters: {len(private_clusters) / (len(private_clusters) + len(public_clusters_dict)) * 100}%")
print(f"Average number of relations per cluster: {sum(relation_count_dict.values()) / len(relation_count_dict)}")
print(f"Max number of relations per cluster: {max(relation_count_dict.values())} from {max(relation_count_dict, key=relation_count_dict.get)}")
print(f"Min number of relations per cluster: {min(relation_count_dict.values())} from {min(relation_count_dict, key=relation_count_dict.get)}")
@ -331,6 +426,64 @@ if __name__ == "__main__":
print(f"Min number of synonyms per cluster: {min(synonyms_count_dict.values())} from {min(synonyms_count_dict, key=synonyms_count_dict.get)}")
print(f"Number of empty UUIDs: {sum(empty_uuids_dict.values())}")
print(f"Empty UUIDs per cluster: {empty_uuids_dict}")
print(sorted(relation_count_dict.items(), key=operator.itemgetter(1), reverse=True)[:30])
lol = [re.sub(r"[^A-Za-z0-9 ]", "", key) for key, value in sorted_relation]
print(", ".join(lol))
return statistic_output
def main():
galaxies_fnames = []
for f in os.listdir(CLUSTER_PATH):
if '.json' in f and f not in FILES_TO_IGNORE:
galaxies_fnames.append(f)
galaxies_fnames.sort()
galaxy_output = {}
galaxies = []
for galaxy in galaxies_fnames:
with open(os.path.join(CLUSTER_PATH, galaxy)) as fr:
galaxie_json = json.load(fr)
galaxies.append(Galaxy(galaxie_json['values'], galaxie_json['authors'], galaxie_json['description'], galaxie_json['name'], galaxy.split('.')[0]))
cluster_dict = {}
for galaxy in galaxies:
for cluster in galaxy.clusters:
cluster_dict[cluster.uuid] = cluster
# Write files
if not os.path.exists(SITE_PATH):
os.mkdir(SITE_PATH)
for galaxy in galaxies:
galaxy.write_entry(SITE_PATH, cluster_dict)
index_output = create_index(galaxies)
# galaxy_output = create_galaxies(galaxies, cluster_dict)
statistic_output = create_statistics()
with open(os.path.join(SITE_PATH, 'index.md'), "w") as index:
index.write(index_output)
with open(os.path.join(SITE_PATH, 'statistics.md'), "w") as index:
index.write(statistic_output)
# for f in galaxies_fnames:
# cluster_filename = f.split('.')[0]
# pathSiteCluster = os.path.join(SITE_PATH, cluster_filename)
# if not os.path.exists(pathSiteCluster):
# os.mkdir(pathSiteCluster)
# with open(os.path.join(pathSiteCluster, 'index.md'), "w") as index:
# index.write(galaxy_output[cluster_filename])
# for cluster in cluster_dict.values():
# print(cluster.uuid)
if __name__ == "__main__":
main()
# test = cluster_dict['f0ec2df5-2e38-4df3-970d-525352006f2e']
# test = cluster_dict['d7247cf9-13b6-4781-b789-a5f33521633b']