From 0d26334448ffd8dc945e79568b54a042ae9d8c47 Mon Sep 17 00:00:00 2001 From: niclas Date: Mon, 11 Mar 2024 16:29:36 +0100 Subject: [PATCH] Add [intel-agencies] build script --- tools/WikipediaAPI/main.py | 98 ++++++++++++++++++++++++++ tools/WikipediaAPI/modules/__init__.py | 0 tools/WikipediaAPI/modules/api.py | 56 +++++++++++++++ tools/WikipediaAPI/modules/intel.py | 64 +++++++++++++++++ 4 files changed, 218 insertions(+) create mode 100644 tools/WikipediaAPI/main.py create mode 100644 tools/WikipediaAPI/modules/__init__.py create mode 100644 tools/WikipediaAPI/modules/api.py create mode 100644 tools/WikipediaAPI/modules/intel.py diff --git a/tools/WikipediaAPI/main.py b/tools/WikipediaAPI/main.py new file mode 100644 index 0000000..ac484fb --- /dev/null +++ b/tools/WikipediaAPI/main.py @@ -0,0 +1,98 @@ +from modules.api import WikipediaAPI +from modules.intel import IntelAgency, Meta, Galaxy, Cluster +import os +import uuid +import json +import re + +from bs4 import BeautifulSoup + +CLUSTER_PATH = '../../clusters' +GALAXY_PATH = '../../galaxies' +GALAXY_NAME = 'intelligence-agencies' +UUID = str(uuid.uuid4()) + +def get_UUIDs(): + if GALAXY_NAME in os.listdir(CLUSTER_PATH): + uuids = {} + with open(os.path.join(CLUSTER_PATH, GALAXY_NAME)) as fr: + galaxy_json = json.load(fr) + for cluster in galaxy_json["values"]: + uuids[cluster["value"]] = cluster["uuid"] + return uuids + return None + +def get_notes_on_lower_level(content): + notes = [] + for li in content.find_all('li', recursive=False): + if li.find('ul'): + notes.extend(get_notes_on_lower_level(li.find('ul'))) + else: + notes.append(li.text) + return notes + +def get_agencies_from_country(heading, current_country, uuids): + agencies = [] + content = heading.find_next('ul') + agency_names = get_notes_on_lower_level(content) + for name in agency_names: + if uuids and name in uuids: + agencies.append(IntelAgency(value=name, uuid=uuids[name], meta=Meta(country=current_country))) + else: + agencies.append(IntelAgency(value=name, meta=Meta(country=current_country), uuid=str(uuid.uuid4()))) + return agencies + +def extract_info(content, uuids): + IGNORE = ["See also", "References", "External links", "Further reading"] + soup = BeautifulSoup(content, 'html.parser') + agencies = [] + current_country = None + for h2 in soup.find_all('h2'): + span = h2.find('span', {'class': 'mw-headline'}) + if span and span.text not in IGNORE: + current_country = span.text.strip() + agencies.extend(get_agencies_from_country(h2, current_country, uuids)) + else: + continue + return agencies + +if __name__ == '__main__': + wiki = WikipediaAPI() + page_title = 'List of intelligence agencies' + content = wiki.get_page_html(page_title) + uuids = get_UUIDs() + if content and uuids: + agencies = extract_info(content, uuids) + elif not uuids: + print(f'No UUIDs found for {GALAXY_NAME}') + agencies = extract_info(content, None) + else: + print(f'Error: {content}') + + # Write to files + galaxy = Galaxy( + description="List of intelligence agencies", + icon="ninja", + name="intelligence-agencies", + namespace="intelligence-agency", + type="intelligence-agency", + uuid=UUID, + version=1, + ) + galaxy.save_to_file(os.path.join(GALAXY_PATH, f'{GALAXY_NAME}.json')) + + cluster = Cluster( + authors="Wikipedia", + category="Intelligence Agencies", + description="List of intelligence agencies", + name="intelligence-agencies", + source="https://en.wikipedia.org/wiki/List_of_intelligence_agencies", + type="intelligence-agency", + uuid=UUID, + version=1, + ) + for agency in agencies: + cluster.add_value(agency) + print(cluster.values) + print(cluster.uuid) + cluster.save_to_file(os.path.join(CLUSTER_PATH, f'{GALAXY_NAME}.json')) diff --git a/tools/WikipediaAPI/modules/__init__.py b/tools/WikipediaAPI/modules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tools/WikipediaAPI/modules/api.py b/tools/WikipediaAPI/modules/api.py new file mode 100644 index 0000000..b77b64c --- /dev/null +++ b/tools/WikipediaAPI/modules/api.py @@ -0,0 +1,56 @@ +import requests + +class WikipediaAPI(): + def __init__(self): + self.base_url = 'https://en.wikipedia.org/w/api.php' + + def get_page_summary(self, page_title): + params = { + 'action': 'query', + 'format': 'json', + 'titles': page_title, + 'prop': 'extracts', + 'explaintext': True, + } + + try: + response = requests.get(self.base_url, params=params) + data = response.json() + page_id = next(iter(data['query']['pages'])) + return data['query']['pages'][page_id]['extract'] + except Exception as e: + print(f'Error: {e}') + return None + + def get_page_content(self, page_title): + params = { + 'action': 'query', + 'format': 'json', + 'titles': page_title, + 'prop': 'revisions', + 'rvprop': 'content', + } + try: + response = requests.get(self.base_url, params=params) + data = response.json() + page_id = next(iter(data['query']['pages'])) + return data['query']['pages'][page_id]['revisions'][0]['*'] + except Exception as e: + print(f'Error: {e}') + return None + + def get_page_html(self, page_title): + params = { + 'action': 'parse', + 'format': 'json', + 'page': page_title, + 'prop': 'text', + 'disableeditsection': True, + } + try: + response = requests.get(self.base_url, params=params) + data = response.json() + return data['parse']['text']['*'] + except Exception as e: + print(f'Error: {e}') + return None \ No newline at end of file diff --git a/tools/WikipediaAPI/modules/intel.py b/tools/WikipediaAPI/modules/intel.py new file mode 100644 index 0000000..f4db5c8 --- /dev/null +++ b/tools/WikipediaAPI/modules/intel.py @@ -0,0 +1,64 @@ +from dataclasses import dataclass, field, asdict +import json + +@dataclass +class Meta: + country: str = "" + +@dataclass +class IntelAgency: + description: str = "" + meta: Meta = field(default_factory=Meta) + related: list = field(default_factory=list) + uuid: str = None + value: str = None + + def __post_init__(self): + if not self.value: + raise ValueError("IntelAgency 'value' cannot be empty.") + if not self.uuid: + raise ValueError("IntelAgency 'uuid' cannot be empty.") + +@dataclass +class Galaxy: + description: str + icon: str + name: str + namespace: str + type: str + uuid: str + version: int + + def save_to_file(self, path: str): + with open(path, "w") as file: + file.write(json.dumps(asdict(self), indent=4)) + +@dataclass +class Cluster(): + def __init__( + self, + authors: str, + category: str, + description: str, + name: str, + source: str, + type: str, + uuid: str, + version: int, + ): + self.authors = authors + self.category = category + self.description = description + self.name = name + self.source = source + self.type = type + self.uuid = uuid + self.version = version + self.values = [] + + def add_value(self, value: IntelAgency): + self.values.append(value) + + def save_to_file(self, path: str): + with open(path, "w") as file: + file.write(json.dumps(asdict(self), indent=4)) \ No newline at end of file