misp-taxonomies/doping-substances/gen_taxonomy.py

64 lines
2.0 KiB
Python
Raw Permalink Normal View History

import json
import requests
from bs4 import BeautifulSoup
from pathlib import Path
from pytaxonomies import Entry, Predicate, Taxonomy
CONTENT_URL = 'https://www.wada-ama.org/en/prohibited-list'
TAXONOMY_DESCRIPTION = 'This taxonomy aims to list doping substances'
TAXONOMY_EXPANDED = 'Doping substances'
TAXONOMY_NAME = 'doping-substances'
ignore = ('NON-APPROVED SUBSTANCES', )
def list_predicates(articles):
predicates = {}
for article in articles:
title = article.find('p', attrs={'class': 'h3 panel-title'}).text
if title in ignore:
continue
predicate = Predicate()
predicate.predicate = title
div = article.find('div', attrs={'class': 'layout-wysiwyg'})
description = div.find('p')
predicate.description = description.find_next_sibling().text
predicates[title] = predicate
return predicates
def generate_taxonomy():
new_taxonomy = Taxonomy()
new_taxonomy.name = TAXONOMY_NAME
new_taxonomy.expanded = TAXONOMY_EXPANDED
new_taxonomy.description = TAXONOMY_DESCRIPTION
response = requests.get(CONTENT_URL)
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.findAll('article', attrs={'class': 'panel hide-reader'})
new_taxonomy.predicates = list_predicates(articles)
for article in articles:
title = article.find('p', attrs={'class': 'h3 panel-title'}).text
if title in ignore:
continue
products = article.findAll('li')
products_list = {}
for product in products:
entry = Entry()
entry.value = product.text
products_list[entry.value] = entry
new_taxonomy.predicates[title].entries = products_list
return new_taxonomy
if __name__ == '__main__':
taxonomy = generate_taxonomy()
taxonomy.version = 2
with open(Path(__file__).resolve().parent / 'machinetag.json', 'wt', encoding='utf-8') as f:
json.dump(taxonomy.to_dict(), f, indent=2, ensure_ascii=False)