64 lines
2.0 KiB
Python
64 lines
2.0 KiB
Python
import json
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from pathlib import Path
|
|
from pytaxonomies import Entry, Predicate, Taxonomy
|
|
|
|
CONTENT_URL = 'https://www.wada-ama.org/en/prohibited-list'
|
|
|
|
TAXONOMY_DESCRIPTION = 'This taxonomy aims to list doping substances'
|
|
TAXONOMY_EXPANDED = 'Doping substances'
|
|
TAXONOMY_NAME = 'doping-substances'
|
|
|
|
ignore = ('NON-APPROVED SUBSTANCES', )
|
|
|
|
|
|
def list_predicates(articles):
|
|
predicates = {}
|
|
for article in articles:
|
|
title = article.find('p', attrs={'class': 'h3 panel-title'}).text
|
|
if title in ignore:
|
|
continue
|
|
predicate = Predicate()
|
|
predicate.predicate = title
|
|
div = article.find('div', attrs={'class': 'layout-wysiwyg'})
|
|
description = div.find('p')
|
|
predicate.description = description.find_next_sibling().text
|
|
predicates[title] = predicate
|
|
return predicates
|
|
|
|
|
|
def generate_taxonomy():
|
|
new_taxonomy = Taxonomy()
|
|
|
|
new_taxonomy.name = TAXONOMY_NAME
|
|
new_taxonomy.expanded = TAXONOMY_EXPANDED
|
|
new_taxonomy.description = TAXONOMY_DESCRIPTION
|
|
|
|
response = requests.get(CONTENT_URL)
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
articles = soup.findAll('article', attrs={'class': 'panel hide-reader'})
|
|
|
|
new_taxonomy.predicates = list_predicates(articles)
|
|
|
|
for article in articles:
|
|
title = article.find('p', attrs={'class': 'h3 panel-title'}).text
|
|
if title in ignore:
|
|
continue
|
|
products = article.findAll('li')
|
|
products_list = {}
|
|
for product in products:
|
|
entry = Entry()
|
|
entry.value = product.text
|
|
products_list[entry.value] = entry
|
|
new_taxonomy.predicates[title].entries = products_list
|
|
|
|
return new_taxonomy
|
|
|
|
|
|
if __name__ == '__main__':
|
|
taxonomy = generate_taxonomy()
|
|
taxonomy.version = 2
|
|
with open(Path(__file__).resolve().parent / 'machinetag.json', 'wt', encoding='utf-8') as f:
|
|
json.dump(taxonomy.to_dict(), f, indent=2, ensure_ascii=False)
|