mirror of https://github.com/MISP/misp-modules
64 lines
1.6 KiB
Python
Executable File
64 lines
1.6 KiB
Python
Executable File
import json
|
|
import requests
|
|
from markdownify import markdownify
|
|
from bs4 import BeautifulSoup
|
|
|
|
misperrors = {'error': 'Error'}
|
|
mispattributes = {'input': ['url'], 'output': ['text']}
|
|
moduleinfo = {
|
|
'version': '0.1',
|
|
'author': 'Sami Mokaddem',
|
|
'description': 'Expansion module to fetch the html content from an url and convert it into markdown.',
|
|
'module-type': ['expansion'],
|
|
'name': 'HTML to Markdown',
|
|
'logo': '',
|
|
'requirements': ['The markdownify python library'],
|
|
'features': 'The module take an URL as input and the HTML content is fetched from it. This content is then converted into markdown that is returned as text.',
|
|
'references': [],
|
|
'input': 'URL attribute.',
|
|
'output': 'Markdown content converted from the HTML fetched from the url.',
|
|
}
|
|
|
|
|
|
def fetchHTML(url):
|
|
r = requests.get(url)
|
|
return r.text
|
|
|
|
|
|
def stripUselessTags(html):
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
toRemove = ['script', 'head', 'header', 'footer', 'meta', 'link']
|
|
for tag in soup.find_all(toRemove):
|
|
tag.decompose()
|
|
return str(soup)
|
|
|
|
|
|
def convertHTML(html):
|
|
toStrip = ['a', 'img']
|
|
return markdownify(html, heading_style='ATX', strip=toStrip)
|
|
|
|
|
|
def handler(q=False):
|
|
if q is False:
|
|
return False
|
|
request = json.loads(q)
|
|
if request.get('url'):
|
|
url = request['url']
|
|
else:
|
|
return False
|
|
html = fetchHTML(url)
|
|
html = stripUselessTags(html)
|
|
markdown = convertHTML(html)
|
|
|
|
r = {'results': [{'types': mispattributes['output'],
|
|
'values':[str(markdown)]}]}
|
|
return r
|
|
|
|
|
|
def introspection():
|
|
return mispattributes
|
|
|
|
|
|
def version():
|
|
return moduleinfo
|