diff --git a/REQUIREMENTS b/REQUIREMENTS index 9b26d1c..f6362b5 100644 --- a/REQUIREMENTS +++ b/REQUIREMENTS @@ -47,6 +47,7 @@ jsonschema==3.2.0 lief==0.10.1 lxml==4.5.2 maclookup==1.0.3 +markdownify==0.5.3 maxminddb==2.0.2; python_version >= '3.6' multidict==4.7.6; python_version >= '3.5' np==1.0.2 diff --git a/misp_modules/modules/expansion/__init__.py b/misp_modules/modules/expansion/__init__.py index 1b6d2bb..6485140 100644 --- a/misp_modules/modules/expansion/__init__.py +++ b/misp_modules/modules/expansion/__init__.py @@ -18,7 +18,7 @@ __all__ = ['cuckoo_submit', 'vmray_submit', 'bgpranking', 'circl_passivedns', 'c 'virustotal_public', 'apiosintds', 'urlscan', 'securitytrails', 'apivoid', 'assemblyline_submit', 'assemblyline_query', 'ransomcoindb', 'malwarebazaar', 'lastline_query', 'lastline_submit', 'sophoslabs_intelix', 'cytomic_orion', 'censys_enrich', - 'trustar_enrich', 'recordedfuture'] + 'trustar_enrich', 'recordedfuture', 'html_to_markdown'] minimum_required_fields = ('type', 'uuid', 'value') diff --git a/misp_modules/modules/expansion/html_to_markdown.py b/misp_modules/modules/expansion/html_to_markdown.py new file mode 100755 index 0000000..228b4bc --- /dev/null +++ b/misp_modules/modules/expansion/html_to_markdown.py @@ -0,0 +1,53 @@ +import json +import requests +from markdownify import markdownify +from bs4 import BeautifulSoup + +misperrors = {'error': 'Error'} +mispattributes = {'input': ['url'], 'output': ['text']} +moduleinfo = {'version': '0.1', 'author': 'Sami Mokaddem', + 'description': 'Simple HTML fetcher', + 'module-type': ['expansion']} + + +def fetchHTML(url): + r = requests.get(url) + return r.text + + +def stripUselessTags(html): + soup = BeautifulSoup(html, 'html.parser') + toRemove = ['script', 'head', 'header', 'footer', 'meta', 'link'] + for tag in soup.find_all(toRemove): + tag.decompose() + return str(soup) + + +def convertHTML(html): + toStrip = ['a', 'img'] + return markdownify(html, heading_style='ATX', strip=toStrip) + + +def handler(q=False): + if q is False: + return False + request = json.loads(q) + if request.get('url'): + url = request['url'] + else: + return False + html = fetchHTML(url) + html = stripUselessTags(html) + markdown = convertHTML(html) + + r = {'results': [{'types': mispattributes['output'], + 'values':[str(markdown)]}]} + return r + + +def introspection(): + return mispattributes + + +def version(): + return moduleinfo