From 2be1d7a0cde74703e15e2afd70c4205d9a4c3844 Mon Sep 17 00:00:00 2001 From: mokaddem Date: Fri, 23 Oct 2020 22:17:47 +0200 Subject: [PATCH] new: [expansion] Added html_to_markdown module It fetches the HTML from the provided URL, performs a bit of DOM clean-up then convert it into markdown --- REQUIREMENTS | 1 + misp_modules/modules/expansion/__init__.py | 2 +- .../modules/expansion/html_to_markdown.py | 53 +++++++++++++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) create mode 100755 misp_modules/modules/expansion/html_to_markdown.py diff --git a/REQUIREMENTS b/REQUIREMENTS index 9b26d1c..f6362b5 100644 --- a/REQUIREMENTS +++ b/REQUIREMENTS @@ -47,6 +47,7 @@ jsonschema==3.2.0 lief==0.10.1 lxml==4.5.2 maclookup==1.0.3 +markdownify==0.5.3 maxminddb==2.0.2; python_version >= '3.6' multidict==4.7.6; python_version >= '3.5' np==1.0.2 diff --git a/misp_modules/modules/expansion/__init__.py b/misp_modules/modules/expansion/__init__.py index 1b6d2bb..6485140 100644 --- a/misp_modules/modules/expansion/__init__.py +++ b/misp_modules/modules/expansion/__init__.py @@ -18,7 +18,7 @@ __all__ = ['cuckoo_submit', 'vmray_submit', 'bgpranking', 'circl_passivedns', 'c 'virustotal_public', 'apiosintds', 'urlscan', 'securitytrails', 'apivoid', 'assemblyline_submit', 'assemblyline_query', 'ransomcoindb', 'malwarebazaar', 'lastline_query', 'lastline_submit', 'sophoslabs_intelix', 'cytomic_orion', 'censys_enrich', - 'trustar_enrich', 'recordedfuture'] + 'trustar_enrich', 'recordedfuture', 'html_to_markdown'] minimum_required_fields = ('type', 'uuid', 'value') diff --git a/misp_modules/modules/expansion/html_to_markdown.py b/misp_modules/modules/expansion/html_to_markdown.py new file mode 100755 index 0000000..228b4bc --- /dev/null +++ b/misp_modules/modules/expansion/html_to_markdown.py @@ -0,0 +1,53 @@ +import json +import requests +from markdownify import markdownify +from bs4 import BeautifulSoup + +misperrors = {'error': 'Error'} +mispattributes = {'input': ['url'], 'output': ['text']} +moduleinfo = {'version': '0.1', 'author': 'Sami Mokaddem', + 'description': 'Simple HTML fetcher', + 'module-type': ['expansion']} + + +def fetchHTML(url): + r = requests.get(url) + return r.text + + +def stripUselessTags(html): + soup = BeautifulSoup(html, 'html.parser') + toRemove = ['script', 'head', 'header', 'footer', 'meta', 'link'] + for tag in soup.find_all(toRemove): + tag.decompose() + return str(soup) + + +def convertHTML(html): + toStrip = ['a', 'img'] + return markdownify(html, heading_style='ATX', strip=toStrip) + + +def handler(q=False): + if q is False: + return False + request = json.loads(q) + if request.get('url'): + url = request['url'] + else: + return False + html = fetchHTML(url) + html = stripUselessTags(html) + markdown = convertHTML(html) + + r = {'results': [{'types': mispattributes['output'], + 'values':[str(markdown)]}]} + return r + + +def introspection(): + return mispattributes + + +def version(): + return moduleinfo