new: [expansion] Added html_to_markdown module

It fetches the HTML from the provided URL, performs a bit of DOM clean-up then convert it into markdown
2020-10-23 22:17:47 +02:00 · 2020-10-23 22:17:47 +02:00 · 2be1d7a0cd
parent c00349e198
commit 2be1d7a0cd
3 changed files with 55 additions and 1 deletions
--- a/1
+++ b/1
@ -47,6 +47,7 @@ jsonschema==3.2.0
 lief==0.10.1
 lxml==4.5.2
 maclookup==1.0.3
 markdownify==0.5.3
 maxminddb==2.0.2; python_version >= '3.6'
 multidict==4.7.6; python_version >= '3.5'
 np==1.0.2
--- a/misp_modules/modules/expansion/init.py
+++ b/misp_modules/modules/expansion/init.py
@ -18,7 +18,7 @@ __all__ = ['cuckoo_submit', 'vmray_submit', 'bgpranking', 'circl_passivedns', 'c
           'virustotal_public', 'apiosintds', 'urlscan', 'securitytrails', 'apivoid',
           'assemblyline_submit', 'assemblyline_query', 'ransomcoindb', 'malwarebazaar',
           'lastline_query', 'lastline_submit', 'sophoslabs_intelix', 'cytomic_orion', 'censys_enrich',
-           'trustar_enrich', 'recordedfuture']
+           'trustar_enrich', 'recordedfuture', 'html_to_markdown']
 minimum_required_fields = ('type', 'uuid', 'value')
--- a/misp_modules/modules/expansion/html_to_markdown.py
+++ b/misp_modules/modules/expansion/html_to_markdown.py
@ -0,0 +1,53 @@
 import json
 import requests
 from markdownify import markdownify
 from bs4 import BeautifulSoup
 misperrors = {'error': 'Error'}
 mispattributes = {'input': ['url'], 'output': ['text']}
 moduleinfo = {'version': '0.1', 'author': 'Sami Mokaddem',
              'description': 'Simple HTML fetcher',
              'module-type': ['expansion']}
 def fetchHTML(url):
    r = requests.get(url)
    return r.text
 def stripUselessTags(html):
    soup = BeautifulSoup(html, 'html.parser')
    toRemove = ['script', 'head', 'header', 'footer', 'meta', 'link']
    for tag in soup.find_all(toRemove):
        tag.decompose()
    return str(soup)
 def convertHTML(html):
    toStrip = ['a', 'img']
    return markdownify(html, heading_style='ATX', strip=toStrip)
 def handler(q=False):
    if q is False:
        return False
    request = json.loads(q)
    if request.get('url'):
        url = request['url']
    else:
        return False
    html = fetchHTML(url)
    html = stripUselessTags(html)
    markdown = convertHTML(html)
    r = {'results': [{'types': mispattributes['output'],
                      'values':[str(markdown)]}]}
    return r
 def introspection():
    return mispattributes
 def version():
    return moduleinfo