mirror of https://github.com/MISP/misp-modules
new: [expansion] Added html_to_markdown module
It fetches the HTML from the provided URL, performs a bit of DOM clean-up then convert it into markdownpull/436/head
parent
c00349e198
commit
2be1d7a0cd
|
@ -47,6 +47,7 @@ jsonschema==3.2.0
|
||||||
lief==0.10.1
|
lief==0.10.1
|
||||||
lxml==4.5.2
|
lxml==4.5.2
|
||||||
maclookup==1.0.3
|
maclookup==1.0.3
|
||||||
|
markdownify==0.5.3
|
||||||
maxminddb==2.0.2; python_version >= '3.6'
|
maxminddb==2.0.2; python_version >= '3.6'
|
||||||
multidict==4.7.6; python_version >= '3.5'
|
multidict==4.7.6; python_version >= '3.5'
|
||||||
np==1.0.2
|
np==1.0.2
|
||||||
|
|
|
@ -18,7 +18,7 @@ __all__ = ['cuckoo_submit', 'vmray_submit', 'bgpranking', 'circl_passivedns', 'c
|
||||||
'virustotal_public', 'apiosintds', 'urlscan', 'securitytrails', 'apivoid',
|
'virustotal_public', 'apiosintds', 'urlscan', 'securitytrails', 'apivoid',
|
||||||
'assemblyline_submit', 'assemblyline_query', 'ransomcoindb', 'malwarebazaar',
|
'assemblyline_submit', 'assemblyline_query', 'ransomcoindb', 'malwarebazaar',
|
||||||
'lastline_query', 'lastline_submit', 'sophoslabs_intelix', 'cytomic_orion', 'censys_enrich',
|
'lastline_query', 'lastline_submit', 'sophoslabs_intelix', 'cytomic_orion', 'censys_enrich',
|
||||||
'trustar_enrich', 'recordedfuture']
|
'trustar_enrich', 'recordedfuture', 'html_to_markdown']
|
||||||
|
|
||||||
|
|
||||||
minimum_required_fields = ('type', 'uuid', 'value')
|
minimum_required_fields = ('type', 'uuid', 'value')
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
from markdownify import markdownify
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
misperrors = {'error': 'Error'}
|
||||||
|
mispattributes = {'input': ['url'], 'output': ['text']}
|
||||||
|
moduleinfo = {'version': '0.1', 'author': 'Sami Mokaddem',
|
||||||
|
'description': 'Simple HTML fetcher',
|
||||||
|
'module-type': ['expansion']}
|
||||||
|
|
||||||
|
|
||||||
|
def fetchHTML(url):
|
||||||
|
r = requests.get(url)
|
||||||
|
return r.text
|
||||||
|
|
||||||
|
|
||||||
|
def stripUselessTags(html):
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
toRemove = ['script', 'head', 'header', 'footer', 'meta', 'link']
|
||||||
|
for tag in soup.find_all(toRemove):
|
||||||
|
tag.decompose()
|
||||||
|
return str(soup)
|
||||||
|
|
||||||
|
|
||||||
|
def convertHTML(html):
|
||||||
|
toStrip = ['a', 'img']
|
||||||
|
return markdownify(html, heading_style='ATX', strip=toStrip)
|
||||||
|
|
||||||
|
|
||||||
|
def handler(q=False):
|
||||||
|
if q is False:
|
||||||
|
return False
|
||||||
|
request = json.loads(q)
|
||||||
|
if request.get('url'):
|
||||||
|
url = request['url']
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
html = fetchHTML(url)
|
||||||
|
html = stripUselessTags(html)
|
||||||
|
markdown = convertHTML(html)
|
||||||
|
|
||||||
|
r = {'results': [{'types': mispattributes['output'],
|
||||||
|
'values':[str(markdown)]}]}
|
||||||
|
return r
|
||||||
|
|
||||||
|
|
||||||
|
def introspection():
|
||||||
|
return mispattributes
|
||||||
|
|
||||||
|
|
||||||
|
def version():
|
||||||
|
return moduleinfo
|
Loading…
Reference in New Issue