mirror of https://github.com/MISP/misp-modules
Merge pull request #436 from MISP/new-html-to-markdown
new: [expansion] Added html_to_markdown modulepull/443/head
commit
2779ed7331
|
@ -47,6 +47,7 @@ jsonschema==3.2.0
|
|||
lief==0.10.1
|
||||
lxml==4.5.2
|
||||
maclookup==1.0.3
|
||||
markdownify==0.5.3
|
||||
maxminddb==2.0.2; python_version >= '3.6'
|
||||
multidict==4.7.6; python_version >= '3.5'
|
||||
np==1.0.2
|
||||
|
|
|
@ -18,7 +18,7 @@ __all__ = ['cuckoo_submit', 'vmray_submit', 'bgpranking', 'circl_passivedns', 'c
|
|||
'virustotal_public', 'apiosintds', 'urlscan', 'securitytrails', 'apivoid',
|
||||
'assemblyline_submit', 'assemblyline_query', 'ransomcoindb', 'malwarebazaar',
|
||||
'lastline_query', 'lastline_submit', 'sophoslabs_intelix', 'cytomic_orion', 'censys_enrich',
|
||||
'trustar_enrich', 'recordedfuture']
|
||||
'trustar_enrich', 'recordedfuture', 'html_to_markdown']
|
||||
|
||||
|
||||
minimum_required_fields = ('type', 'uuid', 'value')
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
import json
|
||||
import requests
|
||||
from markdownify import markdownify
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
misperrors = {'error': 'Error'}
|
||||
mispattributes = {'input': ['url'], 'output': ['text']}
|
||||
moduleinfo = {'version': '0.1', 'author': 'Sami Mokaddem',
|
||||
'description': 'Simple HTML fetcher',
|
||||
'module-type': ['expansion']}
|
||||
|
||||
|
||||
def fetchHTML(url):
|
||||
r = requests.get(url)
|
||||
return r.text
|
||||
|
||||
|
||||
def stripUselessTags(html):
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
toRemove = ['script', 'head', 'header', 'footer', 'meta', 'link']
|
||||
for tag in soup.find_all(toRemove):
|
||||
tag.decompose()
|
||||
return str(soup)
|
||||
|
||||
|
||||
def convertHTML(html):
|
||||
toStrip = ['a', 'img']
|
||||
return markdownify(html, heading_style='ATX', strip=toStrip)
|
||||
|
||||
|
||||
def handler(q=False):
|
||||
if q is False:
|
||||
return False
|
||||
request = json.loads(q)
|
||||
if request.get('url'):
|
||||
url = request['url']
|
||||
else:
|
||||
return False
|
||||
html = fetchHTML(url)
|
||||
html = stripUselessTags(html)
|
||||
markdown = convertHTML(html)
|
||||
|
||||
r = {'results': [{'types': mispattributes['output'],
|
||||
'values':[str(markdown)]}]}
|
||||
return r
|
||||
|
||||
|
||||
def introspection():
|
||||
return mispattributes
|
||||
|
||||
|
||||
def version():
|
||||
return moduleinfo
|
Loading…
Reference in New Issue