From 2be1d7a0cde74703e15e2afd70c4205d9a4c3844 Mon Sep 17 00:00:00 2001
From: mokaddem <sami.mokaddem@circl.lu>
Date: Fri, 23 Oct 2020 22:17:47 +0200
Subject: [PATCH] new: [expansion] Added html_to_markdown module

It fetches the HTML from the provided URL, performs a bit of DOM
clean-up then convert it into markdown
---
 REQUIREMENTS                                  |  1 +
 misp_modules/modules/expansion/__init__.py    |  2 +-
 .../modules/expansion/html_to_markdown.py     | 53 +++++++++++++++++++
 3 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100755 misp_modules/modules/expansion/html_to_markdown.py

diff --git a/REQUIREMENTS b/REQUIREMENTS
index 9b26d1c4..f6362b50 100644
--- a/REQUIREMENTS
+++ b/REQUIREMENTS
@@ -47,6 +47,7 @@ jsonschema==3.2.0
 lief==0.10.1
 lxml==4.5.2
 maclookup==1.0.3
+markdownify==0.5.3
 maxminddb==2.0.2; python_version >= '3.6'
 multidict==4.7.6; python_version >= '3.5'
 np==1.0.2
diff --git a/misp_modules/modules/expansion/__init__.py b/misp_modules/modules/expansion/__init__.py
index 1b6d2bbc..64851408 100644
--- a/misp_modules/modules/expansion/__init__.py
+++ b/misp_modules/modules/expansion/__init__.py
@@ -18,7 +18,7 @@ __all__ = ['cuckoo_submit', 'vmray_submit', 'bgpranking', 'circl_passivedns', 'c
            'virustotal_public', 'apiosintds', 'urlscan', 'securitytrails', 'apivoid',
            'assemblyline_submit', 'assemblyline_query', 'ransomcoindb', 'malwarebazaar',
            'lastline_query', 'lastline_submit', 'sophoslabs_intelix', 'cytomic_orion', 'censys_enrich',
-           'trustar_enrich', 'recordedfuture']
+           'trustar_enrich', 'recordedfuture', 'html_to_markdown']
 
 
 minimum_required_fields = ('type', 'uuid', 'value')
diff --git a/misp_modules/modules/expansion/html_to_markdown.py b/misp_modules/modules/expansion/html_to_markdown.py
new file mode 100755
index 00000000..228b4bcb
--- /dev/null
+++ b/misp_modules/modules/expansion/html_to_markdown.py
@@ -0,0 +1,53 @@
+import json
+import requests
+from markdownify import markdownify
+from bs4 import BeautifulSoup
+
+misperrors = {'error': 'Error'}
+mispattributes = {'input': ['url'], 'output': ['text']}
+moduleinfo = {'version': '0.1', 'author': 'Sami Mokaddem',
+              'description': 'Simple HTML fetcher',
+              'module-type': ['expansion']}
+
+
+def fetchHTML(url):
+    r = requests.get(url)
+    return r.text
+
+
+def stripUselessTags(html):
+    soup = BeautifulSoup(html, 'html.parser')
+    toRemove = ['script', 'head', 'header', 'footer', 'meta', 'link']
+    for tag in soup.find_all(toRemove):
+        tag.decompose()
+    return str(soup)
+
+
+def convertHTML(html):
+    toStrip = ['a', 'img']
+    return markdownify(html, heading_style='ATX', strip=toStrip)
+
+
+def handler(q=False):
+    if q is False:
+        return False
+    request = json.loads(q)
+    if request.get('url'):
+        url = request['url']
+    else:
+        return False
+    html = fetchHTML(url)
+    html = stripUselessTags(html)
+    markdown = convertHTML(html)
+
+    r = {'results': [{'types': mispattributes['output'],
+                      'values':[str(markdown)]}]}
+    return r
+
+
+def introspection():
+    return mispattributes
+
+
+def version():
+    return moduleinfo