mirror of https://github.com/MISP/misp-modules
				
				
				
			
		
			
				
	
	
		
			64 lines
		
	
	
		
			1.6 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
			
		
		
	
	
			64 lines
		
	
	
		
			1.6 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
| import json
 | |
| import requests
 | |
| from markdownify import markdownify
 | |
| from bs4 import BeautifulSoup
 | |
| 
 | |
| misperrors = {'error': 'Error'}
 | |
| mispattributes = {'input': ['url'], 'output': ['text']}
 | |
| moduleinfo = {
 | |
|     'version': '0.1',
 | |
|     'author': 'Sami Mokaddem',
 | |
|     'description': 'Expansion module to fetch the html content from an url and convert it into markdown.',
 | |
|     'module-type': ['expansion'],
 | |
|     'name': 'HTML to Markdown',
 | |
|     'logo': '',
 | |
|     'requirements': ['The markdownify python library'],
 | |
|     'features': 'The module take an URL as input and the HTML content is fetched from it. This content is then converted into markdown that is returned as text.',
 | |
|     'references': [],
 | |
|     'input': 'URL attribute.',
 | |
|     'output': 'Markdown content converted from the HTML fetched from the url.',
 | |
| }
 | |
| 
 | |
| 
 | |
| def fetchHTML(url):
 | |
|     r = requests.get(url)
 | |
|     return r.text
 | |
| 
 | |
| 
 | |
| def stripUselessTags(html):
 | |
|     soup = BeautifulSoup(html, 'html.parser')
 | |
|     toRemove = ['script', 'head', 'header', 'footer', 'meta', 'link']
 | |
|     for tag in soup.find_all(toRemove):
 | |
|         tag.decompose()
 | |
|     return str(soup)
 | |
| 
 | |
| 
 | |
| def convertHTML(html):
 | |
|     toStrip = ['a', 'img']
 | |
|     return markdownify(html, heading_style='ATX', strip=toStrip)
 | |
| 
 | |
| 
 | |
| def handler(q=False):
 | |
|     if q is False:
 | |
|         return False
 | |
|     request = json.loads(q)
 | |
|     if request.get('url'):
 | |
|         url = request['url']
 | |
|     else:
 | |
|         return False
 | |
|     html = fetchHTML(url)
 | |
|     html = stripUselessTags(html)
 | |
|     markdown = convertHTML(html)
 | |
| 
 | |
|     r = {'results': [{'types': mispattributes['output'],
 | |
|                       'values':[str(markdown)]}]}
 | |
|     return r
 | |
| 
 | |
| 
 | |
| def introspection():
 | |
|     return mispattributes
 | |
| 
 | |
| 
 | |
| def version():
 | |
|     return moduleinfo
 |