2020-10-23 22:17:47 +02:00
import json
import requests
from markdownify import markdownify
from bs4 import BeautifulSoup
misperrors = { ' error ' : ' Error ' }
mispattributes = { ' input ' : [ ' url ' ] , ' output ' : [ ' text ' ] }
2024-08-12 11:23:10 +02:00
moduleinfo = {
' version ' : ' 0.1 ' ,
' author ' : ' Sami Mokaddem ' ,
' description ' : ' Expansion module to fetch the html content from an url and convert it into markdown. ' ,
' module-type ' : [ ' expansion ' ] ,
' name ' : ' HTML to Markdown ' ,
' logo ' : ' ' ,
' requirements ' : [ ' The markdownify python library ' ] ,
' features ' : ' The module take an URL as input and the HTML content is fetched from it. This content is then converted into markdown that is returned as text. ' ,
' references ' : [ ] ,
' input ' : ' URL attribute. ' ,
' output ' : ' Markdown content converted from the HTML fetched from the url. ' ,
}
2020-10-23 22:17:47 +02:00
def fetchHTML ( url ) :
r = requests . get ( url )
return r . text
def stripUselessTags ( html ) :
soup = BeautifulSoup ( html , ' html.parser ' )
toRemove = [ ' script ' , ' head ' , ' header ' , ' footer ' , ' meta ' , ' link ' ]
for tag in soup . find_all ( toRemove ) :
tag . decompose ( )
return str ( soup )
def convertHTML ( html ) :
toStrip = [ ' a ' , ' img ' ]
return markdownify ( html , heading_style = ' ATX ' , strip = toStrip )
def handler ( q = False ) :
if q is False :
return False
request = json . loads ( q )
if request . get ( ' url ' ) :
url = request [ ' url ' ]
else :
return False
html = fetchHTML ( url )
html = stripUselessTags ( html )
markdown = convertHTML ( html )
r = { ' results ' : [ { ' types ' : mispattributes [ ' output ' ] ,
' values ' : [ str ( markdown ) ] } ] }
return r
def introspection ( ) :
return mispattributes
def version ( ) :
return moduleinfo