mirror of https://github.com/MISP/misp-modules
79 lines
1.8 KiB
Python
Executable File
79 lines
1.8 KiB
Python
Executable File
import json
|
|
import base64
|
|
import magic
|
|
|
|
from PIL import Image
|
|
|
|
from wand.image import Image as WImage
|
|
|
|
from pytesseract import image_to_string
|
|
from io import BytesIO
|
|
misperrors = {'error': 'Error'}
|
|
userConfig = { };
|
|
|
|
inputSource = ['file']
|
|
|
|
moduleinfo = {'version': '0.2', 'author': 'Alexandre Dulaunoy',
|
|
'description': 'Optical Character Recognition (OCR) module for MISP',
|
|
'module-type': ['import']}
|
|
|
|
moduleconfig = []
|
|
|
|
|
|
def handler(q=False):
|
|
if q is False:
|
|
return False
|
|
r = {'results': []}
|
|
request = json.loads(q)
|
|
document = base64.b64decode(request["data"])
|
|
if magic.from_buffer(document, mime=True).split("/")[1] == 'pdf':
|
|
print("PDF Detected")
|
|
with WImage(blob=document) as pdf:
|
|
pages=len(pdf.sequence)
|
|
img = WImage(width=pdf.width, height=pdf.height * pages)
|
|
for p in range(pages):
|
|
img.composite(pdf.sequence[p], top=pdf.height * i, left=0)
|
|
image = document
|
|
|
|
image_file = BytesIO(image)
|
|
image_file.seek(0)
|
|
|
|
try:
|
|
im = WImage(blob=image_file)
|
|
except IOError:
|
|
misperrors['error'] = "Corrupt or not an image file."
|
|
return misperrors
|
|
|
|
|
|
ocrized = image_to_string(im)
|
|
|
|
freetext = {}
|
|
freetext['values'] = ocrized
|
|
freetext['types'] = ['freetext']
|
|
r['results'].append(freetext)
|
|
return r
|
|
|
|
|
|
def introspection():
|
|
modulesetup = {}
|
|
try:
|
|
userConfig
|
|
modulesetup['userConfig'] = userConfig
|
|
except NameError:
|
|
pass
|
|
try:
|
|
inputSource
|
|
modulesetup['inputSource'] = inputSource
|
|
except NameError:
|
|
pass
|
|
return modulesetup
|
|
|
|
|
|
def version():
|
|
moduleinfo['config'] = moduleconfig
|
|
return moduleinfo
|
|
|
|
if __name__ == '__main__':
|
|
x = open('test.json', 'r')
|
|
handler(q=x.read())
|