2018-06-30 05:58:44 +02:00
|
|
|
import sys
|
2016-08-04 14:32:50 +02:00
|
|
|
import json
|
|
|
|
import base64
|
|
|
|
from io import BytesIO
|
2018-06-29 18:58:25 +02:00
|
|
|
|
2018-06-30 05:52:12 +02:00
|
|
|
import logging
|
|
|
|
|
|
|
|
log = logging.getLogger('ocr')
|
|
|
|
log.setLevel(logging.DEBUG)
|
|
|
|
ch = logging.StreamHandler(sys.stdout)
|
|
|
|
ch.setLevel(logging.DEBUG)
|
|
|
|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
|
ch.setFormatter(formatter)
|
|
|
|
log.addHandler(ch)
|
|
|
|
|
2016-08-04 14:32:50 +02:00
|
|
|
misperrors = {'error': 'Error'}
|
2018-12-11 15:29:09 +01:00
|
|
|
userConfig = {}
|
2016-08-04 14:32:50 +02:00
|
|
|
|
|
|
|
inputSource = ['file']
|
|
|
|
|
2018-06-28 10:00:14 +02:00
|
|
|
moduleinfo = {'version': '0.2', 'author': 'Alexandre Dulaunoy',
|
2016-08-04 14:32:50 +02:00
|
|
|
'description': 'Optical Character Recognition (OCR) module for MISP',
|
|
|
|
'module-type': ['import']}
|
|
|
|
|
|
|
|
moduleconfig = []
|
|
|
|
|
|
|
|
|
|
|
|
def handler(q=False):
|
2018-06-28 17:20:38 +02:00
|
|
|
# try to import modules and return errors if module not found
|
|
|
|
try:
|
|
|
|
from PIL import Image
|
|
|
|
except ImportError:
|
|
|
|
misperrors['error'] = "Please pip(3) install pillow"
|
|
|
|
return misperrors
|
|
|
|
|
|
|
|
try:
|
|
|
|
# Official ImageMagick module
|
|
|
|
from wand.image import Image as WImage
|
|
|
|
except ImportError:
|
|
|
|
misperrors['error'] = "Please pip(3) install wand"
|
|
|
|
return misperrors
|
|
|
|
|
|
|
|
try:
|
|
|
|
from pytesseract import image_to_string
|
|
|
|
except ImportError:
|
|
|
|
misperrors['error'] = "Please pip(3) install pytesseract"
|
|
|
|
return misperrors
|
|
|
|
|
2016-08-04 14:32:50 +02:00
|
|
|
if q is False:
|
|
|
|
return False
|
|
|
|
r = {'results': []}
|
|
|
|
request = json.loads(q)
|
2018-06-28 10:00:14 +02:00
|
|
|
document = base64.b64decode(request["data"])
|
2018-06-29 18:58:25 +02:00
|
|
|
document = WImage(blob=document)
|
|
|
|
if document.format == 'PDF':
|
2018-06-30 05:38:26 +02:00
|
|
|
with document as pdf:
|
2018-06-29 18:58:25 +02:00
|
|
|
# Get number of pages
|
2018-12-11 15:29:09 +01:00
|
|
|
pages = len(pdf.sequence)
|
2018-07-01 16:08:42 +02:00
|
|
|
log.debug("PDF with {} page(s) detected".format(pages))
|
2018-06-29 18:58:25 +02:00
|
|
|
# Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc…
|
2018-06-28 10:00:14 +02:00
|
|
|
img = WImage(width=pdf.width, height=pdf.height * pages)
|
2018-06-29 18:58:25 +02:00
|
|
|
# Cycle through pages and stitch it together to one big file
|
2018-06-28 10:00:14 +02:00
|
|
|
for p in range(pages):
|
2018-12-11 15:29:09 +01:00
|
|
|
log.debug("Stitching page {}".format(p + 1))
|
2018-06-28 17:20:38 +02:00
|
|
|
image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0)
|
2018-06-29 18:58:25 +02:00
|
|
|
# Create a png blob
|
2018-06-28 17:20:38 +02:00
|
|
|
image = img.make_blob('png')
|
2018-12-11 15:29:09 +01:00
|
|
|
log.debug("Final image size is {}x{}".format(pdf.width, pdf.height * (p + 1)))
|
2018-06-28 17:20:38 +02:00
|
|
|
else:
|
|
|
|
image = document
|
2018-06-28 10:00:14 +02:00
|
|
|
|
2016-08-04 14:32:50 +02:00
|
|
|
image_file = BytesIO(image)
|
2016-08-04 18:54:21 +02:00
|
|
|
image_file.seek(0)
|
2018-06-28 10:00:14 +02:00
|
|
|
|
|
|
|
try:
|
2018-06-28 17:20:38 +02:00
|
|
|
im = Image.open(image_file)
|
2018-06-28 10:00:14 +02:00
|
|
|
except IOError:
|
|
|
|
misperrors['error'] = "Corrupt or not an image file."
|
|
|
|
return misperrors
|
|
|
|
|
|
|
|
ocrized = image_to_string(im)
|
|
|
|
|
2016-08-04 14:32:50 +02:00
|
|
|
freetext = {}
|
|
|
|
freetext['values'] = ocrized
|
2016-08-04 18:54:21 +02:00
|
|
|
freetext['types'] = ['freetext']
|
2016-08-04 14:32:50 +02:00
|
|
|
r['results'].append(freetext)
|
|
|
|
return r
|
|
|
|
|
|
|
|
|
|
|
|
def introspection():
|
|
|
|
modulesetup = {}
|
|
|
|
try:
|
|
|
|
userConfig
|
|
|
|
modulesetup['userConfig'] = userConfig
|
|
|
|
except NameError:
|
|
|
|
pass
|
|
|
|
try:
|
|
|
|
inputSource
|
|
|
|
modulesetup['inputSource'] = inputSource
|
|
|
|
except NameError:
|
|
|
|
pass
|
|
|
|
return modulesetup
|
|
|
|
|
|
|
|
|
|
|
|
def version():
|
|
|
|
moduleinfo['config'] = moduleconfig
|
|
|
|
return moduleinfo
|
|
|
|
|
2018-12-11 15:29:09 +01:00
|
|
|
|
2016-08-04 14:32:50 +02:00
|
|
|
if __name__ == '__main__':
|
|
|
|
x = open('test.json', 'r')
|
|
|
|
handler(q=x.read())
|