|
|
|
@ -1,16 +1,24 @@ |
|
|
|
|
import sys |
|
|
|
|
import json |
|
|
|
|
import base64 |
|
|
|
|
from io import BytesIO |
|
|
|
|
|
|
|
|
|
from PIL import Image |
|
|
|
|
import logging |
|
|
|
|
|
|
|
|
|
log = logging.getLogger('ocr') |
|
|
|
|
log.setLevel(logging.DEBUG) |
|
|
|
|
ch = logging.StreamHandler(sys.stdout) |
|
|
|
|
ch.setLevel(logging.DEBUG) |
|
|
|
|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
|
|
|
|
ch.setFormatter(formatter) |
|
|
|
|
log.addHandler(ch) |
|
|
|
|
|
|
|
|
|
from pytesseract import image_to_string |
|
|
|
|
from io import BytesIO |
|
|
|
|
misperrors = {'error': 'Error'} |
|
|
|
|
userConfig = { }; |
|
|
|
|
userConfig = {}; |
|
|
|
|
|
|
|
|
|
inputSource = ['file'] |
|
|
|
|
|
|
|
|
|
moduleinfo = {'version': '0.1', 'author': 'Alexandre Dulaunoy', |
|
|
|
|
moduleinfo = {'version': '0.2', 'author': 'Alexandre Dulaunoy', |
|
|
|
|
'description': 'Optical Character Recognition (OCR) module for MISP', |
|
|
|
|
'module-type': ['import']} |
|
|
|
|
|
|
|
|
@ -18,14 +26,61 @@ moduleconfig = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def handler(q=False): |
|
|
|
|
# try to import modules and return errors if module not found |
|
|
|
|
try: |
|
|
|
|
from PIL import Image |
|
|
|
|
except ImportError: |
|
|
|
|
misperrors['error'] = "Please pip(3) install pillow" |
|
|
|
|
return misperrors |
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
# Official ImageMagick module |
|
|
|
|
from wand.image import Image as WImage |
|
|
|
|
except ImportError: |
|
|
|
|
misperrors['error'] = "Please pip(3) install wand" |
|
|
|
|
return misperrors |
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
from pytesseract import image_to_string |
|
|
|
|
except ImportError: |
|
|
|
|
misperrors['error'] = "Please pip(3) install pytesseract" |
|
|
|
|
return misperrors |
|
|
|
|
|
|
|
|
|
if q is False: |
|
|
|
|
return False |
|
|
|
|
r = {'results': []} |
|
|
|
|
request = json.loads(q) |
|
|
|
|
image = base64.b64decode(request["data"]) |
|
|
|
|
document = base64.b64decode(request["data"]) |
|
|
|
|
document = WImage(blob=document) |
|
|
|
|
if document.format == 'PDF': |
|
|
|
|
with document as pdf: |
|
|
|
|
# Get number of pages |
|
|
|
|
pages=len(pdf.sequence) |
|
|
|
|
log.debug("PDF with {} page(s) detected".format(pages)) |
|
|
|
|
# Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc… |
|
|
|
|
img = WImage(width=pdf.width, height=pdf.height * pages) |
|
|
|
|
# Cycle through pages and stitch it together to one big file |
|
|
|
|
for p in range(pages): |
|
|
|
|
log.debug("Stitching page {}".format(p+1)) |
|
|
|
|
image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0) |
|
|
|
|
# Create a png blob |
|
|
|
|
image = img.make_blob('png') |
|
|
|
|
log.debug("Final image size is {}x{}".format(pdf.width, pdf.height*(p+1))) |
|
|
|
|
else: |
|
|
|
|
image = document |
|
|
|
|
|
|
|
|
|
image_file = BytesIO(image) |
|
|
|
|
image_file.seek(0) |
|
|
|
|
ocrized = image_to_string(Image.open(image_file)) |
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
im = Image.open(image_file) |
|
|
|
|
except IOError: |
|
|
|
|
misperrors['error'] = "Corrupt or not an image file." |
|
|
|
|
return misperrors |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ocrized = image_to_string(im) |
|
|
|
|
|
|
|
|
|
freetext = {} |
|
|
|
|
freetext['values'] = ocrized |
|
|
|
|
freetext['types'] = ['freetext'] |
|
|
|
|