mirror of https://github.com/MISP/misp-modules
Merge pull request #199 from SteveClement/master
Added (Multipage) PDF support to OCR Module, minor refactorpull/202/head
commit
cc91b42607
|
@ -14,6 +14,7 @@ git+https://github.com/MISP/PyMISP.git#egg=pymisp
|
|||
git+https://github.com/sebdraven/pyonyphe#egg=pyonyphe
|
||||
pillow
|
||||
pytesseract
|
||||
wand
|
||||
SPARQLWrapper
|
||||
domaintools_api
|
||||
pygeoip
|
||||
|
|
|
@ -193,7 +193,7 @@ class QueryModule(tornado.web.RequestHandler):
|
|||
if dict_payload.get('timeout'):
|
||||
timeout = datetime.timedelta(seconds=int(dict_payload.get('timeout')))
|
||||
else:
|
||||
timeout = datetime.timedelta(seconds=30)
|
||||
timeout = datetime.timedelta(seconds=300)
|
||||
response = yield tornado.gen.with_timeout(timeout, self.run_request(jsonpayload))
|
||||
self.write(response)
|
||||
except tornado.gen.TimeoutError:
|
||||
|
|
|
@ -1,16 +1,24 @@
|
|||
import sys
|
||||
import json
|
||||
import base64
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from pytesseract import image_to_string
|
||||
from io import BytesIO
|
||||
|
||||
import logging
|
||||
|
||||
log = logging.getLogger('ocr')
|
||||
log.setLevel(logging.DEBUG)
|
||||
ch = logging.StreamHandler(sys.stdout)
|
||||
ch.setLevel(logging.DEBUG)
|
||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
ch.setFormatter(formatter)
|
||||
log.addHandler(ch)
|
||||
|
||||
misperrors = {'error': 'Error'}
|
||||
userConfig = { };
|
||||
userConfig = {};
|
||||
|
||||
inputSource = ['file']
|
||||
|
||||
moduleinfo = {'version': '0.1', 'author': 'Alexandre Dulaunoy',
|
||||
moduleinfo = {'version': '0.2', 'author': 'Alexandre Dulaunoy',
|
||||
'description': 'Optical Character Recognition (OCR) module for MISP',
|
||||
'module-type': ['import']}
|
||||
|
||||
|
@ -18,14 +26,61 @@ moduleconfig = []
|
|||
|
||||
|
||||
def handler(q=False):
|
||||
# try to import modules and return errors if module not found
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
misperrors['error'] = "Please pip(3) install pillow"
|
||||
return misperrors
|
||||
|
||||
try:
|
||||
# Official ImageMagick module
|
||||
from wand.image import Image as WImage
|
||||
except ImportError:
|
||||
misperrors['error'] = "Please pip(3) install wand"
|
||||
return misperrors
|
||||
|
||||
try:
|
||||
from pytesseract import image_to_string
|
||||
except ImportError:
|
||||
misperrors['error'] = "Please pip(3) install pytesseract"
|
||||
return misperrors
|
||||
|
||||
if q is False:
|
||||
return False
|
||||
r = {'results': []}
|
||||
request = json.loads(q)
|
||||
image = base64.b64decode(request["data"])
|
||||
document = base64.b64decode(request["data"])
|
||||
document = WImage(blob=document)
|
||||
if document.format == 'PDF':
|
||||
with document as pdf:
|
||||
# Get number of pages
|
||||
pages=len(pdf.sequence)
|
||||
log.debug("PDF with {} page(s) detected".format(pages))
|
||||
# Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc…
|
||||
img = WImage(width=pdf.width, height=pdf.height * pages)
|
||||
# Cycle through pages and stitch it together to one big file
|
||||
for p in range(pages):
|
||||
log.debug("Stitching page {}".format(p+1))
|
||||
image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0)
|
||||
# Create a png blob
|
||||
image = img.make_blob('png')
|
||||
log.debug("Final image size is {}x{}".format(pdf.width, pdf.height*(p+1)))
|
||||
else:
|
||||
image = document
|
||||
|
||||
image_file = BytesIO(image)
|
||||
image_file.seek(0)
|
||||
ocrized = image_to_string(Image.open(image_file))
|
||||
|
||||
try:
|
||||
im = Image.open(image_file)
|
||||
except IOError:
|
||||
misperrors['error'] = "Corrupt or not an image file."
|
||||
return misperrors
|
||||
|
||||
|
||||
ocrized = image_to_string(im)
|
||||
|
||||
freetext = {}
|
||||
freetext['values'] = ocrized
|
||||
freetext['types'] = ['freetext']
|
||||
|
|
Loading…
Reference in New Issue