Merge pull request #199 from SteveClement/master

Added (Multipage) PDF support to OCR Module, minor refactor
2018-07-01 16:47:41 +02:00 · 2018-07-01 16:47:41 +02:00 · cc91b42607
parent ff793bc221 549f32547d
commit cc91b42607
3 changed files with 65 additions and 9 deletions
--- a/1
+++ b/1
@ -14,6 +14,7 @@ git+https://github.com/MISP/PyMISP.git#egg=pymisp
 git+https://github.com/sebdraven/pyonyphe#egg=pyonyphe
 pillow
 pytesseract
+wand
 SPARQLWrapper
 domaintools_api
 pygeoip
--- a/misp_modules/init.py
+++ b/misp_modules/init.py
@ -193,7 +193,7 @@ class QueryModule(tornado.web.RequestHandler):
            if dict_payload.get('timeout'):
                timeout = datetime.timedelta(seconds=int(dict_payload.get('timeout')))
            else:
-                timeout = datetime.timedelta(seconds=30)
+                timeout = datetime.timedelta(seconds=300)
            response = yield tornado.gen.with_timeout(timeout, self.run_request(jsonpayload))
            self.write(response)
        except tornado.gen.TimeoutError:
--- a/misp_modules/modules/import_mod/ocr.py
+++ b/misp_modules/modules/import_mod/ocr.py
@ -1,16 +1,24 @@
+import sys
 import json
 import base64
-
-from PIL import Image
-
-from pytesseract import image_to_string
 from io import BytesIO
+
+import logging
+
+log = logging.getLogger('ocr')
+log.setLevel(logging.DEBUG)
+ch = logging.StreamHandler(sys.stdout)
+ch.setLevel(logging.DEBUG)
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+ch.setFormatter(formatter)
+log.addHandler(ch)
+
 misperrors = {'error': 'Error'}
-userConfig = { };
+userConfig = {};

 inputSource = ['file']

-moduleinfo = {'version': '0.1', 'author': 'Alexandre Dulaunoy',
+moduleinfo = {'version': '0.2', 'author': 'Alexandre Dulaunoy',
              'description': 'Optical Character Recognition (OCR) module for MISP',
              'module-type': ['import']}

@ -18,14 +26,61 @@ moduleconfig = []


 def handler(q=False):
+    # try to import modules and return errors if module not found
+    try:
+        from PIL import Image
+    except ImportError:
+        misperrors['error'] = "Please pip(3) install pillow"
+        return misperrors
+
+    try:
+        # Official ImageMagick module
+        from wand.image import Image as WImage
+    except ImportError:
+        misperrors['error'] = "Please pip(3) install wand"
+        return misperrors
+
+    try:
+        from pytesseract import image_to_string
+    except ImportError:
+        misperrors['error'] = "Please pip(3) install pytesseract"
+        return misperrors
+
    if q is False:
        return False
    r = {'results': []}
    request = json.loads(q)
-    image = base64.b64decode(request["data"])
+    document = base64.b64decode(request["data"])
+    document = WImage(blob=document)
+    if document.format == 'PDF':
+        with document as pdf:
+            # Get number of pages
+            pages=len(pdf.sequence)
+            log.debug("PDF with {} page(s) detected".format(pages))
+            # Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc…
+            img = WImage(width=pdf.width, height=pdf.height * pages)
+            # Cycle through pages and stitch it together to one big file
+            for p in range(pages):
+                log.debug("Stitching page {}".format(p+1))
+                image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0)
+            # Create a png blob
+            image = img.make_blob('png')
+            log.debug("Final image size is {}x{}".format(pdf.width, pdf.height*(p+1)))
+    else:
+        image = document
+
    image_file = BytesIO(image)
    image_file.seek(0)
-    ocrized = image_to_string(Image.open(image_file))
+
+    try:
+        im = Image.open(image_file)
+    except IOError:
+        misperrors['error'] = "Corrupt or not an image file."
+        return misperrors
+
+
+    ocrized = image_to_string(im)
+
    freetext = {}
    freetext['values'] = ocrized
    freetext['types'] = ['freetext']