From 59b7688bdcbdc3d1d1e085ec6e8862c0c8585b33 Mon Sep 17 00:00:00 2001
From: Steve Clement <steve@localhost.lu>
Date: Thu, 28 Jun 2018 16:00:14 +0800
Subject: [PATCH 01/11] - Added initial PDF support, nothing is processed yet -
 Test to replace PIL with wand

---
 misp_modules/modules/import_mod/ocr.py | 27 +++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py
index aafe6538..17d634cd 100755
--- a/misp_modules/modules/import_mod/ocr.py
+++ b/misp_modules/modules/import_mod/ocr.py
@@ -1,8 +1,11 @@
 import json
 import base64
+import magic
 
 from PIL import Image
 
+from wand.image import Image as WImage
+
 from pytesseract import image_to_string
 from io import BytesIO
 misperrors = {'error': 'Error'}
@@ -10,7 +13,7 @@ userConfig = { };
 
 inputSource = ['file']
 
-moduleinfo = {'version': '0.1', 'author': 'Alexandre Dulaunoy',
+moduleinfo = {'version': '0.2', 'author': 'Alexandre Dulaunoy',
               'description': 'Optical Character Recognition (OCR) module for MISP',
               'module-type': ['import']}
 
@@ -22,10 +25,28 @@ def handler(q=False):
         return False
     r = {'results': []}
     request = json.loads(q)
-    image = base64.b64decode(request["data"])
+    document = base64.b64decode(request["data"])
+    if magic.from_buffer(document, mime=True).split("/")[1] == 'pdf':
+        print("PDF Detected")
+        with WImage(blob=document) as pdf:
+            pages=len(pdf.sequence)
+            img = WImage(width=pdf.width, height=pdf.height * pages)
+            for p in range(pages):
+                img.composite(pdf.sequence[p], top=pdf.height * i, left=0)
+        image = document
+
     image_file = BytesIO(image)
     image_file.seek(0)
-    ocrized = image_to_string(Image.open(image_file))
+
+    try:
+        im = WImage(blob=image_file)
+    except IOError:
+        misperrors['error'] = "Corrupt or not an image file."
+        return misperrors
+
+
+    ocrized = image_to_string(im)
+
     freetext = {}
     freetext['values'] = ocrized
     freetext['types'] = ['freetext']

From 7885017981a122761ce1613858ff904115eb10cc Mon Sep 17 00:00:00 2001
From: Steve Clement <steve@localhost.lu>
Date: Thu, 28 Jun 2018 16:59:03 +0800
Subject: [PATCH 02/11] - fixed typo move image back in scope

---
 misp_modules/modules/import_mod/ocr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py
index 17d634cd..0748d356 100755
--- a/misp_modules/modules/import_mod/ocr.py
+++ b/misp_modules/modules/import_mod/ocr.py
@@ -32,8 +32,8 @@ def handler(q=False):
             pages=len(pdf.sequence)
             img = WImage(width=pdf.width, height=pdf.height * pages)
             for p in range(pages):
-                img.composite(pdf.sequence[p], top=pdf.height * i, left=0)
-        image = document
+                img.composite(pdf.sequence[p], top=pdf.height * p, left=0)
+    image = document
 
     image_file = BytesIO(image)
     image_file.seek(0)

From 60a3fbe28204c5178b2bceed2b3551a27e8c6ce4 Mon Sep 17 00:00:00 2001
From: Steve Clement <steve@localhost.lu>
Date: Thu, 28 Jun 2018 23:20:38 +0800
Subject: [PATCH 03/11] - added wand requirement - fixed missing return png
 byte-stream - move module import to handler to catch and  report errorz

---
 REQUIREMENTS                           |  1 +
 misp_modules/modules/import_mod/ocr.py | 41 +++++++++++++++++++-------
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/REQUIREMENTS b/REQUIREMENTS
index 94048554..c1167637 100644
--- a/REQUIREMENTS
+++ b/REQUIREMENTS
@@ -14,6 +14,7 @@ git+https://github.com/MISP/PyMISP.git#egg=pymisp
 git+https://github.com/sebdraven/pyonyphe#egg=pyonyphe
 pillow
 pytesseract
+wand
 SPARQLWrapper
 domaintools_api
 pygeoip
diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py
index 0748d356..a30bba0e 100755
--- a/misp_modules/modules/import_mod/ocr.py
+++ b/misp_modules/modules/import_mod/ocr.py
@@ -1,12 +1,5 @@
 import json
 import base64
-import magic
-
-from PIL import Image
-
-from wand.image import Image as WImage
-
-from pytesseract import image_to_string
 from io import BytesIO
 misperrors = {'error': 'Error'}
 userConfig = { };
@@ -21,6 +14,32 @@ moduleconfig = []
 
 
 def handler(q=False):
+    # try to import modules and return errors if module not found
+    try:
+        import magic
+    except ImportError:
+        misperrors['error'] = "Please pip(3) install magic"
+        return misperrors
+
+    try:
+        from PIL import Image
+    except ImportError:
+        misperrors['error'] = "Please pip(3) install pillow"
+        return misperrors
+
+    try:
+        # Official ImageMagick module
+        from wand.image import Image as WImage
+    except ImportError:
+        misperrors['error'] = "Please pip(3) install wand"
+        return misperrors
+
+    try:
+        from pytesseract import image_to_string
+    except ImportError:
+        misperrors['error'] = "Please pip(3) install pytesseract"
+        return misperrors
+
     if q is False:
         return False
     r = {'results': []}
@@ -32,14 +51,16 @@ def handler(q=False):
             pages=len(pdf.sequence)
             img = WImage(width=pdf.width, height=pdf.height * pages)
             for p in range(pages):
-                img.composite(pdf.sequence[p], top=pdf.height * p, left=0)
-    image = document
+                image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0)
+            image = img.make_blob('png')
+    else:
+        image = document
 
     image_file = BytesIO(image)
     image_file.seek(0)
 
     try:
-        im = WImage(blob=image_file)
+        im = Image.open(image_file)
     except IOError:
         misperrors['error'] = "Corrupt or not an image file."
         return misperrors

From fbb3617f256d19e272d95e7a6f2c9e745acdfe94 Mon Sep 17 00:00:00 2001
From: Steve Clement <steve@localhost.lu>
Date: Fri, 29 Jun 2018 12:01:17 +0800
Subject: [PATCH 04/11] - Quick comment ToDo: Avoid using Magic in future
 releases

---
 misp_modules/modules/import_mod/ocr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py
index a30bba0e..22483063 100755
--- a/misp_modules/modules/import_mod/ocr.py
+++ b/misp_modules/modules/import_mod/ocr.py
@@ -45,7 +45,7 @@ def handler(q=False):
     r = {'results': []}
     request = json.loads(q)
     document = base64.b64decode(request["data"])
-    if magic.from_buffer(document, mime=True).split("/")[1] == 'pdf':
+    if magic.from_buffer(document, mime=True).split("/")[1] == 'pdf': # Eventually this could be replaced with wand.obj.format
         print("PDF Detected")
         with WImage(blob=document) as pdf:
             pages=len(pdf.sequence)

From c7c93b53e8522c6318f9571d1c78d7f5a4b8b25f Mon Sep 17 00:00:00 2001
From: Steve Clement <steve@localhost.lu>
Date: Fri, 29 Jun 2018 12:02:08 +0800
Subject: [PATCH 05/11] - Set tornado timeout to 300 seconds.

---
 misp_modules/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/misp_modules/__init__.py b/misp_modules/__init__.py
index 1c1713b9..3bb72538 100644
--- a/misp_modules/__init__.py
+++ b/misp_modules/__init__.py
@@ -193,7 +193,7 @@ class QueryModule(tornado.web.RequestHandler):
             if dict_payload.get('timeout'):
                 timeout = datetime.timedelta(seconds=int(dict_payload.get('timeout')))
             else:
-                timeout = datetime.timedelta(seconds=30)
+                timeout = datetime.timedelta(seconds=300)
             response = yield tornado.gen.with_timeout(timeout, self.run_request(jsonpayload))
             self.write(response)
         except tornado.gen.TimeoutError:

From ef3837077e1e5da2d1f7bf4f40730e10be6aea1f Mon Sep 17 00:00:00 2001
From: Steve Clement <steve@localhost.lu>
Date: Sat, 30 Jun 2018 00:58:25 +0800
Subject: [PATCH 06/11] - Some more comments - Removed libmagic, wand can
 handle it better

---
 misp_modules/modules/import_mod/ocr.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py
index 22483063..441adc45 100755
--- a/misp_modules/modules/import_mod/ocr.py
+++ b/misp_modules/modules/import_mod/ocr.py
@@ -1,8 +1,9 @@
 import json
 import base64
 from io import BytesIO
+
 misperrors = {'error': 'Error'}
-userConfig = { };
+userConfig = {};
 
 inputSource = ['file']
 
@@ -15,12 +16,6 @@ moduleconfig = []
 
 def handler(q=False):
     # try to import modules and return errors if module not found
-    try:
-        import magic
-    except ImportError:
-        misperrors['error'] = "Please pip(3) install magic"
-        return misperrors
-
     try:
         from PIL import Image
     except ImportError:
@@ -45,13 +40,18 @@ def handler(q=False):
     r = {'results': []}
     request = json.loads(q)
     document = base64.b64decode(request["data"])
-    if magic.from_buffer(document, mime=True).split("/")[1] == 'pdf': # Eventually this could be replaced with wand.obj.format
-        print("PDF Detected")
+    document = WImage(blob=document)
+    if document.format == 'PDF':
         with WImage(blob=document) as pdf:
+            # Get number of pages
             pages=len(pdf.sequence)
+            print(f"PDF with {pages} page(s) detected")
+            # Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc…
             img = WImage(width=pdf.width, height=pdf.height * pages)
+            # Cycle through pages and stitch it together to one big file
             for p in range(pages):
                 image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0)
+            # Create a png blob
             image = img.make_blob('png')
     else:
         image = document

From 2f5dd9928e89fb4adf9c7c1849e003f2e8b9a360 Mon Sep 17 00:00:00 2001
From: Steve Clement <steve@localhost.lu>
Date: Sat, 30 Jun 2018 11:38:26 +0800
Subject: [PATCH 07/11] - content was already a wand.obj

---
 misp_modules/modules/import_mod/ocr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py
index 441adc45..f37ba9bb 100755
--- a/misp_modules/modules/import_mod/ocr.py
+++ b/misp_modules/modules/import_mod/ocr.py
@@ -42,7 +42,7 @@ def handler(q=False):
     document = base64.b64decode(request["data"])
     document = WImage(blob=document)
     if document.format == 'PDF':
-        with WImage(blob=document) as pdf:
+        with document as pdf:
             # Get number of pages
             pages=len(pdf.sequence)
             print(f"PDF with {pages} page(s) detected")

From ffce2aa5cc465823ae558953c2b46fc1fe88cef5 Mon Sep 17 00:00:00 2001
From: Steve Clement <steve@localhost.lu>
Date: Sat, 30 Jun 2018 11:52:12 +0800
Subject: [PATCH 08/11] - Added logger functionality for debug sessions

---
 misp_modules/modules/import_mod/ocr.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py
index f37ba9bb..fc7acf72 100755
--- a/misp_modules/modules/import_mod/ocr.py
+++ b/misp_modules/modules/import_mod/ocr.py
@@ -2,6 +2,16 @@ import json
 import base64
 from io import BytesIO
 
+import logging
+
+log = logging.getLogger('ocr')
+log.setLevel(logging.DEBUG)
+ch = logging.StreamHandler(sys.stdout)
+ch.setLevel(logging.DEBUG)
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+ch.setFormatter(formatter)
+log.addHandler(ch)
+
 misperrors = {'error': 'Error'}
 userConfig = {};
 
@@ -45,14 +55,16 @@ def handler(q=False):
         with document as pdf:
             # Get number of pages
             pages=len(pdf.sequence)
-            print(f"PDF with {pages} page(s) detected")
+            log.debug(f"PDF with {pages} page(s) detected")
             # Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc…
             img = WImage(width=pdf.width, height=pdf.height * pages)
             # Cycle through pages and stitch it together to one big file
             for p in range(pages):
+                log.debug(f"Stitching page {p}")
                 image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0)
             # Create a png blob
             image = img.make_blob('png')
+            log.debug(f"Final image size is {pdf.width}x{pdf.height*p}")
     else:
         image = document
 

From 184065cf741818d9a38a2fb885a77b7b441bb02e Mon Sep 17 00:00:00 2001
From: Steve Clement <steve@localhost.lu>
Date: Sat, 30 Jun 2018 11:58:44 +0800
Subject: [PATCH 09/11] - Forgot to import sys

---
 misp_modules/modules/import_mod/ocr.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py
index fc7acf72..b52722fe 100755
--- a/misp_modules/modules/import_mod/ocr.py
+++ b/misp_modules/modules/import_mod/ocr.py
@@ -1,3 +1,4 @@
+import sys
 import json
 import base64
 from io import BytesIO

From 9f0313a97e0cf4e13cf4af580feddfe64591709f Mon Sep 17 00:00:00 2001
From: Steve Clement <steve@localhost.lu>
Date: Sat, 30 Jun 2018 12:01:21 +0800
Subject: [PATCH 10/11] - Fixed log output

---
 misp_modules/modules/import_mod/ocr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py
index b52722fe..15d660bd 100755
--- a/misp_modules/modules/import_mod/ocr.py
+++ b/misp_modules/modules/import_mod/ocr.py
@@ -61,11 +61,11 @@ def handler(q=False):
             img = WImage(width=pdf.width, height=pdf.height * pages)
             # Cycle through pages and stitch it together to one big file
             for p in range(pages):
-                log.debug(f"Stitching page {p}")
+                log.debug(f"Stitching page {p+1}")
                 image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0)
             # Create a png blob
             image = img.make_blob('png')
-            log.debug(f"Final image size is {pdf.width}x{pdf.height*p}")
+            log.debug(f"Final image size is {pdf.width}x{pdf.height*(p+1)}")
     else:
         image = document
 

From 549f32547d474d7d3a33651cf5dc8d9ce8a8720b Mon Sep 17 00:00:00 2001
From: Steve Clement <steve@localhost.lu>
Date: Sun, 1 Jul 2018 22:08:42 +0800
Subject: [PATCH 11/11] - Reverted to <3.6 compatibility

---
 misp_modules/modules/import_mod/ocr.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py
index 15d660bd..f14212b1 100755
--- a/misp_modules/modules/import_mod/ocr.py
+++ b/misp_modules/modules/import_mod/ocr.py
@@ -56,16 +56,16 @@ def handler(q=False):
         with document as pdf:
             # Get number of pages
             pages=len(pdf.sequence)
-            log.debug(f"PDF with {pages} page(s) detected")
+            log.debug("PDF with {} page(s) detected".format(pages))
             # Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc…
             img = WImage(width=pdf.width, height=pdf.height * pages)
             # Cycle through pages and stitch it together to one big file
             for p in range(pages):
-                log.debug(f"Stitching page {p+1}")
+                log.debug("Stitching page {}".format(p+1))
                 image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0)
             # Create a png blob
             image = img.make_blob('png')
-            log.debug(f"Final image size is {pdf.width}x{pdf.height*(p+1)}")
+            log.debug("Final image size is {}x{}".format(pdf.width, pdf.height*(p+1)))
     else:
         image = document