From 59b7688bdcbdc3d1d1e085ec6e8862c0c8585b33 Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Thu, 28 Jun 2018 16:00:14 +0800 Subject: [PATCH 01/11] - Added initial PDF support, nothing is processed yet - Test to replace PIL with wand --- misp_modules/modules/import_mod/ocr.py | 27 +++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py index aafe653..17d634c 100755 --- a/misp_modules/modules/import_mod/ocr.py +++ b/misp_modules/modules/import_mod/ocr.py @@ -1,8 +1,11 @@ import json import base64 +import magic from PIL import Image +from wand.image import Image as WImage + from pytesseract import image_to_string from io import BytesIO misperrors = {'error': 'Error'} @@ -10,7 +13,7 @@ userConfig = { }; inputSource = ['file'] -moduleinfo = {'version': '0.1', 'author': 'Alexandre Dulaunoy', +moduleinfo = {'version': '0.2', 'author': 'Alexandre Dulaunoy', 'description': 'Optical Character Recognition (OCR) module for MISP', 'module-type': ['import']} @@ -22,10 +25,28 @@ def handler(q=False): return False r = {'results': []} request = json.loads(q) - image = base64.b64decode(request["data"]) + document = base64.b64decode(request["data"]) + if magic.from_buffer(document, mime=True).split("/")[1] == 'pdf': + print("PDF Detected") + with WImage(blob=document) as pdf: + pages=len(pdf.sequence) + img = WImage(width=pdf.width, height=pdf.height * pages) + for p in range(pages): + img.composite(pdf.sequence[p], top=pdf.height * i, left=0) + image = document + image_file = BytesIO(image) image_file.seek(0) - ocrized = image_to_string(Image.open(image_file)) + + try: + im = WImage(blob=image_file) + except IOError: + misperrors['error'] = "Corrupt or not an image file." + return misperrors + + + ocrized = image_to_string(im) + freetext = {} freetext['values'] = ocrized freetext['types'] = ['freetext'] From 7885017981a122761ce1613858ff904115eb10cc Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Thu, 28 Jun 2018 16:59:03 +0800 Subject: [PATCH 02/11] - fixed typo move image back in scope --- misp_modules/modules/import_mod/ocr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py index 17d634c..0748d35 100755 --- a/misp_modules/modules/import_mod/ocr.py +++ b/misp_modules/modules/import_mod/ocr.py @@ -32,8 +32,8 @@ def handler(q=False): pages=len(pdf.sequence) img = WImage(width=pdf.width, height=pdf.height * pages) for p in range(pages): - img.composite(pdf.sequence[p], top=pdf.height * i, left=0) - image = document + img.composite(pdf.sequence[p], top=pdf.height * p, left=0) + image = document image_file = BytesIO(image) image_file.seek(0) From 60a3fbe28204c5178b2bceed2b3551a27e8c6ce4 Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Thu, 28 Jun 2018 23:20:38 +0800 Subject: [PATCH 03/11] - added wand requirement - fixed missing return png byte-stream - move module import to handler to catch and report errorz --- REQUIREMENTS | 1 + misp_modules/modules/import_mod/ocr.py | 41 +++++++++++++++++++------- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/REQUIREMENTS b/REQUIREMENTS index 9404855..c116763 100644 --- a/REQUIREMENTS +++ b/REQUIREMENTS @@ -14,6 +14,7 @@ git+https://github.com/MISP/PyMISP.git#egg=pymisp git+https://github.com/sebdraven/pyonyphe#egg=pyonyphe pillow pytesseract +wand SPARQLWrapper domaintools_api pygeoip diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py index 0748d35..a30bba0 100755 --- a/misp_modules/modules/import_mod/ocr.py +++ b/misp_modules/modules/import_mod/ocr.py @@ -1,12 +1,5 @@ import json import base64 -import magic - -from PIL import Image - -from wand.image import Image as WImage - -from pytesseract import image_to_string from io import BytesIO misperrors = {'error': 'Error'} userConfig = { }; @@ -21,6 +14,32 @@ moduleconfig = [] def handler(q=False): + # try to import modules and return errors if module not found + try: + import magic + except ImportError: + misperrors['error'] = "Please pip(3) install magic" + return misperrors + + try: + from PIL import Image + except ImportError: + misperrors['error'] = "Please pip(3) install pillow" + return misperrors + + try: + # Official ImageMagick module + from wand.image import Image as WImage + except ImportError: + misperrors['error'] = "Please pip(3) install wand" + return misperrors + + try: + from pytesseract import image_to_string + except ImportError: + misperrors['error'] = "Please pip(3) install pytesseract" + return misperrors + if q is False: return False r = {'results': []} @@ -32,14 +51,16 @@ def handler(q=False): pages=len(pdf.sequence) img = WImage(width=pdf.width, height=pdf.height * pages) for p in range(pages): - img.composite(pdf.sequence[p], top=pdf.height * p, left=0) - image = document + image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0) + image = img.make_blob('png') + else: + image = document image_file = BytesIO(image) image_file.seek(0) try: - im = WImage(blob=image_file) + im = Image.open(image_file) except IOError: misperrors['error'] = "Corrupt or not an image file." return misperrors From fbb3617f256d19e272d95e7a6f2c9e745acdfe94 Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Fri, 29 Jun 2018 12:01:17 +0800 Subject: [PATCH 04/11] - Quick comment ToDo: Avoid using Magic in future releases --- misp_modules/modules/import_mod/ocr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py index a30bba0..2248306 100755 --- a/misp_modules/modules/import_mod/ocr.py +++ b/misp_modules/modules/import_mod/ocr.py @@ -45,7 +45,7 @@ def handler(q=False): r = {'results': []} request = json.loads(q) document = base64.b64decode(request["data"]) - if magic.from_buffer(document, mime=True).split("/")[1] == 'pdf': + if magic.from_buffer(document, mime=True).split("/")[1] == 'pdf': # Eventually this could be replaced with wand.obj.format print("PDF Detected") with WImage(blob=document) as pdf: pages=len(pdf.sequence) From c7c93b53e8522c6318f9571d1c78d7f5a4b8b25f Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Fri, 29 Jun 2018 12:02:08 +0800 Subject: [PATCH 05/11] - Set tornado timeout to 300 seconds. --- misp_modules/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misp_modules/__init__.py b/misp_modules/__init__.py index 1c1713b..3bb7253 100644 --- a/misp_modules/__init__.py +++ b/misp_modules/__init__.py @@ -193,7 +193,7 @@ class QueryModule(tornado.web.RequestHandler): if dict_payload.get('timeout'): timeout = datetime.timedelta(seconds=int(dict_payload.get('timeout'))) else: - timeout = datetime.timedelta(seconds=30) + timeout = datetime.timedelta(seconds=300) response = yield tornado.gen.with_timeout(timeout, self.run_request(jsonpayload)) self.write(response) except tornado.gen.TimeoutError: From ef3837077e1e5da2d1f7bf4f40730e10be6aea1f Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Sat, 30 Jun 2018 00:58:25 +0800 Subject: [PATCH 06/11] - Some more comments - Removed libmagic, wand can handle it better --- misp_modules/modules/import_mod/ocr.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py index 2248306..441adc4 100755 --- a/misp_modules/modules/import_mod/ocr.py +++ b/misp_modules/modules/import_mod/ocr.py @@ -1,8 +1,9 @@ import json import base64 from io import BytesIO + misperrors = {'error': 'Error'} -userConfig = { }; +userConfig = {}; inputSource = ['file'] @@ -15,12 +16,6 @@ moduleconfig = [] def handler(q=False): # try to import modules and return errors if module not found - try: - import magic - except ImportError: - misperrors['error'] = "Please pip(3) install magic" - return misperrors - try: from PIL import Image except ImportError: @@ -45,13 +40,18 @@ def handler(q=False): r = {'results': []} request = json.loads(q) document = base64.b64decode(request["data"]) - if magic.from_buffer(document, mime=True).split("/")[1] == 'pdf': # Eventually this could be replaced with wand.obj.format - print("PDF Detected") + document = WImage(blob=document) + if document.format == 'PDF': with WImage(blob=document) as pdf: + # Get number of pages pages=len(pdf.sequence) + print(f"PDF with {pages} page(s) detected") + # Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc… img = WImage(width=pdf.width, height=pdf.height * pages) + # Cycle through pages and stitch it together to one big file for p in range(pages): image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0) + # Create a png blob image = img.make_blob('png') else: image = document From 2f5dd9928e89fb4adf9c7c1849e003f2e8b9a360 Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Sat, 30 Jun 2018 11:38:26 +0800 Subject: [PATCH 07/11] - content was already a wand.obj --- misp_modules/modules/import_mod/ocr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py index 441adc4..f37ba9b 100755 --- a/misp_modules/modules/import_mod/ocr.py +++ b/misp_modules/modules/import_mod/ocr.py @@ -42,7 +42,7 @@ def handler(q=False): document = base64.b64decode(request["data"]) document = WImage(blob=document) if document.format == 'PDF': - with WImage(blob=document) as pdf: + with document as pdf: # Get number of pages pages=len(pdf.sequence) print(f"PDF with {pages} page(s) detected") From ffce2aa5cc465823ae558953c2b46fc1fe88cef5 Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Sat, 30 Jun 2018 11:52:12 +0800 Subject: [PATCH 08/11] - Added logger functionality for debug sessions --- misp_modules/modules/import_mod/ocr.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py index f37ba9b..fc7acf7 100755 --- a/misp_modules/modules/import_mod/ocr.py +++ b/misp_modules/modules/import_mod/ocr.py @@ -2,6 +2,16 @@ import json import base64 from io import BytesIO +import logging + +log = logging.getLogger('ocr') +log.setLevel(logging.DEBUG) +ch = logging.StreamHandler(sys.stdout) +ch.setLevel(logging.DEBUG) +formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +ch.setFormatter(formatter) +log.addHandler(ch) + misperrors = {'error': 'Error'} userConfig = {}; @@ -45,14 +55,16 @@ def handler(q=False): with document as pdf: # Get number of pages pages=len(pdf.sequence) - print(f"PDF with {pages} page(s) detected") + log.debug(f"PDF with {pages} page(s) detected") # Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc… img = WImage(width=pdf.width, height=pdf.height * pages) # Cycle through pages and stitch it together to one big file for p in range(pages): + log.debug(f"Stitching page {p}") image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0) # Create a png blob image = img.make_blob('png') + log.debug(f"Final image size is {pdf.width}x{pdf.height*p}") else: image = document From 184065cf741818d9a38a2fb885a77b7b441bb02e Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Sat, 30 Jun 2018 11:58:44 +0800 Subject: [PATCH 09/11] - Forgot to import sys --- misp_modules/modules/import_mod/ocr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py index fc7acf7..b52722f 100755 --- a/misp_modules/modules/import_mod/ocr.py +++ b/misp_modules/modules/import_mod/ocr.py @@ -1,3 +1,4 @@ +import sys import json import base64 from io import BytesIO From 9f0313a97e0cf4e13cf4af580feddfe64591709f Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Sat, 30 Jun 2018 12:01:21 +0800 Subject: [PATCH 10/11] - Fixed log output --- misp_modules/modules/import_mod/ocr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py index b52722f..15d660b 100755 --- a/misp_modules/modules/import_mod/ocr.py +++ b/misp_modules/modules/import_mod/ocr.py @@ -61,11 +61,11 @@ def handler(q=False): img = WImage(width=pdf.width, height=pdf.height * pages) # Cycle through pages and stitch it together to one big file for p in range(pages): - log.debug(f"Stitching page {p}") + log.debug(f"Stitching page {p+1}") image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0) # Create a png blob image = img.make_blob('png') - log.debug(f"Final image size is {pdf.width}x{pdf.height*p}") + log.debug(f"Final image size is {pdf.width}x{pdf.height*(p+1)}") else: image = document From 549f32547d474d7d3a33651cf5dc8d9ce8a8720b Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Sun, 1 Jul 2018 22:08:42 +0800 Subject: [PATCH 11/11] - Reverted to <3.6 compatibility --- misp_modules/modules/import_mod/ocr.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py index 15d660b..f14212b 100755 --- a/misp_modules/modules/import_mod/ocr.py +++ b/misp_modules/modules/import_mod/ocr.py @@ -56,16 +56,16 @@ def handler(q=False): with document as pdf: # Get number of pages pages=len(pdf.sequence) - log.debug(f"PDF with {pages} page(s) detected") + log.debug("PDF with {} page(s) detected".format(pages)) # Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc… img = WImage(width=pdf.width, height=pdf.height * pages) # Cycle through pages and stitch it together to one big file for p in range(pages): - log.debug(f"Stitching page {p+1}") + log.debug("Stitching page {}".format(p+1)) image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0) # Create a png blob image = img.make_blob('png') - log.debug(f"Final image size is {pdf.width}x{pdf.height*(p+1)}") + log.debug("Final image size is {}x{}".format(pdf.width, pdf.height*(p+1))) else: image = document