diff --git a/REQUIREMENTS b/REQUIREMENTS index a8baf52..1365ec2 100644 --- a/REQUIREMENTS +++ b/REQUIREMENTS @@ -14,6 +14,7 @@ git+https://github.com/MISP/PyMISP.git#egg=pymisp git+https://github.com/sebdraven/pyonyphe#egg=pyonyphe pillow pytesseract +wand SPARQLWrapper domaintools_api pygeoip diff --git a/misp_modules/__init__.py b/misp_modules/__init__.py index 1c1713b..3bb7253 100644 --- a/misp_modules/__init__.py +++ b/misp_modules/__init__.py @@ -193,7 +193,7 @@ class QueryModule(tornado.web.RequestHandler): if dict_payload.get('timeout'): timeout = datetime.timedelta(seconds=int(dict_payload.get('timeout'))) else: - timeout = datetime.timedelta(seconds=30) + timeout = datetime.timedelta(seconds=300) response = yield tornado.gen.with_timeout(timeout, self.run_request(jsonpayload)) self.write(response) except tornado.gen.TimeoutError: diff --git a/misp_modules/modules/import_mod/ocr.py b/misp_modules/modules/import_mod/ocr.py index aafe653..f14212b 100755 --- a/misp_modules/modules/import_mod/ocr.py +++ b/misp_modules/modules/import_mod/ocr.py @@ -1,16 +1,24 @@ +import sys import json import base64 - -from PIL import Image - -from pytesseract import image_to_string from io import BytesIO + +import logging + +log = logging.getLogger('ocr') +log.setLevel(logging.DEBUG) +ch = logging.StreamHandler(sys.stdout) +ch.setLevel(logging.DEBUG) +formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +ch.setFormatter(formatter) +log.addHandler(ch) + misperrors = {'error': 'Error'} -userConfig = { }; +userConfig = {}; inputSource = ['file'] -moduleinfo = {'version': '0.1', 'author': 'Alexandre Dulaunoy', +moduleinfo = {'version': '0.2', 'author': 'Alexandre Dulaunoy', 'description': 'Optical Character Recognition (OCR) module for MISP', 'module-type': ['import']} @@ -18,14 +26,61 @@ moduleconfig = [] def handler(q=False): + # try to import modules and return errors if module not found + try: + from PIL import Image + except ImportError: + misperrors['error'] = "Please pip(3) install pillow" + return misperrors + + try: + # Official ImageMagick module + from wand.image import Image as WImage + except ImportError: + misperrors['error'] = "Please pip(3) install wand" + return misperrors + + try: + from pytesseract import image_to_string + except ImportError: + misperrors['error'] = "Please pip(3) install pytesseract" + return misperrors + if q is False: return False r = {'results': []} request = json.loads(q) - image = base64.b64decode(request["data"]) + document = base64.b64decode(request["data"]) + document = WImage(blob=document) + if document.format == 'PDF': + with document as pdf: + # Get number of pages + pages=len(pdf.sequence) + log.debug("PDF with {} page(s) detected".format(pages)) + # Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc… + img = WImage(width=pdf.width, height=pdf.height * pages) + # Cycle through pages and stitch it together to one big file + for p in range(pages): + log.debug("Stitching page {}".format(p+1)) + image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0) + # Create a png blob + image = img.make_blob('png') + log.debug("Final image size is {}x{}".format(pdf.width, pdf.height*(p+1))) + else: + image = document + image_file = BytesIO(image) image_file.seek(0) - ocrized = image_to_string(Image.open(image_file)) + + try: + im = Image.open(image_file) + except IOError: + misperrors['error'] = "Corrupt or not an image file." + return misperrors + + + ocrized = image_to_string(im) + freetext = {} freetext['values'] = ocrized freetext['types'] = ['freetext']