mirror of https://github.com/MISP/misp-modules
				
				
				
			
		
			
				
	
	
		
			113 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
			
		
		
	
	
			113 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
import sys
 | 
						|
import json
 | 
						|
import base64
 | 
						|
from io import BytesIO
 | 
						|
 | 
						|
import logging
 | 
						|
 | 
						|
log = logging.getLogger('ocr')
 | 
						|
log.setLevel(logging.DEBUG)
 | 
						|
ch = logging.StreamHandler(sys.stdout)
 | 
						|
ch.setLevel(logging.DEBUG)
 | 
						|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 | 
						|
ch.setFormatter(formatter)
 | 
						|
log.addHandler(ch)
 | 
						|
 | 
						|
misperrors = {'error': 'Error'}
 | 
						|
userConfig = {}
 | 
						|
 | 
						|
inputSource = ['file']
 | 
						|
 | 
						|
moduleinfo = {'version': '0.2', 'author': 'Alexandre Dulaunoy',
 | 
						|
              'description': 'Optical Character Recognition (OCR) module for MISP',
 | 
						|
              'module-type': ['import']}
 | 
						|
 | 
						|
moduleconfig = []
 | 
						|
 | 
						|
 | 
						|
def handler(q=False):
 | 
						|
    # try to import modules and return errors if module not found
 | 
						|
    try:
 | 
						|
        from PIL import Image
 | 
						|
    except ImportError:
 | 
						|
        misperrors['error'] = "Please pip(3) install pillow"
 | 
						|
        return misperrors
 | 
						|
 | 
						|
    try:
 | 
						|
        # Official ImageMagick module
 | 
						|
        from wand.image import Image as WImage
 | 
						|
    except ImportError:
 | 
						|
        misperrors['error'] = "Please pip(3) install wand"
 | 
						|
        return misperrors
 | 
						|
 | 
						|
    try:
 | 
						|
        from pytesseract import image_to_string
 | 
						|
    except ImportError:
 | 
						|
        misperrors['error'] = "Please pip(3) install pytesseract"
 | 
						|
        return misperrors
 | 
						|
 | 
						|
    if q is False:
 | 
						|
        return False
 | 
						|
    r = {'results': []}
 | 
						|
    request = json.loads(q)
 | 
						|
    document = base64.b64decode(request["data"])
 | 
						|
    document = WImage(blob=document)
 | 
						|
    if document.format == 'PDF':
 | 
						|
        with document as pdf:
 | 
						|
            # Get number of pages
 | 
						|
            pages = len(pdf.sequence)
 | 
						|
            log.debug("PDF with {} page(s) detected".format(pages))
 | 
						|
            # Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc…
 | 
						|
            img = WImage(width=pdf.width, height=pdf.height * pages)
 | 
						|
            # Cycle through pages and stitch it together to one big file
 | 
						|
            for p in range(pages):
 | 
						|
                log.debug("Stitching page {}".format(p + 1))
 | 
						|
                image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0)
 | 
						|
            # Create a png blob
 | 
						|
            image = img.make_blob('png')
 | 
						|
            log.debug("Final image size is {}x{}".format(pdf.width, pdf.height * (p + 1)))
 | 
						|
    else:
 | 
						|
        image = document
 | 
						|
 | 
						|
    image_file = BytesIO(image)
 | 
						|
    image_file.seek(0)
 | 
						|
 | 
						|
    try:
 | 
						|
        im = Image.open(image_file)
 | 
						|
    except IOError:
 | 
						|
        misperrors['error'] = "Corrupt or not an image file."
 | 
						|
        return misperrors
 | 
						|
 | 
						|
    ocrized = image_to_string(im)
 | 
						|
 | 
						|
    freetext = {}
 | 
						|
    freetext['values'] = ocrized
 | 
						|
    freetext['types'] = ['freetext']
 | 
						|
    r['results'].append(freetext)
 | 
						|
    return r
 | 
						|
 | 
						|
 | 
						|
def introspection():
 | 
						|
    modulesetup = {}
 | 
						|
    try:
 | 
						|
        userConfig
 | 
						|
        modulesetup['userConfig'] = userConfig
 | 
						|
    except NameError:
 | 
						|
        pass
 | 
						|
    try:
 | 
						|
        inputSource
 | 
						|
        modulesetup['inputSource'] = inputSource
 | 
						|
    except NameError:
 | 
						|
        pass
 | 
						|
    return modulesetup
 | 
						|
 | 
						|
 | 
						|
def version():
 | 
						|
    moduleinfo['config'] = moduleconfig
 | 
						|
    return moduleinfo
 | 
						|
 | 
						|
 | 
						|
if __name__ == '__main__':
 | 
						|
    x = open('test.json', 'r')
 | 
						|
    handler(q=x.read())
 |