mirror of https://github.com/MISP/misp-modules
				
				
				
			
		
			
				
	
	
		
			113 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
			
		
		
	
	
			113 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
| import sys
 | |
| import json
 | |
| import base64
 | |
| from io import BytesIO
 | |
| 
 | |
| import logging
 | |
| 
 | |
| log = logging.getLogger('ocr')
 | |
| log.setLevel(logging.DEBUG)
 | |
| ch = logging.StreamHandler(sys.stdout)
 | |
| ch.setLevel(logging.DEBUG)
 | |
| formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 | |
| ch.setFormatter(formatter)
 | |
| log.addHandler(ch)
 | |
| 
 | |
| misperrors = {'error': 'Error'}
 | |
| userConfig = {}
 | |
| 
 | |
| inputSource = ['file']
 | |
| 
 | |
| moduleinfo = {'version': '0.2', 'author': 'Alexandre Dulaunoy',
 | |
|               'description': 'Optical Character Recognition (OCR) module for MISP',
 | |
|               'module-type': ['import']}
 | |
| 
 | |
| moduleconfig = []
 | |
| 
 | |
| 
 | |
| def handler(q=False):
 | |
|     # try to import modules and return errors if module not found
 | |
|     try:
 | |
|         from PIL import Image
 | |
|     except ImportError:
 | |
|         misperrors['error'] = "Please pip(3) install pillow"
 | |
|         return misperrors
 | |
| 
 | |
|     try:
 | |
|         # Official ImageMagick module
 | |
|         from wand.image import Image as WImage
 | |
|     except ImportError:
 | |
|         misperrors['error'] = "Please pip(3) install wand"
 | |
|         return misperrors
 | |
| 
 | |
|     try:
 | |
|         from pytesseract import image_to_string
 | |
|     except ImportError:
 | |
|         misperrors['error'] = "Please pip(3) install pytesseract"
 | |
|         return misperrors
 | |
| 
 | |
|     if q is False:
 | |
|         return False
 | |
|     r = {'results': []}
 | |
|     request = json.loads(q)
 | |
|     document = base64.b64decode(request["data"])
 | |
|     document = WImage(blob=document)
 | |
|     if document.format == 'PDF':
 | |
|         with document as pdf:
 | |
|             # Get number of pages
 | |
|             pages = len(pdf.sequence)
 | |
|             log.debug("PDF with {} page(s) detected".format(pages))
 | |
|             # Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc…
 | |
|             img = WImage(width=pdf.width, height=pdf.height * pages)
 | |
|             # Cycle through pages and stitch it together to one big file
 | |
|             for p in range(pages):
 | |
|                 log.debug("Stitching page {}".format(p + 1))
 | |
|                 image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0)
 | |
|             # Create a png blob
 | |
|             image = img.make_blob('png')
 | |
|             log.debug("Final image size is {}x{}".format(pdf.width, pdf.height * (p + 1)))
 | |
|     else:
 | |
|         image = base64.b64decode(request["data"])
 | |
| 
 | |
|     image_file = BytesIO(image)
 | |
|     image_file.seek(0)
 | |
| 
 | |
|     try:
 | |
|         im = Image.open(image_file)
 | |
|     except IOError:
 | |
|         misperrors['error'] = "Corrupt or not an image file."
 | |
|         return misperrors
 | |
| 
 | |
|     ocrized = image_to_string(im)
 | |
| 
 | |
|     freetext = {}
 | |
|     freetext['values'] = ocrized
 | |
|     freetext['types'] = ['freetext']
 | |
|     r['results'].append(freetext)
 | |
|     return r
 | |
| 
 | |
| 
 | |
| def introspection():
 | |
|     modulesetup = {}
 | |
|     try:
 | |
|         userConfig
 | |
|         modulesetup['userConfig'] = userConfig
 | |
|     except NameError:
 | |
|         pass
 | |
|     try:
 | |
|         inputSource
 | |
|         modulesetup['inputSource'] = inputSource
 | |
|     except NameError:
 | |
|         pass
 | |
|     return modulesetup
 | |
| 
 | |
| 
 | |
| def version():
 | |
|     moduleinfo['config'] = moduleconfig
 | |
|     return moduleinfo
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     x = open('test.json', 'r')
 | |
|     handler(q=x.read())
 |