misp-modules/misp_modules/modules/import_mod/ocr.py

import json
import base64
import magic

from PIL import Image

from wand.image import Image as WImage

from pytesseract import image_to_string
from io import BytesIO
misperrors = {'error': 'Error'}
userConfig = { };

inputSource = ['file']

moduleinfo = {'version': '0.2', 'author': 'Alexandre Dulaunoy',
              'description': 'Optical Character Recognition (OCR) module for MISP',
              'module-type': ['import']}

moduleconfig = []


def handler(q=False):
    if q is False:
        return False
    r = {'results': []}
    request = json.loads(q)
    document = base64.b64decode(request["data"])
    if magic.from_buffer(document, mime=True).split("/")[1] == 'pdf':
        print("PDF Detected")
        with WImage(blob=document) as pdf:
            pages=len(pdf.sequence)
            img = WImage(width=pdf.width, height=pdf.height * pages)
            for p in range(pages):
                img.composite(pdf.sequence[p], top=pdf.height * p, left=0)
    image = document

    image_file = BytesIO(image)
    image_file.seek(0)

    try:
        im = WImage(blob=image_file)
    except IOError:
        misperrors['error'] = "Corrupt or not an image file."
        return misperrors


    ocrized = image_to_string(im)

    freetext = {}
    freetext['values'] = ocrized
    freetext['types'] = ['freetext']
    r['results'].append(freetext)
    return r


def introspection():
    modulesetup = {}
    try:
        userConfig
        modulesetup['userConfig'] = userConfig
    except NameError:
        pass
    try:
        inputSource
        modulesetup['inputSource'] = inputSource
    except NameError:
        pass
    return modulesetup


def version():
    moduleinfo['config'] = moduleconfig
    return moduleinfo

if __name__ == '__main__':
    x = open('test.json', 'r')
    handler(q=x.read())
First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00			`import json`
			`import base64`
- Added initial PDF support, nothing is processed yet - Test to replace PIL with wand 2018-06-28 10:00:14 +02:00			`import magic`
First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00
Fix: types array 2016-08-04 18:54:21 +02:00			`from PIL import Image`
First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00
- Added initial PDF support, nothing is processed yet - Test to replace PIL with wand 2018-06-28 10:00:14 +02:00			`from wand.image import Image as WImage`

First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00			`from pytesseract import image_to_string`
			`from io import BytesIO`
			`misperrors = {'error': 'Error'}`
			`userConfig = { };`

			`inputSource = ['file']`

- Added initial PDF support, nothing is processed yet - Test to replace PIL with wand 2018-06-28 10:00:14 +02:00			`moduleinfo = {'version': '0.2', 'author': 'Alexandre Dulaunoy',`
First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00			`'description': 'Optical Character Recognition (OCR) module for MISP',`
			`'module-type': ['import']}`

			`moduleconfig = []`


			`def handler(q=False):`
			`if q is False:`
			`return False`
			`r = {'results': []}`
			`request = json.loads(q)`
- Added initial PDF support, nothing is processed yet - Test to replace PIL with wand 2018-06-28 10:00:14 +02:00			`document = base64.b64decode(request["data"])`
			`if magic.from_buffer(document, mime=True).split("/")[1] == 'pdf':`
			`print("PDF Detected")`
			`with WImage(blob=document) as pdf:`
			`pages=len(pdf.sequence)`
			`img = WImage(width=pdf.width, height=pdf.height * pages)`
			`for p in range(pages):`
- fixed typo move image back in scope 2018-06-28 10:59:03 +02:00			`img.composite(pdf.sequence[p], top=pdf.height * p, left=0)`
			`image = document`
- Added initial PDF support, nothing is processed yet - Test to replace PIL with wand 2018-06-28 10:00:14 +02:00
First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00			`image_file = BytesIO(image)`
Fix: types array 2016-08-04 18:54:21 +02:00			`image_file.seek(0)`
- Added initial PDF support, nothing is processed yet - Test to replace PIL with wand 2018-06-28 10:00:14 +02:00
			`try:`
			`im = WImage(blob=image_file)`
			`except IOError:`
			`misperrors['error'] = "Corrupt or not an image file."`
			`return misperrors`


			`ocrized = image_to_string(im)`

First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00			`freetext = {}`
			`freetext['values'] = ocrized`
Fix: types array 2016-08-04 18:54:21 +02:00			`freetext['types'] = ['freetext']`
First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00			`r['results'].append(freetext)`
			`return r`


			`def introspection():`
			`modulesetup = {}`
			`try:`
			`userConfig`
			`modulesetup['userConfig'] = userConfig`
			`except NameError:`
			`pass`
			`try:`
			`inputSource`
			`modulesetup['inputSource'] = inputSource`
			`except NameError:`
			`pass`
			`return modulesetup`


			`def version():`
			`moduleinfo['config'] = moduleconfig`
			`return moduleinfo`

			`if __name__ == '__main__':`
			`x = open('test.json', 'r')`
			`handler(q=x.read())`