misp-modules/misp_modules/modules/import_mod/ocr.py

import sys
import json
import base64
from io import BytesIO

import logging

log = logging.getLogger('ocr')
log.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
log.addHandler(ch)

misperrors = {'error': 'Error'}
userConfig = {}

inputSource = ['file']

moduleinfo = {
    'version': '0.2',
    'author': 'Alexandre Dulaunoy',
    'description': 'Optical Character Recognition (OCR) module for MISP.',
    'module-type': ['import'],
    'name': 'OCR Import',
    'logo': '',
    'requirements': [],
    'features': 'The module tries to recognize some text from an image and import the result as a freetext attribute, there is then no special feature asked to users to make it work.',
    'references': [],
    'input': 'Image',
    'output': 'freetext MISP attribute',
}

moduleconfig = []


def handler(q=False):
    # try to import modules and return errors if module not found
    try:
        from PIL import Image
    except ImportError:
        misperrors['error'] = "Please pip(3) install pillow"
        return misperrors

    try:
        # Official ImageMagick module
        from wand.image import Image as WImage
    except ImportError:
        misperrors['error'] = "Please pip(3) install wand"
        return misperrors

    try:
        from pytesseract import image_to_string
    except ImportError:
        misperrors['error'] = "Please pip(3) install pytesseract"
        return misperrors

    if q is False:
        return False
    r = {'results': []}
    request = json.loads(q)
    document = base64.b64decode(request["data"])
    document = WImage(blob=document)
    if document.format == 'PDF':
        with document as pdf:
            # Get number of pages
            pages = len(pdf.sequence)
            log.debug("PDF with {} page(s) detected".format(pages))
            # Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc…
            img = WImage(width=pdf.width, height=pdf.height * pages)
            # Cycle through pages and stitch it together to one big file
            for p in range(pages):
                log.debug("Stitching page {}".format(p + 1))
                image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0)
            # Create a png blob
            image = img.make_blob('png')
            log.debug("Final image size is {}x{}".format(pdf.width, pdf.height * (p + 1)))
    else:
        image = base64.b64decode(request["data"])

    image_file = BytesIO(image)
    image_file.seek(0)

    try:
        im = Image.open(image_file)
    except IOError:
        misperrors['error'] = "Corrupt or not an image file."
        return misperrors

    ocrized = image_to_string(im)

    freetext = {}
    freetext['values'] = ocrized
    freetext['types'] = ['freetext']
    r['results'].append(freetext)
    return r


def introspection():
    modulesetup = {}
    try:
        userConfig
        modulesetup['userConfig'] = userConfig
    except NameError:
        pass
    try:
        inputSource
        modulesetup['inputSource'] = inputSource
    except NameError:
        pass
    return modulesetup


def version():
    moduleinfo['config'] = moduleconfig
    return moduleinfo


if __name__ == '__main__':
    x = open('test.json', 'r')
    handler(q=x.read())
- Forgot to import sys 2018-06-30 05:58:44 +02:00			`import sys`
First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00			`import json`
			`import base64`
			`from io import BytesIO`
- Some more comments - Removed libmagic, wand can handle it better 2018-06-29 18:58:25 +02:00
- Added logger functionality for debug sessions 2018-06-30 05:52:12 +02:00			`import logging`

			`log = logging.getLogger('ocr')`
			`log.setLevel(logging.DEBUG)`
			`ch = logging.StreamHandler(sys.stdout)`
			`ch.setLevel(logging.DEBUG)`
			`formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')`
			`ch.setFormatter(formatter)`
			`log.addHandler(ch)`

First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00			`misperrors = {'error': 'Error'}`
fix: Make pep8 happy 2018-12-11 15:29:09 +01:00			`userConfig = {}`
First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00
			`inputSource = ['file']`

chg: [doc] Big doc revamp #680 2024-08-12 11:23:10 +02:00			`moduleinfo = {`
			`'version': '0.2',`
			`'author': 'Alexandre Dulaunoy',`
			`'description': 'Optical Character Recognition (OCR) module for MISP.',`
			`'module-type': ['import'],`
			`'name': 'OCR Import',`
			`'logo': '',`
			`'requirements': [],`
			`'features': 'The module tries to recognize some text from an image and import the result as a freetext attribute, there is then no special feature asked to users to make it work.',`
			`'references': [],`
			`'input': 'Image',`
			`'output': 'freetext MISP attribute',`
			`}`
First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00
			`moduleconfig = []`


			`def handler(q=False):`
- added wand requirement - fixed missing return png byte-stream - move module import to handler to catch and report errorz 2018-06-28 17:20:38 +02:00			`# try to import modules and return errors if module not found`
			`try:`
			`from PIL import Image`
			`except ImportError:`
			`misperrors['error'] = "Please pip(3) install pillow"`
			`return misperrors`

			`try:`
			`# Official ImageMagick module`
			`from wand.image import Image as WImage`
			`except ImportError:`
			`misperrors['error'] = "Please pip(3) install wand"`
			`return misperrors`

			`try:`
			`from pytesseract import image_to_string`
			`except ImportError:`
			`misperrors['error'] = "Please pip(3) install pytesseract"`
			`return misperrors`

First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00			`if q is False:`
			`return False`
			`r = {'results': []}`
			`request = json.loads(q)`
- Added initial PDF support, nothing is processed yet - Test to replace PIL with wand 2018-06-28 10:00:14 +02:00			`document = base64.b64decode(request["data"])`
- Some more comments - Removed libmagic, wand can handle it better 2018-06-29 18:58:25 +02:00			`document = WImage(blob=document)`
			`if document.format == 'PDF':`
- content was already a wand.obj 2018-06-30 05:38:26 +02:00			`with document as pdf:`
- Some more comments - Removed libmagic, wand can handle it better 2018-06-29 18:58:25 +02:00			`# Get number of pages`
fix: Make pep8 happy 2018-12-11 15:29:09 +01:00			`pages = len(pdf.sequence)`
- Reverted to <3.6 compatibility 2018-07-01 16:08:42 +02:00			`log.debug("PDF with {} page(s) detected".format(pages))`
- Some more comments - Removed libmagic, wand can handle it better 2018-06-29 18:58:25 +02:00			`# Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc…`
- Added initial PDF support, nothing is processed yet - Test to replace PIL with wand 2018-06-28 10:00:14 +02:00			`img = WImage(width=pdf.width, height=pdf.height * pages)`
- Some more comments - Removed libmagic, wand can handle it better 2018-06-29 18:58:25 +02:00			`# Cycle through pages and stitch it together to one big file`
- Added initial PDF support, nothing is processed yet - Test to replace PIL with wand 2018-06-28 10:00:14 +02:00			`for p in range(pages):`
fix: Make pep8 happy 2018-12-11 15:29:09 +01:00			`log.debug("Stitching page {}".format(p + 1))`
- added wand requirement - fixed missing return png byte-stream - move module import to handler to catch and report errorz 2018-06-28 17:20:38 +02:00			`image = img.composite(pdf.sequence[p], top=pdf.height * p, left=0)`
- Some more comments - Removed libmagic, wand can handle it better 2018-06-29 18:58:25 +02:00			`# Create a png blob`
- added wand requirement - fixed missing return png byte-stream - move module import to handler to catch and report errorz 2018-06-28 17:20:38 +02:00			`image = img.make_blob('png')`
fix: Make pep8 happy 2018-12-11 15:29:09 +01:00			`log.debug("Final image size is {}x{}".format(pdf.width, pdf.height * (p + 1)))`
- added wand requirement - fixed missing return png byte-stream - move module import to handler to catch and report errorz 2018-06-28 17:20:38 +02:00			`else:`
Fix for ocr import Currently works only for .pdf files, with this .png and .jpg should also work (fixes #512) 2022-09-16 10:12:46 +02:00			`image = base64.b64decode(request["data"])`
- Added initial PDF support, nothing is processed yet - Test to replace PIL with wand 2018-06-28 10:00:14 +02:00
First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00			`image_file = BytesIO(image)`
Fix: types array 2016-08-04 18:54:21 +02:00			`image_file.seek(0)`
- Added initial PDF support, nothing is processed yet - Test to replace PIL with wand 2018-06-28 10:00:14 +02:00
			`try:`
- added wand requirement - fixed missing return png byte-stream - move module import to handler to catch and report errorz 2018-06-28 17:20:38 +02:00			`im = Image.open(image_file)`
- Added initial PDF support, nothing is processed yet - Test to replace PIL with wand 2018-06-28 10:00:14 +02:00			`except IOError:`
			`misperrors['error'] = "Corrupt or not an image file."`
			`return misperrors`

			`ocrized = image_to_string(im)`

First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00			`freetext = {}`
			`freetext['values'] = ocrized`
Fix: types array 2016-08-04 18:54:21 +02:00			`freetext['types'] = ['freetext']`
First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00			`r['results'].append(freetext)`
			`return r`


			`def introspection():`
			`modulesetup = {}`
			`try:`
			`userConfig`
			`modulesetup['userConfig'] = userConfig`
			`except NameError:`
			`pass`
			`try:`
			`inputSource`
			`modulesetup['inputSource'] = inputSource`
			`except NameError:`
			`pass`
			`return modulesetup`


			`def version():`
			`moduleinfo['config'] = moduleconfig`
			`return moduleinfo`

fix: Make pep8 happy 2018-12-11 15:29:09 +01:00
First version of an Optical Character Recognition (OCR) module for MISP 2016-08-04 14:32:50 +02:00			`if __name__ == '__main__':`
			`x = open('test.json', 'r')`
			`handler(q=x.read())`