2018-06-30 05:58:44 +02:00
import sys
2016-08-04 14:32:50 +02:00
import json
import base64
from io import BytesIO
2018-06-29 18:58:25 +02:00
2018-06-30 05:52:12 +02:00
import logging
log = logging . getLogger ( ' ocr ' )
log . setLevel ( logging . DEBUG )
ch = logging . StreamHandler ( sys . stdout )
ch . setLevel ( logging . DEBUG )
formatter = logging . Formatter ( ' %(asctime)s - %(name)s - %(levelname)s - %(message)s ' )
ch . setFormatter ( formatter )
log . addHandler ( ch )
2016-08-04 14:32:50 +02:00
misperrors = { ' error ' : ' Error ' }
2018-12-11 15:29:09 +01:00
userConfig = { }
2016-08-04 14:32:50 +02:00
inputSource = [ ' file ' ]
2024-08-12 11:23:10 +02:00
moduleinfo = {
' version ' : ' 0.2 ' ,
' author ' : ' Alexandre Dulaunoy ' ,
' description ' : ' Optical Character Recognition (OCR) module for MISP. ' ,
' module-type ' : [ ' import ' ] ,
' name ' : ' OCR Import ' ,
' logo ' : ' ' ,
' requirements ' : [ ] ,
' features ' : ' The module tries to recognize some text from an image and import the result as a freetext attribute, there is then no special feature asked to users to make it work. ' ,
' references ' : [ ] ,
' input ' : ' Image ' ,
' output ' : ' freetext MISP attribute ' ,
}
2016-08-04 14:32:50 +02:00
moduleconfig = [ ]
def handler ( q = False ) :
2018-06-28 17:20:38 +02:00
# try to import modules and return errors if module not found
try :
from PIL import Image
except ImportError :
misperrors [ ' error ' ] = " Please pip(3) install pillow "
return misperrors
try :
# Official ImageMagick module
from wand . image import Image as WImage
except ImportError :
misperrors [ ' error ' ] = " Please pip(3) install wand "
return misperrors
try :
from pytesseract import image_to_string
except ImportError :
misperrors [ ' error ' ] = " Please pip(3) install pytesseract "
return misperrors
2016-08-04 14:32:50 +02:00
if q is False :
return False
r = { ' results ' : [ ] }
request = json . loads ( q )
2018-06-28 10:00:14 +02:00
document = base64 . b64decode ( request [ " data " ] )
2018-06-29 18:58:25 +02:00
document = WImage ( blob = document )
if document . format == ' PDF ' :
2018-06-30 05:38:26 +02:00
with document as pdf :
2018-06-29 18:58:25 +02:00
# Get number of pages
2018-12-11 15:29:09 +01:00
pages = len ( pdf . sequence )
2018-07-01 16:08:42 +02:00
log . debug ( " PDF with {} page(s) detected " . format ( pages ) )
2018-06-29 18:58:25 +02:00
# Create new image object where the height will be the number of pages. With huge PDFs this will overflow, break, consume silly memory etc…
2018-06-28 10:00:14 +02:00
img = WImage ( width = pdf . width , height = pdf . height * pages )
2018-06-29 18:58:25 +02:00
# Cycle through pages and stitch it together to one big file
2018-06-28 10:00:14 +02:00
for p in range ( pages ) :
2018-12-11 15:29:09 +01:00
log . debug ( " Stitching page {} " . format ( p + 1 ) )
2018-06-28 17:20:38 +02:00
image = img . composite ( pdf . sequence [ p ] , top = pdf . height * p , left = 0 )
2018-06-29 18:58:25 +02:00
# Create a png blob
2018-06-28 17:20:38 +02:00
image = img . make_blob ( ' png ' )
2018-12-11 15:29:09 +01:00
log . debug ( " Final image size is {} x {} " . format ( pdf . width , pdf . height * ( p + 1 ) ) )
2018-06-28 17:20:38 +02:00
else :
2022-09-16 10:12:46 +02:00
image = base64 . b64decode ( request [ " data " ] )
2018-06-28 10:00:14 +02:00
2016-08-04 14:32:50 +02:00
image_file = BytesIO ( image )
2016-08-04 18:54:21 +02:00
image_file . seek ( 0 )
2018-06-28 10:00:14 +02:00
try :
2018-06-28 17:20:38 +02:00
im = Image . open ( image_file )
2018-06-28 10:00:14 +02:00
except IOError :
misperrors [ ' error ' ] = " Corrupt or not an image file. "
return misperrors
ocrized = image_to_string ( im )
2016-08-04 14:32:50 +02:00
freetext = { }
freetext [ ' values ' ] = ocrized
2016-08-04 18:54:21 +02:00
freetext [ ' types ' ] = [ ' freetext ' ]
2016-08-04 14:32:50 +02:00
r [ ' results ' ] . append ( freetext )
return r
def introspection ( ) :
modulesetup = { }
try :
userConfig
modulesetup [ ' userConfig ' ] = userConfig
except NameError :
pass
try :
inputSource
modulesetup [ ' inputSource ' ] = inputSource
except NameError :
pass
return modulesetup
def version ( ) :
moduleinfo [ ' config ' ] = moduleconfig
return moduleinfo
2018-12-11 15:29:09 +01:00
2016-08-04 14:32:50 +02:00
if __name__ == ' __main__ ' :
x = open ( ' test.json ' , ' r ' )
handler ( q = x . read ( ) )