diff --git a/misp_modules/modules/expansion/ocr_enrich.py b/misp_modules/modules/expansion/ocr_enrich.py index cd6baca..4c24cb8 100644 --- a/misp_modules/modules/expansion/ocr_enrich.py +++ b/misp_modules/modules/expansion/ocr_enrich.py @@ -6,14 +6,21 @@ import pytesseract misperrors = {'error': 'Error'} mispattributes = {'input': ['attachment'], - 'output': ['freetext', 'text']} -moduleinfo = {'version': '0.1', 'author': 'Sascha Rommelfangen', + 'output': ['freetext']} +moduleinfo = {'version': '0.2', 'author': 'Sascha Rommelfangen', 'description': 'OCR decoder', 'module-type': ['expansion']} moduleconfig = [] +def filter_decoded(decoded): + for line in decoded.split('\n'): + decoded_line = line.strip('\t\x0b\x0c\r ') + if decoded_line: + yield decoded_line + + def handler(q=False): if q is False: return False @@ -31,9 +38,16 @@ def handler(q=False): image = img_array image = cv2.imdecode(img_array, cv2.IMREAD_COLOR) try: - decoded = pytesseract.image_to_string(image) - return {'results': [{'types': ['freetext'], 'values': decoded, 'comment': "OCR from file " + filename}, - {'types': ['text'], 'values': decoded, 'comment': "ORC from file " + filename}]} + decoded = pytesseract.image_to_string(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) + return { + 'results':[ + { + 'types': ['freetext'], + 'values': list(filter_decoded(decoded)), + 'comment': f"OCR from file {filename}" + } + ] + } except Exception as e: print(e) err = "Couldn't analyze file type. Only images are supported right now."