From 43b3556588d782f2826c20b0956190007e87e113 Mon Sep 17 00:00:00 2001 From: Alain Date: Fri, 5 Feb 2016 13:58:21 -0500 Subject: [PATCH 1/3] Starting Phone number recognition --- bin/Phone.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 bin/Phone.py diff --git a/bin/Phone.py b/bin/Phone.py new file mode 100644 index 00000000..87caf772 --- /dev/null +++ b/bin/Phone.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python2 +# -*-coding:UTF-8 -* +""" + module for finding phone numbers +""" + +import time +import pprint +import re +from packages import Paste +from packages import lib_refine +from pubsublogger import publisher +from Helper import Process + + +def search_phone(message): + paste = Paste.Paste(message) + content = paste.get_p_content() + # regex to find phone numbers, may raise many false positives (shalt thou seek optimization, upgrading is required) + reg_phone = re.compile(r'(\+\d{1,3}\(\d{1,2}\)\d?)?(\d{2,4}[\W\D\s]?){4,6} ') + # list of the regex results in the Paste, may be null + results = reg_phone.findall(content) + + # if the list is greater than 4, we consider the Paste may contain a list of phone numbers + if len(results) > 4 : + print results + publisher.warning('{} contains PID (phone numbers)'.format(paste.p_name)) + + if __name__ == '__main__': + # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) + # Port of the redis instance used by pubsublogger + publisher.port = 6380 + # Script is the default channel used for the modules. + publisher.channel = 'Script' + + # Section name in bin/packages/modules.cfg + config_section = 'Phone' + + # Setup the I/O queues + p = Process(config_section) + + # Sent to the logging a description of the module + publisher.info("Run Phone module") + + # Endless loop getting messages from the input queue + while True: + # Get one message from the input queue + message = p.get_from_set() + if message is None: + publisher.debug("{} queue is empty, waiting".format(config_section)) + time.sleep(1) + continue + + # Do something with the message from the queue + search_phone(message) + From fabbfd8ae9a606ce3fe13949cf066a2129880470 Mon Sep 17 00:00:00 2001 From: Alain Date: Fri, 5 Feb 2016 14:00:41 -0500 Subject: [PATCH 2/3] Update module.cfg (adding Keys and Phone section) --- bin/packages/modules.cfg | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg index e408b84f..1638f7be 100644 --- a/bin/packages/modules.cfg +++ b/bin/packages/modules.cfg @@ -46,3 +46,9 @@ subscribe = Redis_ValidOnion [Web] subscribe = Redis_Web publish = Redis_Url,ZMQ_Url + +[Keys] +subscribe = Redis_Global + +[Phone] +subscribe = Redis_Global From ea52fd106818f28a1954c6bcc2b9b5794c616ea1 Mon Sep 17 00:00:00 2001 From: Alain Date: Fri, 5 Feb 2016 20:58:02 +0100 Subject: [PATCH 3/3] Phone regex updated Might still need to be fixed / optimized, in case of maths or random numbers starting with a 0. Do not capture dates, hours, coordinates anymore. Captured formats are: e.g. +331234567890 ; 09 12 34 56 78 ; +4177/123.45.69 ; +352(0)6-23-23-23... --- bin/Phone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/Phone.py b/bin/Phone.py index 87caf772..384040cf 100644 --- a/bin/Phone.py +++ b/bin/Phone.py @@ -17,7 +17,7 @@ def search_phone(message): paste = Paste.Paste(message) content = paste.get_p_content() # regex to find phone numbers, may raise many false positives (shalt thou seek optimization, upgrading is required) - reg_phone = re.compile(r'(\+\d{1,3}\(\d{1,2}\)\d?)?(\d{2,4}[\W\D\s]?){4,6} ') + reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\d{2,3}){3,4})') # list of the regex results in the Paste, may be null results = reg_phone.findall(content)