2020-05-20 17:03:58 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
|
|
|
|
"""
|
|
|
|
Regex Helper
|
|
|
|
"""
|
|
|
|
|
|
|
|
import os
|
2023-05-12 15:29:53 +02:00
|
|
|
import logging.config
|
2020-05-20 17:03:58 +02:00
|
|
|
import re
|
|
|
|
import sys
|
|
|
|
import uuid
|
|
|
|
|
|
|
|
from multiprocessing import Process as Proc
|
|
|
|
|
2022-11-28 15:01:40 +01:00
|
|
|
sys.path.append(os.environ['AIL_BIN'])
|
|
|
|
##################################
|
|
|
|
# Import Project packages
|
|
|
|
##################################
|
2023-05-12 15:29:53 +02:00
|
|
|
from lib import ail_logger
|
2022-11-28 15:01:40 +01:00
|
|
|
from lib import ConfigLoader
|
2020-05-20 17:03:58 +02:00
|
|
|
|
2023-05-12 15:29:53 +02:00
|
|
|
logging.config.dictConfig(ail_logger.get_config())
|
|
|
|
logger = logging.getLogger()
|
|
|
|
|
2020-05-20 17:03:58 +02:00
|
|
|
## LOAD CONFIG ##
|
|
|
|
config_loader = ConfigLoader.ConfigLoader()
|
|
|
|
r_serv_cache = config_loader.get_redis_conn("Redis_Cache")
|
|
|
|
config_loader = None
|
|
|
|
## -- ##
|
|
|
|
|
|
|
|
def generate_redis_cache_key(module_name):
|
2022-01-19 16:20:18 +01:00
|
|
|
new_uuid = str(uuid.uuid4())
|
|
|
|
return f'{module_name}_extracted:{new_uuid}'
|
2020-05-20 17:03:58 +02:00
|
|
|
|
|
|
|
def _regex_findall(redis_key, regex, item_content, r_set):
|
|
|
|
all_items = re.findall(regex, item_content)
|
|
|
|
if r_set:
|
|
|
|
if len(all_items) > 1:
|
2022-10-25 16:25:19 +02:00
|
|
|
for item in all_items:
|
|
|
|
r_serv_cache.sadd(redis_key, str(item))
|
2020-05-20 17:03:58 +02:00
|
|
|
r_serv_cache.expire(redis_key, 360)
|
|
|
|
elif all_items:
|
2022-10-25 16:25:19 +02:00
|
|
|
r_serv_cache.sadd(redis_key, str(all_items[0]))
|
2020-05-20 17:03:58 +02:00
|
|
|
r_serv_cache.expire(redis_key, 360)
|
|
|
|
else:
|
|
|
|
if len(all_items) > 1:
|
2022-10-25 16:25:19 +02:00
|
|
|
for item in all_items:
|
|
|
|
r_serv_cache.lpush(redis_key, str(item))
|
2020-05-20 17:03:58 +02:00
|
|
|
r_serv_cache.expire(redis_key, 360)
|
|
|
|
elif all_items:
|
2022-10-25 16:25:19 +02:00
|
|
|
r_serv_cache.lpush(redis_key, str(all_items[0]))
|
2020-05-20 17:03:58 +02:00
|
|
|
r_serv_cache.expire(redis_key, 360)
|
|
|
|
|
2020-05-20 17:29:51 +02:00
|
|
|
def regex_findall(module_name, redis_key, regex, item_id, item_content, max_time=30, r_set=True):
|
2020-05-20 17:03:58 +02:00
|
|
|
|
|
|
|
proc = Proc(target=_regex_findall, args=(redis_key, regex, item_content, r_set, ))
|
|
|
|
try:
|
|
|
|
proc.start()
|
|
|
|
proc.join(max_time)
|
|
|
|
if proc.is_alive():
|
|
|
|
proc.terminate()
|
2023-03-30 15:23:41 +02:00
|
|
|
# Statistics.incr_module_timeout_statistic(module_name)
|
2022-01-19 16:20:18 +01:00
|
|
|
err_mess = f"{module_name}: processing timeout: {item_id}"
|
2023-05-12 15:29:53 +02:00
|
|
|
logger.info(err_mess)
|
2020-05-20 17:03:58 +02:00
|
|
|
return []
|
|
|
|
else:
|
|
|
|
if r_set:
|
|
|
|
all_items = r_serv_cache.smembers(redis_key)
|
|
|
|
else:
|
2022-12-19 16:38:20 +01:00
|
|
|
all_items = r_serv_cache.lrange(redis_key, 0, -1)
|
2020-05-20 17:03:58 +02:00
|
|
|
r_serv_cache.delete(redis_key)
|
|
|
|
proc.terminate()
|
|
|
|
return all_items
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
print("Caught KeyboardInterrupt, terminating workers")
|
|
|
|
proc.terminate()
|
|
|
|
sys.exit(0)
|
2020-06-24 15:07:45 +02:00
|
|
|
|
2022-12-19 16:38:20 +01:00
|
|
|
def _regex_finditer(r_key, regex, content):
|
|
|
|
iterator = re.finditer(regex, content)
|
|
|
|
for match in iterator:
|
|
|
|
value = match.group()
|
|
|
|
start = match.start()
|
|
|
|
end = match.end()
|
|
|
|
r_serv_cache.rpush(r_key, f'{start}:{end}:{value}')
|
|
|
|
r_serv_cache.expire(r_key, 360)
|
2020-06-24 15:07:45 +02:00
|
|
|
|
2022-12-19 16:38:20 +01:00
|
|
|
def regex_finditer(r_key, regex, item_id, content, max_time=30):
|
|
|
|
proc = Proc(target=_regex_finditer, args=(r_key, regex, content))
|
2020-06-24 15:07:45 +02:00
|
|
|
try:
|
|
|
|
proc.start()
|
|
|
|
proc.join(max_time)
|
|
|
|
if proc.is_alive():
|
|
|
|
proc.terminate()
|
2023-03-30 15:23:41 +02:00
|
|
|
# Statistics.incr_module_timeout_statistic(r_key)
|
2022-12-19 16:38:20 +01:00
|
|
|
err_mess = f"{r_key}: processing timeout: {item_id}"
|
2023-05-12 15:29:53 +02:00
|
|
|
logger.info(err_mess)
|
2022-12-19 16:38:20 +01:00
|
|
|
return []
|
2020-06-24 15:07:45 +02:00
|
|
|
else:
|
2022-12-19 16:38:20 +01:00
|
|
|
res = r_serv_cache.lrange(r_key, 0, -1)
|
|
|
|
r_serv_cache.delete(r_key)
|
2020-06-24 15:07:45 +02:00
|
|
|
proc.terminate()
|
2022-12-19 16:38:20 +01:00
|
|
|
all_match = []
|
|
|
|
for match in res:
|
|
|
|
start, end, value = match.split(':', 2)
|
|
|
|
all_match.append((int(start), int(end), value))
|
|
|
|
return all_match
|
2020-06-24 15:07:45 +02:00
|
|
|
except KeyboardInterrupt:
|
2022-12-19 16:38:20 +01:00
|
|
|
print("Caught KeyboardInterrupt, terminating regex worker")
|
|
|
|
proc.terminate()
|
|
|
|
sys.exit(0)
|
|
|
|
|
2023-08-21 15:49:32 +02:00
|
|
|
def _regex_match(r_key, regex, content):
|
|
|
|
if re.match(regex, content):
|
|
|
|
r_serv_cache.set(r_key, 1)
|
|
|
|
r_serv_cache.expire(r_key, 360)
|
|
|
|
|
|
|
|
def regex_match(r_key, regex, item_id, content, max_time=30):
|
|
|
|
proc = Proc(target=_regex_match, args=(r_key, regex, content))
|
|
|
|
try:
|
|
|
|
proc.start()
|
|
|
|
proc.join(max_time)
|
|
|
|
if proc.is_alive():
|
|
|
|
proc.terminate()
|
|
|
|
# Statistics.incr_module_timeout_statistic(r_key)
|
|
|
|
err_mess = f"{r_key}: processing timeout: {item_id}"
|
|
|
|
logger.info(err_mess)
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
if r_serv_cache.exists(r_key):
|
|
|
|
r_serv_cache.delete(r_key)
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
r_serv_cache.delete(r_key)
|
|
|
|
return False
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
print("Caught KeyboardInterrupt, terminating regex worker")
|
|
|
|
proc.terminate()
|
|
|
|
sys.exit(0)
|
|
|
|
|
2022-12-19 16:38:20 +01:00
|
|
|
def _regex_search(r_key, regex, content):
|
|
|
|
if re.search(regex, content):
|
|
|
|
r_serv_cache.set(r_key, 1)
|
|
|
|
r_serv_cache.expire(r_key, 360)
|
|
|
|
|
|
|
|
def regex_search(r_key, regex, item_id, content, max_time=30):
|
|
|
|
proc = Proc(target=_regex_search, args=(r_key, regex, content))
|
|
|
|
try:
|
|
|
|
proc.start()
|
|
|
|
proc.join(max_time)
|
|
|
|
if proc.is_alive():
|
|
|
|
proc.terminate()
|
2023-03-30 15:23:41 +02:00
|
|
|
# Statistics.incr_module_timeout_statistic(r_key)
|
2022-12-19 16:38:20 +01:00
|
|
|
err_mess = f"{r_key}: processing timeout: {item_id}"
|
2023-05-12 15:29:53 +02:00
|
|
|
logger.info(err_mess)
|
2022-12-19 16:38:20 +01:00
|
|
|
return False
|
|
|
|
else:
|
|
|
|
if r_serv_cache.exists(r_key):
|
|
|
|
r_serv_cache.delete(r_key)
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
r_serv_cache.delete(r_key)
|
|
|
|
return False
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
print("Caught KeyboardInterrupt, terminating regex worker")
|
2020-06-24 15:07:45 +02:00
|
|
|
proc.terminate()
|
|
|
|
sys.exit(0)
|
2023-05-24 10:48:29 +02:00
|
|
|
|
|
|
|
## Phone Regexs ##
|
|
|
|
def _regex_phone_iter(r_key, country_code, content):
|
2024-10-09 15:05:27 +02:00
|
|
|
import phonenumbers
|
2023-05-24 10:48:29 +02:00
|
|
|
iterator = phonenumbers.PhoneNumberMatcher(content, country_code)
|
|
|
|
for match in iterator:
|
|
|
|
value = match.raw_string
|
|
|
|
# PhoneNumberFormat.E164
|
|
|
|
# value = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.INTERNATIONAL)
|
|
|
|
start = match.start
|
|
|
|
end = match.end
|
|
|
|
r_serv_cache.rpush(r_key, f'{start}:{end}:{value}')
|
|
|
|
r_serv_cache.expire(r_key, 360)
|
|
|
|
|
|
|
|
def regex_phone_iter(r_key, country_code, item_id, content, max_time=30):
|
|
|
|
proc = Proc(target=_regex_phone_iter, args=(r_key, country_code, content))
|
|
|
|
try:
|
|
|
|
proc.start()
|
|
|
|
proc.join(max_time)
|
|
|
|
if proc.is_alive():
|
|
|
|
proc.terminate()
|
|
|
|
# Statistics.incr_module_timeout_statistic(r_key)
|
|
|
|
err_mess = f"{r_key}: processing timeout: {item_id}"
|
|
|
|
logger.info(err_mess)
|
|
|
|
return []
|
|
|
|
else:
|
|
|
|
res = r_serv_cache.lrange(r_key, 0, -1)
|
|
|
|
r_serv_cache.delete(r_key)
|
|
|
|
proc.terminate()
|
|
|
|
all_match = []
|
|
|
|
for match in res:
|
|
|
|
start, end, value = match.split(':', 2)
|
|
|
|
all_match.append((int(start), int(end), value))
|
|
|
|
return all_match
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
print("Caught KeyboardInterrupt, terminating regex worker")
|
|
|
|
proc.terminate()
|
|
|
|
sys.exit(0)
|