chg: [Item delete] delete father/child link + remove from domain tree + delete all child from the same domain

pull/519/head
Terrtia 2020-07-07 11:23:23 +02:00
parent 1f8650a648
commit 8a6e72f487
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
4 changed files with 168 additions and 31 deletions

29
bin/lib/domain_basic.py Executable file
View File

@ -0,0 +1,29 @@
#!/usr/bin/python3
"""
``basic domain lib``
===================
"""
import os
import sys
import redis
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader
config_loader = ConfigLoader.ConfigLoader()
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
config_loader = None
def get_domain_type(domain):
if str(domain).endswith('.onion'):
return 'onion'
else:
return 'regular'
def delete_domain_item_core(item_id, domain, port):
domain_type = get_domain_type(domain)
r_serv_onion.zrem('crawler_history_{}:{}:{}'.format(domain_type, domain, port), item_id)

View File

@ -3,6 +3,7 @@
import os
import sys
import gzip
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader
@ -12,6 +13,7 @@ config_loader = ConfigLoader.ConfigLoader()
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/'
PASTES_FOLDER = os.path.join(os.path.realpath(PASTES_FOLDER), '')
r_cache = config_loader.get_redis_conn("Redis_Cache")
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
config_loader = None
@ -43,6 +45,102 @@ def is_crawled(item_id):
def get_item_domain(item_id):
return item_id[19:-36]
def get_item_content(item_id):
item_full_path = os.path.join(PASTES_FOLDER, item_id)
try:
item_content = r_cache.get(item_full_path)
except UnicodeDecodeError:
item_content = None
except Exception as e:
item_content = None
if item_content is None:
try:
with gzip.open(item_full_path, 'r') as f:
item_content = f.read().decode()
r_cache.set(item_full_path, item_content)
r_cache.expire(item_full_path, 300)
except:
item_content = ''
return str(item_content)
#### TREE CHILD/FATHER ####
def is_father(item_id):
return r_serv_metadata.exists('paste_children:{}'.format(item_id))
def is_children(item_id):
return r_serv_metadata.hexists('paste_metadata:{}'.format(item_id), 'father')
def is_root_node():
if is_father(item_id) and not is_children(item_id):
return True
else:
return False
def is_node(item_id):
if is_father(item_id) or is_children(item_id):
return True
else:
return False
def is_leaf(item_id):
if not is_father(item_id) and is_children(item_id):
return True
else:
return False
def is_domain_root(item_id):
if not is_crawled(item_id):
return False
else:
domain = get_item_domain(item_id)
item_father = get_item_parent(item_id)
if not is_crawled(item_father):
return True
else:
# same domain
if get_item_domain(item_father) == domain:
return False
else:
return True
def get_nb_children(item_id):
return r_serv_metadata.scard('paste_children:{}'.format(item_id))
def get_item_parent(item_id):
return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'father')
def get_item_children(item_id):
return list(r_serv_metadata.smembers('paste_children:{}'.format(item_id)))
def add_item_parent(item_parent, item_id):
return item_basic.add_item_parent(item_parent, item_id)
# # TODO: handle domain last origin in domain lib
def _delete_node(item_id):
# only if item isn't deleted
#if is_crawled(item_id):
# r_serv_metadata.hrem('paste_metadata:{}'.format(item_id), 'real_link')
for chidren_id in get_item_children(item_id):
r_serv_metadata.hdel('paste_metadata:{}'.format(chidren_id), 'father')
r_serv_metadata.delete('paste_children:{}'.format(item_id))
# delete regular
# simple if leaf
# delete item node
def get_all_domain_node_by_item_id(item_id, l_nodes=[]):
domain = get_item_domain(item_id)
for child_id in get_item_children(item_id):
if get_item_domain(child_id) == domain:
l_nodes.append(child_id)
l_nodes = get_all_domain_node_by_item_id(child_id, l_nodes)
return l_nodes
##-- --##
def add_item_parent_by_parent_id(parent_type, parent_id, item_id):
parent_item_id = get_obj_id_item_id(parent_type, parent_id)
if parent_item_id:
@ -53,9 +151,9 @@ def add_item_parent(parent_item_id, item_id):
r_serv_metadata.sadd('paste_children:{}'.format(parent_item_id), item_id)
return True
def add_map_obj_id_item_id(obj_id, item_id, obj_type):
if obj_type == 'twitter_id':
r_serv_metadata.hset('map:twitter_id:item_id', obj_id, item_id)
# TODO:
# FIXME:
#### UNKNOW SECTION ####
def get_obj_id_item_id(parent_type, parent_id):
all_parents_type = ['twitter_id']
@ -63,3 +161,11 @@ def get_obj_id_item_id(parent_type, parent_id):
return r_serv_metadata.hget('map:twitter_id:item_id', parent_id)
else:
return None
def add_map_obj_id_item_id(obj_id, item_id, obj_type):
if obj_type == 'twitter_id':
r_serv_metadata.hset('map:twitter_id:item_id', obj_id, item_id)
# delete twitter id
##-- --##

View File

@ -3,7 +3,6 @@
import os
import sys
import gzip
import redis
from io import BytesIO
@ -16,12 +15,15 @@ import Pgp
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import item_basic
import domain_basic
import ConfigLoader
import Correlate_object
import Decoded
import Screenshot
import telegram
from item_basic import *
config_loader = ConfigLoader.ConfigLoader()
# get and sanityze PASTE DIRECTORY
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/'
@ -30,6 +32,7 @@ PASTES_FOLDER = os.path.join(os.path.realpath(PASTES_FOLDER), '')
r_cache = config_loader.get_redis_conn("Redis_Cache")
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
screenshot_directory = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "crawled_screenshot"))
config_loader = None
def exist_item(item_id):
@ -71,22 +74,7 @@ def get_lines_info(item_id, item_content=None):
def get_item_content(item_id):
item_full_path = os.path.join(PASTES_FOLDER, item_id)
try:
item_content = r_cache.get(item_full_path)
except UnicodeDecodeError:
item_content = None
except Exception as e:
item_content = None
if item_content is None:
try:
with gzip.open(item_full_path, 'r') as f:
item_content = f.read().decode()
r_cache.set(item_full_path, item_content)
r_cache.expire(item_full_path, 300)
except:
item_content = ''
return str(item_content)
return item_basic.get_item_content(item_id)
# API
def get_item(request_dict):
@ -292,14 +280,8 @@ def get_domain(item_id):
item_id = item_id[-1]
return item_id[:-36]
def get_item_parent(item_id):
return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'father')
def get_item_children(item_id):
return list(r_serv_metadata.smembers('paste_children:{}'.format(item_id)))
def add_item_parent(item_parent, item_id):
return item_basic.add_item_parent(item_parent, item_id)
def get_item_domain_with_port(item_id):
return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'domain')
def get_item_link(item_id):
return r_serv_metadata.hget('paste_metadata:{}'.format(item_id), 'real_link')
@ -423,12 +405,32 @@ def delete_item(obj_id):
else:
for obj2_id in obj_correlations[correlation]:
Correlate_object.delete_obj_relationship(correlation, obj2_id, 'item', obj_id)
# delete father/child
delete_node(obj_id)
# delete item metadata
r_serv_metadata.delete('paste_metadata:{}'.format(obj_id))
return True
### REQUIRE MORE WORK
# delete child/son !!!
### TODO in inport V2
# delete from tracked items
# delete from queue
###
return False
#### ####
def delete_node(item_id):
if is_node(item_id):
if is_crawled(item_id):
delete_domain_node(item_id)
item_basic._delete_node(item_id)
def delete_domain_node(item_id):
if is_domain_root(item_id):
# remove from domain history
domain, port = get_item_domain_with_port(item_id).split(':')
domain_basic.delete_domain_item_core(item_id, domain, port)
for child_id in get_all_domain_node_by_item_id(item_id):
delete_item(child_id)

View File

@ -115,7 +115,7 @@ def showDomain():
dict_domain['tags_safe'] = Tag.is_tags_safe(dict_domain['tags'])
dict_domain['history'] = domain.get_domain_history_with_status()
dict_domain['crawler_history'] = domain.get_domain_items_crawled(items_link=True, epoch=epoch, item_screenshot=True, item_tag=True) # # TODO: handle multiple port
if dict_domain['crawler_history']['items']:
if dict_domain['crawler_history'].get('items', []):
dict_domain['crawler_history']['random_item'] = random.choice(dict_domain['crawler_history']['items'])
return render_template("showDomain.html", dict_domain=dict_domain, bootstrap_label=bootstrap_label,