mirror of https://github.com/CIRCL/AIL-framework
chg: [DB] update items tags metadata
parent
e83174327a
commit
3cc614a1ad
|
@ -59,6 +59,8 @@ class HiddenServices(object):
|
|||
db=cfg.getint("ARDB_Metadata", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
self.PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/'
|
||||
|
||||
self.domain = domain
|
||||
self.type = type
|
||||
self.port = port
|
||||
|
@ -76,9 +78,16 @@ class HiddenServices(object):
|
|||
## TODO: # FIXME: add error
|
||||
pass
|
||||
|
||||
def remove_absolute_path_link(self, key, value):
|
||||
print(key)
|
||||
print(value)
|
||||
#def remove_absolute_path_link(self, key, value):
|
||||
# print(key)
|
||||
# print(value)
|
||||
|
||||
def update_item_path_children(self, key, children):
|
||||
if self.PASTES_FOLDER in children:
|
||||
self.r_serv_metadata.srem(key, children)
|
||||
children = children.replace(self.PASTES_FOLDER, '', 1)
|
||||
self.r_serv_metadata.sadd(key, children)
|
||||
return children
|
||||
|
||||
def get_origin_paste_name(self):
|
||||
origin_item = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent')
|
||||
|
@ -106,7 +115,6 @@ class HiddenServices(object):
|
|||
# need to remove it
|
||||
else:
|
||||
p_tags = self.r_serv_metadata.smembers('tag:{}'.format(os.path.join(self.paste_directory, item)))
|
||||
print(p_tags)
|
||||
for tag in p_tags:
|
||||
self.tags[tag] = self.tags.get(tag, 0) + 1
|
||||
|
||||
|
@ -158,8 +166,10 @@ class HiddenServices(object):
|
|||
if father is None:
|
||||
return []
|
||||
l_crawled_pastes = []
|
||||
paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(father))
|
||||
key = 'paste_children:{}'.format(father)
|
||||
paste_childrens = self.r_serv_metadata.smembers(key)
|
||||
for children in paste_childrens:
|
||||
children = self.update_item_path_children(key, children)
|
||||
if self.domain in children:
|
||||
l_crawled_pastes.append(children)
|
||||
self.update_domain_tags(children)
|
||||
|
@ -174,8 +184,8 @@ class HiddenServices(object):
|
|||
else:
|
||||
key = os.path.join(self.paste_directory, item)
|
||||
link = self.r_serv_metadata.hget('paste_metadata:{}'.format(key), 'real_link')
|
||||
if link:
|
||||
self.remove_absolute_path_link(key, link)
|
||||
#if link:
|
||||
#self.remove_absolute_path_link(key, link)
|
||||
|
||||
return link
|
||||
|
||||
|
|
|
@ -85,19 +85,19 @@ if __name__ == '__main__':
|
|||
item_path = item_path.replace(PASTES_FOLDER, '', 1)
|
||||
new_item_metadata = 'paste_metadata:{}'.format(item_path)
|
||||
## TODO: catch error
|
||||
r_serv_metadata.rename(old_item_metadata, new_item_metadata)
|
||||
res = r_serv_metadata.renamenx(old_item_metadata, new_item_metadata)
|
||||
# update domain port
|
||||
domain = r_serv_metadata.hget('paste_metadata:{}'.format(item_path), 'domain')
|
||||
domain = r_serv_metadata.hget(new_item_metadata, 'domain')
|
||||
if domain:
|
||||
r_serv_metadata.hset('paste_metadata:{}'.format(item_path), 'domain', '{}:80'.format(domain))
|
||||
super_father = r_serv_metadata.hget('paste_metadata:{}'.format(item_path), 'super_father')
|
||||
r_serv_metadata.hset(new_item_metadata, 'domain', '{}:80'.format(domain))
|
||||
super_father = r_serv_metadata.hget(new_item_metadata, 'super_father')
|
||||
if super_father:
|
||||
if PASTES_FOLDER in super_father:
|
||||
r_serv_metadata.hset('paste_metadata:{}'.format(item_path), 'super_father', super_father.replace(PASTES_FOLDER, '', 1))
|
||||
father = r_serv_metadata.hget('paste_metadata:{}'.format(item_path), 'father')
|
||||
r_serv_metadata.hset(new_item_metadata, 'super_father', super_father.replace(PASTES_FOLDER, '', 1))
|
||||
father = r_serv_metadata.hget(new_item_metadata, 'father')
|
||||
if father:
|
||||
if PASTES_FOLDER in father:
|
||||
r_serv_metadata.hset('paste_metadata:{}'.format(item_path), 'father', father.replace(PASTES_FOLDER, '', 1))
|
||||
r_serv_metadata.hset(new_item_metadata, 'father', father.replace(PASTES_FOLDER, '', 1))
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -47,12 +47,6 @@ if __name__ == '__main__':
|
|||
db=cfg.getint("ARDB_Onion", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
r_serv_onion = redis.StrictRedis(
|
||||
host=cfg.get("ARDB_Onion", "host"),
|
||||
port=cfg.getint("ARDB_Onion", "port"),
|
||||
db=cfg.getint("ARDB_Onion", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
r_important_paste_2018 = redis.StrictRedis(
|
||||
host=cfg.get("ARDB_Metadata", "host"),
|
||||
port=cfg.getint("ARDB_Metadata", "port"),
|
||||
|
@ -123,24 +117,6 @@ if __name__ == '__main__':
|
|||
r_important_paste_2018.flushdb()
|
||||
r_important_paste_2019.flushdb()
|
||||
|
||||
#update item metadata tags
|
||||
tag_not_updated = True
|
||||
total_to_update = r_serv_tag.scard('maj:v1.5:absolute_path_to_rename')
|
||||
nb_updated = 0
|
||||
while tag_not_updated:
|
||||
item_path = r_serv_tag.spop('maj:v1.5:absolute_path_to_rename')
|
||||
old_tag_item_key = 'tag:{}'.format(item_path)
|
||||
new_item_path = item_path.replace(PASTES_FOLDER, '', 1)
|
||||
new_tag_item_key = 'tag:{}'.format(new_item_path)
|
||||
res = r_serv_metadata.renamenx(old_tag_item_key, new_tag_item_key)
|
||||
if res == 0:
|
||||
tags_key_fusion(old_tag_item_key, new_tag_item_key)
|
||||
nb_updated += 1
|
||||
if r_serv_tag.scard('maj:v1.5:absolute_path_to_rename') == 0:
|
||||
tag_not_updated = false
|
||||
else:
|
||||
print('{}/{} Tags updated'.format(nb_updated, total_to_update))
|
||||
|
||||
end = time.time()
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*-coding:UTF-8 -*
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import redis
|
||||
import configparser
|
||||
|
||||
def tags_key_fusion(old_item_path_key, new_item_path_key):
|
||||
print('fusion:')
|
||||
print(old_item_path_key)
|
||||
print(new_item_path_key)
|
||||
for tag in r_serv_metadata.smembers(old_item_path_key):
|
||||
r_serv_metadata.sadd(new_item_path_key, tag)
|
||||
r_serv_metadata.srem(old_item_path_key, tag)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
start_deb = time.time()
|
||||
|
||||
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
||||
if not os.path.exists(configfile):
|
||||
raise Exception('Unable to find the configuration file. \
|
||||
Did you set environment variables? \
|
||||
Or activate the virtualenv.')
|
||||
cfg = configparser.ConfigParser()
|
||||
cfg.read(configfile)
|
||||
|
||||
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/'
|
||||
|
||||
r_serv_metadata = redis.StrictRedis(
|
||||
host=cfg.get("ARDB_Metadata", "host"),
|
||||
port=cfg.getint("ARDB_Metadata", "port"),
|
||||
db=cfg.getint("ARDB_Metadata", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
r_serv_tag = redis.StrictRedis(
|
||||
host=cfg.get("ARDB_Tags", "host"),
|
||||
port=cfg.getint("ARDB_Tags", "port"),
|
||||
db=cfg.getint("ARDB_Tags", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
print('Updating ARDB_Tags ...')
|
||||
start = time.time()
|
||||
|
||||
#update item metadata tags
|
||||
tag_not_updated = True
|
||||
total_to_update = r_serv_tag.scard('maj:v1.5:absolute_path_to_rename')
|
||||
nb_updated = 0
|
||||
if total_to_update > 0:
|
||||
while tag_not_updated:
|
||||
item_path = r_serv_tag.spop('maj:v1.5:absolute_path_to_rename')
|
||||
old_tag_item_key = 'tag:{}'.format(item_path)
|
||||
new_item_path = item_path.replace(PASTES_FOLDER, '', 1)
|
||||
new_tag_item_key = 'tag:{}'.format(new_item_path)
|
||||
res = r_serv_metadata.renamenx(old_tag_item_key, new_tag_item_key)
|
||||
if res == 0:
|
||||
tags_key_fusion(old_tag_item_key, new_tag_item_key)
|
||||
nb_updated += 1
|
||||
if r_serv_tag.scard('maj:v1.5:absolute_path_to_rename') == 0:
|
||||
tag_not_updated = False
|
||||
else:
|
||||
progress = int((nb_updated * 100) /total_to_update)
|
||||
print('{}/{} Tags updated {}%'.format(nb_updated, total_to_update, progress))
|
||||
|
||||
end = time.time()
|
||||
|
||||
|
||||
print('Updating ARDB_Tags Done: {} s'.format(end - start))
|
|
@ -175,7 +175,6 @@ def get_crawler_splash_status(type):
|
|||
return crawler_metadata
|
||||
|
||||
def create_crawler_config(mode, service_type, crawler_config, domain):
|
||||
print(crawler_config)
|
||||
if mode == 'manual':
|
||||
r_cache.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config))
|
||||
elif mode == 'auto':
|
||||
|
@ -559,8 +558,10 @@ def show_domain():
|
|||
|
||||
h = HiddenServices(domain, type, port=port)
|
||||
item_core = h.get_domain_crawled_core_item(epoch=epoch)
|
||||
epoch = item_core['epoch']
|
||||
l_pastes = h.get_last_crawled_pastes(item_root=item_core['root_item'])
|
||||
if item_core:
|
||||
l_pastes = h.get_last_crawled_pastes(item_root=item_core['root_item'])
|
||||
else:
|
||||
l_pastes = []
|
||||
dict_links = h.get_all_links(l_pastes)
|
||||
if l_pastes:
|
||||
status = True
|
||||
|
|
|
@ -63,7 +63,7 @@
|
|||
{% for domain in domains_by_day[date] %}
|
||||
<tr>
|
||||
<td>
|
||||
<a target="_blank" href="{{ url_for('hiddenServices.onion_domain') }}?onion_domain={{ domain }}">{{ domain }}</a>
|
||||
<a target="_blank" href="{{ url_for('hiddenServices.show_domain') }}?domain={{ domain }}">{{ domain }}</a>
|
||||
<div>
|
||||
{% for tag in domain_metadata[domain]['tags'] %}
|
||||
<a href="{{ url_for('Tags.get_tagged_paste') }}?ltags={{ tag }}">
|
||||
|
|
Loading…
Reference in New Issue