chg: [DB] update items tags metadata

pull/342/head
Terrtia 2019-04-11 11:58:06 +02:00
parent e83174327a
commit 3cc614a1ad
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
6 changed files with 99 additions and 42 deletions

View File

@ -59,6 +59,8 @@ class HiddenServices(object):
db=cfg.getint("ARDB_Metadata", "db"),
decode_responses=True)
self.PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/'
self.domain = domain
self.type = type
self.port = port
@ -76,9 +78,16 @@ class HiddenServices(object):
## TODO: # FIXME: add error
pass
def remove_absolute_path_link(self, key, value):
print(key)
print(value)
#def remove_absolute_path_link(self, key, value):
# print(key)
# print(value)
def update_item_path_children(self, key, children):
if self.PASTES_FOLDER in children:
self.r_serv_metadata.srem(key, children)
children = children.replace(self.PASTES_FOLDER, '', 1)
self.r_serv_metadata.sadd(key, children)
return children
def get_origin_paste_name(self):
origin_item = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'paste_parent')
@ -106,7 +115,6 @@ class HiddenServices(object):
# need to remove it
else:
p_tags = self.r_serv_metadata.smembers('tag:{}'.format(os.path.join(self.paste_directory, item)))
print(p_tags)
for tag in p_tags:
self.tags[tag] = self.tags.get(tag, 0) + 1
@ -158,8 +166,10 @@ class HiddenServices(object):
if father is None:
return []
l_crawled_pastes = []
paste_childrens = self.r_serv_metadata.smembers('paste_children:{}'.format(father))
key = 'paste_children:{}'.format(father)
paste_childrens = self.r_serv_metadata.smembers(key)
for children in paste_childrens:
children = self.update_item_path_children(key, children)
if self.domain in children:
l_crawled_pastes.append(children)
self.update_domain_tags(children)
@ -174,8 +184,8 @@ class HiddenServices(object):
else:
key = os.path.join(self.paste_directory, item)
link = self.r_serv_metadata.hget('paste_metadata:{}'.format(key), 'real_link')
if link:
self.remove_absolute_path_link(key, link)
#if link:
#self.remove_absolute_path_link(key, link)
return link

View File

@ -85,19 +85,19 @@ if __name__ == '__main__':
item_path = item_path.replace(PASTES_FOLDER, '', 1)
new_item_metadata = 'paste_metadata:{}'.format(item_path)
## TODO: catch error
r_serv_metadata.rename(old_item_metadata, new_item_metadata)
res = r_serv_metadata.renamenx(old_item_metadata, new_item_metadata)
# update domain port
domain = r_serv_metadata.hget('paste_metadata:{}'.format(item_path), 'domain')
domain = r_serv_metadata.hget(new_item_metadata, 'domain')
if domain:
r_serv_metadata.hset('paste_metadata:{}'.format(item_path), 'domain', '{}:80'.format(domain))
super_father = r_serv_metadata.hget('paste_metadata:{}'.format(item_path), 'super_father')
r_serv_metadata.hset(new_item_metadata, 'domain', '{}:80'.format(domain))
super_father = r_serv_metadata.hget(new_item_metadata, 'super_father')
if super_father:
if PASTES_FOLDER in super_father:
r_serv_metadata.hset('paste_metadata:{}'.format(item_path), 'super_father', super_father.replace(PASTES_FOLDER, '', 1))
father = r_serv_metadata.hget('paste_metadata:{}'.format(item_path), 'father')
r_serv_metadata.hset(new_item_metadata, 'super_father', super_father.replace(PASTES_FOLDER, '', 1))
father = r_serv_metadata.hget(new_item_metadata, 'father')
if father:
if PASTES_FOLDER in father:
r_serv_metadata.hset('paste_metadata:{}'.format(item_path), 'father', father.replace(PASTES_FOLDER, '', 1))
r_serv_metadata.hset(new_item_metadata, 'father', father.replace(PASTES_FOLDER, '', 1))

View File

@ -47,12 +47,6 @@ if __name__ == '__main__':
db=cfg.getint("ARDB_Onion", "db"),
decode_responses=True)
r_serv_onion = redis.StrictRedis(
host=cfg.get("ARDB_Onion", "host"),
port=cfg.getint("ARDB_Onion", "port"),
db=cfg.getint("ARDB_Onion", "db"),
decode_responses=True)
r_important_paste_2018 = redis.StrictRedis(
host=cfg.get("ARDB_Metadata", "host"),
port=cfg.getint("ARDB_Metadata", "port"),
@ -123,24 +117,6 @@ if __name__ == '__main__':
r_important_paste_2018.flushdb()
r_important_paste_2019.flushdb()
#update item metadata tags
tag_not_updated = True
total_to_update = r_serv_tag.scard('maj:v1.5:absolute_path_to_rename')
nb_updated = 0
while tag_not_updated:
item_path = r_serv_tag.spop('maj:v1.5:absolute_path_to_rename')
old_tag_item_key = 'tag:{}'.format(item_path)
new_item_path = item_path.replace(PASTES_FOLDER, '', 1)
new_tag_item_key = 'tag:{}'.format(new_item_path)
res = r_serv_metadata.renamenx(old_tag_item_key, new_tag_item_key)
if res == 0:
tags_key_fusion(old_tag_item_key, new_tag_item_key)
nb_updated += 1
if r_serv_tag.scard('maj:v1.5:absolute_path_to_rename') == 0:
tag_not_updated = false
else:
print('{}/{} Tags updated'.format(nb_updated, total_to_update))
end = time.time()

View File

@ -0,0 +1,70 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import time
import redis
import configparser
def tags_key_fusion(old_item_path_key, new_item_path_key):
print('fusion:')
print(old_item_path_key)
print(new_item_path_key)
for tag in r_serv_metadata.smembers(old_item_path_key):
r_serv_metadata.sadd(new_item_path_key, tag)
r_serv_metadata.srem(old_item_path_key, tag)
if __name__ == '__main__':
start_deb = time.time()
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
if not os.path.exists(configfile):
raise Exception('Unable to find the configuration file. \
Did you set environment variables? \
Or activate the virtualenv.')
cfg = configparser.ConfigParser()
cfg.read(configfile)
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/'
r_serv_metadata = redis.StrictRedis(
host=cfg.get("ARDB_Metadata", "host"),
port=cfg.getint("ARDB_Metadata", "port"),
db=cfg.getint("ARDB_Metadata", "db"),
decode_responses=True)
r_serv_tag = redis.StrictRedis(
host=cfg.get("ARDB_Tags", "host"),
port=cfg.getint("ARDB_Tags", "port"),
db=cfg.getint("ARDB_Tags", "db"),
decode_responses=True)
print('Updating ARDB_Tags ...')
start = time.time()
#update item metadata tags
tag_not_updated = True
total_to_update = r_serv_tag.scard('maj:v1.5:absolute_path_to_rename')
nb_updated = 0
if total_to_update > 0:
while tag_not_updated:
item_path = r_serv_tag.spop('maj:v1.5:absolute_path_to_rename')
old_tag_item_key = 'tag:{}'.format(item_path)
new_item_path = item_path.replace(PASTES_FOLDER, '', 1)
new_tag_item_key = 'tag:{}'.format(new_item_path)
res = r_serv_metadata.renamenx(old_tag_item_key, new_tag_item_key)
if res == 0:
tags_key_fusion(old_tag_item_key, new_tag_item_key)
nb_updated += 1
if r_serv_tag.scard('maj:v1.5:absolute_path_to_rename') == 0:
tag_not_updated = False
else:
progress = int((nb_updated * 100) /total_to_update)
print('{}/{} Tags updated {}%'.format(nb_updated, total_to_update, progress))
end = time.time()
print('Updating ARDB_Tags Done: {} s'.format(end - start))

View File

@ -175,7 +175,6 @@ def get_crawler_splash_status(type):
return crawler_metadata
def create_crawler_config(mode, service_type, crawler_config, domain):
print(crawler_config)
if mode == 'manual':
r_cache.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config))
elif mode == 'auto':
@ -559,8 +558,10 @@ def show_domain():
h = HiddenServices(domain, type, port=port)
item_core = h.get_domain_crawled_core_item(epoch=epoch)
epoch = item_core['epoch']
l_pastes = h.get_last_crawled_pastes(item_root=item_core['root_item'])
if item_core:
l_pastes = h.get_last_crawled_pastes(item_root=item_core['root_item'])
else:
l_pastes = []
dict_links = h.get_all_links(l_pastes)
if l_pastes:
status = True

View File

@ -63,7 +63,7 @@
{% for domain in domains_by_day[date] %}
<tr>
<td>
<a target="_blank" href="{{ url_for('hiddenServices.onion_domain') }}?onion_domain={{ domain }}">{{ domain }}</a>
<a target="_blank" href="{{ url_for('hiddenServices.show_domain') }}?domain={{ domain }}">{{ domain }}</a>
<div>
{% for tag in domain_metadata[domain]['tags'] %}
<a href="{{ url_for('Tags.get_tagged_paste') }}?ltags={{ tag }}">