2020-06-29 11:59:01 +02:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
2020-07-08 02:25:15 +02:00
import argparse
2020-06-29 11:59:01 +02:00
import logging
2020-07-08 02:25:15 +02:00
2020-06-15 16:12:23 +02:00
from lookyloo . lookyloo import Lookyloo , Indexing
2020-06-29 11:59:01 +02:00
logging . basicConfig ( format = ' %(asctime)s %(name)s %(levelname)s : %(message)s ' ,
level = logging . INFO , datefmt = ' % I: % M: % S ' )
2020-10-03 21:19:43 +02:00
def main ( ) :
2020-07-08 02:25:15 +02:00
parser = argparse . ArgumentParser ( description = ' Rebuild the redis cache. ' )
parser . add_argument ( ' --rebuild_pickles ' , default = False , action = ' store_true ' , help = ' Delete and rebuild the pickles. Count 20s/pickle, it can take a very long time. ' )
args = parser . parse_args ( )
2020-06-29 11:59:01 +02:00
lookyloo = Lookyloo ( )
2020-07-08 02:25:15 +02:00
if args . rebuild_pickles :
2020-06-29 11:59:01 +02:00
lookyloo . rebuild_all ( )
else :
lookyloo . rebuild_cache ( )
2020-06-15 16:12:23 +02:00
indexing = Indexing ( )
indexing . clear_indexes ( )
2020-07-08 18:28:07 +02:00
for capture_uuid in lookyloo . capture_uuids :
2020-07-20 13:39:08 +02:00
index = True
2020-07-01 02:19:52 +02:00
try :
2020-07-08 18:28:07 +02:00
tree = lookyloo . get_crawled_tree ( capture_uuid )
2020-07-01 02:19:52 +02:00
except Exception as e :
2020-07-08 18:28:07 +02:00
print ( capture_uuid , e )
continue
2020-07-20 13:39:08 +02:00
if lookyloo . is_public_instance :
cache = lookyloo . capture_cache ( capture_uuid )
2021-01-14 17:12:16 +01:00
if not cache :
continue
if cache . no_index is not None :
2020-07-20 13:39:08 +02:00
index = False
2020-10-29 23:25:20 +01:00
# NOTE: these methods do nothing if we just generated the pickle when calling lookyloo.get_crawled_tree
2020-07-20 13:39:08 +02:00
if index :
indexing . index_cookies_capture ( tree )
indexing . index_body_hashes_capture ( tree )
2020-10-27 00:02:18 +01:00
indexing . index_url_capture ( tree )
2020-11-09 16:02:54 +01:00
categories = list ( lookyloo . categories_capture ( capture_uuid ) . keys ( ) )
indexing . index_categories_capture ( capture_uuid , categories )
2020-10-03 21:19:43 +02:00
if __name__ == ' __main__ ' :
main ( )