mirror of https://github.com/CIRCL/lookyloo
52 lines
1.6 KiB
Python
Executable File
52 lines
1.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import argparse
|
|
import logging
|
|
|
|
from lookyloo.lookyloo import Lookyloo, Indexing
|
|
|
|
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
|
level=logging.INFO, datefmt='%I:%M:%S')
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Rebuild the redis cache.')
|
|
parser.add_argument('--rebuild_pickles', default=False, action='store_true', help='Delete and rebuild the pickles. Count 20s/pickle, it can take a very long time.')
|
|
args = parser.parse_args()
|
|
|
|
lookyloo = Lookyloo()
|
|
if args.rebuild_pickles:
|
|
lookyloo.rebuild_all()
|
|
else:
|
|
lookyloo.rebuild_cache()
|
|
|
|
indexing = Indexing()
|
|
indexing.clear_indexes()
|
|
for capture_uuid in lookyloo.capture_uuids:
|
|
index = True
|
|
try:
|
|
tree = lookyloo.get_crawled_tree(capture_uuid)
|
|
except Exception as e:
|
|
print(capture_uuid, e)
|
|
continue
|
|
|
|
if lookyloo.is_public_instance:
|
|
cache = lookyloo.capture_cache(capture_uuid)
|
|
if not cache:
|
|
continue
|
|
if cache.no_index is not None:
|
|
index = False
|
|
|
|
# NOTE: these methods do nothing if we just generated the pickle when calling lookyloo.get_crawled_tree
|
|
if index:
|
|
indexing.index_cookies_capture(tree)
|
|
indexing.index_body_hashes_capture(tree)
|
|
indexing.index_url_capture(tree)
|
|
categories = list(lookyloo.categories_capture(capture_uuid).keys())
|
|
indexing.index_categories_capture(capture_uuid, categories)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|