lookyloo/tools/create_archive_indexes/create_indexes.py

99 lines
3.1 KiB
Python

#!/usr/bin/env python3
import csv
import json
import os
import sys
from datetime import date
from pathlib import Path
from typing import List, Dict
import s3fs # type: ignore
# manual get config file
configfile = Path(os.path.realpath(__file__)).parent.parent.parent / 'config' / 'generic.json'
with configfile.open() as f:
config = json.load(f)
if not config.get('s3fs') or not config['s3fs'].get('archive_on_s3fs'):
print('archive not in s3fs')
sys.exit()
s3fs_config = config['s3fs']['config']
s3 = s3fs.S3FileSystem(key=s3fs_config['key'], secret=s3fs_config['secret'],
endpoint_url=s3fs_config['endpoint_url'],
config_kwargs={'connect_timeout': 10, 'read_timeout': 900})
bucket_name = s3fs_config['bucket_name']
s3.clear_multipart_uploads(bucket_name)
def _make_dirs_list(root_dir: str) -> List[str]:
directories = []
year_now = date.today().year
while True:
year_dir = f'{root_dir}/{year_now}'
if not s3.exists(year_dir):
# if we do not have a directory with this year, quit the loop
break
for month in range(12, 0, -1):
month_dir = f'{year_dir}/{month:02}'
if s3.exists(month_dir):
directories.append(month_dir)
year_now -= 1
return directories
archives_directories = _make_dirs_list(bucket_name)
for directory in archives_directories:
print(f'Processing {directory}')
s3.invalidate_cache(directory)
print('Cache invalidated')
all_captures = s3.ls(directory, detail=False, refresh=True)
if not all_captures:
print('No captures in directory')
continue
print(f'{directory} contains {len(all_captures)} captures')
index_file = f'{directory}/index'
current_index: Dict[str, str] = {}
print(f'Processing {index_file}')
if s3.exists(index_file):
with s3.open(index_file, 'r') as _f:
current_index = {uuid: dirname for uuid, dirname in csv.reader(_f)
if uuid and dirname}
print(f'Done with {index_file}, has {len(current_index)} entries')
curent_index_dirs = set(current_index.values())
new_captures = 0
for existing_capture in all_captures:
capture_dir = existing_capture.rsplit('/', 1)[-1]
if not capture_dir or capture_dir == 'index':
continue
if capture_dir not in curent_index_dirs:
print(f'New: {existing_capture}')
uuid_path = f'{existing_capture}/uuid'
if s3.exists(uuid_path):
uuid = s3.read_text(uuid_path)
print(uuid)
current_index[uuid] = capture_dir
new_captures += 1
else:
print(f'Does not exists: {uuid_path}')
if not new_captures:
print(f'No new captures in {directory}')
continue
print(f'Updating {index_file} with {new_captures} new captures.')
with s3.open(index_file, 'w') as _f:
index_writer = csv.writer(_f)
for uuid, dirname in current_index.items():
index_writer.writerow([uuid, dirname])
print(f'Done updating {index_file}')