new: Optimize pickle, store as gzip to reduce space.

pull/640/head
Raphaël Vinot 2023-03-15 18:02:33 +01:00
parent 9c10161866
commit 0b678e2db0
1 changed files with 30 additions and 15 deletions

View File

@ -1,10 +1,12 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import contextlib import contextlib
import gzip
import json import json
import logging import logging
import os import os
import pickle import pickle
import pickletools
import signal import signal
import sys import sys
import time import time
@ -87,21 +89,33 @@ def remove_pickle_tree(capture_dir: Path) -> None:
@lru_cache(maxsize=256) @lru_cache(maxsize=256)
def load_pickle_tree(capture_dir: Path, last_mod_time: int) -> CrawledTree: def load_pickle_tree(capture_dir: Path, last_mod_time: int) -> CrawledTree:
pickle_file = capture_dir / 'tree.pickle' pickle_file = capture_dir / 'tree.pickle'
if pickle_file.exists(): pickle_file_gz = capture_dir / 'tree.pickle.gz'
with pickle_file.open('rb') as _p: tree = None
try: try:
if pickle_file.exists():
with pickle_file.open('rb') as _p:
tree = pickle.load(_p) tree = pickle.load(_p)
if tree.root_hartree.har.path.exists(): elif pickle_file_gz.exists():
return tree with gzip.open(pickle_file_gz, 'rb') as _pg:
else: tree = pickle.load(_pg)
# The capture was moved. except pickle.UnpicklingError as e:
remove_pickle_tree(capture_dir) print(e)
except pickle.UnpicklingError: remove_pickle_tree(capture_dir)
remove_pickle_tree(capture_dir)
except EOFError: if tree:
remove_pickle_tree(capture_dir) try:
except Exception: if tree.root_hartree.har.path.exists():
return tree
else:
# The capture was moved.
remove_pickle_tree(capture_dir) remove_pickle_tree(capture_dir)
except EOFError as e:
print(e)
remove_pickle_tree(capture_dir)
except Exception as e:
print(e)
remove_pickle_tree(capture_dir)
if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')): if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')):
raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.') raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.')
# The tree doesn't need to be rebuilt if there are no HAR files. # The tree doesn't need to be rebuilt if there are no HAR files.
@ -265,14 +279,15 @@ class CapturesIndex(Mapping):
except RecursionError as e: except RecursionError as e:
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.') raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
else: else:
with (capture_dir / 'tree.pickle').open('wb') as _p: with gzip.open(capture_dir / 'tree.pickle.gz', 'wb') as _p:
# Some pickles require a pretty high recursion limit, this kindof fixes it. # Some pickles require a pretty high recursion limit, this kindof fixes it.
# If the capture is really broken (generally a refresh to self), the capture # If the capture is really broken (generally a refresh to self), the capture
# is discarded in the RecursionError above. # is discarded in the RecursionError above.
default_recursion_limit = sys.getrecursionlimit() default_recursion_limit = sys.getrecursionlimit()
sys.setrecursionlimit(int(default_recursion_limit * 1.1)) sys.setrecursionlimit(int(default_recursion_limit * 1.1))
try: try:
pickle.dump(tree, _p, protocol=5) _p.write(pickletools.optimize(pickle.dumps(tree, protocol=5)))
# pickle.dump(tree, _p, protocol=5)
except RecursionError as e: except RecursionError as e:
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.') raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
sys.setrecursionlimit(default_recursion_limit) sys.setrecursionlimit(default_recursion_limit)