diff --git a/bin/archiver.py b/bin/archiver.py index c0106c87..bb8e9b6e 100755 --- a/bin/archiver.py +++ b/bin/archiver.py @@ -38,7 +38,9 @@ class Archiver(AbstractManager): self._archive() self._update_all_capture_indexes() self._load_indexes() - self._compress_hars() + # The HARs are supposedly all compressed so this call shouldn't be required + # unless you're processing old captures for the first time. + # self._compress_hars() def _update_index(self, root_dir: Path) -> None: current_index: Dict[str, str] = {} @@ -188,12 +190,23 @@ class Archiver(AbstractManager): p.delete(str(capture_path)) (capture_path / 'tree.pickle').unlink(missing_ok=True) (capture_path / 'tree.pickle.gz').unlink(missing_ok=True) + # If the HAR isn't archived yet, archive it before copy + for har in capture_path.glob('*.har'): + with har.open('rb') as f_in: + with gzip.open(f'{har}.gz', 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + har.unlink() shutil.move(str(capture_path), str(dest_dir)) p.execute() self.logger.info('Archiving done.') def _compress_hars(self): + """This method is very slow (it checks every single capture for non-compressed HARs) + The new approach is to compress the har of every capture by default so this shouldn't be + needed anymore. Keeping it here just for reference, or to process old archives that contain + non-gziped HARs. + """ self.logger.info('Compressing archived captures') for index in self.archived_captures_dir.glob('*/*/index'): if self.shutdown_requested(): @@ -201,9 +214,7 @@ class Archiver(AbstractManager): break with index.open('r') as _f: for uuid, dirname in csv.reader(_f): - for har in (index.parent / dirname).rglob('*.har'): - if not har.exists(): - continue + for har in (index.parent / dirname).glob('*.har'): with har.open('rb') as f_in: with gzip.open(f'{har}.gz', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 2f0dcef4..10b45b0c 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -2,6 +2,7 @@ import base64 import copy +import gzip import json import logging import operator @@ -1499,8 +1500,8 @@ class Lookyloo(): json.dump(error, _error) if har: - with (dirpath / '0.har').open('w') as _har: - json.dump(har, _har) + with gzip.open(dirpath / '0.har.gz', 'wt') as f_out: + f_out.write(json.dumps(har)) if png: with (dirpath / '0.png').open('wb') as _img: diff --git a/poetry.lock b/poetry.lock index 064599ff..7ce0ccda 100644 --- a/poetry.lock +++ b/poetry.lock @@ -166,13 +166,13 @@ test = ["astroid", "pytest"] [[package]] name = "async-timeout" -version = "4.0.2" +version = "4.0.3" description = "Timeout context manager for asyncio programs" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"}, - {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"}, + {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"}, + {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, ] [[package]] @@ -908,13 +908,13 @@ tornado = ["tornado (>=0.2)"] [[package]] name = "har2tree" -version = "1.21.6" +version = "1.21.7" description = "HTTP Archive (HAR) to ETE Toolkit generator" optional = false python-versions = ">=3.8,<3.12" files = [ - {file = "har2tree-1.21.6-py3-none-any.whl", hash = "sha256:22e152634d1307a3a096a46581f218b71ca8d7f0aee558eff0aa5303c2130cf2"}, - {file = "har2tree-1.21.6.tar.gz", hash = "sha256:09e73fbbee97bab0da4e209332d1a017ae844a88d4d77896927a575f83a62926"}, + {file = "har2tree-1.21.7-py3-none-any.whl", hash = "sha256:ffe14f2d21c53bd95839e67682aff4ecf4bb624bfda0aa23857df6dfe5aa70fa"}, + {file = "har2tree-1.21.7.tar.gz", hash = "sha256:b03cd8c2ee7e954060c1109c08844c376db9d7fd0e2e2499bbdc5d9fe6879b7f"}, ] [package.dependencies] @@ -926,7 +926,7 @@ numpy = [ {version = "1.23.3", markers = "python_version < \"3.10\""}, {version = ">=1.23.4,<2.0.0", markers = "python_version >= \"3.10\""}, ] -publicsuffixlist = ">=0.10.0.20230804,<0.11.0.0" +publicsuffixlist = ">=0.10.0.20230811,<0.11.0.0" w3lib = ">=2.1.2,<3.0.0" [package.extras] @@ -1243,13 +1243,13 @@ referencing = ">=0.28.0" [[package]] name = "lacuscore" -version = "1.6.5" +version = "1.6.6" description = "Core of Lacus, usable as a module" optional = false python-versions = ">=3.8,<4.0" files = [ - {file = "lacuscore-1.6.5-py3-none-any.whl", hash = "sha256:373cde9898938521514d08e3a53e6b8d5c71dac59eaad1a067c33086fddbfd4f"}, - {file = "lacuscore-1.6.5.tar.gz", hash = "sha256:bc6da53bf58819e995db8f2ed33be032bcc9bac04e06c2f2d752cb30c6965e3a"}, + {file = "lacuscore-1.6.6-py3-none-any.whl", hash = "sha256:456914de415fba8bbaeef32446ff74703015f55ad3d79691d97515e8eb14dc66"}, + {file = "lacuscore-1.6.6.tar.gz", hash = "sha256:279087601cca7730c061e4188c27cc7f72246157bde8ecf5dc345aea69e1009e"}, ] [package.dependencies] @@ -1926,13 +1926,13 @@ files = [ [[package]] name = "publicsuffixlist" -version = "0.10.0.20230806" +version = "0.10.0.20230811" description = "publicsuffixlist implement" optional = false python-versions = ">=2.6" files = [ - {file = "publicsuffixlist-0.10.0.20230806-py2.py3-none-any.whl", hash = "sha256:c05dbd256d049d3fb94405e7e4a5215cffb39e7329471137e04b320d22cbf141"}, - {file = "publicsuffixlist-0.10.0.20230806.tar.gz", hash = "sha256:8c30ea7a0019386d144ca3db8751f757ee46acc194ea6d9619eb175041491c96"}, + {file = "publicsuffixlist-0.10.0.20230811-py2.py3-none-any.whl", hash = "sha256:cd84fcabb7d5bbca45ac3a1ea876cd8fac4d093705a827b20afbe34f5b2e70e6"}, + {file = "publicsuffixlist-0.10.0.20230811.tar.gz", hash = "sha256:0ba20a5fa7b9fe5c6dc787d978c6be212e53c962a1a417a2a5948c9d28a4c549"}, ] [package.extras] @@ -3154,4 +3154,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.12" -content-hash = "056a4bf984207efa7930a8af07e93d4fd1ce7751288b0047d64df1918ad92cb1" +content-hash = "3d380da51d6c302c872b3bec918b8c3581946d43e91ba047f0984e9325293cdb" diff --git a/pyproject.toml b/pyproject.toml index a220012e..0f797e33 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,15 +60,15 @@ pyhashlookup = "^1.2.1" lief = "^0.13.2" ua-parser = "^0.18.0" Flask-Login = "^0.6.2" -har2tree = "^1.21.6" +har2tree = "^1.21.7" passivetotal = "^2.5.9" werkzeug = "^2.3.6" filetype = "^1.2.0" pypandora = "^1.5.0" -lacuscore = "^1.6.5" +lacuscore = "^1.6.6" pylacus = "^1.6.1" pyipasnhistory = "^2.1.2" -publicsuffixlist = "^0.10.0.20230806" +publicsuffixlist = "^0.10.0.20230811" pyfaup = "^1.2" chardet = "^5.2.0" pysecuritytxt = "^1.1.1"