From 855485984f53750d851005c5968969934c2a9a35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Wed, 26 Jul 2023 22:16:00 +0200 Subject: [PATCH] fix: handle gracefully empty lists in hset, and duplicates UUIDs --- bin/background_indexer.py | 13 ++++++ lookyloo/indexing.py | 12 ++--- poetry.lock | 92 +++------------------------------------ pyproject.toml | 2 +- 4 files changed, 25 insertions(+), 94 deletions(-) diff --git a/bin/background_indexer.py b/bin/background_indexer.py index 821d0fd2..68edbf74 100755 --- a/bin/background_indexer.py +++ b/bin/background_indexer.py @@ -5,6 +5,7 @@ import logging.config import os import shutil +from pathlib import Path from typing import Optional from lookyloo.default import AbstractManager, get_config @@ -59,6 +60,18 @@ class BackgroundIndexer(AbstractManager): if not self.lookyloo.redis.hexists('lookup_dirs', uuid): # The capture with this UUID exists, but it is for some reason missing in lookup_dirs self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent)) + else: + cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid)) + if cached_path != uuid_path.parent: + # we have a duplicate UUID, it is proably related to some bad copy/paste + if cached_path.exists(): + # Both paths exist, move the one that isn't in lookup_dirs + self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {uuid_path.parent}, discarding the latest') + shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name)) + continue + else: + # The path in lookup_dirs for that UUID doesn't exists, just update it. + self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent)) try: self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}') diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py index a3fef8d3..80d916c8 100644 --- a/lookyloo/indexing.py +++ b/lookyloo/indexing.py @@ -76,9 +76,9 @@ class Indexing(): if name not in already_cleaned_up: # We only run this srem once per name for a capture, # before adding it for the first time - pipeline.srem(f'cn|{name}|captures', - *[key for key in self.redis.sscan_iter(f'cn|{name}|captures', - f'{crawled_tree.uuid}|*')]) + to_remove = [key for key in self.redis.sscan_iter(f'cn|{name}|captures', f'{crawled_tree.uuid}|*')] + if to_remove: + pipeline.srem(f'cn|{name}|captures', *to_remove) already_cleaned_up.add(name) pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}') pipeline.execute() @@ -255,9 +255,9 @@ class Indexing(): if urlnode.hhhash not in already_cleaned_up: # We only run this srem once per name for a capture, # before adding it for the first time - pipeline.srem(f'hhhashes|{urlnode.hhhash}|captures', - *[key for key in self.redis.sscan_iter(f'hhhashes|{urlnode.hhhash}|captures', - f'{crawled_tree.uuid}|*')]) + to_remove = [key for key in self.redis.sscan_iter(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|*')] + if to_remove: + pipeline.srem(f'hhhashes|{urlnode.hhhash}|captures', * to_remove) already_cleaned_up.add(urlnode.hhhash) pipeline.sadd(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}') pipeline.execute() diff --git a/poetry.lock b/poetry.lock index b0d1d596..6984b2a3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -136,27 +136,6 @@ files = [ [package.extras] dev = ["black", "coverage", "isort", "pre-commit", "pyenchant", "pylint"] -[[package]] -name = "anyio" -version = "3.7.1" -description = "High level compatibility layer for multiple asynchronous event loop implementations" -optional = false -python-versions = ">=3.7" -files = [ - {file = "anyio-3.7.1-py3-none-any.whl", hash = "sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5"}, - {file = "anyio-3.7.1.tar.gz", hash = "sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780"}, -] - -[package.dependencies] -exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} -idna = ">=2.8" -sniffio = ">=1.1" - -[package.extras] -doc = ["Sphinx", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-jquery"] -test = ["anyio[trio]", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] -trio = ["trio (<0.22)"] - [[package]] name = "appnope" version = "0.1.3" @@ -612,22 +591,18 @@ dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"] [[package]] name = "dnspython" -version = "2.4.0" +version = "2.4.1" description = "DNS toolkit" optional = false python-versions = ">=3.8,<4.0" files = [ - {file = "dnspython-2.4.0-py3-none-any.whl", hash = "sha256:46b4052a55b56beea3a3bdd7b30295c292bd6827dd442348bc116f2d35b17f0a"}, - {file = "dnspython-2.4.0.tar.gz", hash = "sha256:758e691dbb454d5ccf4e1b154a19e52847f79e21a42fef17b969144af29a4e6c"}, + {file = "dnspython-2.4.1-py3-none-any.whl", hash = "sha256:5b7488477388b8c0b70a8ce93b227c5603bc7b77f1565afe8e729c36c51447d7"}, + {file = "dnspython-2.4.1.tar.gz", hash = "sha256:c33971c79af5be968bb897e95c2448e11a645ee84d93b265ce0b7aabe5dfdca8"}, ] -[package.dependencies] -httpcore = {version = ">=0.17.3", markers = "python_version >= \"3.8\""} -sniffio = ">=1.1,<2.0" - [package.extras] dnssec = ["cryptography (>=2.6,<42.0)"] -doh = ["h2 (>=4.1.0)", "httpx (>=0.24.1)"] +doh = ["h2 (>=4.1.0)", "httpcore (>=0.17.3)", "httpx (>=0.24.1)"] doq = ["aioquic (>=0.9.20)"] idna = ["idna (>=2.1,<4.0)"] trio = ["trio (>=0.14,<0.23)"] @@ -643,20 +618,6 @@ files = [ {file = "ete3-3.1.3.tar.gz", hash = "sha256:06a3b7fa8ed90187b076a8dbbe5b1b62acee94201d3c6e822f55f449601ef6f2"}, ] -[[package]] -name = "exceptiongroup" -version = "1.1.2" -description = "Backport of PEP 654 (exception groups)" -optional = false -python-versions = ">=3.7" -files = [ - {file = "exceptiongroup-1.1.2-py3-none-any.whl", hash = "sha256:e346e69d186172ca7cf029c8c1d16235aa0e04035e5750b4b95039e65204328f"}, - {file = "exceptiongroup-1.1.2.tar.gz", hash = "sha256:12c3e887d6485d16943a309616de20ae5582633e0a2eda17f4e10fd61c1e8af5"}, -] - -[package.extras] -test = ["pytest (>=6)"] - [[package]] name = "executing" version = "1.2.0" @@ -945,17 +906,6 @@ gevent = ["gevent (>=1.4.0)"] setproctitle = ["setproctitle"] tornado = ["tornado (>=0.2)"] -[[package]] -name = "h11" -version = "0.14.0" -description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" -optional = false -python-versions = ">=3.7" -files = [ - {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, - {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, -] - [[package]] name = "har2tree" version = "1.21.2" @@ -1080,27 +1030,6 @@ files = [ {file = "hiredis-2.2.3.tar.gz", hash = "sha256:e75163773a309e56a9b58165cf5a50e0f84b755f6ff863b2c01a38918fe92daa"}, ] -[[package]] -name = "httpcore" -version = "0.17.3" -description = "A minimal low-level HTTP client." -optional = false -python-versions = ">=3.7" -files = [ - {file = "httpcore-0.17.3-py3-none-any.whl", hash = "sha256:c2789b767ddddfa2a5782e3199b2b7f6894540b17b16ec26b2c4d8e103510b87"}, - {file = "httpcore-0.17.3.tar.gz", hash = "sha256:a6f30213335e34c1ade7be6ec7c47f19f50c56db36abef1a9dfa3815b1cb3888"}, -] - -[package.dependencies] -anyio = ">=3.0,<5.0" -certifi = "*" -h11 = ">=0.13,<0.15" -sniffio = "==1.*" - -[package.extras] -http2 = ["h2 (>=3,<5)"] -socks = ["socksio (==1.*)"] - [[package]] name = "idna" version = "3.4" @@ -2663,17 +2592,6 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] -[[package]] -name = "sniffio" -version = "1.3.0" -description = "Sniff out which async library your code is running under" -optional = false -python-versions = ">=3.7" -files = [ - {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, - {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, -] - [[package]] name = "soupsieve" version = "2.4.1" @@ -3228,4 +3146,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.12" -content-hash = "594e85de86c9b95e5492786c7990ad9fd975be9c14e1c342eec9fa259a34fbb5" +content-hash = "b84bbfd91bbf1959b14fdad2a9e7a6634bac4361278ff63fc4ecb1d1f11f1602" diff --git a/pyproject.toml b/pyproject.toml index ae28af55..d6d6f13f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ vt-py = "^0.17.5" pyeupi = "^1.1" pysanejs = "^2.0.1" pylookyloo = "^1.21.0" -dnspython = "^2.4.0" +dnspython = "^2.4.1" pytaxonomies = "^1.5.0" pymisp = {version = "^2.4.173", extras = ["url", "fileobjects"]} Pillow = "^10.0.0"