fix: handle gracefully empty lists in hset, and duplicates UUIDs

pull/746/head
Raphaël Vinot 2023-07-26 22:16:00 +02:00
parent fd9325bb0d
commit 855485984f
4 changed files with 25 additions and 94 deletions

View File

@ -5,6 +5,7 @@ import logging.config
import os import os
import shutil import shutil
from pathlib import Path
from typing import Optional from typing import Optional
from lookyloo.default import AbstractManager, get_config from lookyloo.default import AbstractManager, get_config
@ -59,6 +60,18 @@ class BackgroundIndexer(AbstractManager):
if not self.lookyloo.redis.hexists('lookup_dirs', uuid): if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs # The capture with this UUID exists, but it is for some reason missing in lookup_dirs
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent)) self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
else:
cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid))
if cached_path != uuid_path.parent:
# we have a duplicate UUID, it is proably related to some bad copy/paste
if cached_path.exists():
# Both paths exist, move the one that isn't in lookup_dirs
self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {uuid_path.parent}, discarding the latest')
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
continue
else:
# The path in lookup_dirs for that UUID doesn't exists, just update it.
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
try: try:
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}') self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}')

View File

@ -76,9 +76,9 @@ class Indexing():
if name not in already_cleaned_up: if name not in already_cleaned_up:
# We only run this srem once per name for a capture, # We only run this srem once per name for a capture,
# before adding it for the first time # before adding it for the first time
pipeline.srem(f'cn|{name}|captures', to_remove = [key for key in self.redis.sscan_iter(f'cn|{name}|captures', f'{crawled_tree.uuid}|*')]
*[key for key in self.redis.sscan_iter(f'cn|{name}|captures', if to_remove:
f'{crawled_tree.uuid}|*')]) pipeline.srem(f'cn|{name}|captures', *to_remove)
already_cleaned_up.add(name) already_cleaned_up.add(name)
pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}') pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
pipeline.execute() pipeline.execute()
@ -255,9 +255,9 @@ class Indexing():
if urlnode.hhhash not in already_cleaned_up: if urlnode.hhhash not in already_cleaned_up:
# We only run this srem once per name for a capture, # We only run this srem once per name for a capture,
# before adding it for the first time # before adding it for the first time
pipeline.srem(f'hhhashes|{urlnode.hhhash}|captures', to_remove = [key for key in self.redis.sscan_iter(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|*')]
*[key for key in self.redis.sscan_iter(f'hhhashes|{urlnode.hhhash}|captures', if to_remove:
f'{crawled_tree.uuid}|*')]) pipeline.srem(f'hhhashes|{urlnode.hhhash}|captures', * to_remove)
already_cleaned_up.add(urlnode.hhhash) already_cleaned_up.add(urlnode.hhhash)
pipeline.sadd(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}') pipeline.sadd(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
pipeline.execute() pipeline.execute()

92
poetry.lock generated
View File

@ -136,27 +136,6 @@ files = [
[package.extras] [package.extras]
dev = ["black", "coverage", "isort", "pre-commit", "pyenchant", "pylint"] dev = ["black", "coverage", "isort", "pre-commit", "pyenchant", "pylint"]
[[package]]
name = "anyio"
version = "3.7.1"
description = "High level compatibility layer for multiple asynchronous event loop implementations"
optional = false
python-versions = ">=3.7"
files = [
{file = "anyio-3.7.1-py3-none-any.whl", hash = "sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5"},
{file = "anyio-3.7.1.tar.gz", hash = "sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780"},
]
[package.dependencies]
exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
idna = ">=2.8"
sniffio = ">=1.1"
[package.extras]
doc = ["Sphinx", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-jquery"]
test = ["anyio[trio]", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
trio = ["trio (<0.22)"]
[[package]] [[package]]
name = "appnope" name = "appnope"
version = "0.1.3" version = "0.1.3"
@ -612,22 +591,18 @@ dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
[[package]] [[package]]
name = "dnspython" name = "dnspython"
version = "2.4.0" version = "2.4.1"
description = "DNS toolkit" description = "DNS toolkit"
optional = false optional = false
python-versions = ">=3.8,<4.0" python-versions = ">=3.8,<4.0"
files = [ files = [
{file = "dnspython-2.4.0-py3-none-any.whl", hash = "sha256:46b4052a55b56beea3a3bdd7b30295c292bd6827dd442348bc116f2d35b17f0a"}, {file = "dnspython-2.4.1-py3-none-any.whl", hash = "sha256:5b7488477388b8c0b70a8ce93b227c5603bc7b77f1565afe8e729c36c51447d7"},
{file = "dnspython-2.4.0.tar.gz", hash = "sha256:758e691dbb454d5ccf4e1b154a19e52847f79e21a42fef17b969144af29a4e6c"}, {file = "dnspython-2.4.1.tar.gz", hash = "sha256:c33971c79af5be968bb897e95c2448e11a645ee84d93b265ce0b7aabe5dfdca8"},
] ]
[package.dependencies]
httpcore = {version = ">=0.17.3", markers = "python_version >= \"3.8\""}
sniffio = ">=1.1,<2.0"
[package.extras] [package.extras]
dnssec = ["cryptography (>=2.6,<42.0)"] dnssec = ["cryptography (>=2.6,<42.0)"]
doh = ["h2 (>=4.1.0)", "httpx (>=0.24.1)"] doh = ["h2 (>=4.1.0)", "httpcore (>=0.17.3)", "httpx (>=0.24.1)"]
doq = ["aioquic (>=0.9.20)"] doq = ["aioquic (>=0.9.20)"]
idna = ["idna (>=2.1,<4.0)"] idna = ["idna (>=2.1,<4.0)"]
trio = ["trio (>=0.14,<0.23)"] trio = ["trio (>=0.14,<0.23)"]
@ -643,20 +618,6 @@ files = [
{file = "ete3-3.1.3.tar.gz", hash = "sha256:06a3b7fa8ed90187b076a8dbbe5b1b62acee94201d3c6e822f55f449601ef6f2"}, {file = "ete3-3.1.3.tar.gz", hash = "sha256:06a3b7fa8ed90187b076a8dbbe5b1b62acee94201d3c6e822f55f449601ef6f2"},
] ]
[[package]]
name = "exceptiongroup"
version = "1.1.2"
description = "Backport of PEP 654 (exception groups)"
optional = false
python-versions = ">=3.7"
files = [
{file = "exceptiongroup-1.1.2-py3-none-any.whl", hash = "sha256:e346e69d186172ca7cf029c8c1d16235aa0e04035e5750b4b95039e65204328f"},
{file = "exceptiongroup-1.1.2.tar.gz", hash = "sha256:12c3e887d6485d16943a309616de20ae5582633e0a2eda17f4e10fd61c1e8af5"},
]
[package.extras]
test = ["pytest (>=6)"]
[[package]] [[package]]
name = "executing" name = "executing"
version = "1.2.0" version = "1.2.0"
@ -945,17 +906,6 @@ gevent = ["gevent (>=1.4.0)"]
setproctitle = ["setproctitle"] setproctitle = ["setproctitle"]
tornado = ["tornado (>=0.2)"] tornado = ["tornado (>=0.2)"]
[[package]]
name = "h11"
version = "0.14.0"
description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
optional = false
python-versions = ">=3.7"
files = [
{file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
{file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
]
[[package]] [[package]]
name = "har2tree" name = "har2tree"
version = "1.21.2" version = "1.21.2"
@ -1080,27 +1030,6 @@ files = [
{file = "hiredis-2.2.3.tar.gz", hash = "sha256:e75163773a309e56a9b58165cf5a50e0f84b755f6ff863b2c01a38918fe92daa"}, {file = "hiredis-2.2.3.tar.gz", hash = "sha256:e75163773a309e56a9b58165cf5a50e0f84b755f6ff863b2c01a38918fe92daa"},
] ]
[[package]]
name = "httpcore"
version = "0.17.3"
description = "A minimal low-level HTTP client."
optional = false
python-versions = ">=3.7"
files = [
{file = "httpcore-0.17.3-py3-none-any.whl", hash = "sha256:c2789b767ddddfa2a5782e3199b2b7f6894540b17b16ec26b2c4d8e103510b87"},
{file = "httpcore-0.17.3.tar.gz", hash = "sha256:a6f30213335e34c1ade7be6ec7c47f19f50c56db36abef1a9dfa3815b1cb3888"},
]
[package.dependencies]
anyio = ">=3.0,<5.0"
certifi = "*"
h11 = ">=0.13,<0.15"
sniffio = "==1.*"
[package.extras]
http2 = ["h2 (>=3,<5)"]
socks = ["socksio (==1.*)"]
[[package]] [[package]]
name = "idna" name = "idna"
version = "3.4" version = "3.4"
@ -2663,17 +2592,6 @@ files = [
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
] ]
[[package]]
name = "sniffio"
version = "1.3.0"
description = "Sniff out which async library your code is running under"
optional = false
python-versions = ">=3.7"
files = [
{file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"},
{file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
]
[[package]] [[package]]
name = "soupsieve" name = "soupsieve"
version = "2.4.1" version = "2.4.1"
@ -3228,4 +3146,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.8,<3.12" python-versions = ">=3.8,<3.12"
content-hash = "594e85de86c9b95e5492786c7990ad9fd975be9c14e1c342eec9fa259a34fbb5" content-hash = "b84bbfd91bbf1959b14fdad2a9e7a6634bac4361278ff63fc4ecb1d1f11f1602"

View File

@ -48,7 +48,7 @@ vt-py = "^0.17.5"
pyeupi = "^1.1" pyeupi = "^1.1"
pysanejs = "^2.0.1" pysanejs = "^2.0.1"
pylookyloo = "^1.21.0" pylookyloo = "^1.21.0"
dnspython = "^2.4.0" dnspython = "^2.4.1"
pytaxonomies = "^1.5.0" pytaxonomies = "^1.5.0"
pymisp = {version = "^2.4.173", extras = ["url", "fileobjects"]} pymisp = {version = "^2.4.173", extras = ["url", "fileobjects"]}
Pillow = "^10.0.0" Pillow = "^10.0.0"