mirror of https://github.com/CIRCL/lookyloo
fix: handle gracefully empty lists in hset, and duplicates UUIDs
parent
fd9325bb0d
commit
855485984f
|
@ -5,6 +5,7 @@ import logging.config
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from lookyloo.default import AbstractManager, get_config
|
from lookyloo.default import AbstractManager, get_config
|
||||||
|
@ -59,6 +60,18 @@ class BackgroundIndexer(AbstractManager):
|
||||||
if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
|
if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
|
||||||
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
|
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
|
||||||
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
|
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
|
||||||
|
else:
|
||||||
|
cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid))
|
||||||
|
if cached_path != uuid_path.parent:
|
||||||
|
# we have a duplicate UUID, it is proably related to some bad copy/paste
|
||||||
|
if cached_path.exists():
|
||||||
|
# Both paths exist, move the one that isn't in lookup_dirs
|
||||||
|
self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {uuid_path.parent}, discarding the latest')
|
||||||
|
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# The path in lookup_dirs for that UUID doesn't exists, just update it.
|
||||||
|
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}')
|
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}')
|
||||||
|
|
|
@ -76,9 +76,9 @@ class Indexing():
|
||||||
if name not in already_cleaned_up:
|
if name not in already_cleaned_up:
|
||||||
# We only run this srem once per name for a capture,
|
# We only run this srem once per name for a capture,
|
||||||
# before adding it for the first time
|
# before adding it for the first time
|
||||||
pipeline.srem(f'cn|{name}|captures',
|
to_remove = [key for key in self.redis.sscan_iter(f'cn|{name}|captures', f'{crawled_tree.uuid}|*')]
|
||||||
*[key for key in self.redis.sscan_iter(f'cn|{name}|captures',
|
if to_remove:
|
||||||
f'{crawled_tree.uuid}|*')])
|
pipeline.srem(f'cn|{name}|captures', *to_remove)
|
||||||
already_cleaned_up.add(name)
|
already_cleaned_up.add(name)
|
||||||
pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
|
pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
|
||||||
pipeline.execute()
|
pipeline.execute()
|
||||||
|
@ -255,9 +255,9 @@ class Indexing():
|
||||||
if urlnode.hhhash not in already_cleaned_up:
|
if urlnode.hhhash not in already_cleaned_up:
|
||||||
# We only run this srem once per name for a capture,
|
# We only run this srem once per name for a capture,
|
||||||
# before adding it for the first time
|
# before adding it for the first time
|
||||||
pipeline.srem(f'hhhashes|{urlnode.hhhash}|captures',
|
to_remove = [key for key in self.redis.sscan_iter(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|*')]
|
||||||
*[key for key in self.redis.sscan_iter(f'hhhashes|{urlnode.hhhash}|captures',
|
if to_remove:
|
||||||
f'{crawled_tree.uuid}|*')])
|
pipeline.srem(f'hhhashes|{urlnode.hhhash}|captures', * to_remove)
|
||||||
already_cleaned_up.add(urlnode.hhhash)
|
already_cleaned_up.add(urlnode.hhhash)
|
||||||
pipeline.sadd(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
|
pipeline.sadd(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
|
||||||
pipeline.execute()
|
pipeline.execute()
|
||||||
|
|
|
@ -136,27 +136,6 @@ files = [
|
||||||
[package.extras]
|
[package.extras]
|
||||||
dev = ["black", "coverage", "isort", "pre-commit", "pyenchant", "pylint"]
|
dev = ["black", "coverage", "isort", "pre-commit", "pyenchant", "pylint"]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "anyio"
|
|
||||||
version = "3.7.1"
|
|
||||||
description = "High level compatibility layer for multiple asynchronous event loop implementations"
|
|
||||||
optional = false
|
|
||||||
python-versions = ">=3.7"
|
|
||||||
files = [
|
|
||||||
{file = "anyio-3.7.1-py3-none-any.whl", hash = "sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5"},
|
|
||||||
{file = "anyio-3.7.1.tar.gz", hash = "sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[package.dependencies]
|
|
||||||
exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
|
|
||||||
idna = ">=2.8"
|
|
||||||
sniffio = ">=1.1"
|
|
||||||
|
|
||||||
[package.extras]
|
|
||||||
doc = ["Sphinx", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-jquery"]
|
|
||||||
test = ["anyio[trio]", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
|
|
||||||
trio = ["trio (<0.22)"]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "appnope"
|
name = "appnope"
|
||||||
version = "0.1.3"
|
version = "0.1.3"
|
||||||
|
@ -612,22 +591,18 @@ dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dnspython"
|
name = "dnspython"
|
||||||
version = "2.4.0"
|
version = "2.4.1"
|
||||||
description = "DNS toolkit"
|
description = "DNS toolkit"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8,<4.0"
|
python-versions = ">=3.8,<4.0"
|
||||||
files = [
|
files = [
|
||||||
{file = "dnspython-2.4.0-py3-none-any.whl", hash = "sha256:46b4052a55b56beea3a3bdd7b30295c292bd6827dd442348bc116f2d35b17f0a"},
|
{file = "dnspython-2.4.1-py3-none-any.whl", hash = "sha256:5b7488477388b8c0b70a8ce93b227c5603bc7b77f1565afe8e729c36c51447d7"},
|
||||||
{file = "dnspython-2.4.0.tar.gz", hash = "sha256:758e691dbb454d5ccf4e1b154a19e52847f79e21a42fef17b969144af29a4e6c"},
|
{file = "dnspython-2.4.1.tar.gz", hash = "sha256:c33971c79af5be968bb897e95c2448e11a645ee84d93b265ce0b7aabe5dfdca8"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
|
||||||
httpcore = {version = ">=0.17.3", markers = "python_version >= \"3.8\""}
|
|
||||||
sniffio = ">=1.1,<2.0"
|
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
dnssec = ["cryptography (>=2.6,<42.0)"]
|
dnssec = ["cryptography (>=2.6,<42.0)"]
|
||||||
doh = ["h2 (>=4.1.0)", "httpx (>=0.24.1)"]
|
doh = ["h2 (>=4.1.0)", "httpcore (>=0.17.3)", "httpx (>=0.24.1)"]
|
||||||
doq = ["aioquic (>=0.9.20)"]
|
doq = ["aioquic (>=0.9.20)"]
|
||||||
idna = ["idna (>=2.1,<4.0)"]
|
idna = ["idna (>=2.1,<4.0)"]
|
||||||
trio = ["trio (>=0.14,<0.23)"]
|
trio = ["trio (>=0.14,<0.23)"]
|
||||||
|
@ -643,20 +618,6 @@ files = [
|
||||||
{file = "ete3-3.1.3.tar.gz", hash = "sha256:06a3b7fa8ed90187b076a8dbbe5b1b62acee94201d3c6e822f55f449601ef6f2"},
|
{file = "ete3-3.1.3.tar.gz", hash = "sha256:06a3b7fa8ed90187b076a8dbbe5b1b62acee94201d3c6e822f55f449601ef6f2"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "exceptiongroup"
|
|
||||||
version = "1.1.2"
|
|
||||||
description = "Backport of PEP 654 (exception groups)"
|
|
||||||
optional = false
|
|
||||||
python-versions = ">=3.7"
|
|
||||||
files = [
|
|
||||||
{file = "exceptiongroup-1.1.2-py3-none-any.whl", hash = "sha256:e346e69d186172ca7cf029c8c1d16235aa0e04035e5750b4b95039e65204328f"},
|
|
||||||
{file = "exceptiongroup-1.1.2.tar.gz", hash = "sha256:12c3e887d6485d16943a309616de20ae5582633e0a2eda17f4e10fd61c1e8af5"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[package.extras]
|
|
||||||
test = ["pytest (>=6)"]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "executing"
|
name = "executing"
|
||||||
version = "1.2.0"
|
version = "1.2.0"
|
||||||
|
@ -945,17 +906,6 @@ gevent = ["gevent (>=1.4.0)"]
|
||||||
setproctitle = ["setproctitle"]
|
setproctitle = ["setproctitle"]
|
||||||
tornado = ["tornado (>=0.2)"]
|
tornado = ["tornado (>=0.2)"]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "h11"
|
|
||||||
version = "0.14.0"
|
|
||||||
description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
|
|
||||||
optional = false
|
|
||||||
python-versions = ">=3.7"
|
|
||||||
files = [
|
|
||||||
{file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
|
|
||||||
{file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "har2tree"
|
name = "har2tree"
|
||||||
version = "1.21.2"
|
version = "1.21.2"
|
||||||
|
@ -1080,27 +1030,6 @@ files = [
|
||||||
{file = "hiredis-2.2.3.tar.gz", hash = "sha256:e75163773a309e56a9b58165cf5a50e0f84b755f6ff863b2c01a38918fe92daa"},
|
{file = "hiredis-2.2.3.tar.gz", hash = "sha256:e75163773a309e56a9b58165cf5a50e0f84b755f6ff863b2c01a38918fe92daa"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "httpcore"
|
|
||||||
version = "0.17.3"
|
|
||||||
description = "A minimal low-level HTTP client."
|
|
||||||
optional = false
|
|
||||||
python-versions = ">=3.7"
|
|
||||||
files = [
|
|
||||||
{file = "httpcore-0.17.3-py3-none-any.whl", hash = "sha256:c2789b767ddddfa2a5782e3199b2b7f6894540b17b16ec26b2c4d8e103510b87"},
|
|
||||||
{file = "httpcore-0.17.3.tar.gz", hash = "sha256:a6f30213335e34c1ade7be6ec7c47f19f50c56db36abef1a9dfa3815b1cb3888"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[package.dependencies]
|
|
||||||
anyio = ">=3.0,<5.0"
|
|
||||||
certifi = "*"
|
|
||||||
h11 = ">=0.13,<0.15"
|
|
||||||
sniffio = "==1.*"
|
|
||||||
|
|
||||||
[package.extras]
|
|
||||||
http2 = ["h2 (>=3,<5)"]
|
|
||||||
socks = ["socksio (==1.*)"]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "idna"
|
name = "idna"
|
||||||
version = "3.4"
|
version = "3.4"
|
||||||
|
@ -2663,17 +2592,6 @@ files = [
|
||||||
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
|
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "sniffio"
|
|
||||||
version = "1.3.0"
|
|
||||||
description = "Sniff out which async library your code is running under"
|
|
||||||
optional = false
|
|
||||||
python-versions = ">=3.7"
|
|
||||||
files = [
|
|
||||||
{file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"},
|
|
||||||
{file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "soupsieve"
|
name = "soupsieve"
|
||||||
version = "2.4.1"
|
version = "2.4.1"
|
||||||
|
@ -3228,4 +3146,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">=3.8,<3.12"
|
python-versions = ">=3.8,<3.12"
|
||||||
content-hash = "594e85de86c9b95e5492786c7990ad9fd975be9c14e1c342eec9fa259a34fbb5"
|
content-hash = "b84bbfd91bbf1959b14fdad2a9e7a6634bac4361278ff63fc4ecb1d1f11f1602"
|
||||||
|
|
|
@ -48,7 +48,7 @@ vt-py = "^0.17.5"
|
||||||
pyeupi = "^1.1"
|
pyeupi = "^1.1"
|
||||||
pysanejs = "^2.0.1"
|
pysanejs = "^2.0.1"
|
||||||
pylookyloo = "^1.21.0"
|
pylookyloo = "^1.21.0"
|
||||||
dnspython = "^2.4.0"
|
dnspython = "^2.4.1"
|
||||||
pytaxonomies = "^1.5.0"
|
pytaxonomies = "^1.5.0"
|
||||||
pymisp = {version = "^2.4.173", extras = ["url", "fileobjects"]}
|
pymisp = {version = "^2.4.173", extras = ["url", "fileobjects"]}
|
||||||
Pillow = "^10.0.0"
|
Pillow = "^10.0.0"
|
||||||
|
|
Loading…
Reference in New Issue