chg: More cleanup to use the new caching system as it should be.

pull/267/head
Raphaël Vinot 2021-09-27 11:36:27 +02:00
parent d05b1edf48
commit 681e136ef4
7 changed files with 54 additions and 91 deletions

View File

@ -176,11 +176,11 @@ class CapturesIndex(Mapping):
to_return = Path(capture_dir)
if to_return.exists():
return to_return
self.redis.hdel('lookup_dirs_archived', uuid)
# The capture was removed, remove the UUID
self.redis.hdel('lookup_dirs_archived', uuid)
self.redis.delete(capture_dir)
self.logger.warning(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).')
raise MissingCaptureDirectory(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).')
raise MissingUUID(f'Unable to find UUID {uuid}.')
def _create_pickle(self, capture_dir: Path) -> CrawledTree:

View File

@ -27,7 +27,7 @@ from werkzeug.useragents import UserAgent
from .capturecache import CaptureCache, CapturesIndex
from .context import Context
from .exceptions import (LookylooException, MissingCaptureDirectory,
MissingUUID, TreeNeedsRebuild)
MissingUUID, TreeNeedsRebuild, NoValidHarFile)
from .helpers import (CaptureStatus, get_captures_dir, get_config,
get_email_template, get_homedir, get_resources_hashes,
get_socket_path, get_splash_url, get_taxonomies, uniq_domains)
@ -85,10 +85,6 @@ class Lookyloo():
def redis(self):
return Redis(connection_pool=self.redis_pool)
def _get_capture_dir(self, capture_uuid: str, /) -> Path:
'''Use the cache to get a capture directory from a capture UUID'''
return self._captures_index[capture_uuid].capture_dir
def add_context(self, capture_uuid: str, /, urlnode_uuid: str, *, ressource_hash: str,
legitimate: bool, malicious: bool, details: Dict[str, Dict[str, str]]):
'''Adds context information to a capture or a URL node'''
@ -142,8 +138,7 @@ class Lookyloo():
def get_meta(self, capture_uuid: str, /) -> Dict[str, str]:
'''Get the meta informations from a capture (mostly, details about the User Agent used.)'''
capture_dir = self._get_capture_dir(capture_uuid)
metafile = capture_dir / 'meta'
metafile = self._captures_index[capture_uuid].capture_dir / 'meta'
if metafile.exists():
with metafile.open('r') as f:
return json.load(f)
@ -171,10 +166,10 @@ class Lookyloo():
def categories_capture(self, capture_uuid: str, /) -> Dict[str, Any]:
'''Get all the categories related to a capture, in MISP Taxonomies format'''
capture_dir = self._get_capture_dir(capture_uuid)
categ_file = self._captures_index[capture_uuid].capture_dir / 'categories'
# get existing categories if possible
if (capture_dir / 'categories').exists():
with (capture_dir / 'categories').open() as f:
if categ_file.exists():
with categ_file.open() as f:
current_categories = [line.strip() for line in f.readlines()]
return {e: self.taxonomies.revert_machinetag(e) for e in current_categories}
return {}
@ -186,30 +181,30 @@ class Lookyloo():
# Make sure the category is mappable to a taxonomy.
self.taxonomies.revert_machinetag(category)
capture_dir = self._get_capture_dir(capture_uuid)
categ_file = self._captures_index[capture_uuid].capture_dir / 'categories'
# get existing categories if possible
if (capture_dir / 'categories').exists():
with (capture_dir / 'categories').open() as f:
if categ_file.exists():
with categ_file.open() as f:
current_categories = set(line.strip() for line in f.readlines())
else:
current_categories = set()
current_categories.add(category)
with (capture_dir / 'categories').open('w') as f:
with categ_file.open('w') as f:
f.writelines(f'{t}\n' for t in current_categories)
def uncategorize_capture(self, capture_uuid: str, /, category: str) -> None:
'''Remove a category (MISP Taxonomy tag) from a capture.'''
if not get_config('generic', 'enable_categorization'):
return
capture_dir = self._get_capture_dir(capture_uuid)
categ_file = self._captures_index[capture_uuid].capture_dir / 'categories'
# get existing categories if possible
if (capture_dir / 'categories').exists():
with (capture_dir / 'categories').open() as f:
if categ_file.exists():
with categ_file.open() as f:
current_categories = set(line.strip() for line in f.readlines())
else:
current_categories = set()
current_categories.remove(category)
with (capture_dir / 'categories').open('w') as f:
with categ_file.open('w') as f:
f.writelines(f'{t}\n' for t in current_categories)
def trigger_modules(self, capture_uuid: str, /, force: bool=False, auto_trigger: bool=False) -> Dict:
@ -283,23 +278,19 @@ class Lookyloo():
"""Add the capture in the hidden pool (not shown on the front page)
NOTE: it won't remove the correlations until they are rebuilt.
"""
self.redis.hset(str(self._get_capture_dir(capture_uuid)), 'no_index', 1)
(self._get_capture_dir(capture_uuid) / 'no_index').touch()
capture_dir = self._captures_index[capture_uuid].capture_dir
self.redis.hset(str(capture_dir), 'no_index', 1)
(capture_dir / 'no_index').touch()
self._captures_index.reload_cache(capture_uuid)
def update_tree_cache_info(self, process_id: int, classname: str) -> None:
self.redis.hset('tree_cache', f'{process_id}|{classname}', str(self._captures_index.lru_cache_status()))
@property
def capture_uuids(self) -> List[str]:
'''All the capture UUIDs present in the cache.'''
return self.redis.hkeys('lookup_dirs')
def sorted_capture_cache(self, capture_uuids: Optional[Iterable[str]]=None) -> List[CaptureCache]:
'''Get all the captures in the cache, sorted by timestamp (new -> old).'''
if capture_uuids is None:
# Sort all captures
capture_uuids = self.capture_uuids
# Sort all recent captures
capture_uuids = self.redis.hkeys('lookup_dirs')
if not capture_uuids:
# No captures at all on the instance
return []
@ -309,6 +300,7 @@ class Lookyloo():
return all_cache
def get_capture_status(self, capture_uuid: str, /) -> CaptureStatus:
'''Returns the status (queued, ongoing, done, or UUID unknown)'''
if self.redis.zrank('to_capture', capture_uuid) is not None:
return CaptureStatus.QUEUED
elif self.redis.hexists('lookup_dirs', capture_uuid):
@ -318,6 +310,7 @@ class Lookyloo():
return CaptureStatus.UNKNOWN
def try_error_status(self, capture_uuid: str, /) -> Optional[str]:
'''If it is not possible to do the capture, we store the error for a short amount of time'''
return self.redis.get(f'error_{capture_uuid}')
def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]:
@ -351,7 +344,7 @@ class Lookyloo():
def enqueue_capture(self, query: MutableMapping[str, Any], source: str, user: str, authenticated: bool) -> str:
'''Enqueue a query in the capture queue (used by the UI and the API for asynchronous processing)'''
def _get_priority(source: str, user: str, authenticated: bool) -> int:
def get_priority(source: str, user: str, authenticated: bool) -> int:
src_prio: int = self._priority['sources'][source] if source in self._priority['sources'] else -1
if not authenticated:
usr_prio = self._priority['users']['_default_anon']
@ -364,7 +357,7 @@ class Lookyloo():
usr_prio = self._priority['users'][user] if self._priority['users'].get(user) else self._priority['users']['_default_auth']
return src_prio + usr_prio
priority = _get_priority(source, user, authenticated)
priority = get_priority(source, user, authenticated)
perma_uuid = str(uuid4())
p = self.redis.pipeline()
for key, value in query.items():
@ -427,7 +420,7 @@ class Lookyloo():
def _get_raw(self, capture_uuid: str, /, extension: str='*', all_files: bool=True) -> BytesIO:
'''Get file(s) from the capture directory'''
try:
capture_dir = self._get_capture_dir(capture_uuid)
capture_dir = self._captures_index[capture_uuid].capture_dir
except MissingUUID:
return BytesIO(f'Capture {capture_uuid} not unavailable, try again later.'.encode())
except MissingCaptureDirectory:
@ -607,6 +600,10 @@ class Lookyloo():
except IndexError:
# unable to find the uuid, the cache is probably in a weird state.
return None
except NoValidHarFile as e:
# something went poorly when rebuilding the tree (probably a recursive error)
self.logger.warning(e)
return None
if url.empty_response:
return None
if not h or h == url.body_hash:
@ -800,7 +797,7 @@ class Lookyloo():
def get_hostnode_investigator(self, capture_uuid: str, /, node_uuid: str) -> Tuple[HostNode, List[Dict[str, Any]]]:
'''Gather all the informations needed to display the Hostnode investigator popup.'''
def _normalize_known_content(h: str, /, known_content: Dict[str, Any], url: URLNode) -> Tuple[Optional[Union[str, List[Any]]], Optional[Tuple[bool, Any]]]:
def normalize_known_content(h: str, /, known_content: Dict[str, Any], url: URLNode) -> Tuple[Optional[Union[str, List[Any]]], Optional[Tuple[bool, Any]]]:
''' There are a few different sources to figure out known vs. legitimate content,
this method normalize it for the web interface.'''
known: Optional[Union[str, List[Any]]] = None
@ -861,13 +858,13 @@ class Lookyloo():
if freq_embedded['hash_freq'] > 1:
to_append['embedded_ressources'][h]['other_captures'] = self.hash_lookup(h, url.name, capture_uuid)
for h in to_append['embedded_ressources'].keys():
known, legitimate = _normalize_known_content(h, known_content, url)
known, legitimate = normalize_known_content(h, known_content, url)
if known:
to_append['embedded_ressources'][h]['known_content'] = known
elif legitimate:
to_append['embedded_ressources'][h]['legitimacy'] = legitimate
known, legitimate = _normalize_known_content(url.body_hash, known_content, url)
known, legitimate = normalize_known_content(url.body_hash, known_content, url)
if known:
to_append['known_content'] = known
elif legitimate:

32
poetry.lock generated
View File

@ -277,7 +277,7 @@ python-versions = "*"
[[package]]
name = "filetype"
version = "1.0.7"
version = "1.0.8"
description = "Infer file type and MIME type of any file/buffer. No external dependencies."
category = "main"
optional = false
@ -437,7 +437,7 @@ scripts = ["click (>=6.0)", "twisted (>=16.4.0)"]
[[package]]
name = "ipython"
version = "7.27.0"
version = "7.28.0"
description = "IPython: Productive Interactive Computing"
category = "dev"
optional = false
@ -976,7 +976,7 @@ use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"]
[[package]]
name = "rich"
version = "10.10.0"
version = "10.11.0"
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
category = "main"
optional = false
@ -1199,7 +1199,7 @@ python-versions = "*"
[[package]]
name = "types-requests"
version = "2.25.8"
version = "2.25.9"
description = "Typing stubs for requests"
category = "dev"
optional = false
@ -1223,7 +1223,7 @@ python-versions = "*"
[[package]]
name = "urllib3"
version = "1.26.6"
version = "1.26.7"
description = "HTTP library with thread-safe connection pooling, file post, and more."
category = "main"
optional = false
@ -1317,7 +1317,7 @@ misp = ["python-magic", "pydeep"]
[metadata]
lock-version = "1.1"
python-versions = "^3.8"
content-hash = "bd09b20f35c4a361a109abcf950326519706a5938c436e9ac6f2e07ef0e782c2"
content-hash = "2b182690d3f0bb7438ded19e043d0c6aab7870e5460f4e4942a56c626d8fed66"
[metadata.files]
aiohttp = [
@ -1533,8 +1533,8 @@ ete3 = [
{file = "ete3-3.1.2.tar.gz", hash = "sha256:4fc987b8c529889d6608fab1101f1455cb5cbd42722788de6aea9c7d0a8e59e9"},
]
filetype = [
{file = "filetype-1.0.7-py2.py3-none-any.whl", hash = "sha256:353369948bb1c09b8b3ea3d78390b5586e9399bff9aab894a1dff954e31a66f6"},
{file = "filetype-1.0.7.tar.gz", hash = "sha256:da393ece8d98b47edf2dd5a85a2c8733e44b769e32c71af4cd96ed8d38d96aa7"},
{file = "filetype-1.0.8-py2.py3-none-any.whl", hash = "sha256:eb974519c5dbbd678a9fbfb5e2616105c6768ee9c01ce4a4ecaefc141e50d5e5"},
{file = "filetype-1.0.8.tar.gz", hash = "sha256:77df14879b73fd9711b8bd4f465dadf2ecdafff0eac3b22c0bdb0ccba68db316"},
]
flask = [
{file = "Flask-2.0.1-py3-none-any.whl", hash = "sha256:a6209ca15eb63fc9385f38e452704113d679511d9574d09b2cf9183ae7d20dc9"},
@ -1623,8 +1623,8 @@ incremental = [
{file = "incremental-21.3.0.tar.gz", hash = "sha256:02f5de5aff48f6b9f665d99d48bfc7ec03b6e3943210de7cfc88856d755d6f57"},
]
ipython = [
{file = "ipython-7.27.0-py3-none-any.whl", hash = "sha256:75b5e060a3417cf64f138e0bb78e58512742c57dc29db5a5058a2b1f0c10df02"},
{file = "ipython-7.27.0.tar.gz", hash = "sha256:58b55ebfdfa260dad10d509702dc2857cb25ad82609506b070cf2d7b7df5af13"},
{file = "ipython-7.28.0-py3-none-any.whl", hash = "sha256:f16148f9163e1e526f1008d7c8d966d9c15600ca20d1a754287cf96d00ba6f1d"},
{file = "ipython-7.28.0.tar.gz", hash = "sha256:2097be5c814d1b974aea57673176a924c4c8c9583890e7a5f082f547b9975b11"},
]
itemadapter = [
{file = "itemadapter-0.4.0-py3-none-any.whl", hash = "sha256:695809a4e2f42174f0392dd66c2ceb2b2454d3ebbf65a930e5c85910d8d88d8f"},
@ -2061,8 +2061,8 @@ requests = [
{file = "requests-2.26.0.tar.gz", hash = "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"},
]
rich = [
{file = "rich-10.10.0-py3-none-any.whl", hash = "sha256:0b8cbcb0b8d476a7f002feaed9f35e51615f673c6c291d76ddf0c555574fd3c7"},
{file = "rich-10.10.0.tar.gz", hash = "sha256:bacf58b25fea6b920446fe4e7abdc6c7664c4530c4098e7a1bc79b16b8551dfa"},
{file = "rich-10.11.0-py3-none-any.whl", hash = "sha256:44bb3f9553d00b3c8938abf89828df870322b9ba43caf3b12bb7758debdc6dec"},
{file = "rich-10.11.0.tar.gz", hash = "sha256:016fa105f34b69c434e7f908bb5bd7fefa9616efdb218a2917117683a6394ce5"},
]
scrapy = [
{file = "Scrapy-2.5.0-py2.py3-none-any.whl", hash = "sha256:5f590fdc84b496e5a4bb5ef99836b0aa688a07cfcb4bc3bb7290f66486f27424"},
@ -2139,8 +2139,8 @@ types-redis = [
{file = "types_redis-3.5.8-py3-none-any.whl", hash = "sha256:85814769071721044857c34841e46064b867ccdd58fc81221c43462bd07e4892"},
]
types-requests = [
{file = "types-requests-2.25.8.tar.gz", hash = "sha256:225ac2e86549b6ef3a8a44bf955f80b4955855704a15d2883d8445c8df637242"},
{file = "types_requests-2.25.8-py3-none-any.whl", hash = "sha256:26e90866bcd773d76b316de7e6bd6e24641f9e1653cf27241c533886600f6824"},
{file = "types-requests-2.25.9.tar.gz", hash = "sha256:4ec8b71da73e5344adb9bee725a74ec8598e7286f9bcb17500d627f259fe4fb9"},
{file = "types_requests-2.25.9-py3-none-any.whl", hash = "sha256:543ba8b3b23e38ac028da1d163aecbbc27d3cc8f654ae64339da539a191a2b1c"},
]
types-werkzeug = [
{file = "types-Werkzeug-1.0.5.tar.gz", hash = "sha256:f6216ab0e0211fe73ebdb4ae0e414113d4d8a2f783a15c2d8550e06d0fd8e7f9"},
@ -2152,8 +2152,8 @@ typing-extensions = [
{file = "typing_extensions-3.10.0.2.tar.gz", hash = "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e"},
]
urllib3 = [
{file = "urllib3-1.26.6-py2.py3-none-any.whl", hash = "sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4"},
{file = "urllib3-1.26.6.tar.gz", hash = "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"},
{file = "urllib3-1.26.7-py2.py3-none-any.whl", hash = "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"},
{file = "urllib3-1.26.7.tar.gz", hash = "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece"},
]
vt-py = [
{file = "vt-py-0.7.4.tar.gz", hash = "sha256:27af411495a115f6cc2da5d184a9f32c12a2304eca94efefaae03d2b12b66174"},

View File

@ -61,7 +61,7 @@ lief = "^0.11.4"
Flask-Login = "^0.5.0"
flask-restx = "^0.5.1"
hiredis = "^2.0.0"
rich = "^10.10.0"
rich = "^10.11.0"
pyphishtanklookup = "^1.0.1"
[tool.poetry.extras]
@ -69,9 +69,9 @@ misp = ['python-magic', 'pydeep']
[tool.poetry.dev-dependencies]
mypy = "^0.910"
ipython = "^7.27.0"
ipython = "^7.28.0"
types-redis = "^3.5.8"
types-requests = "^2.25.8"
types-requests = "^2.25.9"
types-Flask = "^1.1.3"
types-pkg-resources = "^0.1.2"

View File

@ -1,12 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from lookyloo.lookyloo import Lookyloo
lookyloo = Lookyloo()
for capture_uuid in lookyloo.capture_uuids:
try:
ct = lookyloo.get_crawled_tree(capture_uuid)
except Exception:
continue

View File

@ -23,28 +23,9 @@ def main():
indexing = Indexing()
indexing.clear_indexes()
for capture_uuid in lookyloo.capture_uuids:
index = True
try:
tree = lookyloo.get_crawled_tree(capture_uuid)
except Exception as e:
print(capture_uuid, e)
continue
if lookyloo.is_public_instance:
cache = lookyloo.capture_cache(capture_uuid)
if not cache:
continue
if cache.no_index:
index = False
# NOTE: these methods do nothing if we just generated the pickle when calling lookyloo.get_crawled_tree
if index:
indexing.index_cookies_capture(tree)
indexing.index_body_hashes_capture(tree)
indexing.index_url_capture(tree)
categories = list(lookyloo.categories_capture(capture_uuid).keys())
indexing.index_categories_capture(capture_uuid, categories)
# This call will rebuild all the caches as needed.
lookyloo.sorted_capture_cache()
if __name__ == '__main__':

View File

@ -23,10 +23,7 @@ def uniq_domains(uniq_urls):
return domains
for uuid in lookyloo.capture_uuids:
cache = lookyloo.capture_cache(uuid)
if not cache or not hasattr(cache, 'timestamp'):
continue
for cache in lookyloo.sorted_capture_cache():
date = cache.timestamp
if date.year not in stats:
stats[date.year] = {}