chg: cleanup in the mail lookyloo class

pull/251/head
Raphaël Vinot 2021-08-24 18:32:54 +02:00
parent 8433cbcc1b
commit 81390d5ea0
6 changed files with 203 additions and 163 deletions

View File

@ -25,7 +25,7 @@ class Archiver(AbstractManager):
self.redis = Redis(unix_socket_path=get_socket_path('cache'))
# make sure archived captures dir exists
self.archived_captures_dir = get_homedir / 'archived_captures'
self.archived_captures_dir = get_homedir() / 'archived_captures'
self.archived_captures_dir.mkdir(parents=True, exist_ok=True)
self._load_archives()
@ -78,8 +78,8 @@ class Archiver(AbstractManager):
if archived_uuids:
p = self.redis.pipeline()
p.redis.hdel('lookup_dirs', *archived_uuids.keys())
p.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
p.hdel('lookup_dirs', *archived_uuids.keys())
p.hmset('lookup_dirs_archived', archived_uuids) # type: ignore
p.execute()
self.logger.info('Archiving done.')
@ -91,8 +91,8 @@ class Archiver(AbstractManager):
if not (month / 'index').exists():
continue
with (month / 'index').open('r') as _f:
archived_uuids = {uuid: str(month / dirname) for uuid, dirname in csv.reader(_f)}
self.redis.hset('lookup_dirs_archived', mapping=archived_uuids)
archived_uuids: Dict[str, str] = {uuid: str(month / dirname) for uuid, dirname in csv.reader(_f)}
self.redis.hmset('lookup_dirs_archived', archived_uuids) # type: ignore
def main():

73
bin/background_processing.py Executable file
View File

@ -0,0 +1,73 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import logging
from collections import Counter
from datetime import timedelta, date
from typing import Dict, Any
from redis import Redis
from werkzeug.useragents import UserAgent
from lookyloo.abstractmanager import AbstractManager
from lookyloo.helpers import (get_config, get_homedir, get_socket_path,
safe_create_dir)
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
level=logging.INFO, datefmt='%I:%M:%S')
class Processing(AbstractManager):
def __init__(self, loglevel: int=logging.INFO):
super().__init__(loglevel)
self.script_name = 'archiver'
self.use_own_ua = get_config('generic', 'use_user_agents_users')
def _to_run_forever(self):
if self.use_own_ua:
self._build_ua_file()
def _build_ua_file(self):
'''Build a file in a format compatible with the capture page'''
yesterday = (date.today() - timedelta(days=1))
self_generated_ua_file_path = get_homedir() / 'own_user_agents' / str(yesterday.year) / f'{yesterday.month:02}'
safe_create_dir(self_generated_ua_file_path)
self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json'
if self_generated_ua_file.exists():
return
redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
entries = redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0, -1)
if not entries:
return
to_store: Dict[str, Any] = {'by_frequency': []}
uas = Counter([entry.split('|', 1)[1] for entry in entries])
for ua, _ in uas.most_common():
parsed_ua = UserAgent(ua)
if not parsed_ua.platform or not parsed_ua.browser:
continue
if parsed_ua.platform not in to_store:
to_store[parsed_ua.platform] = {}
if f'{parsed_ua.browser} {parsed_ua.version}' not in to_store[parsed_ua.platform]:
to_store[parsed_ua.platform][f'{parsed_ua.browser} {parsed_ua.version}'] = []
to_store[parsed_ua.platform][f'{parsed_ua.browser} {parsed_ua.version}'].append(parsed_ua.string)
to_store['by_frequency'].append({'os': parsed_ua.platform,
'browser': f'{parsed_ua.browser} {parsed_ua.version}',
'useragent': parsed_ua.string})
with self_generated_ua_file.open('w') as f:
json.dump(to_store, f, indent=2)
# Remove the UA / IP mapping.
redis.delete(f'user_agents|{yesterday.isoformat()}')
def main():
p = Processing()
p.run(sleep_in_sec=3600 * 24)
if __name__ == '__main__':
main()

View File

@ -103,7 +103,7 @@ Run the following command (assuming you run the code from the clonned repository
@lru_cache(64)
def get_capture_dir() -> Path:
def get_captures_dir() -> Path:
capture_dir = get_homedir() / 'scraped'
safe_create_dir(capture_dir)
return capture_dir
@ -365,13 +365,15 @@ def get_useragent_for_requests():
def reload_uuids_index() -> None:
recent_uuids = {}
for uuid_path in sorted(get_capture_dir().glob('*/uuid'), reverse=True):
recent_uuids: Dict[str, str] = {}
for uuid_path in sorted(get_captures_dir().glob('*/uuid'), reverse=True):
with uuid_path.open() as f:
uuid = f.read()
recent_uuids[uuid] = str(uuid_path.parent)
if not recent_uuids:
return None
r = Redis(unix_socket_path=get_socket_path('cache'))
p = r.pipeline()
p.delete('lookup_dirs')
p.hset('lookup_dirs', mapping=recent_uuids)
p.hset('lookup_dirs', mapping=recent_uuids) # type: ignore
p.execute()

View File

@ -3,8 +3,8 @@
import os
import base64
from collections import defaultdict, Counter
from datetime import datetime, date, timedelta
from collections import defaultdict
from datetime import datetime, date
from email.message import EmailMessage
from io import BufferedIOBase, BytesIO
import ipaddress
@ -106,37 +106,32 @@ class Lookyloo():
today = date.today().isoformat()
self.redis.zincrby(f'user_agents|{today}', 1, f'{remote_ip}|{user_agent}')
def build_ua_file(self) -> None:
'''Build a file in a format compatible with the capture page'''
yesterday = (date.today() - timedelta(days=1))
self_generated_ua_file_path = get_homedir() / 'own_user_agents' / str(yesterday.year) / f'{yesterday.month:02}'
safe_create_dir(self_generated_ua_file_path)
self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json'
if self_generated_ua_file.exists():
return
entries = self.redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0, -1)
if not entries:
return
to_store: Dict[str, Any] = {'by_frequency': []}
uas = Counter([entry.split('|', 1)[1] for entry in entries])
for ua, _ in uas.most_common():
parsed_ua = UserAgent(ua)
if not parsed_ua.platform or not parsed_ua.browser:
continue
if parsed_ua.platform not in to_store:
to_store[parsed_ua.platform] = {}
if f'{parsed_ua.browser} {parsed_ua.version}' not in to_store[parsed_ua.platform]:
to_store[parsed_ua.platform][f'{parsed_ua.browser} {parsed_ua.version}'] = []
to_store[parsed_ua.platform][f'{parsed_ua.browser} {parsed_ua.version}'].append(parsed_ua.string)
to_store['by_frequency'].append({'os': parsed_ua.platform,
'browser': f'{parsed_ua.browser} {parsed_ua.version}',
'useragent': parsed_ua.string})
with self_generated_ua_file.open('w') as f:
json.dump(to_store, f, indent=2)
# Remove the UA / IP mapping.
self.redis.delete(f'user_agents|{yesterday.isoformat()}')
def _get_capture_dir(self, capture_uuid: str, /) -> Path:
'''Use the cache to get a capture directory from a capture UUID'''
capture_dir: Optional[Union[str, Path]]
if capture_uuid in self._captures_index:
capture_dir = self._captures_index[capture_uuid].capture_dir
if capture_dir.exists():
return capture_dir
self.redis.delete(capture_dir)
self._captures_index.pop(capture_uuid)
capture_dir = self.redis.hget('lookup_dirs', capture_uuid)
if capture_dir and not Path(capture_dir).exists():
# The capture was either removed or archived, cleaning up
self.redis.hdel('lookup_dirs', capture_uuid)
capture_dir = None
if not capture_dir:
# Try in the archive
capture_dir = self.redis.hget('lookup_dirs_archived', capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
to_return = Path(capture_dir)
if not to_return.exists():
# The capture was removed, remove the UUID
self.redis.hdel('lookup_dirs_archived', capture_uuid)
self.logger.warning(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.')
raise NoValidHarFile(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.')
return to_return
def _cache_capture(self, capture_uuid: str, /) -> CrawledTree:
'''Generate the pickle, set the cache, add capture in the indexes'''
@ -197,6 +192,85 @@ class Lookyloo():
lock_file.unlink(missing_ok=True)
return ct
def _set_capture_cache(self, capture_dir: Path) -> Dict[str, Any]:
'''Populate the redis cache for a capture. Mostly used on the index page.
NOTE: Doesn't require the pickle.'''
with (capture_dir / 'uuid').open() as f:
uuid = f.read().strip()
har_files = sorted(capture_dir.glob('*.har'))
error_cache: Dict[str, str] = {}
if (capture_dir / 'error.txt').exists():
# Something went wrong
with (capture_dir / 'error.txt').open() as _error:
content = _error.read()
try:
error_to_cache = json.loads(content)
if isinstance(error_to_cache, dict) and error_to_cache.get('details'):
error_to_cache = error_to_cache.get('details')
except json.decoder.JSONDecodeError:
# old format
error_to_cache = content
error_cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
fatal_error = False
if har_files:
try:
har = HarFile(har_files[0], uuid)
except Har2TreeError as e:
error_cache['error'] = str(e)
fatal_error = True
else:
error_cache['error'] = f'No har files in {capture_dir.name}'
fatal_error = True
if (capture_dir / 'categories').exists():
with (capture_dir / 'categories').open() as _categories:
categories = [c.strip() for c in _categories.readlines()]
else:
categories = []
p = self.redis.pipeline()
p.hset('lookup_dirs', uuid, str(capture_dir))
if error_cache:
if 'HTTP Error' not in error_cache['error']:
self.logger.warning(error_cache['error'])
p.hmset(str(capture_dir), error_cache)
if not fatal_error:
redirects = har.initial_redirects
incomplete_redirects = False
if redirects and har.need_tree_redirects:
# load tree from disk, get redirects
ct = load_pickle_tree(capture_dir)
if ct:
redirects = ct.redirects
else:
# Pickle not available
incomplete_redirects = True
cache: Dict[str, Union[str, int]] = {'uuid': uuid,
'title': har.initial_title,
'timestamp': har.initial_start_time,
'url': har.root_url,
'redirects': json.dumps(redirects),
'categories': json.dumps(categories),
'capture_dir': str(capture_dir),
'incomplete_redirects': 1 if incomplete_redirects else 0}
if (capture_dir / 'no_index').exists(): # If the folders claims anonymity
cache['no_index'] = 1
if (capture_dir / 'parent').exists(): # The capture was initiated from an other one
with (capture_dir / 'parent').open() as f:
cache['parent'] = f.read().strip()
p.hmset(str(capture_dir), cache)
p.execute()
# If the cache is re-created for some reason, pop from the local cache.
self._captures_index.pop(uuid, None)
return cache
def _build_cname_chain(self, known_cnames: Dict[str, Optional[str]], hostname) -> List[str]:
'''Returns a list of CNAMEs starting from one hostname.
The CNAMEs resolutions are made in `_resolve_dns`. A hostname can have a CNAME entry
@ -259,17 +333,6 @@ class Lookyloo():
json.dump(host_ips, f)
return ct
def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
'''Get the generated tree in ETE Toolkit format.
Loads the pickle if it exists, creates it otherwise.'''
capture_dir = self._get_capture_dir(capture_uuid)
ct = load_pickle_tree(capture_dir)
if not ct:
ct = self._cache_capture(capture_uuid)
if not ct:
raise NoValidHarFile(f'Unable to get tree from {capture_dir}')
return ct
def add_context(self, capture_uuid: str, /, urlnode_uuid: str, *, ressource_hash: str,
legitimate: bool, malicious: bool, details: Dict[str, Dict[str, str]]):
'''Adds context information to a capture or a URL node'''
@ -449,85 +512,6 @@ class Lookyloo():
to_return[event_id].update(values)
return to_return
def _set_capture_cache(self, capture_dir: Path) -> Dict[str, Any]:
'''Populate the redis cache for a capture. Mostly used on the index page.
NOTE: Doesn't require the pickle.'''
with (capture_dir / 'uuid').open() as f:
uuid = f.read().strip()
har_files = sorted(capture_dir.glob('*.har'))
error_cache: Dict[str, str] = {}
if (capture_dir / 'error.txt').exists():
# Something went wrong
with (capture_dir / 'error.txt').open() as _error:
content = _error.read()
try:
error_to_cache = json.loads(content)
if isinstance(error_to_cache, dict) and error_to_cache.get('details'):
error_to_cache = error_to_cache.get('details')
except json.decoder.JSONDecodeError:
# old format
error_to_cache = content
error_cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
fatal_error = False
if har_files:
try:
har = HarFile(har_files[0], uuid)
except Har2TreeError as e:
error_cache['error'] = str(e)
fatal_error = True
else:
error_cache['error'] = f'No har files in {capture_dir.name}'
fatal_error = True
if (capture_dir / 'categories').exists():
with (capture_dir / 'categories').open() as _categories:
categories = [c.strip() for c in _categories.readlines()]
else:
categories = []
p = self.redis.pipeline()
p.hset('lookup_dirs', uuid, str(capture_dir))
if error_cache:
if 'HTTP Error' not in error_cache['error']:
self.logger.warning(error_cache['error'])
p.hmset(str(capture_dir), error_cache)
if not fatal_error:
redirects = har.initial_redirects
incomplete_redirects = False
if redirects and har.need_tree_redirects:
# load tree from disk, get redirects
ct = load_pickle_tree(capture_dir)
if ct:
redirects = ct.redirects
else:
# Pickle not available
incomplete_redirects = True
cache: Dict[str, Union[str, int]] = {'uuid': uuid,
'title': har.initial_title,
'timestamp': har.initial_start_time,
'url': har.root_url,
'redirects': json.dumps(redirects),
'categories': json.dumps(categories),
'capture_dir': str(capture_dir),
'incomplete_redirects': 1 if incomplete_redirects else 0}
if (capture_dir / 'no_index').exists(): # If the folders claims anonymity
cache['no_index'] = 1
if (capture_dir / 'parent').exists(): # The capture was initiated from an other one
with (capture_dir / 'parent').open() as f:
cache['parent'] = f.read().strip()
p.hmset(str(capture_dir), cache)
p.execute()
# If the cache is re-created for some reason, pop from the local cache.
self._captures_index.pop(uuid, None)
return cache
def hide_capture(self, capture_uuid: str, /) -> None:
"""Add the capture in the hidden pool (not shown on the front page)
NOTE: it won't remove the correlations until they are rebuilt.
@ -599,32 +583,16 @@ class Lookyloo():
self.logger.warning(f'Cache ({capture_dir}) is invalid ({e}): {json.dumps(cached, indent=2)}')
return None
def _get_capture_dir(self, capture_uuid: str, /) -> Path:
'''Use the cache to get a capture directory from a capture UUID'''
capture_dir: Optional[Union[str, Path]]
if capture_uuid in self._captures_index:
capture_dir = self._captures_index[capture_uuid].capture_dir
if capture_dir.exists():
return capture_dir
self.redis.delete(capture_dir)
self._captures_index.pop(capture_uuid)
capture_dir = self.redis.hget('lookup_dirs', capture_uuid)
if capture_dir and not Path(capture_dir).exists():
# The capture was either removed or archived, cleaning up
self.redis.hdel('lookup_dirs', capture_uuid)
capture_dir = None
if not capture_dir:
# Try in the archive
capture_dir = self.redis.hget('lookup_dirs_archived', capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
to_return = Path(capture_dir)
if not to_return.exists():
# The capture was removed, remove the UUID
self.redis.hdel('lookup_dirs_archived', capture_uuid)
self.logger.warning(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.')
raise NoValidHarFile(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.')
return to_return
def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
'''Get the generated tree in ETE Toolkit format.
Loads the pickle if it exists, creates it otherwise.'''
capture_dir = self._get_capture_dir(capture_uuid)
ct = load_pickle_tree(capture_dir)
if not ct:
ct = self._cache_capture(capture_uuid)
if not ct:
raise NoValidHarFile(f'Unable to get tree from {capture_dir}')
return ct
def get_capture_status(self, capture_uuid: str, /) -> CaptureStatus:
redis = self.redis # use a single connection

View File

@ -32,6 +32,7 @@ rebuild_caches = "bin.rebuild_caches:main"
update = "bin.update:main"
background_indexer = "bin.background_indexer:main"
archiver = "bin.archiver:main"
processing = "bin.background_processing:main"
[tool.poetry.dependencies]

View File

@ -21,7 +21,7 @@ from werkzeug.security import check_password_hash
from pymisp import MISPEvent, MISPServerError
from lookyloo.helpers import (update_user_agents, get_user_agents, get_config,
from lookyloo.helpers import (get_user_agents, get_config,
get_taxonomies, load_cookies, CaptureStatus)
from lookyloo.lookyloo import Lookyloo, Indexing
from lookyloo.exceptions import NoValidHarFile, MissingUUID
@ -679,10 +679,6 @@ def index():
if request.method == 'HEAD':
# Just returns ack if the webserver is running
return 'Ack'
if use_own_ua:
lookyloo.build_ua_file()
else:
update_user_agents()
show_error, category = get_index_params(request)
return index_generic(show_error=show_error)