chg: Use capture UUID as a reference everywhere

pull/79/head
Raphaël Vinot 2020-06-29 11:59:01 +02:00
parent fa935a6773
commit 05de56022f
11 changed files with 221 additions and 192 deletions

16
bin/rebuild_caches.py Executable file
View File

@ -0,0 +1,16 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import logging
from lookyloo.lookyloo import Lookyloo
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
level=logging.INFO, datefmt='%I:%M:%S')
if __name__ == '__main__':
lookyloo = Lookyloo()
remove_pickles = input('Do you want to remove the pickles? Rebuilding will take a very long time. (y/N)')
if remove_pickles == 'y':
lookyloo.rebuild_all()
else:
lookyloo.rebuild_cache()

View File

@ -13,6 +13,8 @@ from glob import glob
import json import json
import traceback import traceback
from urllib.parse import urlparse from urllib.parse import urlparse
import pickle
from har2tree import CrawledTree
from bs4 import BeautifulSoup # type: ignore from bs4 import BeautifulSoup # type: ignore
try: try:
@ -210,3 +212,17 @@ def load_cookies(cookie_pseudofile: Optional[BufferedIOBase]=None) -> List[Dict[
except Exception as e: except Exception as e:
print(f'Unable to load the cookie file: {e}') print(f'Unable to load the cookie file: {e}')
return to_return return to_return
def load_pickle_tree(capture_dir: Path) -> Optional[CrawledTree]:
pickle_file = capture_dir / 'tree.pickle'
if pickle_file.exists():
with pickle_file.open('rb') as _p:
return pickle.load(_p)
return None
def remove_pickle_tree(capture_dir: Path) -> None:
pickle_file = capture_dir / 'tree.pickle'
if pickle_file.exists():
pickle_file.unlink()

View File

@ -19,6 +19,7 @@ from urllib.parse import urlsplit
from uuid import uuid4 from uuid import uuid4
from zipfile import ZipFile from zipfile import ZipFile
import publicsuffix2 # type: ignore
from defang import refang # type: ignore from defang import refang # type: ignore
from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode
from redis import Redis from redis import Redis
@ -27,7 +28,7 @@ from scrapysplashwrapper import crawl
from werkzeug.useragents import UserAgent from werkzeug.useragents import UserAgent
from .exceptions import NoValidHarFile, MissingUUID from .exceptions import NoValidHarFile, MissingUUID
from .helpers import get_homedir, get_socket_path, load_cookies, load_configs, safe_create_dir, get_email_template from .helpers import get_homedir, get_socket_path, load_cookies, load_configs, safe_create_dir, get_email_template, load_pickle_tree, remove_pickle_tree
from .modules import VirusTotal, SaneJavaScript, PhishingInitiative from .modules import VirusTotal, SaneJavaScript, PhishingInitiative
@ -97,17 +98,60 @@ class Lookyloo():
with self_generated_ua_file.open('w') as f: with self_generated_ua_file.open('w') as f:
json.dump(to_store, f, indent=2) json.dump(to_store, f, indent=2)
def cache_tree(self, capture_uuid) -> None:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
with open((capture_dir / 'uuid'), 'r') as f:
uuid = f.read()
har_files = sorted(capture_dir.glob('*.har'))
try:
ct = CrawledTree(har_files, uuid)
except Har2TreeError as e:
raise NoValidHarFile(e.message)
with (capture_dir / 'tree.pickle').open('wb') as _p:
pickle.dump(ct, _p)
def get_crawled_tree(self, capture_uuid: str) -> CrawledTree:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
ct = load_pickle_tree(capture_dir)
if not ct:
self.cache_tree(capture_uuid)
ct = load_pickle_tree(capture_dir)
if not ct:
raise NoValidHarFile(f'Unable to get tree from {capture_dir}')
return ct
def load_tree(self, capture_uuid: str) -> Tuple[str, str, str, str, Dict[str, str]]:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
meta = {}
if (capture_dir / 'meta').exists():
with open((capture_dir / 'meta'), 'r') as f:
meta = json.load(f)
ct = self.get_crawled_tree(capture_uuid)
return ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
def remove_pickle(self, capture_uuid: str) -> None:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
remove_pickle_tree(capture_dir)
def rebuild_cache(self) -> None: def rebuild_cache(self) -> None:
self.redis.flushdb() self.redis.flushdb()
self._init_existing_dumps() self._init_existing_dumps()
def remove_pickle(self, capture_dir: Path) -> None:
if (capture_dir / 'tree.pickle').exists():
(capture_dir / 'tree.pickle').unlink()
def rebuild_all(self) -> None: def rebuild_all(self) -> None:
for capture_dir in self.capture_dirs: for capture_dir in self.capture_dirs:
self.remove_pickle(capture_dir) remove_pickle_tree(capture_dir)
self.rebuild_cache() self.rebuild_cache()
def get_config(self, entry: str) -> Any: def get_config(self, entry: str) -> Any:
@ -124,29 +168,39 @@ class Lookyloo():
sample_config = json.load(_c) sample_config = json.load(_c)
return sample_config[entry] return sample_config[entry]
def get_urlnode_from_tree(self, capture_dir: Path, node_uuid: str) -> URLNode: def get_urlnode_from_tree(self, capture_uuid: str, node_uuid: str) -> URLNode:
ct = self._load_pickle(capture_dir / 'tree.pickle') capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
ct = load_pickle_tree(capture_dir)
if not ct: if not ct:
raise MissingUUID(f'Unable to find UUID {node_uuid} in {capture_dir}') raise MissingUUID(f'Unable to find UUID {node_uuid} in {capture_dir}')
return ct.root_hartree.get_url_node_by_uuid(node_uuid) return ct.root_hartree.get_url_node_by_uuid(node_uuid)
def get_hostnode_from_tree(self, capture_dir: Path, node_uuid: str) -> HostNode: def get_hostnode_from_tree(self, capture_uuid: str, node_uuid: str) -> HostNode:
ct = self._load_pickle(capture_dir / 'tree.pickle') capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
ct = load_pickle_tree(capture_dir)
if not ct: if not ct:
raise MissingUUID(f'Unable to find UUID {node_uuid} in {capture_dir}') raise MissingUUID(f'Unable to find UUID {node_uuid} in {capture_dir}')
return ct.root_hartree.get_host_node_by_uuid(node_uuid) return ct.root_hartree.get_host_node_by_uuid(node_uuid)
def get_statistics(self, capture_dir: Path) -> Dict[str, Any]: def get_statistics(self, capture_uuid: str) -> Dict[str, Any]:
# We need the pickle capture_dir = self.lookup_capture_dir(capture_uuid)
ct = self._load_pickle(capture_dir / 'tree.pickle') if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
ct = load_pickle_tree(capture_dir)
if not ct: if not ct:
self.logger.warning(f'Unable to trigger the modules unless the tree ({capture_dir}) is cached.') self.logger.warning(f'Unable to trigger the modules unless the tree ({capture_dir}) is cached.')
return {} return {}
return ct.root_hartree.stats return ct.root_hartree.stats
def trigger_modules(self, capture_dir: Path, force: bool=False) -> None: def trigger_modules(self, capture_uuid: str, force: bool=False) -> None:
# We need the pickle capture_dir = self.lookup_capture_dir(capture_uuid)
ct = self._load_pickle(capture_dir / 'tree.pickle') if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
ct = load_pickle_tree(capture_dir)
if not ct: if not ct:
self.logger.warning(f'Unable to trigger the modules unless the tree ({capture_dir}) is cached.') self.logger.warning(f'Unable to trigger the modules unless the tree ({capture_dir}) is cached.')
return return
@ -165,8 +219,11 @@ class Lookyloo():
else: else:
self.vt.url_lookup(ct.root_hartree.har.root_url, force) self.vt.url_lookup(ct.root_hartree.har.root_url, force)
def get_modules_responses(self, capture_dir: Path) -> Optional[Dict[str, Any]]: def get_modules_responses(self, capture_uuid: str) -> Optional[Dict[str, Any]]:
ct = self._load_pickle(capture_dir / 'tree.pickle') capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
ct = load_pickle_tree(capture_dir)
if not ct: if not ct:
self.logger.warning(f'Unable to get the modules responses unless the tree ({capture_dir}) is cached.') self.logger.warning(f'Unable to get the modules responses unless the tree ({capture_dir}) is cached.')
return None return None
@ -219,7 +276,7 @@ class Lookyloo():
incomplete_redirects = False incomplete_redirects = False
if redirects and har.need_tree_redirects: if redirects and har.need_tree_redirects:
# load tree from disk, get redirects # load tree from disk, get redirects
ct = self._load_pickle(capture_dir / 'tree.pickle') ct = load_pickle_tree(capture_dir)
if ct: if ct:
redirects = ct.redirects redirects = ct.redirects
else: else:
@ -231,6 +288,7 @@ class Lookyloo():
'timestamp': har.initial_start_time, 'timestamp': har.initial_start_time,
'url': har.root_url, 'url': har.root_url,
'redirects': json.dumps(redirects), 'redirects': json.dumps(redirects),
'capture_dir': str(capture_dir),
'incomplete_redirects': 1 if incomplete_redirects else 0} 'incomplete_redirects': 1 if incomplete_redirects else 0}
if (capture_dir / 'no_index').exists(): # If the folders claims anonymity if (capture_dir / 'no_index').exists(): # If the folders claims anonymity
cache['no_index'] = 1 cache['no_index'] = 1
@ -238,19 +296,27 @@ class Lookyloo():
self.redis.hmset(str(capture_dir), cache) self.redis.hmset(str(capture_dir), cache)
self.redis.hset('lookup_dirs', uuid, str(capture_dir)) self.redis.hset('lookup_dirs', uuid, str(capture_dir))
def capture_cache(self, capture_dir: Path) -> Optional[Dict[str, Any]]: @property
def capture_uuids(self):
return self.redis.hkeys('lookup_dirs')
def capture_cache(self, capture_uuid: str) -> Dict[str, Any]:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
if self.redis.hget(str(capture_dir), 'incomplete_redirects') == '1': if self.redis.hget(str(capture_dir), 'incomplete_redirects') == '1':
# try to rebuild the cache # try to rebuild the cache
self._set_capture_cache(capture_dir, force=True) self._set_capture_cache(capture_dir, force=True)
cached = self.redis.hgetall(str(capture_dir)) cached = self.redis.hgetall(str(capture_dir))
if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects']): if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir']):
cached['redirects'] = json.loads(cached['redirects']) cached['redirects'] = json.loads(cached['redirects'])
cached['capture_dir'] = Path(cached['capture_dir'])
return cached return cached
elif 'error' in cached: elif 'error' in cached:
return cached return cached
else: else:
self.logger.warning(f'Cache ({capture_dir}) is invalid: {json.dumps(cached, indent=2)}') self.logger.warning(f'Cache ({capture_dir}) is invalid: {json.dumps(cached, indent=2)}')
return None return {}
def _init_existing_dumps(self) -> None: def _init_existing_dumps(self) -> None:
for capture_dir in self.capture_dirs: for capture_dir in self.capture_dirs:
@ -270,8 +336,8 @@ class Lookyloo():
f.write(str(uuid4())) f.write(str(uuid4()))
return sorted(self.scrape_dir.iterdir(), reverse=True) return sorted(self.scrape_dir.iterdir(), reverse=True)
def lookup_capture_dir(self, uuid: str) -> Union[Path, None]: def lookup_capture_dir(self, capture_uuid: str) -> Union[Path, None]:
capture_dir = self.redis.hget('lookup_dirs', uuid) capture_dir = self.redis.hget('lookup_dirs', capture_uuid)
if capture_dir: if capture_dir:
return Path(capture_dir) return Path(capture_dir)
return None return None
@ -300,28 +366,20 @@ class Lookyloo():
return True return True
return False return False
def _load_pickle(self, pickle_file: Path) -> Optional[CrawledTree]:
if pickle_file.exists():
with pickle_file.open('rb') as _p:
return pickle.load(_p)
return None
def send_mail(self, capture_uuid: str, email: str='', comment: str='') -> None: def send_mail(self, capture_uuid: str, email: str='', comment: str='') -> None:
if not self.get_config('enable_mail_notification'): if not self.get_config('enable_mail_notification'):
return return
redirects = '' redirects = ''
initial_url = '' initial_url = ''
capture_dir = self.lookup_capture_dir(capture_uuid) cache = self.capture_cache(capture_uuid)
if capture_dir: if cache:
cache = self.capture_cache(capture_dir) initial_url = cache['url']
if cache: if 'redirects' in cache and cache['redirects']:
initial_url = cache['url'] redirects = "Redirects:\n"
if 'redirects' in cache and cache['redirects']: redirects += '\n'.join(cache['redirects'])
redirects = "Redirects:\n" else:
redirects += '\n'.join(cache['redirects']) redirects = "No redirects."
else:
redirects = "No redirects."
email_config = self.get_config('email') email_config = self.get_config('email')
msg = EmailMessage() msg = EmailMessage()
@ -371,31 +429,10 @@ class Lookyloo():
with metafile.open('w') as f: with metafile.open('w') as f:
json.dump(to_dump, f) json.dump(to_dump, f)
def get_crawled_tree(self, capture_dir: Path) -> CrawledTree: def _get_raw(self, capture_uuid: str, extension: str='*', all_files: bool=True) -> BytesIO:
pickle_file = capture_dir / 'tree.pickle' capture_dir = self.lookup_capture_dir(capture_uuid)
ct = self._load_pickle(pickle_file) if not capture_dir:
if not ct: raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
with open((capture_dir / 'uuid'), 'r') as f:
uuid = f.read()
har_files = sorted(capture_dir.glob('*.har'))
try:
ct = CrawledTree(har_files, uuid)
self._ensure_meta(capture_dir, ct)
except Har2TreeError as e:
raise NoValidHarFile(e.message)
with pickle_file.open('wb') as _p:
pickle.dump(ct, _p)
return ct
def load_tree(self, capture_dir: Path) -> Tuple[str, str, str, str, Dict[str, str]]:
meta = {}
if (capture_dir / 'meta').exists():
with open((capture_dir / 'meta'), 'r') as f:
meta = json.load(f)
ct = self.get_crawled_tree(capture_dir)
return ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
def _get_raw(self, capture_dir: Path, extension: str='*', all_files: bool=True) -> BytesIO:
all_paths = sorted(list(capture_dir.glob(f'*.{extension}'))) all_paths = sorted(list(capture_dir.glob(f'*.{extension}')))
if not all_files: if not all_files:
# Only get the first one in the list # Only get the first one in the list
@ -410,17 +447,17 @@ class Lookyloo():
to_return.seek(0) to_return.seek(0)
return to_return return to_return
def get_html(self, capture_dir: Path, all_html: bool=False) -> BytesIO: def get_html(self, capture_uuid: str, all_html: bool=False) -> BytesIO:
return self._get_raw(capture_dir, 'html', all_html) return self._get_raw(capture_uuid, 'html', all_html)
def get_cookies(self, capture_dir: Path, all_cookies: bool=False) -> BytesIO: def get_cookies(self, capture_uuid: str, all_cookies: bool=False) -> BytesIO:
return self._get_raw(capture_dir, 'cookies.json', all_cookies) return self._get_raw(capture_uuid, 'cookies.json', all_cookies)
def get_screenshot(self, capture_dir: Path, all_images: bool=False) -> BytesIO: def get_screenshot(self, capture_uuid: str, all_images: bool=False) -> BytesIO:
return self._get_raw(capture_dir, 'png', all_images) return self._get_raw(capture_uuid, 'png', all_images)
def get_capture(self, capture_dir: Path) -> BytesIO: def get_capture(self, capture_uuid: str) -> BytesIO:
return self._get_raw(capture_dir) return self._get_raw(capture_uuid)
def scrape(self, url: str, cookies_pseudofile: Optional[BufferedIOBase]=None, def scrape(self, url: str, cookies_pseudofile: Optional[BufferedIOBase]=None,
depth: int=1, listing: bool=True, user_agent: Optional[str]=None, depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
@ -505,8 +542,12 @@ class Lookyloo():
self._set_capture_cache(dirpath) self._set_capture_cache(dirpath)
return perma_uuid return perma_uuid
def get_hostnode_investigator(self, capture_dir: Path, node_uuid: str) -> Tuple[HostNode, List[Dict[str, Any]]]: def get_hostnode_investigator(self, capture_uuid: str, node_uuid: str) -> Tuple[HostNode, List[Dict[str, Any]]]:
ct = self._load_pickle(capture_dir / 'tree.pickle') capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find {capture_uuid}')
ct = load_pickle_tree(capture_dir)
if not ct: if not ct:
raise MissingUUID(f'Unable to find {capture_dir}') raise MissingUUID(f'Unable to find {capture_dir}')
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid) hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
@ -536,16 +577,17 @@ class Lookyloo():
else: else:
to_append['url_path_short'] = to_append['url_path'] to_append['url_path_short'] = to_append['url_path']
# Optional: SaneJS information if not url.empty_response:
if hasattr(url, 'body_hash') and url.body_hash in sanejs_lookups: # Optional: SaneJS information
if sanejs_lookups[url.body_hash]: if url.body_hash in sanejs_lookups:
if isinstance(sanejs_lookups[url.body_hash], list): if sanejs_lookups[url.body_hash]:
libname, version, path = sanejs_lookups[url.body_hash][0].split("|") if isinstance(sanejs_lookups[url.body_hash], list):
other_files = len(sanejs_lookups[url.body_hash]) libname, version, path = sanejs_lookups[url.body_hash][0].split("|")
to_append['sane_js'] = (libname, version, path, other_files) other_files = len(sanejs_lookups[url.body_hash])
else: to_append['sane_js'] = (libname, version, path, other_files)
# Predefined generic file else:
to_append['sane_js'] = sanejs_lookups[url.body_hash] # Predefined generic file
to_append['sane_js'] = sanejs_lookups[url.body_hash]
# Optional: Cookies sent to server in request -> map to nodes who set the cookie in response # Optional: Cookies sent to server in request -> map to nodes who set the cookie in response
if hasattr(url, 'cookies_sent'): if hasattr(url, 'cookies_sent'):

View File

@ -52,10 +52,16 @@ class SaneJavaScript():
"71db01662075fac031dea18b2c766826c77dbab01400a8642cdc7059394841d5df9020076554c3beca6f808187d42e1a1acc98fad9a0e1ad32ae869145f53746": "This is a 1*1 pixel GIF", "71db01662075fac031dea18b2c766826c77dbab01400a8642cdc7059394841d5df9020076554c3beca6f808187d42e1a1acc98fad9a0e1ad32ae869145f53746": "This is a 1*1 pixel GIF",
"49b8daf1f5ba868bc8c6b224c787a75025ca36513ef8633d1d8f34e48ee0b578f466fcc104a7bed553404ddc5f9faff3fef5f894b31cd57f32245e550fad656a": "This is a 1*1 pixel GIF", "49b8daf1f5ba868bc8c6b224c787a75025ca36513ef8633d1d8f34e48ee0b578f466fcc104a7bed553404ddc5f9faff3fef5f894b31cd57f32245e550fad656a": "This is a 1*1 pixel GIF",
"c57ebbadcf59f982ba28da35fdbd5e5369a8500a2e1edad0dc9c9174de6fd99f437953732e545b95d3de5943c61077b6b949c989f49553ff2e483f68fcc30641": "This is a 1*1 pixel GIF", "c57ebbadcf59f982ba28da35fdbd5e5369a8500a2e1edad0dc9c9174de6fd99f437953732e545b95d3de5943c61077b6b949c989f49553ff2e483f68fcc30641": "This is a 1*1 pixel GIF",
"c87bf81fd70cf6434ca3a6c05ad6e9bd3f1d96f77dddad8d45ee043b126b2cb07a5cf23b4137b9d8462cd8a9adf2b463ab6de2b38c93db72d2d511ca60e3b57e": "This is a 1*1 pixel GIF",
"fd8b021f0236e487bfee13bf8f0ae98760abc492f7ca3023e292631979e135cb4ccb0c89b6234971b060ad72c0ca4474cbb5092c6c7a3255d81a54a36277b486": "This is a 1*1 pixel GIF",
"235479f42cbbe0a4b0100167fece0d14c9b47d272b3ba8322bcfe8539f055bf31d500e7b2995cc968ebf73034e039f59c5f0f9410428663034bf119d74b5672c": "This is a 1*1 pixel GIF",
"a85e09c3b5dbb560f4e03ba880047dbc8b4999a64c1f54fbfbca17ee0bcbed3bc6708d699190b56668e464a59358d6b534c3963a1329ba01db21075ef5bedace": "This is a 1*1 pixel GIF",
"27656d6106a6da0c84174ba7a6307e6f1c4b3f2cc085c8466b6a25d54331035dabc7081aac208d960d8d37c5577547628c0d1c4b77bb4cf254c71859673feec1": "This is a 1*1 pixel GIF",
# "": "This is a 1*1 pixel GIF", # "": "This is a 1*1 pixel GIF",
"f1c33e72643ce366fd578e3b5d393799e8c9ea27b180987826af43b4fc00b65a4eaae5e6426a23448956fee99e3108c6a86f32fb4896c156e24af0571a11c498": "This is a 1*1 pixel PNG", "f1c33e72643ce366fd578e3b5d393799e8c9ea27b180987826af43b4fc00b65a4eaae5e6426a23448956fee99e3108c6a86f32fb4896c156e24af0571a11c498": "This is a 1*1 pixel PNG",
"dc7c40381b3d22919e32c1b700ccb77b1b0aea2690642d01c1ac802561e135c01d5a4d2a0ea18efc0ec3362e8c549814a10a23563f1f56bd62aee0ced7e2bd99": "This is a 1*1 pixel PNG", "dc7c40381b3d22919e32c1b700ccb77b1b0aea2690642d01c1ac802561e135c01d5a4d2a0ea18efc0ec3362e8c549814a10a23563f1f56bd62aee0ced7e2bd99": "This is a 1*1 pixel PNG",
"c2c239cb5cdd0b670780ad6414ef6be9ccd4c21ce46bb93d1fa3120ac812f1679445162978c3df05cb2e1582a1844cc4c41cf74960b8fdae3123999c5d2176cc": "This is a 1*1 pixel PNG", "c2c239cb5cdd0b670780ad6414ef6be9ccd4c21ce46bb93d1fa3120ac812f1679445162978c3df05cb2e1582a1844cc4c41cf74960b8fdae3123999c5d2176cc": "This is a 1*1 pixel PNG",
"6ad523f5b65487369d305613366b9f68dcdeee225291766e3b25faf45439ca069f614030c08ca54c714fdbf7a944fac489b1515a8bf9e0d3191e1bcbbfe6a9df": "This is a 1*1 pixel PNG",
# "": "This is a 1*1 pixel PNG", # "": "This is a 1*1 pixel PNG",
"cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e": "This is an empty file" "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e": "This is an empty file"
} }

View File

@ -26,6 +26,7 @@ run_backend = "bin/run_backend.py"
async_scrape = "bin/async_scrape.py" async_scrape = "bin/async_scrape.py"
shutdown = "bin/shutdown.py" shutdown = "bin/shutdown.py"
stop = "bin/stop.py" stop = "bin/stop.py"
rebuild_caches = "bin/rebuild_caches.py"
[tool.poetry.dependencies] [tool.poetry.dependencies]

View File

@ -13,7 +13,7 @@ setup(
description='Web interface to track the trackers.', description='Web interface to track the trackers.',
packages=['lookyloo'], packages=['lookyloo'],
scripts=['bin/start_website.py', 'bin/start.py', 'bin/run_backend.py', 'bin/async_scrape.py', scripts=['bin/start_website.py', 'bin/start.py', 'bin/run_backend.py', 'bin/async_scrape.py',
'bin/shutdown.py', 'bin/stop.py'], 'bin/shutdown.py', 'bin/stop.py', 'bin/rebuild_caches.py'],
include_package_data=True, include_package_data=True,
classifiers=[ classifiers=[
'License :: OSI Approved :: BSD License', 'License :: OSI Approved :: BSD License',

View File

@ -11,7 +11,7 @@ FileSaver="v2.0.2"
wget -q https://raw.githubusercontent.com/eligrey/FileSaver.js/${FileSaver}/src/FileSaver.js -O web/static/FileSaver.js wget -q https://raw.githubusercontent.com/eligrey/FileSaver.js/${FileSaver}/src/FileSaver.js -O web/static/FileSaver.js
datatables="1.10.20" datatables="1.10.21"
wget -q https://cdn.datatables.net/v/bs4/dt-${datatables}/datatables.min.css -O web/static/datatables.min.css wget -q https://cdn.datatables.net/v/bs4/dt-${datatables}/datatables.min.css -O web/static/datatables.min.css
wget -q https://cdn.datatables.net/v/bs4/dt-${datatables}/datatables.min.js -O web/static/datatables.min.js wget -q https://cdn.datatables.net/v/bs4/dt-${datatables}/datatables.min.js -O web/static/datatables.min.js

View File

@ -14,7 +14,7 @@ from flask_httpauth import HTTPDigestAuth # type: ignore
from lookyloo.helpers import get_homedir, update_user_agents, get_user_agents from lookyloo.helpers import get_homedir, update_user_agents, get_user_agents
from lookyloo.lookyloo import Lookyloo from lookyloo.lookyloo import Lookyloo
from lookyloo.exceptions import NoValidHarFile from lookyloo.exceptions import NoValidHarFile, MissingUUID
from .proxied import ReverseProxied from .proxied import ReverseProxied
from typing import Optional, Dict, Any from typing import Optional, Dict, Any
@ -96,11 +96,11 @@ def rebuild_cache():
@app.route('/tree/<string:tree_uuid>/rebuild') @app.route('/tree/<string:tree_uuid>/rebuild')
@auth.login_required @auth.login_required
def rebuild_tree(tree_uuid: str): def rebuild_tree(tree_uuid: str):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) try:
if capture_dir: lookyloo.remove_pickle(tree_uuid)
lookyloo.remove_pickle(capture_dir)
return redirect(url_for('tree', tree_uuid=tree_uuid)) return redirect(url_for('tree', tree_uuid=tree_uuid))
return redirect(url_for('index')) except Exception:
return redirect(url_for('index'))
@app.route('/submit', methods=['POST', 'GET']) @app.route('/submit', methods=['POST', 'GET'])
@ -140,10 +140,7 @@ def scrape_web():
@app.route('/tree/<string:tree_uuid>/hostname/<string:node_uuid>/text', methods=['GET']) @app.route('/tree/<string:tree_uuid>/hostname/<string:node_uuid>/text', methods=['GET'])
def hostnode_details_text(tree_uuid: str, node_uuid: str): def hostnode_details_text(tree_uuid: str, node_uuid: str):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) hostnode = lookyloo.get_hostnode_from_tree(tree_uuid, node_uuid)
if not capture_dir:
return
hostnode = lookyloo.get_hostnode_from_tree(capture_dir, node_uuid)
urls = [] urls = []
for url in hostnode.urls: for url in hostnode.urls:
urls.append(url.name) urls.append(url.name)
@ -159,10 +156,6 @@ def hostnode_details_text(tree_uuid: str, node_uuid: str):
@app.route('/tree/<string:tree_uuid>/hostname_popup/<string:node_uuid>', methods=['GET']) @app.route('/tree/<string:tree_uuid>/hostname_popup/<string:node_uuid>', methods=['GET'])
def hostnode_popup(tree_uuid: str, node_uuid: str): def hostnode_popup(tree_uuid: str, node_uuid: str):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
if not capture_dir:
return
keys_response = { keys_response = {
'js': "/static/javascript.png", 'js': "/static/javascript.png",
'exe': "/static/exe.png", 'exe': "/static/exe.png",
@ -182,7 +175,7 @@ def hostnode_popup(tree_uuid: str, node_uuid: str):
'request_cookie': "/static/cookie_read.png", 'request_cookie': "/static/cookie_read.png",
} }
hostnode, urls = lookyloo.get_hostnode_investigator(capture_dir, node_uuid) hostnode, urls = lookyloo.get_hostnode_investigator(tree_uuid, node_uuid)
return render_template('hostname_popup.html', return render_template('hostname_popup.html',
tree_uuid=tree_uuid, tree_uuid=tree_uuid,
@ -195,10 +188,7 @@ def hostnode_popup(tree_uuid: str, node_uuid: str):
@app.route('/tree/<string:tree_uuid>/url/<string:node_uuid>/request_cookies', methods=['GET']) @app.route('/tree/<string:tree_uuid>/url/<string:node_uuid>/request_cookies', methods=['GET'])
def urlnode_request_cookies(tree_uuid: str, node_uuid: str): def urlnode_request_cookies(tree_uuid: str, node_uuid: str):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) urlnode = lookyloo.get_urlnode_from_tree(tree_uuid, node_uuid)
if not capture_dir:
return
urlnode = lookyloo.get_urlnode_from_tree(capture_dir, node_uuid)
if not urlnode.request_cookie: if not urlnode.request_cookie:
return return
@ -208,10 +198,7 @@ def urlnode_request_cookies(tree_uuid: str, node_uuid: str):
@app.route('/tree/<string:tree_uuid>/url/<string:node_uuid>/response_cookies', methods=['GET']) @app.route('/tree/<string:tree_uuid>/url/<string:node_uuid>/response_cookies', methods=['GET'])
def urlnode_response_cookies(tree_uuid: str, node_uuid: str): def urlnode_response_cookies(tree_uuid: str, node_uuid: str):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) urlnode = lookyloo.get_urlnode_from_tree(tree_uuid, node_uuid)
if not capture_dir:
return
urlnode = lookyloo.get_urlnode_from_tree(capture_dir, node_uuid)
if not urlnode.response_cookie: if not urlnode.response_cookie:
return return
@ -221,10 +208,7 @@ def urlnode_response_cookies(tree_uuid: str, node_uuid: str):
@app.route('/tree/<string:tree_uuid>/url/<string:node_uuid>/posted_data', methods=['GET']) @app.route('/tree/<string:tree_uuid>/url/<string:node_uuid>/posted_data', methods=['GET'])
def urlnode_post_request(tree_uuid: str, node_uuid: str): def urlnode_post_request(tree_uuid: str, node_uuid: str):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) urlnode = lookyloo.get_urlnode_from_tree(tree_uuid, node_uuid)
if not capture_dir:
return
urlnode = lookyloo.get_urlnode_from_tree(capture_dir, node_uuid)
if not urlnode.posted_data: if not urlnode.posted_data:
return return
if isinstance(urlnode.posted_data, (dict, list)): if isinstance(urlnode.posted_data, (dict, list)):
@ -244,10 +228,7 @@ def urlnode_post_request(tree_uuid: str, node_uuid: str):
@app.route('/tree/<string:tree_uuid>/url/<string:node_uuid>', methods=['GET']) @app.route('/tree/<string:tree_uuid>/url/<string:node_uuid>', methods=['GET'])
def urlnode_details(tree_uuid: str, node_uuid: str): def urlnode_details(tree_uuid: str, node_uuid: str):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) urlnode = lookyloo.get_urlnode_from_tree(tree_uuid, node_uuid)
if not capture_dir:
return
urlnode = lookyloo.get_urlnode_from_tree(capture_dir, node_uuid)
to_return = BytesIO() to_return = BytesIO()
got_content = False got_content = False
if hasattr(urlnode, 'body'): if hasattr(urlnode, 'body'):
@ -267,28 +248,19 @@ def urlnode_details(tree_uuid: str, node_uuid: str):
@app.route('/tree/<string:tree_uuid>/trigger_modules/', defaults={'force': False}) @app.route('/tree/<string:tree_uuid>/trigger_modules/', defaults={'force': False})
@app.route('/tree/<string:tree_uuid>/trigger_modules/<int:force>', methods=['GET']) @app.route('/tree/<string:tree_uuid>/trigger_modules/<int:force>', methods=['GET'])
def trigger_modules(tree_uuid: str, force: int): def trigger_modules(tree_uuid: str, force: int):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) lookyloo.trigger_modules(tree_uuid, True if force else False)
if not capture_dir:
return Response('Not available.', mimetype='text/text')
lookyloo.trigger_modules(capture_dir, True if force else False)
return redirect(url_for('modules', tree_uuid=tree_uuid)) return redirect(url_for('modules', tree_uuid=tree_uuid))
@app.route('/tree/<string:tree_uuid>/stats', methods=['GET']) @app.route('/tree/<string:tree_uuid>/stats', methods=['GET'])
def stats(tree_uuid: str): def stats(tree_uuid: str):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) stats = lookyloo.get_statistics(tree_uuid)
if not capture_dir:
return Response('Not available.', mimetype='text/text')
stats = lookyloo.get_statistics(capture_dir)
return render_template('statistics.html', uuid=tree_uuid, stats=stats) return render_template('statistics.html', uuid=tree_uuid, stats=stats)
@app.route('/tree/<string:tree_uuid>/modules', methods=['GET']) @app.route('/tree/<string:tree_uuid>/modules', methods=['GET'])
def modules(tree_uuid: str): def modules(tree_uuid: str):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) modules_responses = lookyloo.get_modules_responses(tree_uuid)
if not capture_dir:
return Response('Not available.', mimetype='text/text')
modules_responses = lookyloo.get_modules_responses(capture_dir)
if not modules_responses: if not modules_responses:
return redirect(url_for('tree', tree_uuid=tree_uuid)) return redirect(url_for('tree', tree_uuid=tree_uuid))
@ -319,50 +291,35 @@ def modules(tree_uuid: str):
@app.route('/tree/<string:tree_uuid>/image', methods=['GET']) @app.route('/tree/<string:tree_uuid>/image', methods=['GET'])
def image(tree_uuid: str): def image(tree_uuid: str):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) to_return = lookyloo.get_screenshot(tree_uuid)
if not capture_dir:
return Response('Not available.', mimetype='text/text')
to_return = lookyloo.get_screenshot(capture_dir)
return send_file(to_return, mimetype='image/png', return send_file(to_return, mimetype='image/png',
as_attachment=True, attachment_filename='image.png') as_attachment=True, attachment_filename='image.png')
@app.route('/tree/<string:tree_uuid>/html', methods=['GET']) @app.route('/tree/<string:tree_uuid>/html', methods=['GET'])
def html(tree_uuid: str): def html(tree_uuid: str):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) to_return = lookyloo.get_html(tree_uuid)
if not capture_dir:
return Response('Not available.', mimetype='text/text')
to_return = lookyloo.get_html(capture_dir)
return send_file(to_return, mimetype='text/html', return send_file(to_return, mimetype='text/html',
as_attachment=True, attachment_filename='page.html') as_attachment=True, attachment_filename='page.html')
@app.route('/tree/<string:tree_uuid>/cookies', methods=['GET']) @app.route('/tree/<string:tree_uuid>/cookies', methods=['GET'])
def cookies(tree_uuid: str): def cookies(tree_uuid: str):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) to_return = lookyloo.get_cookies(tree_uuid)
if not capture_dir:
return Response('Not available.', mimetype='text/text')
to_return = lookyloo.get_cookies(capture_dir)
return send_file(to_return, mimetype='application/json', return send_file(to_return, mimetype='application/json',
as_attachment=True, attachment_filename='cookies.json') as_attachment=True, attachment_filename='cookies.json')
@app.route('/tree/<string:tree_uuid>/export', methods=['GET']) @app.route('/tree/<string:tree_uuid>/export', methods=['GET'])
def export(tree_uuid: str): def export(tree_uuid: str):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) to_return = lookyloo.get_capture(tree_uuid)
if not capture_dir:
return Response('Not available.', mimetype='text/text')
to_return = lookyloo.get_capture(capture_dir)
return send_file(to_return, mimetype='application/zip', return send_file(to_return, mimetype='application/zip',
as_attachment=True, attachment_filename='capture.zip') as_attachment=True, attachment_filename='capture.zip')
@app.route('/redirects/<string:tree_uuid>', methods=['GET']) @app.route('/redirects/<string:tree_uuid>', methods=['GET'])
def redirects(tree_uuid: str): def redirects(tree_uuid: str):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) cache = lookyloo.capture_cache(tree_uuid)
if not capture_dir:
return Response('Not available.', mimetype='text/text')
cache = lookyloo.capture_cache(capture_dir)
if not cache: if not cache:
return Response('Not available.', mimetype='text/text') return Response('Not available.', mimetype='text/text')
if not cache['redirects']: if not cache['redirects']:
@ -374,9 +331,7 @@ def redirects(tree_uuid: str):
@app.route('/cache_tree/<string:tree_uuid>', methods=['GET']) @app.route('/cache_tree/<string:tree_uuid>', methods=['GET'])
def cache_tree(tree_uuid: str): def cache_tree(tree_uuid: str):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) lookyloo.cache_tree(tree_uuid)
if capture_dir:
lookyloo.load_tree(capture_dir)
return redirect(url_for('index')) return redirect(url_for('index'))
@ -389,16 +344,17 @@ def send_mail(tree_uuid: str):
@app.route('/tree/<string:tree_uuid>', methods=['GET']) @app.route('/tree/<string:tree_uuid>', methods=['GET'])
def tree(tree_uuid: str): @app.route('/tree/<string:tree_uuid>/<string:urlnode_uuid>', methods=['GET'])
def tree(tree_uuid: str, urlnode_uuid: Optional[str]=None):
if tree_uuid == 'False': if tree_uuid == 'False':
flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error') flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error')
return redirect(url_for('index')) return redirect(url_for('index'))
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) try:
if not capture_dir: cache = lookyloo.capture_cache(tree_uuid)
except MissingUUID:
flash(f'Unable to find this UUID ({tree_uuid}). The capture may still be ongoing, try again later.', 'error') flash(f'Unable to find this UUID ({tree_uuid}). The capture may still be ongoing, try again later.', 'error')
return redirect(url_for('index')) return redirect(url_for('index'))
cache = lookyloo.capture_cache(capture_dir)
if not cache: if not cache:
flash('Invalid cache.', 'error') flash('Invalid cache.', 'error')
return redirect(url_for('index')) return redirect(url_for('index'))
@ -412,10 +368,12 @@ def tree(tree_uuid: str):
enable_mail_notification = True enable_mail_notification = True
else: else:
enable_mail_notification = False enable_mail_notification = False
tree_json, start_time, user_agent, root_url, meta = lookyloo.load_tree(capture_dir) tree_json, start_time, user_agent, root_url, meta = lookyloo.load_tree(tree_uuid)
return render_template('tree.html', tree_json=tree_json, start_time=start_time, return render_template('tree.html', tree_json=tree_json, start_time=start_time,
user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid, user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid,
meta=meta, enable_mail_notification=enable_mail_notification) meta=meta, enable_mail_notification=enable_mail_notification,
urlnode_uuid=urlnode_uuid)
except NoValidHarFile as e: except NoValidHarFile as e:
return render_template('error.html', error_message=e) return render_template('error.html', error_message=e)
@ -427,8 +385,8 @@ def index_generic(show_hidden: bool=False):
cut_time = datetime.now() - timedelta(**time_delta_on_index) cut_time = datetime.now() - timedelta(**time_delta_on_index)
else: else:
cut_time = None # type: ignore cut_time = None # type: ignore
for capture_dir in lookyloo.capture_dirs: for capture_uuid in lookyloo.capture_uuids:
cached = lookyloo.capture_cache(capture_dir) cached = lookyloo.capture_cache(capture_uuid)
if not cached or 'error' in cached: if not cached or 'error' in cached:
continue continue
if show_hidden: if show_hidden:
@ -459,15 +417,12 @@ def index():
def index_hidden(): def index_hidden():
return index_generic(show_hidden=True) return index_generic(show_hidden=True)
# Query API # Query API
@app.route('/json/<string:tree_uuid>/redirects', methods=['GET']) @app.route('/json/<string:tree_uuid>/redirects', methods=['GET'])
def json_redirects(tree_uuid: str): def json_redirects(tree_uuid: str):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid) cache = lookyloo.capture_cache(tree_uuid)
if not capture_dir:
return {'error': 'Unknown UUID, try again later.'}
cache = lookyloo.capture_cache(capture_dir)
if not cache: if not cache:
return {'error': 'UUID missing in cache, try again later.'} return {'error': 'UUID missing in cache, try again later.'}
@ -477,8 +432,8 @@ def json_redirects(tree_uuid: str):
return to_return return to_return
if cache['incomplete_redirects']: if cache['incomplete_redirects']:
# Trigger tree build, get all redirects # Trigger tree build, get all redirects
lookyloo.load_tree(capture_dir) lookyloo.load_tree(tree_uuid)
cache = lookyloo.capture_cache(capture_dir) cache = lookyloo.capture_cache(tree_uuid)
if cache: if cache:
to_return['response']['redirects'] = cache['redirects'] to_return['response']['redirects'] = cache['redirects']
else: else:

View File

@ -77,30 +77,15 @@ function urlnode_click(d) {
}); });
}; };
d3.selection.prototype.moveToFront = function() {
return this.each(function() {
this.parentNode.appendChild(this);
});
};
d3.selection.prototype.moveToBack = function() {
return this.each(function() {
var firstChild = this.parentNode.firstChild;
if (firstChild) {
this.parentNode.insertBefore(this, firstChild);
}
});
};
function hostnode_click_popup(d) { function hostnode_click_popup(d) {
window.open('/tree/' + treeUUID + '/hostname_popup/' + d.data.uuid, '_blank', 'width=1024,height=768,left=200,top=100'); window.open('/tree/' + treeUUID + '/hostname_popup/' + d.data.uuid, '_blank', 'width=1024,height=768,left=200,top=100');
}; };
function ProcessChildMessage(message) { function ProcessChildMessage(urlnode_uuid) {
var element = document.getElementById("node_" + message); var element = document.getElementById("node_" + urlnode_uuid);
element.scrollIntoView({behavior: "smooth", block: "center", inline: "nearest"}); element.scrollIntoView({behavior: "smooth", block: "center", inline: "nearest"});
var to_blink = d3.select("#node_" + message).select('text'); var to_blink = d3.select("#node_" + urlnode_uuid).select('text');
to_blink to_blink
.transition().duration(500) //Set transition .transition().duration(500) //Set transition
.style('fill', 'red') .style('fill', 'red')

View File

@ -105,15 +105,15 @@
{% if url['sane_js'] %} {% if url['sane_js'] %}
<div> <div>
{% if url['sane_js'] is string %} {% if url['sane_js'] is string %}
{{ url['sane_js'] }} <b>{{ url['sane_js'] }} </b>
{% else %} {% else %}
This file is known as part of <b>{{ url['sane_js'][0] }}</b> This file is known as part of <b>{{ url['sane_js'][0] }}</b>
version <b>{{ url['sane_js'][1] }}</b>: <b>{{ url['sane_js'][2] }}</b>. version <b>{{ url['sane_js'][1] }}</b>: <b>{{ url['sane_js'][2] }}</b>.
{% if url['sane_js'][3] > 1%} {% if url['sane_js'][3] > 1%}
It is also present in <b>{{ url['sane_js'][3] -1 }}</b> other libraries. It is also present in <b>{{ url['sane_js'][3] -1 }}</b> other libraries.
{%endif%}
{%endif%} {%endif%}
{%endif%}
</div> </div>
{% endif %} {% endif %}

View File

@ -35,6 +35,14 @@
}); });
}); });
</script> </script>
{% if urlnode_uuid %}
<script>
history.scrollRestoration = "manual";
window.addEventListener('DOMContentLoaded', (event) => {
ProcessChildMessage('{{urlnode_uuid}}');
});
</script>
{% endif%}
{% endblock %} {% endblock %}
{% block content %} {% block content %}