chg: add stats, avoid building big trees twice, bump deps

pull/209/head
Raphaël Vinot 2021-05-26 18:25:06 -07:00
parent 315a2733c3
commit 1117ab6371
8 changed files with 67 additions and 12 deletions

View File

@ -2,6 +2,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import logging import logging
from datetime import datetime, timedelta
from lookyloo.abstractmanager import AbstractManager from lookyloo.abstractmanager import AbstractManager
from lookyloo.lookyloo import Lookyloo from lookyloo.lookyloo import Lookyloo
@ -29,6 +30,19 @@ class BackgroundIndexer(AbstractManager):
for uuid_path in self.lookyloo.capture_dir.glob('*/uuid'): for uuid_path in self.lookyloo.capture_dir.glob('*/uuid'):
if (uuid_path.parent / 'tree.pickle').exists(): if (uuid_path.parent / 'tree.pickle').exists():
continue continue
lock_file = uuid_path.parent / 'lock'
if lock_file.exists():
try:
with lock_file.open('r') as f:
lock_ts = datetime.fromisoformat(f.read())
if lock_ts < datetime.now() - timedelta(minutes=5):
# Clear old locks. They shouldn't be there, but it's gonna happen.
self.logger.info(f'Old lock found {lock_file}, removing it.')
lock_file.unlink(missing_ok=True)
except Exception as e:
self.logger.info(f'Error while reading lock {lock_file}: {e}')
continue
with uuid_path.open() as f: with uuid_path.open() as f:
uuid = f.read() uuid = f.read()
try: try:

View File

@ -340,3 +340,11 @@ def uniq_domains(uniq_urls):
splitted = urlparse(url) splitted = urlparse(url)
domains.add(splitted.hostname) domains.add(splitted.hostname)
return domains return domains
def try_make_file(filename: Path):
try:
filename.touch(exist_ok=False)
return True
except FileExistsError:
return False

View File

@ -20,6 +20,7 @@ from urllib.parse import urlsplit, urljoin
from uuid import uuid4 from uuid import uuid4
from zipfile import ZipFile from zipfile import ZipFile
import operator import operator
import time
from defang import refang # type: ignore from defang import refang # type: ignore
import dns.resolver import dns.resolver
@ -38,7 +39,7 @@ from .exceptions import NoValidHarFile, MissingUUID, LookylooException
from .helpers import (get_homedir, get_socket_path, load_cookies, get_config, from .helpers import (get_homedir, get_socket_path, load_cookies, get_config,
safe_create_dir, get_email_template, load_pickle_tree, safe_create_dir, get_email_template, load_pickle_tree,
remove_pickle_tree, get_resources_hashes, get_taxonomies, uniq_domains, remove_pickle_tree, get_resources_hashes, get_taxonomies, uniq_domains,
CaptureStatus) CaptureStatus, try_make_file)
from .modules import VirusTotal, SaneJavaScript, PhishingInitiative, MISP, UniversalWhois from .modules import VirusTotal, SaneJavaScript, PhishingInitiative, MISP, UniversalWhois
from .capturecache import CaptureCache from .capturecache import CaptureCache
from .context import Context from .context import Context
@ -149,6 +150,25 @@ class Lookyloo():
'''Generate the pickle, set the cache, add capture in the indexes''' '''Generate the pickle, set the cache, add capture in the indexes'''
capture_dir = self._get_capture_dir(capture_uuid) capture_dir = self._get_capture_dir(capture_uuid)
har_files = sorted(capture_dir.glob('*.har')) har_files = sorted(capture_dir.glob('*.har'))
lock_file = capture_dir / 'lock'
pickle_file = capture_dir / 'tree.pickle'
if try_make_file(lock_file):
# Lock created, we can process
with lock_file.open('w') as f:
f.write(datetime.now().isoformat())
else:
# The pickle is being created somewhere else, wait until it's done.
while lock_file.exists():
time.sleep(5)
keep_going = 5
while (ct := load_pickle_tree(capture_dir)) is None:
keep_going -= 1
if not keep_going:
raise LookylooException(f'Unable to get tree for {capture_uuid}')
time.sleep(5)
return ct
# NOTE: We only index the public captures # NOTE: We only index the public captures
index = True index = True
try: try:
@ -175,7 +195,7 @@ class Lookyloo():
except RecursionError as e: except RecursionError as e:
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.') raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
with (capture_dir / 'tree.pickle').open('wb') as _p: with pickle_file.open('wb') as _p:
# Some pickles require a pretty high recursion limit, this kindof fixes it. # Some pickles require a pretty high recursion limit, this kindof fixes it.
# If the capture is really broken (generally a refresh to self), the capture # If the capture is really broken (generally a refresh to self), the capture
# is discarded in the RecursionError above. # is discarded in the RecursionError above.
@ -183,6 +203,7 @@ class Lookyloo():
sys.setrecursionlimit(int(default_recursion_limit * 1.1)) sys.setrecursionlimit(int(default_recursion_limit * 1.1))
pickle.dump(ct, _p) pickle.dump(ct, _p)
sys.setrecursionlimit(default_recursion_limit) sys.setrecursionlimit(default_recursion_limit)
lock_file.unlink(missing_ok=True)
return ct return ct
def _build_cname_chain(self, known_cnames: Dict[str, Optional[str]], hostname) -> List[str]: def _build_cname_chain(self, known_cnames: Dict[str, Optional[str]], hostname) -> List[str]:

View File

@ -166,6 +166,8 @@ class UniversalWhois():
self.query_whois_hostnode(n) self.query_whois_hostnode(n)
def whois(self, query: str) -> str: def whois(self, query: str) -> str:
if not self.available:
return ''
bytes_whois = b'' bytes_whois = b''
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.connect((self.server, self.port)) sock.connect((self.server, self.port))

14
poetry.lock generated
View File

@ -307,7 +307,7 @@ hyperframe = ">=5.2.0,<6"
[[package]] [[package]]
name = "har2tree" name = "har2tree"
version = "1.6.0" version = "1.6.1"
description = "HTTP Archive (HAR) to ETE Toolkit generator" description = "HTTP Archive (HAR) to ETE Toolkit generator"
category = "main" category = "main"
optional = false optional = false
@ -1059,16 +1059,16 @@ python-versions = "*"
[[package]] [[package]]
name = "urllib3" name = "urllib3"
version = "1.26.4" version = "1.26.5"
description = "HTTP library with thread-safe connection pooling, file post, and more." description = "HTTP library with thread-safe connection pooling, file post, and more."
category = "main" category = "main"
optional = false optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
[package.extras] [package.extras]
brotli = ["brotlipy (>=0.6.0)"]
secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"]
socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
brotli = ["brotlipy (>=0.6.0)"]
[[package]] [[package]]
name = "vt-py" name = "vt-py"
@ -1366,8 +1366,8 @@ h2 = [
{file = "h2-3.2.0.tar.gz", hash = "sha256:875f41ebd6f2c44781259005b157faed1a5031df3ae5aa7bcb4628a6c0782f14"}, {file = "h2-3.2.0.tar.gz", hash = "sha256:875f41ebd6f2c44781259005b157faed1a5031df3ae5aa7bcb4628a6c0782f14"},
] ]
har2tree = [ har2tree = [
{file = "har2tree-1.6.0-py3-none-any.whl", hash = "sha256:8d4469ddea36da12ec7b25fc098740cfd66e2e565b8a013415261784eaa82cf4"}, {file = "har2tree-1.6.1-py3-none-any.whl", hash = "sha256:2db656b47986a682f46c3bdcbe928ff4048c1b8ccbdc557306de368518584f59"},
{file = "har2tree-1.6.0.tar.gz", hash = "sha256:572b85b5470fd544152c2b1bb41cf1257f4256e2a7a1936bcd6fb06a8f7784f2"}, {file = "har2tree-1.6.1.tar.gz", hash = "sha256:f34627f80aa155e28c920d17d3390d15fe71874652e0d54b52025dede5f50030"},
] ]
hpack = [ hpack = [
{file = "hpack-3.0.0-py2.py3-none-any.whl", hash = "sha256:0edd79eda27a53ba5be2dfabf3b15780928a0dff6eb0c60a3d6767720e970c89"}, {file = "hpack-3.0.0-py2.py3-none-any.whl", hash = "sha256:0edd79eda27a53ba5be2dfabf3b15780928a0dff6eb0c60a3d6767720e970c89"},
@ -1878,8 +1878,8 @@ typing-extensions = [
{file = "typing_extensions-3.10.0.0.tar.gz", hash = "sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342"}, {file = "typing_extensions-3.10.0.0.tar.gz", hash = "sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342"},
] ]
urllib3 = [ urllib3 = [
{file = "urllib3-1.26.4-py2.py3-none-any.whl", hash = "sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df"}, {file = "urllib3-1.26.5-py2.py3-none-any.whl", hash = "sha256:753a0374df26658f99d826cfe40394a686d05985786d946fbe4165b5148f5a7c"},
{file = "urllib3-1.26.4.tar.gz", hash = "sha256:e7b021f7241115872f92f43c6508082facffbd1c048e3c6e2bb9c2a157e28937"}, {file = "urllib3-1.26.5.tar.gz", hash = "sha256:a7acd0977125325f516bda9735fa7142b909a8d01e8b2e4c8108d0984e6e0098"},
] ]
vt-py = [ vt-py = [
{file = "vt-py-0.6.3.tar.gz", hash = "sha256:172916d07b54927271e62dd3ead03142189d7431e9ec0fdbb75fe09f68efa888"}, {file = "vt-py-0.6.3.tar.gz", hash = "sha256:172916d07b54927271e62dd3ead03142189d7431e9ec0fdbb75fe09f68efa888"},

View File

@ -47,7 +47,7 @@ vt-py = "^0.6.2"
pyeupi = "^1.1" pyeupi = "^1.1"
scrapysplashwrapper = "^1.6.0" scrapysplashwrapper = "^1.6.0"
pysanejs = "^1.4" pysanejs = "^1.4"
har2tree = "^1.6.0" har2tree = "^1.6.1"
pylookyloo = "^1.6" pylookyloo = "^1.6"
dnspython = "^2.1.0" dnspython = "^2.1.0"
pytaxonomies = "^1.3" pytaxonomies = "^1.3"

View File

@ -110,7 +110,7 @@
{% for url in urls %} {% for url in urls %}
{# URL Display #} {# URL Display #}
<li class="list-group-item"> <li class="list-group-item">
<div class="h3" title={{ url['url_object'].name }}> <div class="h3" title="{{ url['url_object'].name }}">
{# HTTPs or not #} {# HTTPs or not #}
{% if url['encrypted'] %} {% if url['encrypted'] %}
<img src="/static/secure.svg" title="Encrypted request" width="21" height="21"/> <img src="/static/secure.svg" title="Encrypted request" width="21" height="21"/>
@ -162,7 +162,11 @@
{# Details of the response #} {# Details of the response #}
<p class="h4">Response <p class="h4">Response
<small>(Status code: <small>(Status code:
<span title="{{ http_status_description(url['url_object'].response['status']) }}">{{ url['url_object'].response['status'] }})</span> <span title="{{ http_status_description(url['url_object'].response['status']) }}">
{{ url['url_object'].response['status'] }})
</span>
-
<span>Load time: {{ url['url_object'].time.total_seconds() }}s</span>
</small> </small>
</p> </p>
{{ popup_icons(keys_response, url['url_object'], tree_uuid) }} {{ popup_icons(keys_response, url['url_object'], tree_uuid) }}

View File

@ -28,5 +28,11 @@
<dt class="col-sm-2">Total Nodes</dt> <dt class="col-sm-2">Total Nodes</dt>
<dd class="col-sm-10">{{ stats['total_hostnames'] }}</dd> <dd class="col-sm-10">{{ stats['total_hostnames'] }}</dd>
<dt class="col-sm-2">Total load time</dt>
<dd class="col-sm-10">{{ stats['total_load_time'] }}</dd>
<dt class="col-sm-2">Total size responses</dt>
<dd class="col-sm-10">{{ sizeof_fmt(stats['total_size_responses']) }}</dd>
</dl> </dl>
</div> </div>