chg: add stats, avoid building big trees twice, bump deps

pull/209/head
Raphaël Vinot 2021-05-26 18:25:06 -07:00
parent 315a2733c3
commit 1117ab6371
8 changed files with 67 additions and 12 deletions

View File

@ -2,6 +2,7 @@
# -*- coding: utf-8 -*-
import logging
from datetime import datetime, timedelta
from lookyloo.abstractmanager import AbstractManager
from lookyloo.lookyloo import Lookyloo
@ -29,6 +30,19 @@ class BackgroundIndexer(AbstractManager):
for uuid_path in self.lookyloo.capture_dir.glob('*/uuid'):
if (uuid_path.parent / 'tree.pickle').exists():
continue
lock_file = uuid_path.parent / 'lock'
if lock_file.exists():
try:
with lock_file.open('r') as f:
lock_ts = datetime.fromisoformat(f.read())
if lock_ts < datetime.now() - timedelta(minutes=5):
# Clear old locks. They shouldn't be there, but it's gonna happen.
self.logger.info(f'Old lock found {lock_file}, removing it.')
lock_file.unlink(missing_ok=True)
except Exception as e:
self.logger.info(f'Error while reading lock {lock_file}: {e}')
continue
with uuid_path.open() as f:
uuid = f.read()
try:

View File

@ -340,3 +340,11 @@ def uniq_domains(uniq_urls):
splitted = urlparse(url)
domains.add(splitted.hostname)
return domains
def try_make_file(filename: Path):
try:
filename.touch(exist_ok=False)
return True
except FileExistsError:
return False

View File

@ -20,6 +20,7 @@ from urllib.parse import urlsplit, urljoin
from uuid import uuid4
from zipfile import ZipFile
import operator
import time
from defang import refang # type: ignore
import dns.resolver
@ -38,7 +39,7 @@ from .exceptions import NoValidHarFile, MissingUUID, LookylooException
from .helpers import (get_homedir, get_socket_path, load_cookies, get_config,
safe_create_dir, get_email_template, load_pickle_tree,
remove_pickle_tree, get_resources_hashes, get_taxonomies, uniq_domains,
CaptureStatus)
CaptureStatus, try_make_file)
from .modules import VirusTotal, SaneJavaScript, PhishingInitiative, MISP, UniversalWhois
from .capturecache import CaptureCache
from .context import Context
@ -149,6 +150,25 @@ class Lookyloo():
'''Generate the pickle, set the cache, add capture in the indexes'''
capture_dir = self._get_capture_dir(capture_uuid)
har_files = sorted(capture_dir.glob('*.har'))
lock_file = capture_dir / 'lock'
pickle_file = capture_dir / 'tree.pickle'
if try_make_file(lock_file):
# Lock created, we can process
with lock_file.open('w') as f:
f.write(datetime.now().isoformat())
else:
# The pickle is being created somewhere else, wait until it's done.
while lock_file.exists():
time.sleep(5)
keep_going = 5
while (ct := load_pickle_tree(capture_dir)) is None:
keep_going -= 1
if not keep_going:
raise LookylooException(f'Unable to get tree for {capture_uuid}')
time.sleep(5)
return ct
# NOTE: We only index the public captures
index = True
try:
@ -175,7 +195,7 @@ class Lookyloo():
except RecursionError as e:
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
with (capture_dir / 'tree.pickle').open('wb') as _p:
with pickle_file.open('wb') as _p:
# Some pickles require a pretty high recursion limit, this kindof fixes it.
# If the capture is really broken (generally a refresh to self), the capture
# is discarded in the RecursionError above.
@ -183,6 +203,7 @@ class Lookyloo():
sys.setrecursionlimit(int(default_recursion_limit * 1.1))
pickle.dump(ct, _p)
sys.setrecursionlimit(default_recursion_limit)
lock_file.unlink(missing_ok=True)
return ct
def _build_cname_chain(self, known_cnames: Dict[str, Optional[str]], hostname) -> List[str]:

View File

@ -166,6 +166,8 @@ class UniversalWhois():
self.query_whois_hostnode(n)
def whois(self, query: str) -> str:
if not self.available:
return ''
bytes_whois = b''
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.connect((self.server, self.port))

14
poetry.lock generated
View File

@ -307,7 +307,7 @@ hyperframe = ">=5.2.0,<6"
[[package]]
name = "har2tree"
version = "1.6.0"
version = "1.6.1"
description = "HTTP Archive (HAR) to ETE Toolkit generator"
category = "main"
optional = false
@ -1059,16 +1059,16 @@ python-versions = "*"
[[package]]
name = "urllib3"
version = "1.26.4"
version = "1.26.5"
description = "HTTP library with thread-safe connection pooling, file post, and more."
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
[package.extras]
brotli = ["brotlipy (>=0.6.0)"]
secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"]
socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
brotli = ["brotlipy (>=0.6.0)"]
[[package]]
name = "vt-py"
@ -1366,8 +1366,8 @@ h2 = [
{file = "h2-3.2.0.tar.gz", hash = "sha256:875f41ebd6f2c44781259005b157faed1a5031df3ae5aa7bcb4628a6c0782f14"},
]
har2tree = [
{file = "har2tree-1.6.0-py3-none-any.whl", hash = "sha256:8d4469ddea36da12ec7b25fc098740cfd66e2e565b8a013415261784eaa82cf4"},
{file = "har2tree-1.6.0.tar.gz", hash = "sha256:572b85b5470fd544152c2b1bb41cf1257f4256e2a7a1936bcd6fb06a8f7784f2"},
{file = "har2tree-1.6.1-py3-none-any.whl", hash = "sha256:2db656b47986a682f46c3bdcbe928ff4048c1b8ccbdc557306de368518584f59"},
{file = "har2tree-1.6.1.tar.gz", hash = "sha256:f34627f80aa155e28c920d17d3390d15fe71874652e0d54b52025dede5f50030"},
]
hpack = [
{file = "hpack-3.0.0-py2.py3-none-any.whl", hash = "sha256:0edd79eda27a53ba5be2dfabf3b15780928a0dff6eb0c60a3d6767720e970c89"},
@ -1878,8 +1878,8 @@ typing-extensions = [
{file = "typing_extensions-3.10.0.0.tar.gz", hash = "sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342"},
]
urllib3 = [
{file = "urllib3-1.26.4-py2.py3-none-any.whl", hash = "sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df"},
{file = "urllib3-1.26.4.tar.gz", hash = "sha256:e7b021f7241115872f92f43c6508082facffbd1c048e3c6e2bb9c2a157e28937"},
{file = "urllib3-1.26.5-py2.py3-none-any.whl", hash = "sha256:753a0374df26658f99d826cfe40394a686d05985786d946fbe4165b5148f5a7c"},
{file = "urllib3-1.26.5.tar.gz", hash = "sha256:a7acd0977125325f516bda9735fa7142b909a8d01e8b2e4c8108d0984e6e0098"},
]
vt-py = [
{file = "vt-py-0.6.3.tar.gz", hash = "sha256:172916d07b54927271e62dd3ead03142189d7431e9ec0fdbb75fe09f68efa888"},

View File

@ -47,7 +47,7 @@ vt-py = "^0.6.2"
pyeupi = "^1.1"
scrapysplashwrapper = "^1.6.0"
pysanejs = "^1.4"
har2tree = "^1.6.0"
har2tree = "^1.6.1"
pylookyloo = "^1.6"
dnspython = "^2.1.0"
pytaxonomies = "^1.3"

View File

@ -110,7 +110,7 @@
{% for url in urls %}
{# URL Display #}
<li class="list-group-item">
<div class="h3" title={{ url['url_object'].name }}>
<div class="h3" title="{{ url['url_object'].name }}">
{# HTTPs or not #}
{% if url['encrypted'] %}
<img src="/static/secure.svg" title="Encrypted request" width="21" height="21"/>
@ -162,7 +162,11 @@
{# Details of the response #}
<p class="h4">Response
<small>(Status code:
<span title="{{ http_status_description(url['url_object'].response['status']) }}">{{ url['url_object'].response['status'] }})</span>
<span title="{{ http_status_description(url['url_object'].response['status']) }}">
{{ url['url_object'].response['status'] }})
</span>
-
<span>Load time: {{ url['url_object'].time.total_seconds() }}s</span>
</small>
</p>
{{ popup_icons(keys_response, url['url_object'], tree_uuid) }}

View File

@ -28,5 +28,11 @@
<dt class="col-sm-2">Total Nodes</dt>
<dd class="col-sm-10">{{ stats['total_hostnames'] }}</dd>
<dt class="col-sm-2">Total load time</dt>
<dd class="col-sm-10">{{ stats['total_load_time'] }}</dd>
<dt class="col-sm-2">Total size responses</dt>
<dd class="col-sm-10">{{ sizeof_fmt(stats['total_size_responses']) }}</dd>
</dl>
</div>