diff --git a/lookyloo/capturecache.py b/lookyloo/capturecache.py index caecc552..e1daadd4 100644 --- a/lookyloo/capturecache.py +++ b/lookyloo/capturecache.py @@ -32,7 +32,7 @@ from pyipasnhistory import IPASNHistory # type: ignore[attr-defined] from redis import Redis from .context import Context -from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree, get_indexing +from .helpers import get_captures_dir, is_locked, load_pickle_tree, get_pickle_path, remove_pickle_tree, get_indexing, mimetype_to_generic from .default import LookylooException, try_make_file, get_config from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, TreeNeedsRebuild from .modules import Cloudflare @@ -291,6 +291,14 @@ class CapturesIndex(Mapping): # type: ignore[type-arg] default_recursion_limit = sys.getrecursionlimit() with self._timeout_context(): tree = CrawledTree(har_files, uuid) + for node in tree.root_hartree.hostname_tree.traverse(): + for url in node.urls: + if 'mimetype' in url.features: + generic_type = mimetype_to_generic(url.mimetype) + if generic_type not in node.features: + node.add_feature(generic_type, 1) + else: + node.add_feature(generic_type, getattr(node, generic_type) + 1) await self.__resolve_dns(tree, logger) if self.contextualizer: self.contextualizer.contextualize_tree(tree) diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py index 2aa2efb7..216e84c9 100644 --- a/lookyloo/helpers.py +++ b/lookyloo/helpers.py @@ -511,3 +511,61 @@ def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> C raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.') # The tree doesn't need to be rebuilt if there are no HAR files. raise NoValidHarFile("Couldn't find HAR files") + + +def mimetype_to_generic(mimetype: str | None) -> str: + if not mimetype or mimetype == 'none': + return 'unset_mimetype' + elif 'javascript' in mimetype or 'ecmascript' in mimetype or mimetype.startswith('js'): + return 'js' + elif (mimetype.startswith('image') + or mimetype.startswith('img') + or 'webp' in mimetype): + return 'image' + elif mimetype.startswith('text/css'): + return 'css' + elif 'json' in mimetype: + return 'json' + elif 'html' in mimetype: + return 'html' + elif ('font' in mimetype + or 'woff' in mimetype + or 'opentype' in mimetype): + return 'font' + elif ('octet-stream' in mimetype + or 'application/x-protobuf' in mimetype + or 'application/pkix-cert' in mimetype + or 'application/x-123' in mimetype + or 'application/x-binary' in mimetype + or 'application/x-msdownload' in mimetype + or 'application/x-thrift' in mimetype + or 'application/x-troff-man' in mimetype + or 'application/x-typekit-augmentation' in mimetype + or 'application/grpc-web' in mimetype + or 'model/gltf-binary' in mimetype + or 'model/obj' in mimetype + or 'application/wasm' in mimetype): + return 'octet-stream' + elif ('text' in mimetype or 'xml' in mimetype + or mimetype.startswith('multipart') + or mimetype.startswith('message') + or 'application/x-www-form-urlencoded' in mimetype + or 'application/vnd.oasis.opendocument.formula-template' in mimetype): + return 'text' + elif 'video' in mimetype: + return 'video' + elif ('audio' in mimetype or 'ogg' in mimetype): + return 'audio' + elif ('mpegurl' in mimetype + or 'application/vnd.yt-ump' in mimetype): + return 'livestream' + elif ('application/x-shockwave-flash' in mimetype + or 'application/x-shockware-flash' in mimetype): # Yes, shockwaRe + return 'flash' + elif 'application/pdf' in mimetype: + return 'pdf' + elif ('application/gzip' in mimetype + or 'application/zip' in mimetype): + return 'archive' + else: + return 'unknown_mimetype' diff --git a/website/web/__init__.py b/website/web/__init__.py index c2ce531a..75d0061a 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -45,7 +45,8 @@ from lookyloo.default import get_config from lookyloo.exceptions import MissingUUID, NoValidHarFile, LacusUnreachable from lookyloo.helpers import (UserAgents, load_cookies, load_user_config, - get_taxonomies + get_taxonomies, + mimetype_to_generic ) from zoneinfo import available_timezones @@ -53,8 +54,7 @@ from zoneinfo import available_timezones from .genericapi import api as generic_api from .helpers import (User, build_users_table, get_secret_key, load_user_from_request, src_request_ip, sri_load, - get_lookyloo_instance, get_indexing, build_keys_table, - mimetype_to_generic) + get_lookyloo_instance, get_indexing, build_keys_table) from .proxied import ReverseProxied logging.config.dictConfig(get_config('logging')) diff --git a/website/web/genericapi.py b/website/web/genericapi.py index dce9d052..bfcf81f7 100644 --- a/website/web/genericapi.py +++ b/website/web/genericapi.py @@ -855,7 +855,7 @@ class TLDCaptures(Resource): # type: ignore[misc] @api.param('tld', 'Get captures with a specific TLD and their capture timestamp.') # type: ignore[misc] @api.param('urls_only', 'Returns recent URLs with that TLD, regardless the capture.') # type: ignore[misc] @api.param('most_recent_capture', 'Timestamp of the most recent capture to check for a TLD (fallback to now)') # type: ignore[misc] - @api.param('oldest_capture', 'Timestamp of the oldest capture to check for a TLD (fallback to 5 days ago)') # type: ignore[misc] + @api.param('oldest_capture', 'Timestamp of the oldest capture to check for a TLD (fallback to 1 day ago)') # type: ignore[misc] def get(self) -> list[tuple[str, float]] | list[str]: tld: str | None = request.args['tld'] if request.args.get('tld') else None if not tld: diff --git a/website/web/helpers.py b/website/web/helpers.py index f2b9bd36..7e0c6f24 100644 --- a/website/web/helpers.py +++ b/website/web/helpers.py @@ -126,61 +126,3 @@ def get_indexing(user: User | None) -> Indexing: It is only accessible to the admin user. ''' return get_indexing_cache(full=bool(user and user.is_authenticated)) - - -def mimetype_to_generic(mimetype: str | None) -> str: - if not mimetype or mimetype == 'none': - return 'unset_mimetype' - elif 'javascript' in mimetype or 'ecmascript' in mimetype or mimetype.startswith('js'): - return 'js' - elif (mimetype.startswith('image') - or mimetype.startswith('img') - or 'webp' in mimetype): - return 'image' - elif mimetype.startswith('text/css'): - return 'css' - elif 'json' in mimetype: - return 'json' - elif 'html' in mimetype: - return 'html' - elif ('font' in mimetype - or 'woff' in mimetype - or 'opentype' in mimetype): - return 'font' - elif ('octet-stream' in mimetype - or 'application/x-protobuf' in mimetype - or 'application/pkix-cert' in mimetype - or 'application/x-123' in mimetype - or 'application/x-binary' in mimetype - or 'application/x-msdownload' in mimetype - or 'application/x-thrift' in mimetype - or 'application/x-troff-man' in mimetype - or 'application/x-typekit-augmentation' in mimetype - or 'application/grpc-web' in mimetype - or 'model/gltf-binary' in mimetype - or 'model/obj' in mimetype - or 'application/wasm' in mimetype): - return 'octet-stream' - elif ('text' in mimetype or 'xml' in mimetype - or mimetype.startswith('multipart') - or mimetype.startswith('message') - or 'application/x-www-form-urlencoded' in mimetype - or 'application/vnd.oasis.opendocument.formula-template' in mimetype): - return 'text' - elif 'video' in mimetype: - return 'video' - elif ('audio' in mimetype or 'ogg' in mimetype): - return 'audio' - elif ('mpegurl' in mimetype - or 'application/vnd.yt-ump' in mimetype): - return 'livestream' - elif ('application/x-shockwave-flash' in mimetype - or 'application/x-shockware-flash' in mimetype): # Yes, shockwaRe - return 'flash' - elif 'application/pdf' in mimetype: - return 'pdf' - elif ('application/gzip' in mimetype - or 'application/zip' in mimetype): - return 'archive' - else: - return 'unknown_mimetype' diff --git a/website/web/templates/hostname_popup.html b/website/web/templates/hostname_popup.html index f4fcd13f..8b4e6bdb 100644 --- a/website/web/templates/hostname_popup.html +++ b/website/web/templates/hostname_popup.html @@ -289,7 +289,7 @@ {% for hash, details in url['embedded_ressources'].items() %}
{{hash_info(tree_uuid, url['url_object'].uuid, details['type'], hash, - details['body_size'], details['hash_freq'], + details['body_size'], details.get('hash_freq', 0), has_pandora, details.get('legitimacy'), details.get('known_content')) }}