lookyloo/lookyloo/helpers.py

189 lines
5.8 KiB
Python
Raw Normal View History

2019-01-23 15:13:29 +01:00
#!/usr/bin/env python3
import hashlib
2020-10-09 18:05:04 +02:00
import json
2021-09-07 12:59:31 +02:00
import logging
2022-03-29 21:13:02 +02:00
import pkg_resources
from datetime import datetime, timedelta
2021-03-31 19:25:57 +02:00
from enum import IntEnum, unique
2021-09-07 12:59:31 +02:00
from functools import lru_cache
from io import BufferedIOBase
from pathlib import Path
2022-04-25 14:43:02 +02:00
from typing import Any, Dict, List, Optional, Set, Union
from urllib.parse import urlparse
2020-01-06 15:32:38 +01:00
2022-03-29 21:13:02 +02:00
2021-09-07 12:59:31 +02:00
from har2tree import CrawledTree, HostNode, URLNode
2020-10-09 18:05:04 +02:00
from publicsuffix2 import PublicSuffixList, fetch # type: ignore
2020-10-28 18:49:15 +01:00
from pytaxonomies import Taxonomies
2022-03-29 21:13:02 +02:00
from ua_parser import user_agent_parser # type: ignore
from werkzeug.user_agent import UserAgent
from werkzeug.utils import cached_property
2020-10-28 18:49:15 +01:00
2022-04-25 14:43:02 +02:00
from .default import get_homedir, safe_create_dir
2020-10-09 18:05:04 +02:00
logger = logging.getLogger('Lookyloo - Helpers')
2019-01-23 15:13:29 +01:00
2021-03-31 19:25:57 +02:00
@unique
class CaptureStatus(IntEnum):
UNKNOWN = -1
QUEUED = 0
DONE = 1
ONGOING = 2
2020-10-09 18:05:04 +02:00
# This method is used in json.dump or json.dumps calls as the default parameter:
# json.dumps(..., default=dump_to_json)
def serialize_to_json(obj: Union[Set]) -> Union[List]:
if isinstance(obj, set):
return list(obj)
def get_resources_hashes(har2tree_container: Union[CrawledTree, HostNode, URLNode]) -> Set[str]:
if isinstance(har2tree_container, CrawledTree):
urlnodes = har2tree_container.root_hartree.url_tree.traverse()
elif isinstance(har2tree_container, HostNode):
urlnodes = har2tree_container.urls
elif isinstance(har2tree_container, URLNode):
urlnodes = [har2tree_container]
else:
raise Exception(f'har2tree_container cannot be {type(har2tree_container)}')
all_ressources_hashes: Set[str] = set()
for urlnode in urlnodes:
if hasattr(urlnode, 'resources_hashes'):
all_ressources_hashes.update(urlnode.resources_hashes)
return all_ressources_hashes
2020-10-28 18:49:15 +01:00
@lru_cache(64)
def get_taxonomies():
return Taxonomies()
2020-10-13 13:03:57 +02:00
@lru_cache(64)
2020-10-09 18:05:04 +02:00
def get_public_suffix_list():
"""Initialize Public Suffix List"""
try:
psl_file = fetch()
psl = PublicSuffixList(psl_file=psl_file)
except Exception:
psl = PublicSuffixList()
return psl
@lru_cache(64)
def get_captures_dir() -> Path:
capture_dir = get_homedir() / 'scraped'
safe_create_dir(capture_dir)
return capture_dir
2020-10-13 13:03:57 +02:00
@lru_cache(64)
2020-05-18 18:32:59 +02:00
def get_email_template() -> str:
2020-05-11 19:01:02 +02:00
with (get_homedir() / 'config' / 'email.tmpl').open() as f:
return f.read()
def get_user_agents(directory: str='user_agents') -> Dict[str, Any]:
ua_files_path = sorted((get_homedir() / directory).glob('**/*.json'), reverse=True)
with ua_files_path[0].open() as f:
return json.load(f)
2020-03-23 12:45:57 +01:00
def load_known_content(directory: str='known_content') -> Dict[str, Dict[str, Any]]:
to_return: Dict[str, Dict[str, Any]] = {}
for known_content_file in (get_homedir() / directory).glob('*.json'):
with known_content_file.open() as f:
to_return[known_content_file.stem] = json.load(f)
return to_return
2020-10-12 12:15:07 +02:00
def load_cookies(cookie_pseudofile: Optional[Union[BufferedIOBase, str]]=None) -> List[Dict[str, Union[str, bool]]]:
cookies: List[Dict[str, Union[str, bool]]]
2020-01-24 10:17:41 +01:00
if cookie_pseudofile:
2020-10-12 12:15:07 +02:00
if isinstance(cookie_pseudofile, str):
try:
cookies = json.loads(cookie_pseudofile)
except json.decoder.JSONDecodeError:
logger.warning(f'Unable to load json content: {cookie_pseudofile}')
return []
2020-10-12 12:15:07 +02:00
else:
cookies = json.load(cookie_pseudofile)
2020-01-24 10:17:41 +01:00
else:
if not (get_homedir() / 'cookies.json').exists():
return []
2020-01-24 10:17:41 +01:00
with (get_homedir() / 'cookies.json').open() as f:
cookies = json.load(f)
2020-10-12 12:15:07 +02:00
to_return: List[Dict[str, Union[str, bool]]] = []
2020-01-24 10:17:41 +01:00
try:
for cookie in cookies:
2020-10-12 12:15:07 +02:00
to_add: Dict[str, Union[str, bool]]
2022-05-23 00:15:52 +02:00
if 'Host raw' in cookie and isinstance(cookie['Host raw'], str):
2020-05-26 17:45:04 +02:00
# Cookie export format for Cookie Quick Manager
2022-05-23 00:15:52 +02:00
u = urlparse(cookie['Host raw']).netloc.split(':', 1)[0]
2020-05-26 17:45:04 +02:00
to_add = {'path': cookie['Path raw'],
'name': cookie['Name raw'],
'httpOnly': cookie['HTTP only raw'] == 'true',
'secure': cookie['Send for'] == 'Encrypted connections only',
'expires': (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z',
'domain': u,
'value': cookie['Content raw']
}
else:
2022-04-25 14:43:02 +02:00
# Cookie from lookyloo/playwright
2020-05-26 17:45:04 +02:00
to_add = cookie
2020-01-24 10:17:41 +01:00
to_return.append(to_add)
except Exception as e:
print(f'Unable to load the cookie file: {e}')
return to_return
2020-11-24 16:44:43 +01:00
def uniq_domains(uniq_urls):
domains = set()
for url in uniq_urls:
splitted = urlparse(url)
domains.add(splitted.hostname)
return domains
2021-08-10 17:38:47 +02:00
@lru_cache(64)
def get_useragent_for_requests():
version = pkg_resources.get_distribution('lookyloo').version
return f'Lookyloo / {version}'
def get_cache_directory(root: Path, identifier: str, namespace: Optional[str] = None) -> Path:
m = hashlib.md5()
m.update(identifier.encode())
digest = m.hexdigest()
if namespace:
root = root / namespace
return root / digest[0] / digest[1] / digest[2] / digest
2022-03-29 21:13:02 +02:00
class ParsedUserAgent(UserAgent):
# from https://python.tutorialink.com/how-do-i-get-the-user-agent-with-flask/
@cached_property
def _details(self):
return user_agent_parser.Parse(self.string)
@property
def platform(self):
return self._details['os'].get('family')
@property
def browser(self):
return self._details['user_agent'].get('family')
@property
def version(self):
return '.'.join(
part
for key in ('major', 'minor', 'patch')
if (part := self._details['user_agent'][key]) is not None
)