From ebbe6e3ce921c2eaa0f79f87e0c5f3faa5cc18a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Mon, 22 Aug 2022 17:34:00 +0200 Subject: [PATCH] new: Pick mobile devices on capture page --- bin/async_capture.py | 2 +- bin/background_processing.py | 20 ++++--- lookyloo/helpers.py | 56 ++++++++++++++---- poetry.lock | 50 ++++++++-------- pyproject.toml | 8 +-- tools/manual_parse_ua_list.py | 26 +++++--- website/web/__init__.py | 5 +- website/web/templates/capture.html | 95 +++++++++++++++++++++++++----- 8 files changed, 190 insertions(+), 72 deletions(-) diff --git a/bin/async_capture.py b/bin/async_capture.py index 128d926..bc1f48f 100755 --- a/bin/async_capture.py +++ b/bin/async_capture.py @@ -87,7 +87,7 @@ class AsyncCapture(AbstractManager): url = to_capture[b'url'].decode() self.thirdparty_submit(url) else: - self.logger.warning(f'Invalid capture {to_capture}.') + self.logger.warning(f'Invalid capture (no URL provided): {to_capture}.') url = '' if url: diff --git a/bin/background_processing.py b/bin/background_processing.py index 3c0b275..a773769 100755 --- a/bin/background_processing.py +++ b/bin/background_processing.py @@ -49,13 +49,19 @@ class Processing(AbstractManager): parsed_ua = ParsedUserAgent(ua) if not parsed_ua.platform or not parsed_ua.browser: continue - if parsed_ua.platform not in to_store: - to_store[parsed_ua.platform] = {} - if f'{parsed_ua.browser} {parsed_ua.version}' not in to_store[parsed_ua.platform]: - to_store[parsed_ua.platform][f'{parsed_ua.browser} {parsed_ua.version}'] = [] - to_store[parsed_ua.platform][f'{parsed_ua.browser} {parsed_ua.version}'].append(parsed_ua.string) - to_store['by_frequency'].append({'os': parsed_ua.platform, - 'browser': f'{parsed_ua.browser} {parsed_ua.version}', + platform_key = parsed_ua.platform + if parsed_ua.platform_version: + platform_key = f'{platform_key} {parsed_ua.platform_version}' + browser_key = parsed_ua.browser + if parsed_ua.version: + browser_key = f'{browser_key} {parsed_ua.version}' + if platform_key not in to_store: + to_store[platform_key] = {} + if browser_key not in to_store[platform_key]: + to_store[platform_key][browser_key] = [] + to_store[platform_key][browser_key].append(parsed_ua.string) + to_store['by_frequency'].append({'os': platform_key, + 'browser': browser_key, 'useragent': parsed_ua.string}) with self_generated_ua_file.open('w') as f: json.dump(to_store, f, indent=2) diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py index 49294c5..842c19e 100644 --- a/lookyloo/helpers.py +++ b/lookyloo/helpers.py @@ -14,6 +14,7 @@ from urllib.parse import urlparse from har2tree import CrawledTree, HostNode, URLNode +from playwrightcapture import get_devices from publicsuffix2 import PublicSuffixList, fetch # type: ignore from pytaxonomies import Taxonomies from ua_parser import user_agent_parser # type: ignore @@ -95,6 +96,7 @@ class UserAgents: ua_files_path = sorted(self.path.glob('**/*.json'), reverse=True) self._load_newest_ua_file(ua_files_path[0]) + self._load_playwright_devices() def _load_newest_ua_file(self, path: Path): self.most_recent_ua_path = path @@ -102,6 +104,25 @@ class UserAgents: self.most_recent_uas = json.load(f) self.by_freq = self.most_recent_uas.pop('by_frequency') + def _load_playwright_devices(self): + self.playwright_devices = get_devices() + # Only get default and desktop for now. + for device_name, details in self.playwright_devices['desktop']['default'].items(): + parsed_ua = ParsedUserAgent(details['user_agent']) + if not parsed_ua.platform or not parsed_ua.browser: + continue + platform_key = parsed_ua.platform + if parsed_ua.platform_version: + platform_key = f'{platform_key} {parsed_ua.platform_version}' + browser_key = parsed_ua.browser + if parsed_ua.version: + browser_key = f'{browser_key} {parsed_ua.version}' + if platform_key not in self.most_recent_uas: + self.most_recent_uas[platform_key] = {} + if browser_key not in self.most_recent_uas[platform_key]: + self.most_recent_uas[platform_key][browser_key] = [] + self.most_recent_uas[platform_key][browser_key].append(parsed_ua.string) + @property def user_agents(self) -> Dict[str, Dict[str, List[str]]]: ua_files_path = sorted(self.path.glob('**/*.json'), reverse=True) @@ -111,14 +132,19 @@ class UserAgents: @property def default(self) -> Dict[str, str]: - blocked_words = ['bot', 'bing'] - for ua in self.by_freq: - if ua["os"] == "Other": - continue - if any(blockedword in ua['useragent'].lower() for blockedword in blocked_words): - continue - return ua - raise Exception('Erros with the User agents.') + '''The default useragent for desktop chrome from playwright''' + parsed_ua = ParsedUserAgent(self.playwright_devices['desktop']['default']['Desktop Chrome']['user_agent']) + platform_key = parsed_ua.platform + if parsed_ua.platform_version: + platform_key = f'{platform_key} {parsed_ua.platform_version}' + browser_key = parsed_ua.browser + if parsed_ua.version: + browser_key = f'{browser_key} {parsed_ua.version}' + if not platform_key or not browser_key: + raise Exception(f'Unable to get valid default user agent from playwright: {parsed_ua}') + return {'os': platform_key, + 'browser': browser_key, + 'useragent': parsed_ua.string} def load_known_content(directory: str='known_content') -> Dict[str, Dict[str, Any]]: @@ -210,14 +236,24 @@ class ParsedUserAgent(UserAgent): def platform(self): return self._details['os'].get('family') + @property + def platform_version(self) -> Optional[str]: + return self._aggregate_version(self._details['os']) + @property def browser(self): return self._details['user_agent'].get('family') @property def version(self): + return self._aggregate_version(self._details['user_agent']) + + def _aggregate_version(self, details: Dict[str, str]) -> Optional[str]: return '.'.join( part - for key in ('major', 'minor', 'patch') - if (part := self._details['user_agent'][key]) is not None + for key in ('major', 'minor', 'patch', 'patch_minor') + if (part := details.get(key)) is not None ) + + def __str__(self): + return f'OS: {self.platform} - Browser: {self.browser} {self.version} - UA: {self.string}' diff --git a/poetry.lock b/poetry.lock index 29c3d8d..50de5fe 100644 --- a/poetry.lock +++ b/poetry.lock @@ -120,7 +120,7 @@ lxml = ["lxml"] [[package]] name = "bootstrap-flask" -version = "2.0.2" +version = "2.1.0" description = "Bootstrap 4 & 5 helper for your Flask projects." category = "main" optional = false @@ -156,7 +156,7 @@ python-versions = ">=3.6" [[package]] name = "charset-normalizer" -version = "2.1.0" +version = "2.1.1" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." category = "main" optional = false @@ -544,7 +544,7 @@ i18n = ["Babel (>=2.7)"] [[package]] name = "jsonschema" -version = "4.12.1" +version = "4.14.0" description = "An implementation of JSON Schema validation for Python" category = "main" optional = false @@ -557,8 +557,8 @@ pkgutil-resolve-name = {version = ">=1.3.10", markers = "python_version < \"3.9\ pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2" [package.extras] -format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] -format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"] +format-nongpl = ["webcolors (>=1.11)", "uri-template", "rfc3986-validator (>0.1.0)", "rfc3339-validator", "jsonpointer (>1.13)", "isoduration", "idna", "fqdn"] +format = ["webcolors (>=1.11)", "uri-template", "rfc3987", "rfc3339-validator", "jsonpointer (>1.13)", "isoduration", "idna", "fqdn"] [[package]] name = "lief" @@ -738,7 +738,7 @@ websockets = "10.1" [[package]] name = "playwrightcapture" -version = "1.14.2" +version = "1.14.3" description = "A simple library to capture websites using playwright" category = "main" optional = false @@ -1217,7 +1217,7 @@ python-versions = "*" [[package]] name = "types-redis" -version = "4.3.14" +version = "4.3.16" description = "Typing stubs for redis" category = "dev" optional = false @@ -1293,7 +1293,7 @@ test = ["pytest-mock (>=3.3)", "pytest (>=4.3)"] [[package]] name = "ua-parser" -version = "0.15.0" +version = "0.16.0" description = "Python port of Browserscope's user agent parser" category = "main" optional = false @@ -1301,7 +1301,7 @@ python-versions = "*" [[package]] name = "urllib3" -version = "1.26.11" +version = "1.26.12" description = "HTTP library with thread-safe connection pooling, file post, and more." category = "main" optional = false @@ -1309,7 +1309,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, [package.extras] brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] -secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] +secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "urllib3-secure-extra", "ipaddress"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] @@ -1413,7 +1413,7 @@ misp = ["python-magic", "pydeep2"] [metadata] lock-version = "1.1" python-versions = ">=3.8,<3.11" -content-hash = "50d92ec200e527034c7286b1c2529f6c278446bbbbed9abccd4702e95c829a18" +content-hash = "b813c16a36c8bfd612488d2fdfce36b34234e03fae997dc56a6613672c31d9b4" [metadata.files] aiohttp = [ @@ -1541,8 +1541,8 @@ beautifulsoup4 = [ {file = "beautifulsoup4-4.11.1.tar.gz", hash = "sha256:ad9aa55b65ef2808eb405f46cf74df7fcb7044d5cbc26487f96eb2ef2e436693"}, ] bootstrap-flask = [ - {file = "Bootstrap-Flask-2.0.2.tar.gz", hash = "sha256:4ddd910c5a821d92d20aeda2476203814ca2e6764b4cf31768f134b3a07f5691"}, - {file = "Bootstrap_Flask-2.0.2-py2.py3-none-any.whl", hash = "sha256:32867bb785dd2be84a045367401849c6afb1debba25b966b0c48e1fd62bb97c8"}, + {file = "Bootstrap-Flask-2.1.0.tar.gz", hash = "sha256:dc4f9c463727f3a59a6bfb17b7f9d13fd07646ba852f94285542c6a1e4e457e3"}, + {file = "Bootstrap_Flask-2.1.0-py2.py3-none-any.whl", hash = "sha256:52e360421aafbf117f59ed8237391b2a4c77592e35ab7b4566d4fde3b277825a"}, ] cchardet = [ {file = "cchardet-2.1.7-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:c6f70139aaf47ffb94d89db603af849b82efdf756f187cdd3e566e30976c519f"}, @@ -1584,8 +1584,8 @@ chardet = [ {file = "chardet-5.0.0.tar.gz", hash = "sha256:0368df2bfd78b5fc20572bb4e9bb7fb53e2c094f60ae9993339e8671d0afb8aa"}, ] charset-normalizer = [ - {file = "charset-normalizer-2.1.0.tar.gz", hash = "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413"}, - {file = "charset_normalizer-2.1.0-py3-none-any.whl", hash = "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5"}, + {file = "charset-normalizer-2.1.1.tar.gz", hash = "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845"}, + {file = "charset_normalizer-2.1.1-py3-none-any.whl", hash = "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"}, ] click = [ {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, @@ -1850,8 +1850,8 @@ jinja2 = [ {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, ] jsonschema = [ - {file = "jsonschema-4.12.1-py3-none-any.whl", hash = "sha256:05f975aee3f1244a1ea0e018e8ad2672f6ca5fd1a28bc46ffc7d4b3e9896cac4"}, - {file = "jsonschema-4.12.1.tar.gz", hash = "sha256:c7dd96a88c4ea60bdc8478589ee2d4ea5d73ab235e24d17641ad733dde4e3eb1"}, + {file = "jsonschema-4.14.0-py3-none-any.whl", hash = "sha256:9892b8d630a82990521a9ca630d3446bd316b5ad54dbe981338802787f3e0d2d"}, + {file = "jsonschema-4.14.0.tar.gz", hash = "sha256:15062f4cc6f591400cd528d2c355f2cfa6a57e44c820dc783aee5e23d36a831f"}, ] lief = [ {file = "lief-0.12.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:4fbbc9d520de87ac22210c62d22a9b088e5460f9a028741311e6f68ef8877ddd"}, @@ -2212,8 +2212,8 @@ playwright = [ {file = "playwright-1.25.1-py3-none-win_amd64.whl", hash = "sha256:d5c64d4b6f4ab56ea0acf5446f3aa3834beea8d871c58a49eff189aa3cf85d13"}, ] playwrightcapture = [ - {file = "PlaywrightCapture-1.14.2-py3-none-any.whl", hash = "sha256:22bc32ab969e99846cfd2dddc1ea3e210fbca0f35d692c0330ec619e1ce60287"}, - {file = "PlaywrightCapture-1.14.2.tar.gz", hash = "sha256:9efcfc05a1e3327e63bb138ece22d257c2274828cc466f0a04f1059628a42859"}, + {file = "PlaywrightCapture-1.14.3-py3-none-any.whl", hash = "sha256:5068f58726ebff1f7928e9793f3c91ead97ba1b096f05021261d126c9ce7fb9e"}, + {file = "PlaywrightCapture-1.14.3.tar.gz", hash = "sha256:7243b908caa16b9e50c662eb01f558a578fd4acb0541fbde7103714ef51219b5"}, ] prompt-toolkit = [ {file = "prompt_toolkit-3.0.30-py3-none-any.whl", hash = "sha256:d8916d3f62a7b67ab353a952ce4ced6a1d2587dfe9ef8ebc30dd7c386751f289"}, @@ -2484,8 +2484,8 @@ types-python-dateutil = [ {file = "types_python_dateutil-2.8.19-py3-none-any.whl", hash = "sha256:6284df1e4783d8fc6e587f0317a81333856b872a6669a282f8a325342bce7fa8"}, ] types-redis = [ - {file = "types-redis-4.3.14.tar.gz", hash = "sha256:aa5955ff5f10b87a9021c031eb002a544a41a0a6280084ec0758a8a812a375c4"}, - {file = "types_redis-4.3.14-py3-none-any.whl", hash = "sha256:c5518f14df81f2070bb075b292309331b1be283f1927adf3f04983eeb05e2f98"}, + {file = "types-redis-4.3.16.tar.gz", hash = "sha256:ed53e35eac4303ef70cf7f5c205e210f2238b7fdd9306b7f8676669c1070943e"}, + {file = "types_redis-4.3.16-py3-none-any.whl", hash = "sha256:41da87dc80f39573a71873e4265f181d9628b0a2a862e850ecc896a0cd6cacd2"}, ] types-requests = [ {file = "types-requests-2.28.9.tar.gz", hash = "sha256:feaf581bd580497a47fe845d506fa3b91b484cf706ff27774e87659837de9962"}, @@ -2516,12 +2516,12 @@ tzlocal = [ {file = "tzlocal-4.2.tar.gz", hash = "sha256:ee5842fa3a795f023514ac2d801c4a81d1743bbe642e3940143326b3a00addd7"}, ] ua-parser = [ - {file = "ua-parser-0.15.0.tar.gz", hash = "sha256:e441c982ffe81aa7e31af40ac6bf1d39f8ad24f1d34a2d91baae415470b26e9b"}, - {file = "ua_parser-0.15.0-py2.py3-none-any.whl", hash = "sha256:a93592ee96922b5f969bde9ae79662bdd41d041760280b099a6700264a1b7291"}, + {file = "ua-parser-0.16.0.tar.gz", hash = "sha256:ace314a7c04f89d37282d5c837d1189960314a336270b41f6d6171097c8dfe9a"}, + {file = "ua_parser-0.16.0-py2.py3-none-any.whl", hash = "sha256:f11dcec514d211be24671da296c491adbe43d76e6866d99972fda966c77b6349"}, ] urllib3 = [ - {file = "urllib3-1.26.11-py2.py3-none-any.whl", hash = "sha256:c33ccba33c819596124764c23a97d25f32b28433ba0dedeb77d873a38722c9bc"}, - {file = "urllib3-1.26.11.tar.gz", hash = "sha256:ea6e8fb210b19d950fab93b60c9009226c63a28808bc8386e05301e25883ac0a"}, + {file = "urllib3-1.26.12-py2.py3-none-any.whl", hash = "sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997"}, + {file = "urllib3-1.26.12.tar.gz", hash = "sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e"}, ] vt-py = [ {file = "vt-py-0.15.0.tar.gz", hash = "sha256:f6b681314866acc0bfc1e862ca97ba9f522702cd7d1acfd41bd203969fdc3bc3"}, diff --git a/pyproject.toml b/pyproject.toml index 1f443e9..df4fd08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ gunicorn = "^20.1.0" cchardet = "^2.1.7" redis = {version = "^4.3.4", extras = ["hiredis"]} beautifulsoup4 = "^4.11.1" -bootstrap-flask = "^2.0.2" +bootstrap-flask = "^2.1.0" defang = "^0.5.3" vt-py = "^0.15.0" pyeupi = "^1.1" @@ -60,10 +60,10 @@ pyphishtanklookup = "^1.1" Flask-Cors = "^3.0.10" pyhashlookup = "^1.2.0" lief = "^0.12.1" -ua-parser = "^0.15.0" +ua-parser = "^0.16.0" Flask-Login = "^0.6.2" har2tree = "^1.14.2" -playwrightcapture = "^1.14.2" +playwrightcapture = "^1.14.3" passivetotal = "^2.5.9" werkzeug = "2.1.2" filetype = "^1.1.0" @@ -74,7 +74,7 @@ misp = ['python-magic', 'pydeep2'] [tool.poetry.dev-dependencies] mypy = "^0.971" ipython = "^8.4.0" -types-redis = "^4.3.14" +types-redis = "^4.3.16" types-requests = "^2.28.9" types-Flask = "^1.1.6" types-pkg-resources = "^0.1.3" diff --git a/tools/manual_parse_ua_list.py b/tools/manual_parse_ua_list.py index e3688f5..97010fb 100644 --- a/tools/manual_parse_ua_list.py +++ b/tools/manual_parse_ua_list.py @@ -14,6 +14,7 @@ except ImportError: HAS_CF = False from lookyloo.default import get_homedir, safe_create_dir +from lookyloo.helpers import ParsedUserAgent def update_user_agents() -> None: @@ -52,14 +53,23 @@ def ua_parser(html_content: str) -> Dict[str, Any]: to_store: Dict[str, Any] = {'by_frequency': []} for ua in json.loads(uas.replace('\n', '')): - os = ua['system'].split(' ')[-1] - if os not in to_store: - to_store[os] = {} - browser = ' '.join(ua['system'].split(' ')[:-1]) - if browser not in to_store[os]: - to_store[os][browser] = [] - to_store[os][browser].append(ua['useragent']) - to_store['by_frequency'].append({'os': os, 'browser': browser, 'useragent': ua['useragent']}) + parsed_ua = ParsedUserAgent(ua['useragent']) + if not parsed_ua.platform or not parsed_ua.browser: + continue + platform_key = parsed_ua.platform + if parsed_ua.platform_version: + platform_key = f'{platform_key} {parsed_ua.platform_version}' + browser_key = parsed_ua.browser + if parsed_ua.version: + browser_key = f'{browser_key} {parsed_ua.version}' + if platform_key not in to_store: + to_store[platform_key] = {} + if browser_key not in to_store[platform_key]: + to_store[platform_key][browser_key] = [] + to_store[platform_key][browser_key].append(parsed_ua.string) + to_store['by_frequency'].append({'os': platform_key, + 'browser': browser_key, + 'useragent': parsed_ua.string}) return to_store diff --git a/website/web/__init__.py b/website/web/__init__.py index 774be17..0f98e04 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -842,6 +842,7 @@ def _prepare_capture_template(user_ua: Optional[str], predefined_url: Optional[s default=user_agents.default, personal_ua=user_ua, default_public=get_config('generic', 'default_public'), + devices=lookyloo.get_playwright_devices(), predefined_url_to_capture=predefined_url if predefined_url else '') @@ -872,7 +873,9 @@ def capture_web(): if 'cookies' in request.files and request.files['cookies'].filename: capture_query['cookies'] = request.files['cookies'].stream.read() - if request.form.get('freetext_ua'): + if request.form.get('device_name'): + capture_query['device_name'] = request.form['device_name'] + elif request.form.get('freetext_ua'): capture_query['user_agent'] = request.form['freetext_ua'] elif request.form.get('personal_ua') and request.headers.get('User-Agent'): capture_query['user_agent'] = request.headers['User-Agent'] diff --git a/website/web/templates/capture.html b/website/web/templates/capture.html index 2e73cbf..d57bdd1 100644 --- a/website/web/templates/capture.html +++ b/website/web/templates/capture.html @@ -39,6 +39,7 @@ +