2020-06-25 16:43:36 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2021-08-27 15:22:39 +02:00
|
|
|
import json
|
|
|
|
import traceback
|
2020-06-25 16:43:36 +02:00
|
|
|
from datetime import datetime
|
|
|
|
from pathlib import Path
|
2021-08-27 15:22:39 +02:00
|
|
|
from typing import Dict, Any
|
|
|
|
|
2022-03-31 11:30:53 +02:00
|
|
|
from bs4 import BeautifulSoup
|
2021-08-27 15:22:39 +02:00
|
|
|
try:
|
|
|
|
import cloudscraper # type: ignore
|
|
|
|
HAS_CF = True
|
|
|
|
except ImportError:
|
|
|
|
HAS_CF = False
|
|
|
|
|
2021-10-18 13:06:43 +02:00
|
|
|
from lookyloo.default import get_homedir, safe_create_dir
|
2022-08-23 17:44:48 +02:00
|
|
|
from lookyloo.helpers import ParsedUserAgent, serialize_to_json
|
2021-08-27 15:22:39 +02:00
|
|
|
|
|
|
|
|
|
|
|
def update_user_agents() -> None:
|
|
|
|
# NOTE: this URL is behind cloudflare and tehre is no easy reliable way around it.
|
|
|
|
# The manual way it to open the page in the browser, save it, and run this script.
|
|
|
|
if not HAS_CF:
|
|
|
|
# The website with the UAs is behind Cloudflare's anti-bot page, we need cloudscraper
|
|
|
|
return
|
|
|
|
|
|
|
|
today = datetime.now()
|
|
|
|
ua_path = get_homedir() / 'user_agents' / str(today.year) / f'{today.month:02}'
|
|
|
|
safe_create_dir(ua_path)
|
|
|
|
ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json'
|
|
|
|
if ua_file_name.exists():
|
|
|
|
# Already have a UA for that day.
|
|
|
|
return
|
|
|
|
try:
|
|
|
|
s = cloudscraper.create_scraper()
|
|
|
|
r = s.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/')
|
|
|
|
except Exception:
|
|
|
|
traceback.print_exc()
|
|
|
|
return
|
|
|
|
to_store = ua_parser(r.text)
|
|
|
|
with open(ua_file_name, 'w') as f:
|
|
|
|
json.dump(to_store, f, indent=2)
|
|
|
|
|
|
|
|
|
|
|
|
def ua_parser(html_content: str) -> Dict[str, Any]:
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
|
|
|
|
try:
|
|
|
|
uas = soup.find_all('textarea')[1].text
|
|
|
|
except Exception:
|
|
|
|
traceback.print_exc()
|
|
|
|
return {}
|
|
|
|
|
|
|
|
to_store: Dict[str, Any] = {'by_frequency': []}
|
|
|
|
for ua in json.loads(uas.replace('\n', '')):
|
2022-08-22 17:34:00 +02:00
|
|
|
parsed_ua = ParsedUserAgent(ua['useragent'])
|
|
|
|
if not parsed_ua.platform or not parsed_ua.browser:
|
|
|
|
continue
|
|
|
|
platform_key = parsed_ua.platform
|
|
|
|
if parsed_ua.platform_version:
|
|
|
|
platform_key = f'{platform_key} {parsed_ua.platform_version}'
|
|
|
|
browser_key = parsed_ua.browser
|
|
|
|
if parsed_ua.version:
|
|
|
|
browser_key = f'{browser_key} {parsed_ua.version}'
|
|
|
|
if platform_key not in to_store:
|
|
|
|
to_store[platform_key] = {}
|
|
|
|
if browser_key not in to_store[platform_key]:
|
2022-08-23 17:44:48 +02:00
|
|
|
to_store[platform_key][browser_key] = set()
|
|
|
|
to_store[platform_key][browser_key].add(parsed_ua.string)
|
2022-08-22 17:34:00 +02:00
|
|
|
to_store['by_frequency'].append({'os': platform_key,
|
|
|
|
'browser': browser_key,
|
|
|
|
'useragent': parsed_ua.string})
|
2021-08-27 15:22:39 +02:00
|
|
|
return to_store
|
2020-06-25 16:43:36 +02:00
|
|
|
|
2021-10-18 13:06:43 +02:00
|
|
|
|
2021-08-27 15:22:39 +02:00
|
|
|
def main():
|
|
|
|
to_parse = Path('Most Common User Agents - Tech Blog (wh).html')
|
2020-06-25 16:43:36 +02:00
|
|
|
|
2021-08-27 15:22:39 +02:00
|
|
|
today = datetime.now()
|
|
|
|
ua_path = get_homedir() / 'user_agents' / str(today.year) / f'{today.month:02}'
|
|
|
|
safe_create_dir(ua_path)
|
|
|
|
ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json'
|
2020-06-25 16:43:36 +02:00
|
|
|
|
2021-08-27 15:22:39 +02:00
|
|
|
with to_parse.open() as f:
|
|
|
|
to_store = ua_parser(f.read())
|
2020-06-25 16:43:36 +02:00
|
|
|
|
2021-08-27 15:22:39 +02:00
|
|
|
with open(ua_file_name, 'w') as f:
|
2022-08-23 17:44:48 +02:00
|
|
|
json.dump(to_store, f, indent=2, default=serialize_to_json)
|
2020-06-25 16:43:36 +02:00
|
|
|
|
2021-10-18 13:06:43 +02:00
|
|
|
|
2021-08-27 15:22:39 +02:00
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|