chg: The Tech Blog UAs can only be fetched manually, moving it to tools

2021-08-27 15:22:39 +02:00 · 2021-08-27 15:22:39 +02:00 · 8918b11a7f
parent 9ee6771158
commit 8918b11a7f
2 changed files with 71 additions and 67 deletions
--- a/lookyloo/helpers.py
+++ b/lookyloo/helpers.py
@ -4,7 +4,6 @@ import os
 import logging
 import time
 import json
-import traceback
 import pickle
 import pkg_resources
 from typing import List, Optional, Dict, Union, Any, Set, Tuple
@ -23,15 +22,8 @@ from redis.exceptions import ConnectionError
 import requests
 from requests.exceptions import HTTPError
 from publicsuffix2 import PublicSuffixList, fetch  # type: ignore
-from bs4 import BeautifulSoup  # type: ignore
 from pytaxonomies import Taxonomies

-try:
-    import cloudscraper  # type: ignore
-    HAS_CF = True
-except ImportError:
-    HAS_CF = False
-
 from .exceptions import MissingEnv, CreateDirectoryException, ConfigError

 configs: Dict[str, Dict[str, Any]] = {}
@ -225,57 +217,9 @@ def long_sleep(sleep_in_sec: int, shutdown_check: int=10) -> bool:
    return True


-def update_user_agents() -> None:
-    if not HAS_CF:
-        # The website with the UAs is behind Cloudflare's anti-bot page, we need cloudscraper
-        return
-
-    today = datetime.now()
-    ua_path = get_homedir() / 'user_agents' / str(today.year) / f'{today.month:02}'
-    safe_create_dir(ua_path)
-    ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json'
-    if ua_file_name.exists():
-        # Already have a UA for that day.
-        return
-    try:
-        s = cloudscraper.create_scraper()
-        r = s.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/')
-    except Exception:
-        traceback.print_exc()
-        return
-    to_store = ua_parser(r.text)
-    with open(ua_file_name, 'w') as f:
-        json.dump(to_store, f, indent=2)
-
-
-def ua_parser(html_content: str) -> Dict[str, Any]:
-    soup = BeautifulSoup(html_content, 'html.parser')
-
-    try:
-        uas = soup.find_all('textarea')[1].text
-    except Exception:
-        traceback.print_exc()
-        return {}
-
-    to_store: Dict[str, Any] = {'by_frequency': []}
-    for ua in json.loads(uas.replace('\n', '')):
-        os = ua['system'].split(' ')[-1]
-        if os not in to_store:
-            to_store[os] = {}
-        browser = ' '.join(ua['system'].split(' ')[:-1])
-        if browser not in to_store[os]:
-            to_store[os][browser] = []
-        to_store[os][browser].append(ua['useragent'])
-        to_store['by_frequency'].append({'os': os, 'browser': browser, 'useragent': ua['useragent']})
-    return to_store
-
-
 def get_user_agents(directory: str='user_agents') -> Dict[str, Any]:
    ua_files_path = str(get_homedir() / directory / '*' / '*' / '*.json')
    paths = sorted(glob(ua_files_path), reverse=True)
-    if not paths:
-        update_user_agents()
-        paths = sorted(glob(ua_files_path), reverse=True)
    with open(paths[0]) as f:
        return json.load(f)

--- a/tools/manual_parse_ua_list.py
+++ b/tools/manual_parse_ua_list.py
@ -1,21 +1,81 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

+import json
+import traceback
 from datetime import datetime
 from pathlib import Path
-import json
+from typing import Dict, Any

-from lookyloo.helpers import ua_parser, get_homedir, safe_create_dir
+from bs4 import BeautifulSoup  # type: ignore
+try:
+    import cloudscraper  # type: ignore
+    HAS_CF = True
+except ImportError:
+    HAS_CF = False

-to_parse = Path('Most Common User Agents - Tech Blog (wh).html')
+from lookyloo.helpers import get_homedir, safe_create_dir

-today = datetime.now()
-ua_path = get_homedir() / 'user_agents' / str(today.year) / f'{today.month:02}'
-safe_create_dir(ua_path)
-ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json'

-with to_parse.open() as f:
-    to_store = ua_parser(f.read())
+def update_user_agents() -> None:
+    # NOTE: this URL is behind cloudflare and tehre is no easy reliable way around it.
+    # The manual way it to open the page in the browser, save it, and run this script.
+    if not HAS_CF:
+        # The website with the UAs is behind Cloudflare's anti-bot page, we need cloudscraper
+        return

-with open(ua_file_name, 'w') as f:
-    json.dump(to_store, f, indent=2)
+    today = datetime.now()
+    ua_path = get_homedir() / 'user_agents' / str(today.year) / f'{today.month:02}'
+    safe_create_dir(ua_path)
+    ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json'
+    if ua_file_name.exists():
+        # Already have a UA for that day.
+        return
+    try:
+        s = cloudscraper.create_scraper()
+        r = s.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/')
+    except Exception:
+        traceback.print_exc()
+        return
+    to_store = ua_parser(r.text)
+    with open(ua_file_name, 'w') as f:
+        json.dump(to_store, f, indent=2)
+
+
+def ua_parser(html_content: str) -> Dict[str, Any]:
+    soup = BeautifulSoup(html_content, 'html.parser')
+
+    try:
+        uas = soup.find_all('textarea')[1].text
+    except Exception:
+        traceback.print_exc()
+        return {}
+
+    to_store: Dict[str, Any] = {'by_frequency': []}
+    for ua in json.loads(uas.replace('\n', '')):
+        os = ua['system'].split(' ')[-1]
+        if os not in to_store:
+            to_store[os] = {}
+        browser = ' '.join(ua['system'].split(' ')[:-1])
+        if browser not in to_store[os]:
+            to_store[os][browser] = []
+        to_store[os][browser].append(ua['useragent'])
+        to_store['by_frequency'].append({'os': os, 'browser': browser, 'useragent': ua['useragent']})
+    return to_store
+
+def main():
+    to_parse = Path('Most Common User Agents - Tech Blog (wh).html')
+
+    today = datetime.now()
+    ua_path = get_homedir() / 'user_agents' / str(today.year) / f'{today.month:02}'
+    safe_create_dir(ua_path)
+    ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json'
+
+    with to_parse.open() as f:
+        to_store = ua_parser(f.read())
+
+    with open(ua_file_name, 'w') as f:
+        json.dump(to_store, f, indent=2)
+
+if __name__ == '__main__':
+    main()