diff --git a/Pipfile b/Pipfile index c324e22..7b90590 100644 --- a/Pipfile +++ b/Pipfile @@ -17,6 +17,7 @@ lookyloo = {editable = true,path = "."} cchardet = "*" redis = "*" pylookyloo = {editable = true,path = "./client"} +beautifulsoup4 = "*" [requires] python_version = "3.6" diff --git a/Pipfile.lock b/Pipfile.lock index 60d7c42..17b1153 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "766c81240a1f3be46b1f7e845ee38ff7306eeb9a91ffa6aa6be204f51c582b7c" + "sha256": "8b6fd0b35686dc0c0cac746c878bc3d34dc04e340b2df3c321dfb13b56234f2d" }, "pipfile-spec": 6, "requires": { @@ -43,6 +43,7 @@ "sha256:945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348", "sha256:ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718" ], + "index": "pypi", "version": "==4.7.1" }, "cchardet": { @@ -289,34 +290,34 @@ }, "lxml": { "hashes": [ - "sha256:0358b9e9642bc7d39aac5cffe9884a99a5ca68e5e2c1b89e570ed60da9139908", - "sha256:091a359c4dafebbecd3959d9013f1b896b5371859165e4e50b01607a98d9e3e2", - "sha256:1998e4e60603c64bcc35af61b4331ab3af087457900d3980e18d190e17c3a697", - "sha256:2000b4088dee9a41f459fddaf6609bba48a435ce6374bb254c5ccdaa8928c5ba", - "sha256:2afb0064780d8aaf165875be5898c1866766e56175714fa5f9d055433e92d41d", - "sha256:2d8f1d9334a4e3ff176d096c14ded3100547d73440683567d85b8842a53180bb", - "sha256:2e38db22f6a3199fd63675e1b4bd795d676d906869047398f29f38ca55cb453a", - "sha256:3181f84649c1a1ca62b19ddf28436b1b2cb05ae6c7d2628f33872e713994c364", - "sha256:37462170dfd88af8431d04de6b236e6e9c06cda71e2ca26d88ef2332fd2a5237", - "sha256:3a9d8521c89bf6f2a929c3d12ad3ad7392c774c327ea809fd08a13be6b3bc05f", - "sha256:3d0bbd2e1a28b4429f24fd63a122a450ce9edb7a8063d070790092d7343a1aa4", - "sha256:483d60585ce3ee71929cea70949059f83850fa5e12deb9c094ed1c8c2ec73cbd", - "sha256:4888be27d5cba55ce94209baef5bcd7bbd7314a3d17021a5fc10000b3a5f737d", - "sha256:64b0d62e4209170a2a0c404c446ab83b941a0003e96604d2e4f4cb735f8a2254", - "sha256:68010900898fdf139ac08549c4dba8206c584070a960ffc530aebf0c6f2794ef", - "sha256:872ecb066de602a0099db98bd9e57f4cfc1d62f6093d94460c787737aa08f39e", - "sha256:88a32b03f2e4cd0e63f154cac76724709f40b3fc2f30139eb5d6f900521b44ed", - "sha256:b1dc7683da4e67ab2bebf266afa68098d681ae02ce570f0d1117312273d2b2ac", - "sha256:b29e27ce9371810250cb1528a771d047a9c7b0f79630dc7dc5815ff828f4273b", - "sha256:ce197559596370d985f1ce6b7051b52126849d8159040293bf8b98cb2b3e1f78", - "sha256:d45cf6daaf22584eff2175f48f82c4aa24d8e72a44913c5aff801819bb73d11f", - "sha256:e2ff9496322b2ce947ba4a7a5eb048158de9d6f3fe9efce29f1e8dd6878561e6", - "sha256:f7b979518ec1f294a41a707c007d54d0f3b3e1fd15d5b26b7e99b62b10d9a72e", - "sha256:f9c7268e9d16e34e50f8246c4f24cf7353764affd2bc971f0379514c246e3f6b", - "sha256:f9c839806089d79de588ee1dde2dae05dc1156d3355dfeb2b51fde84d9c960ad", - "sha256:ff962953e2389226adc4d355e34a98b0b800984399153c6678f2367b11b4d4b8" + "sha256:03984196d00670b2ab14ae0ea83d5cc0cfa4f5a42558afa9ab5fa745995328f5", + "sha256:0815b0c9f897468de6a386dc15917a0becf48cc92425613aa8bbfc7f0f82951f", + "sha256:175f3825f075cf02d15099eb52658457cf0ff103dcf11512b5d2583e1d40f58b", + "sha256:30e14c62d88d1e01a26936ecd1c6e784d4afc9aa002bba4321c5897937112616", + "sha256:3210da6f36cf4b835ff1be853962b22cc354d506f493b67a4303c88bbb40d57b", + "sha256:40f60819fbd5bad6e191ba1329bfafa09ab7f3f174b3d034d413ef5266963294", + "sha256:43b26a865a61549919f8a42e094dfdb62847113cf776d84bd6b60e4e3fc20ea3", + "sha256:4a03dd682f8e35a10234904e0b9508d705ff98cf962c5851ed052e9340df3d90", + "sha256:62f382cddf3d2e52cf266e161aa522d54fd624b8cc567bc18f573d9d50d40e8e", + "sha256:7b98f0325be8450da70aa4a796c4f06852949fe031878b4aa1d6c417a412f314", + "sha256:846a0739e595871041385d86d12af4b6999f921359b38affb99cdd6b54219a8f", + "sha256:a3080470559938a09a5d0ec558c005282e99ac77bf8211fb7b9a5c66390acd8d", + "sha256:ad841b78a476623955da270ab8d207c3c694aa5eba71f4792f65926dc46c6ee8", + "sha256:afdd75d9735e44c639ffd6258ce04a2de3b208f148072c02478162d0944d9da3", + "sha256:b4fbf9b552faff54742bcd0791ab1da5863363fb19047e68f6592be1ac2dab33", + "sha256:b90c4e32d6ec089d3fa3518436bdf5ce4d902a0787dbd9bb09f37afe8b994317", + "sha256:b91cfe4438c741aeff662d413fd2808ac901cc6229c838236840d11de4586d63", + "sha256:bdb0593a42070b0a5f138b79b872289ee73c8e25b3f0bea6564e795b55b6bcdd", + "sha256:c4e4bca2bb68ce22320297dfa1a7bf070a5b20bcbaec4ee023f83d2f6e76496f", + "sha256:cec4ab14af9eae8501be3266ff50c3c2aecc017ba1e86c160209bb4f0423df6a", + "sha256:e83b4b2bf029f5104bc1227dbb7bf5ace6fd8fabaebffcd4f8106fafc69fc45f", + "sha256:e995b3734a46d41ae60b6097f7c51ba9958648c6d1e0935b7e0ee446ee4abe22", + "sha256:f679d93dec7f7210575c85379a31322df4c46496f184ef650d3aba1484b38a2d", + "sha256:fd213bb5166e46974f113c8228daaef1732abc47cb561ce9c4c8eaed4bd3b09b", + "sha256:fdcb57b906dbc1f80666e6290e794ab8fb959a2e17aa5aee1758a85d1da4533f", + "sha256:ff424b01d090ffe1947ec7432b07f536912e0300458f9a7f48ea217dd8362b86" ], - "version": "==4.3.2" + "version": "==4.3.3" }, "markupsafe": { "hashes": [ @@ -473,10 +474,10 @@ }, "soupsieve": { "hashes": [ - "sha256:afa56bf14907bb09403e5d15fbed6275caa4174d36b975226e3b67a3bb6e2c4b", - "sha256:eaed742b48b1f3e2d45ba6f79401b2ed5dc33b2123dfe216adb90d4bfa0ade26" + "sha256:3aef141566afd07201b525c17bfaadd07580a8066f82b57f7c9417f26adbd0a3", + "sha256:e41a65e99bd125972d84221022beb1e4b5cfc68fa12c170c39834ce32d1b294c" ], - "version": "==1.8" + "version": "==1.9" }, "twisted": { "hashes": [ @@ -506,10 +507,10 @@ }, "werkzeug": { "hashes": [ - "sha256:590abe38f8be026d78457fe3b5200895b3543e58ac3fc1dd792c6333ea11af64", - "sha256:ee11b0f0640c56fb491b43b38356c4b588b3202b415a1e03eacf1c5561c961cf" + "sha256:96da23fa8ccecbc3ae832a83df5c722c11547d021637faacb0bec4dd2f4666c8", + "sha256:ca5c2dcd367d6c0df87185b9082929d255358f5391923269335782b213d52655" ], - "version": "==0.15.0" + "version": "==0.15.1" }, "zope.interface": { "hashes": [ diff --git a/README.md b/README.md index 9ae413f..932237e 100644 --- a/README.md +++ b/README.md @@ -43,9 +43,9 @@ You need a running splash instance, preferably on [docker](https://splash.readth ```bash sudo apt install docker.io sudo docker pull scrapinghub/splash -sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui --disable-lua +sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui --disable-lua --disable-browser-caches # On a server with a decent abount of RAM, you may want to run it this way: -# sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui -s 100 --disable-lua -m 50000 +# sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui -s 100 --disable-lua -m 50000 --disable-browser-caches ``` ## Install redis diff --git a/lookyloo/exceptions.py b/lookyloo/exceptions.py index f0f29b2..9a4ec30 100644 --- a/lookyloo/exceptions.py +++ b/lookyloo/exceptions.py @@ -12,3 +12,7 @@ class MissingEnv(LookylooException): class NoValidHarFile(LookylooException): pass + + +class CreateDirectoryException(LookylooException): + pass diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py index 37fd106..db473e0 100644 --- a/lookyloo/helpers.py +++ b/lookyloo/helpers.py @@ -2,11 +2,15 @@ # -*- coding: utf-8 -*- import os from pathlib import Path -from .exceptions import MissingEnv +from .exceptions import MissingEnv, CreateDirectoryException from redis import Redis from redis.exceptions import ConnectionError from datetime import datetime, timedelta import time +from bs4 import BeautifulSoup +import json +import requests +from glob import glob def get_homedir(): @@ -18,6 +22,12 @@ Run the following command (assuming you run the code from the clonned repository return Path(os.environ['LOOKYLOO_HOME']) +def safe_create_dir(to_create: Path): + if to_create.exists() and not to_create.is_dir(): + raise CreateDirectoryException(f'The path {to_create} already exists and is not a directory') + os.makedirs(to_create, exist_ok=True) + + def set_running(name: str) -> None: r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True) r.hset('running', name, 1) @@ -70,3 +80,38 @@ def long_sleep(sleep_in_sec: int, shutdown_check: int=10) -> bool: if shutdown_requested(): return False return True + + +def update_user_agents(): + today = datetime.now() + ua_path = get_homedir() / 'user_agents' / str(today.year) / f'{today.month:02}' + safe_create_dir(ua_path) + ua_file_name = ua_path / f'{today.date().isoformat()}.json' + if ua_file_name.exists(): + # Already have a UA for that day. + return + r = requests.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/') + soup = BeautifulSoup(r.text, 'html.parser') + uas = soup.find_all('textarea')[1].text + to_store = {'by_frequency': []} + for ua in json.loads(uas): + os = ua['system'].split(' ')[-1] + if os not in to_store: + to_store[os] = {} + browser = ' '.join(ua['system'].split(' ')[:-1]) + if browser not in to_store[os]: + to_store[os][browser] = [] + to_store[os][browser].append(ua['useragent']) + to_store['by_frequency'].append({'os': os, 'browser': browser, 'useragent': ua['useragent']}) + with open(ua_file_name, 'w') as f: + json.dump(to_store, f, indent=2) + + +def get_user_agents(): + ua_files_path = str(get_homedir() / 'user_agents' / '*' / '*' / '*.json') + paths = sorted(glob(ua_files_path), reverse=True) + if not paths: + update_user_agents() + paths = sorted(glob(ua_files_path), reverse=True) + with open(paths[0]) as f: + return json.load(f) diff --git a/website/web/__init__.py b/website/web/__init__.py index cedc33f..4318b8f 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -10,7 +10,7 @@ import os from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response from flask_bootstrap import Bootstrap -from lookyloo.helpers import get_homedir +from lookyloo.helpers import get_homedir, update_user_agents, get_user_agents from lookyloo.lookyloo import Lookyloo from lookyloo.exceptions import NoValidHarFile @@ -54,7 +54,9 @@ def scrape_web(): if request.form.get('url'): perma_uuid = lookyloo.scrape(request.form.get('url'), request.form.get('depth'), request.form.get('listing')) return redirect(url_for('tree', tree_uuid=perma_uuid)) - return render_template('scrape.html') + user_agents = get_user_agents() + user_agents.pop('by_frequency') + return render_template('scrape.html', user_agents=user_agents) @app.route('/tree/hostname//text', methods=['GET']) @@ -142,6 +144,7 @@ def index(): # Just returns ack if the webserver is running return 'Ack' lookyloo.cleanup_old_tmpfiles() + update_user_agents() session.clear() titles = [] for report_dir in lookyloo.report_dirs: diff --git a/website/web/templates/scrape.html b/website/web/templates/scrape.html index 3a9551f..4f60ac6 100644 --- a/website/web/templates/scrape.html +++ b/website/web/templates/scrape.html @@ -24,7 +24,53 @@ + + + {% for os, browsers in user_agents.items() %} +
+ +
+ {% set outer_loop = loop %} + {% for browser, user_agents in browsers.items()%} + + {% endfor%} + {% endfor%} {% endblock %} + +{% block scripts %} + {{ super() }} + +{% endblock %}