new: Add initial support for user-agent

pull/42/head
Raphaël Vinot 2019-03-29 20:11:44 +01:00
parent 49654b6b11
commit 36c856ef9b
7 changed files with 139 additions and 39 deletions

View File

@ -17,6 +17,7 @@ lookyloo = {editable = true,path = "."}
cchardet = "*" cchardet = "*"
redis = "*" redis = "*"
pylookyloo = {editable = true,path = "./client"} pylookyloo = {editable = true,path = "./client"}
beautifulsoup4 = "*"
[requires] [requires]
python_version = "3.6" python_version = "3.6"

69
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "766c81240a1f3be46b1f7e845ee38ff7306eeb9a91ffa6aa6be204f51c582b7c" "sha256": "8b6fd0b35686dc0c0cac746c878bc3d34dc04e340b2df3c321dfb13b56234f2d"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@ -43,6 +43,7 @@
"sha256:945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348", "sha256:945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348",
"sha256:ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718" "sha256:ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"
], ],
"index": "pypi",
"version": "==4.7.1" "version": "==4.7.1"
}, },
"cchardet": { "cchardet": {
@ -289,34 +290,34 @@
}, },
"lxml": { "lxml": {
"hashes": [ "hashes": [
"sha256:0358b9e9642bc7d39aac5cffe9884a99a5ca68e5e2c1b89e570ed60da9139908", "sha256:03984196d00670b2ab14ae0ea83d5cc0cfa4f5a42558afa9ab5fa745995328f5",
"sha256:091a359c4dafebbecd3959d9013f1b896b5371859165e4e50b01607a98d9e3e2", "sha256:0815b0c9f897468de6a386dc15917a0becf48cc92425613aa8bbfc7f0f82951f",
"sha256:1998e4e60603c64bcc35af61b4331ab3af087457900d3980e18d190e17c3a697", "sha256:175f3825f075cf02d15099eb52658457cf0ff103dcf11512b5d2583e1d40f58b",
"sha256:2000b4088dee9a41f459fddaf6609bba48a435ce6374bb254c5ccdaa8928c5ba", "sha256:30e14c62d88d1e01a26936ecd1c6e784d4afc9aa002bba4321c5897937112616",
"sha256:2afb0064780d8aaf165875be5898c1866766e56175714fa5f9d055433e92d41d", "sha256:3210da6f36cf4b835ff1be853962b22cc354d506f493b67a4303c88bbb40d57b",
"sha256:2d8f1d9334a4e3ff176d096c14ded3100547d73440683567d85b8842a53180bb", "sha256:40f60819fbd5bad6e191ba1329bfafa09ab7f3f174b3d034d413ef5266963294",
"sha256:2e38db22f6a3199fd63675e1b4bd795d676d906869047398f29f38ca55cb453a", "sha256:43b26a865a61549919f8a42e094dfdb62847113cf776d84bd6b60e4e3fc20ea3",
"sha256:3181f84649c1a1ca62b19ddf28436b1b2cb05ae6c7d2628f33872e713994c364", "sha256:4a03dd682f8e35a10234904e0b9508d705ff98cf962c5851ed052e9340df3d90",
"sha256:37462170dfd88af8431d04de6b236e6e9c06cda71e2ca26d88ef2332fd2a5237", "sha256:62f382cddf3d2e52cf266e161aa522d54fd624b8cc567bc18f573d9d50d40e8e",
"sha256:3a9d8521c89bf6f2a929c3d12ad3ad7392c774c327ea809fd08a13be6b3bc05f", "sha256:7b98f0325be8450da70aa4a796c4f06852949fe031878b4aa1d6c417a412f314",
"sha256:3d0bbd2e1a28b4429f24fd63a122a450ce9edb7a8063d070790092d7343a1aa4", "sha256:846a0739e595871041385d86d12af4b6999f921359b38affb99cdd6b54219a8f",
"sha256:483d60585ce3ee71929cea70949059f83850fa5e12deb9c094ed1c8c2ec73cbd", "sha256:a3080470559938a09a5d0ec558c005282e99ac77bf8211fb7b9a5c66390acd8d",
"sha256:4888be27d5cba55ce94209baef5bcd7bbd7314a3d17021a5fc10000b3a5f737d", "sha256:ad841b78a476623955da270ab8d207c3c694aa5eba71f4792f65926dc46c6ee8",
"sha256:64b0d62e4209170a2a0c404c446ab83b941a0003e96604d2e4f4cb735f8a2254", "sha256:afdd75d9735e44c639ffd6258ce04a2de3b208f148072c02478162d0944d9da3",
"sha256:68010900898fdf139ac08549c4dba8206c584070a960ffc530aebf0c6f2794ef", "sha256:b4fbf9b552faff54742bcd0791ab1da5863363fb19047e68f6592be1ac2dab33",
"sha256:872ecb066de602a0099db98bd9e57f4cfc1d62f6093d94460c787737aa08f39e", "sha256:b90c4e32d6ec089d3fa3518436bdf5ce4d902a0787dbd9bb09f37afe8b994317",
"sha256:88a32b03f2e4cd0e63f154cac76724709f40b3fc2f30139eb5d6f900521b44ed", "sha256:b91cfe4438c741aeff662d413fd2808ac901cc6229c838236840d11de4586d63",
"sha256:b1dc7683da4e67ab2bebf266afa68098d681ae02ce570f0d1117312273d2b2ac", "sha256:bdb0593a42070b0a5f138b79b872289ee73c8e25b3f0bea6564e795b55b6bcdd",
"sha256:b29e27ce9371810250cb1528a771d047a9c7b0f79630dc7dc5815ff828f4273b", "sha256:c4e4bca2bb68ce22320297dfa1a7bf070a5b20bcbaec4ee023f83d2f6e76496f",
"sha256:ce197559596370d985f1ce6b7051b52126849d8159040293bf8b98cb2b3e1f78", "sha256:cec4ab14af9eae8501be3266ff50c3c2aecc017ba1e86c160209bb4f0423df6a",
"sha256:d45cf6daaf22584eff2175f48f82c4aa24d8e72a44913c5aff801819bb73d11f", "sha256:e83b4b2bf029f5104bc1227dbb7bf5ace6fd8fabaebffcd4f8106fafc69fc45f",
"sha256:e2ff9496322b2ce947ba4a7a5eb048158de9d6f3fe9efce29f1e8dd6878561e6", "sha256:e995b3734a46d41ae60b6097f7c51ba9958648c6d1e0935b7e0ee446ee4abe22",
"sha256:f7b979518ec1f294a41a707c007d54d0f3b3e1fd15d5b26b7e99b62b10d9a72e", "sha256:f679d93dec7f7210575c85379a31322df4c46496f184ef650d3aba1484b38a2d",
"sha256:f9c7268e9d16e34e50f8246c4f24cf7353764affd2bc971f0379514c246e3f6b", "sha256:fd213bb5166e46974f113c8228daaef1732abc47cb561ce9c4c8eaed4bd3b09b",
"sha256:f9c839806089d79de588ee1dde2dae05dc1156d3355dfeb2b51fde84d9c960ad", "sha256:fdcb57b906dbc1f80666e6290e794ab8fb959a2e17aa5aee1758a85d1da4533f",
"sha256:ff962953e2389226adc4d355e34a98b0b800984399153c6678f2367b11b4d4b8" "sha256:ff424b01d090ffe1947ec7432b07f536912e0300458f9a7f48ea217dd8362b86"
], ],
"version": "==4.3.2" "version": "==4.3.3"
}, },
"markupsafe": { "markupsafe": {
"hashes": [ "hashes": [
@ -473,10 +474,10 @@
}, },
"soupsieve": { "soupsieve": {
"hashes": [ "hashes": [
"sha256:afa56bf14907bb09403e5d15fbed6275caa4174d36b975226e3b67a3bb6e2c4b", "sha256:3aef141566afd07201b525c17bfaadd07580a8066f82b57f7c9417f26adbd0a3",
"sha256:eaed742b48b1f3e2d45ba6f79401b2ed5dc33b2123dfe216adb90d4bfa0ade26" "sha256:e41a65e99bd125972d84221022beb1e4b5cfc68fa12c170c39834ce32d1b294c"
], ],
"version": "==1.8" "version": "==1.9"
}, },
"twisted": { "twisted": {
"hashes": [ "hashes": [
@ -506,10 +507,10 @@
}, },
"werkzeug": { "werkzeug": {
"hashes": [ "hashes": [
"sha256:590abe38f8be026d78457fe3b5200895b3543e58ac3fc1dd792c6333ea11af64", "sha256:96da23fa8ccecbc3ae832a83df5c722c11547d021637faacb0bec4dd2f4666c8",
"sha256:ee11b0f0640c56fb491b43b38356c4b588b3202b415a1e03eacf1c5561c961cf" "sha256:ca5c2dcd367d6c0df87185b9082929d255358f5391923269335782b213d52655"
], ],
"version": "==0.15.0" "version": "==0.15.1"
}, },
"zope.interface": { "zope.interface": {
"hashes": [ "hashes": [

View File

@ -43,9 +43,9 @@ You need a running splash instance, preferably on [docker](https://splash.readth
```bash ```bash
sudo apt install docker.io sudo apt install docker.io
sudo docker pull scrapinghub/splash sudo docker pull scrapinghub/splash
sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui --disable-lua sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui --disable-lua --disable-browser-caches
# On a server with a decent abount of RAM, you may want to run it this way: # On a server with a decent abount of RAM, you may want to run it this way:
# sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui -s 100 --disable-lua -m 50000 # sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui -s 100 --disable-lua -m 50000 --disable-browser-caches
``` ```
## Install redis ## Install redis

View File

@ -12,3 +12,7 @@ class MissingEnv(LookylooException):
class NoValidHarFile(LookylooException): class NoValidHarFile(LookylooException):
pass pass
class CreateDirectoryException(LookylooException):
pass

View File

@ -2,11 +2,15 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
from pathlib import Path from pathlib import Path
from .exceptions import MissingEnv from .exceptions import MissingEnv, CreateDirectoryException
from redis import Redis from redis import Redis
from redis.exceptions import ConnectionError from redis.exceptions import ConnectionError
from datetime import datetime, timedelta from datetime import datetime, timedelta
import time import time
from bs4 import BeautifulSoup
import json
import requests
from glob import glob
def get_homedir(): def get_homedir():
@ -18,6 +22,12 @@ Run the following command (assuming you run the code from the clonned repository
return Path(os.environ['LOOKYLOO_HOME']) return Path(os.environ['LOOKYLOO_HOME'])
def safe_create_dir(to_create: Path):
if to_create.exists() and not to_create.is_dir():
raise CreateDirectoryException(f'The path {to_create} already exists and is not a directory')
os.makedirs(to_create, exist_ok=True)
def set_running(name: str) -> None: def set_running(name: str) -> None:
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True) r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
r.hset('running', name, 1) r.hset('running', name, 1)
@ -70,3 +80,38 @@ def long_sleep(sleep_in_sec: int, shutdown_check: int=10) -> bool:
if shutdown_requested(): if shutdown_requested():
return False return False
return True return True
def update_user_agents():
today = datetime.now()
ua_path = get_homedir() / 'user_agents' / str(today.year) / f'{today.month:02}'
safe_create_dir(ua_path)
ua_file_name = ua_path / f'{today.date().isoformat()}.json'
if ua_file_name.exists():
# Already have a UA for that day.
return
r = requests.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/')
soup = BeautifulSoup(r.text, 'html.parser')
uas = soup.find_all('textarea')[1].text
to_store = {'by_frequency': []}
for ua in json.loads(uas):
os = ua['system'].split(' ')[-1]
if os not in to_store:
to_store[os] = {}
browser = ' '.join(ua['system'].split(' ')[:-1])
if browser not in to_store[os]:
to_store[os][browser] = []
to_store[os][browser].append(ua['useragent'])
to_store['by_frequency'].append({'os': os, 'browser': browser, 'useragent': ua['useragent']})
with open(ua_file_name, 'w') as f:
json.dump(to_store, f, indent=2)
def get_user_agents():
ua_files_path = str(get_homedir() / 'user_agents' / '*' / '*' / '*.json')
paths = sorted(glob(ua_files_path), reverse=True)
if not paths:
update_user_agents()
paths = sorted(glob(ua_files_path), reverse=True)
with open(paths[0]) as f:
return json.load(f)

View File

@ -10,7 +10,7 @@ import os
from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response
from flask_bootstrap import Bootstrap from flask_bootstrap import Bootstrap
from lookyloo.helpers import get_homedir from lookyloo.helpers import get_homedir, update_user_agents, get_user_agents
from lookyloo.lookyloo import Lookyloo from lookyloo.lookyloo import Lookyloo
from lookyloo.exceptions import NoValidHarFile from lookyloo.exceptions import NoValidHarFile
@ -54,7 +54,9 @@ def scrape_web():
if request.form.get('url'): if request.form.get('url'):
perma_uuid = lookyloo.scrape(request.form.get('url'), request.form.get('depth'), request.form.get('listing')) perma_uuid = lookyloo.scrape(request.form.get('url'), request.form.get('depth'), request.form.get('listing'))
return redirect(url_for('tree', tree_uuid=perma_uuid)) return redirect(url_for('tree', tree_uuid=perma_uuid))
return render_template('scrape.html') user_agents = get_user_agents()
user_agents.pop('by_frequency')
return render_template('scrape.html', user_agents=user_agents)
@app.route('/tree/hostname/<node_uuid>/text', methods=['GET']) @app.route('/tree/hostname/<node_uuid>/text', methods=['GET'])
@ -142,6 +144,7 @@ def index():
# Just returns ack if the webserver is running # Just returns ack if the webserver is running
return 'Ack' return 'Ack'
lookyloo.cleanup_old_tmpfiles() lookyloo.cleanup_old_tmpfiles()
update_user_agents()
session.clear() session.clear()
titles = [] titles = []
for report_dir in lookyloo.report_dirs: for report_dir in lookyloo.report_dirs:

View File

@ -24,7 +24,53 @@
<label for="listing">Public</label> <label for="listing">Public</label>
<input type="checkbox" name="listing" checked="true"></input> <input type="checkbox" name="listing" checked="true"></input>
</div> </div>
<label for="os">Choose an operating system</label>
<select class="form-control" name="os" id="os">
{% for os in user_agents.keys() %}
<option value="{{ os }}">{{ os }}</option>
{% endfor%}
</select>
{% for os, browsers in user_agents.items() %}
<div id="{{os.replace(' ', '_')}}" class="style-sub-1" {% if loop.index0 != 0 %}style="display: none;"{%endif%}>
<select class="form-control" name="browser">
{% for browser in browsers.keys()%}
<option value="{{ browser }}">{{ browser }}</option>
{% endfor%}
</select>
</div>
{% set outer_loop = loop %}
{% for browser, user_agents in browsers.items()%}
<div id="{{os.replace(' ', '_')}}_{{browser.replace(' ', '_')}}" class="style-sub-2" {% if not loop.first or not outer_loop.first %} style="display: none;"{%endif%}>
{% set ua_displayed = True %}
<select class="form-control" name="user_agent">
{% for user_agent in user_agents %}
<option value="{{ user_agent }}">{{ user_agent }}</option>
{% endfor%}
</select>
</div>
{% endfor%}
{% endfor%}
<button type="submit" class="btn btn-default">Scrape</button> <button type="submit" class="btn btn-default">Scrape</button>
</form> </form>
</div> </div>
{% endblock %} {% endblock %}
{% block scripts %}
{{ super() }}
<script>
$("#os").change(function(){
var os_name = $(this).find(":selected").val().replace(/(:|\.|\[|\]|,|=)/g, "\\$1").replace(/ /g,"_");
var first_browser_name = $("[id='" + os_name + "']").find('select option:first-child').val().replace(/(:|\.|\[|\]|,|=)/g, "\\$1").replace(/ /g,"_");
$(".style-sub-1").hide();
$(".style-sub-2").hide();
$("[id='" + os_name + "']").show();
$("[id='" + os_name + '_' + first_browser_name + "']").show();
});
$('select[name="browser"]').change(function(){
var browser_name = $(this).find(":selected").val().replace(/(:|\.|\[|\]|,|=)/g, "\\$1").replace(/ /g,"_");
var os_name = $(this).parent().attr("id").replace(/(:|\.|\[|\]|,|=)/g, "\\$1").replace(/ /g,"_");
$(".style-sub-2").hide();
$("[id='" + os_name + '_' + browser_name + "']").show();
});
</script>
{% endblock %}