new: Add initial support for user-agent

pull/42/head
Raphaël Vinot 2019-03-29 20:11:44 +01:00
parent 49654b6b11
commit 36c856ef9b
7 changed files with 139 additions and 39 deletions

View File

@ -17,6 +17,7 @@ lookyloo = {editable = true,path = "."}
cchardet = "*"
redis = "*"
pylookyloo = {editable = true,path = "./client"}
beautifulsoup4 = "*"
[requires]
python_version = "3.6"

69
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "766c81240a1f3be46b1f7e845ee38ff7306eeb9a91ffa6aa6be204f51c582b7c"
"sha256": "8b6fd0b35686dc0c0cac746c878bc3d34dc04e340b2df3c321dfb13b56234f2d"
},
"pipfile-spec": 6,
"requires": {
@ -43,6 +43,7 @@
"sha256:945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348",
"sha256:ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"
],
"index": "pypi",
"version": "==4.7.1"
},
"cchardet": {
@ -289,34 +290,34 @@
},
"lxml": {
"hashes": [
"sha256:0358b9e9642bc7d39aac5cffe9884a99a5ca68e5e2c1b89e570ed60da9139908",
"sha256:091a359c4dafebbecd3959d9013f1b896b5371859165e4e50b01607a98d9e3e2",
"sha256:1998e4e60603c64bcc35af61b4331ab3af087457900d3980e18d190e17c3a697",
"sha256:2000b4088dee9a41f459fddaf6609bba48a435ce6374bb254c5ccdaa8928c5ba",
"sha256:2afb0064780d8aaf165875be5898c1866766e56175714fa5f9d055433e92d41d",
"sha256:2d8f1d9334a4e3ff176d096c14ded3100547d73440683567d85b8842a53180bb",
"sha256:2e38db22f6a3199fd63675e1b4bd795d676d906869047398f29f38ca55cb453a",
"sha256:3181f84649c1a1ca62b19ddf28436b1b2cb05ae6c7d2628f33872e713994c364",
"sha256:37462170dfd88af8431d04de6b236e6e9c06cda71e2ca26d88ef2332fd2a5237",
"sha256:3a9d8521c89bf6f2a929c3d12ad3ad7392c774c327ea809fd08a13be6b3bc05f",
"sha256:3d0bbd2e1a28b4429f24fd63a122a450ce9edb7a8063d070790092d7343a1aa4",
"sha256:483d60585ce3ee71929cea70949059f83850fa5e12deb9c094ed1c8c2ec73cbd",
"sha256:4888be27d5cba55ce94209baef5bcd7bbd7314a3d17021a5fc10000b3a5f737d",
"sha256:64b0d62e4209170a2a0c404c446ab83b941a0003e96604d2e4f4cb735f8a2254",
"sha256:68010900898fdf139ac08549c4dba8206c584070a960ffc530aebf0c6f2794ef",
"sha256:872ecb066de602a0099db98bd9e57f4cfc1d62f6093d94460c787737aa08f39e",
"sha256:88a32b03f2e4cd0e63f154cac76724709f40b3fc2f30139eb5d6f900521b44ed",
"sha256:b1dc7683da4e67ab2bebf266afa68098d681ae02ce570f0d1117312273d2b2ac",
"sha256:b29e27ce9371810250cb1528a771d047a9c7b0f79630dc7dc5815ff828f4273b",
"sha256:ce197559596370d985f1ce6b7051b52126849d8159040293bf8b98cb2b3e1f78",
"sha256:d45cf6daaf22584eff2175f48f82c4aa24d8e72a44913c5aff801819bb73d11f",
"sha256:e2ff9496322b2ce947ba4a7a5eb048158de9d6f3fe9efce29f1e8dd6878561e6",
"sha256:f7b979518ec1f294a41a707c007d54d0f3b3e1fd15d5b26b7e99b62b10d9a72e",
"sha256:f9c7268e9d16e34e50f8246c4f24cf7353764affd2bc971f0379514c246e3f6b",
"sha256:f9c839806089d79de588ee1dde2dae05dc1156d3355dfeb2b51fde84d9c960ad",
"sha256:ff962953e2389226adc4d355e34a98b0b800984399153c6678f2367b11b4d4b8"
"sha256:03984196d00670b2ab14ae0ea83d5cc0cfa4f5a42558afa9ab5fa745995328f5",
"sha256:0815b0c9f897468de6a386dc15917a0becf48cc92425613aa8bbfc7f0f82951f",
"sha256:175f3825f075cf02d15099eb52658457cf0ff103dcf11512b5d2583e1d40f58b",
"sha256:30e14c62d88d1e01a26936ecd1c6e784d4afc9aa002bba4321c5897937112616",
"sha256:3210da6f36cf4b835ff1be853962b22cc354d506f493b67a4303c88bbb40d57b",
"sha256:40f60819fbd5bad6e191ba1329bfafa09ab7f3f174b3d034d413ef5266963294",
"sha256:43b26a865a61549919f8a42e094dfdb62847113cf776d84bd6b60e4e3fc20ea3",
"sha256:4a03dd682f8e35a10234904e0b9508d705ff98cf962c5851ed052e9340df3d90",
"sha256:62f382cddf3d2e52cf266e161aa522d54fd624b8cc567bc18f573d9d50d40e8e",
"sha256:7b98f0325be8450da70aa4a796c4f06852949fe031878b4aa1d6c417a412f314",
"sha256:846a0739e595871041385d86d12af4b6999f921359b38affb99cdd6b54219a8f",
"sha256:a3080470559938a09a5d0ec558c005282e99ac77bf8211fb7b9a5c66390acd8d",
"sha256:ad841b78a476623955da270ab8d207c3c694aa5eba71f4792f65926dc46c6ee8",
"sha256:afdd75d9735e44c639ffd6258ce04a2de3b208f148072c02478162d0944d9da3",
"sha256:b4fbf9b552faff54742bcd0791ab1da5863363fb19047e68f6592be1ac2dab33",
"sha256:b90c4e32d6ec089d3fa3518436bdf5ce4d902a0787dbd9bb09f37afe8b994317",
"sha256:b91cfe4438c741aeff662d413fd2808ac901cc6229c838236840d11de4586d63",
"sha256:bdb0593a42070b0a5f138b79b872289ee73c8e25b3f0bea6564e795b55b6bcdd",
"sha256:c4e4bca2bb68ce22320297dfa1a7bf070a5b20bcbaec4ee023f83d2f6e76496f",
"sha256:cec4ab14af9eae8501be3266ff50c3c2aecc017ba1e86c160209bb4f0423df6a",
"sha256:e83b4b2bf029f5104bc1227dbb7bf5ace6fd8fabaebffcd4f8106fafc69fc45f",
"sha256:e995b3734a46d41ae60b6097f7c51ba9958648c6d1e0935b7e0ee446ee4abe22",
"sha256:f679d93dec7f7210575c85379a31322df4c46496f184ef650d3aba1484b38a2d",
"sha256:fd213bb5166e46974f113c8228daaef1732abc47cb561ce9c4c8eaed4bd3b09b",
"sha256:fdcb57b906dbc1f80666e6290e794ab8fb959a2e17aa5aee1758a85d1da4533f",
"sha256:ff424b01d090ffe1947ec7432b07f536912e0300458f9a7f48ea217dd8362b86"
],
"version": "==4.3.2"
"version": "==4.3.3"
},
"markupsafe": {
"hashes": [
@ -473,10 +474,10 @@
},
"soupsieve": {
"hashes": [
"sha256:afa56bf14907bb09403e5d15fbed6275caa4174d36b975226e3b67a3bb6e2c4b",
"sha256:eaed742b48b1f3e2d45ba6f79401b2ed5dc33b2123dfe216adb90d4bfa0ade26"
"sha256:3aef141566afd07201b525c17bfaadd07580a8066f82b57f7c9417f26adbd0a3",
"sha256:e41a65e99bd125972d84221022beb1e4b5cfc68fa12c170c39834ce32d1b294c"
],
"version": "==1.8"
"version": "==1.9"
},
"twisted": {
"hashes": [
@ -506,10 +507,10 @@
},
"werkzeug": {
"hashes": [
"sha256:590abe38f8be026d78457fe3b5200895b3543e58ac3fc1dd792c6333ea11af64",
"sha256:ee11b0f0640c56fb491b43b38356c4b588b3202b415a1e03eacf1c5561c961cf"
"sha256:96da23fa8ccecbc3ae832a83df5c722c11547d021637faacb0bec4dd2f4666c8",
"sha256:ca5c2dcd367d6c0df87185b9082929d255358f5391923269335782b213d52655"
],
"version": "==0.15.0"
"version": "==0.15.1"
},
"zope.interface": {
"hashes": [

View File

@ -43,9 +43,9 @@ You need a running splash instance, preferably on [docker](https://splash.readth
```bash
sudo apt install docker.io
sudo docker pull scrapinghub/splash
sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui --disable-lua
sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui --disable-lua --disable-browser-caches
# On a server with a decent abount of RAM, you may want to run it this way:
# sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui -s 100 --disable-lua -m 50000
# sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui -s 100 --disable-lua -m 50000 --disable-browser-caches
```
## Install redis

View File

@ -12,3 +12,7 @@ class MissingEnv(LookylooException):
class NoValidHarFile(LookylooException):
pass
class CreateDirectoryException(LookylooException):
pass

View File

@ -2,11 +2,15 @@
# -*- coding: utf-8 -*-
import os
from pathlib import Path
from .exceptions import MissingEnv
from .exceptions import MissingEnv, CreateDirectoryException
from redis import Redis
from redis.exceptions import ConnectionError
from datetime import datetime, timedelta
import time
from bs4 import BeautifulSoup
import json
import requests
from glob import glob
def get_homedir():
@ -18,6 +22,12 @@ Run the following command (assuming you run the code from the clonned repository
return Path(os.environ['LOOKYLOO_HOME'])
def safe_create_dir(to_create: Path):
if to_create.exists() and not to_create.is_dir():
raise CreateDirectoryException(f'The path {to_create} already exists and is not a directory')
os.makedirs(to_create, exist_ok=True)
def set_running(name: str) -> None:
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
r.hset('running', name, 1)
@ -70,3 +80,38 @@ def long_sleep(sleep_in_sec: int, shutdown_check: int=10) -> bool:
if shutdown_requested():
return False
return True
def update_user_agents():
today = datetime.now()
ua_path = get_homedir() / 'user_agents' / str(today.year) / f'{today.month:02}'
safe_create_dir(ua_path)
ua_file_name = ua_path / f'{today.date().isoformat()}.json'
if ua_file_name.exists():
# Already have a UA for that day.
return
r = requests.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/')
soup = BeautifulSoup(r.text, 'html.parser')
uas = soup.find_all('textarea')[1].text
to_store = {'by_frequency': []}
for ua in json.loads(uas):
os = ua['system'].split(' ')[-1]
if os not in to_store:
to_store[os] = {}
browser = ' '.join(ua['system'].split(' ')[:-1])
if browser not in to_store[os]:
to_store[os][browser] = []
to_store[os][browser].append(ua['useragent'])
to_store['by_frequency'].append({'os': os, 'browser': browser, 'useragent': ua['useragent']})
with open(ua_file_name, 'w') as f:
json.dump(to_store, f, indent=2)
def get_user_agents():
ua_files_path = str(get_homedir() / 'user_agents' / '*' / '*' / '*.json')
paths = sorted(glob(ua_files_path), reverse=True)
if not paths:
update_user_agents()
paths = sorted(glob(ua_files_path), reverse=True)
with open(paths[0]) as f:
return json.load(f)

View File

@ -10,7 +10,7 @@ import os
from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response
from flask_bootstrap import Bootstrap
from lookyloo.helpers import get_homedir
from lookyloo.helpers import get_homedir, update_user_agents, get_user_agents
from lookyloo.lookyloo import Lookyloo
from lookyloo.exceptions import NoValidHarFile
@ -54,7 +54,9 @@ def scrape_web():
if request.form.get('url'):
perma_uuid = lookyloo.scrape(request.form.get('url'), request.form.get('depth'), request.form.get('listing'))
return redirect(url_for('tree', tree_uuid=perma_uuid))
return render_template('scrape.html')
user_agents = get_user_agents()
user_agents.pop('by_frequency')
return render_template('scrape.html', user_agents=user_agents)
@app.route('/tree/hostname/<node_uuid>/text', methods=['GET'])
@ -142,6 +144,7 @@ def index():
# Just returns ack if the webserver is running
return 'Ack'
lookyloo.cleanup_old_tmpfiles()
update_user_agents()
session.clear()
titles = []
for report_dir in lookyloo.report_dirs:

View File

@ -24,7 +24,53 @@
<label for="listing">Public</label>
<input type="checkbox" name="listing" checked="true"></input>
</div>
<label for="os">Choose an operating system</label>
<select class="form-control" name="os" id="os">
{% for os in user_agents.keys() %}
<option value="{{ os }}">{{ os }}</option>
{% endfor%}
</select>
{% for os, browsers in user_agents.items() %}
<div id="{{os.replace(' ', '_')}}" class="style-sub-1" {% if loop.index0 != 0 %}style="display: none;"{%endif%}>
<select class="form-control" name="browser">
{% for browser in browsers.keys()%}
<option value="{{ browser }}">{{ browser }}</option>
{% endfor%}
</select>
</div>
{% set outer_loop = loop %}
{% for browser, user_agents in browsers.items()%}
<div id="{{os.replace(' ', '_')}}_{{browser.replace(' ', '_')}}" class="style-sub-2" {% if not loop.first or not outer_loop.first %} style="display: none;"{%endif%}>
{% set ua_displayed = True %}
<select class="form-control" name="user_agent">
{% for user_agent in user_agents %}
<option value="{{ user_agent }}">{{ user_agent }}</option>
{% endfor%}
</select>
</div>
{% endfor%}
{% endfor%}
<button type="submit" class="btn btn-default">Scrape</button>
</form>
</div>
{% endblock %}
{% block scripts %}
{{ super() }}
<script>
$("#os").change(function(){
var os_name = $(this).find(":selected").val().replace(/(:|\.|\[|\]|,|=)/g, "\\$1").replace(/ /g,"_");
var first_browser_name = $("[id='" + os_name + "']").find('select option:first-child').val().replace(/(:|\.|\[|\]|,|=)/g, "\\$1").replace(/ /g,"_");
$(".style-sub-1").hide();
$(".style-sub-2").hide();
$("[id='" + os_name + "']").show();
$("[id='" + os_name + '_' + first_browser_name + "']").show();
});
$('select[name="browser"]').change(function(){
var browser_name = $(this).find(":selected").val().replace(/(:|\.|\[|\]|,|=)/g, "\\$1").replace(/ /g,"_");
var os_name = $(this).parent().attr("id").replace(/(:|\.|\[|\]|,|=)/g, "\\$1").replace(/ /g,"_");
$(".style-sub-2").hide();
$("[id='" + os_name + '_' + browser_name + "']").show();
});
</script>
{% endblock %}