mirror of https://github.com/CIRCL/lookyloo
new: Add initial support for user-agent
parent
49654b6b11
commit
36c856ef9b
1
Pipfile
1
Pipfile
|
@ -17,6 +17,7 @@ lookyloo = {editable = true,path = "."}
|
|||
cchardet = "*"
|
||||
redis = "*"
|
||||
pylookyloo = {editable = true,path = "./client"}
|
||||
beautifulsoup4 = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.6"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "766c81240a1f3be46b1f7e845ee38ff7306eeb9a91ffa6aa6be204f51c582b7c"
|
||||
"sha256": "8b6fd0b35686dc0c0cac746c878bc3d34dc04e340b2df3c321dfb13b56234f2d"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
|
@ -43,6 +43,7 @@
|
|||
"sha256:945065979fb8529dd2f37dbb58f00b661bdbcbebf954f93b32fdf5263ef35348",
|
||||
"sha256:ba6d5c59906a85ac23dadfe5c88deaf3e179ef565f4898671253e50a78680718"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.7.1"
|
||||
},
|
||||
"cchardet": {
|
||||
|
@ -289,34 +290,34 @@
|
|||
},
|
||||
"lxml": {
|
||||
"hashes": [
|
||||
"sha256:0358b9e9642bc7d39aac5cffe9884a99a5ca68e5e2c1b89e570ed60da9139908",
|
||||
"sha256:091a359c4dafebbecd3959d9013f1b896b5371859165e4e50b01607a98d9e3e2",
|
||||
"sha256:1998e4e60603c64bcc35af61b4331ab3af087457900d3980e18d190e17c3a697",
|
||||
"sha256:2000b4088dee9a41f459fddaf6609bba48a435ce6374bb254c5ccdaa8928c5ba",
|
||||
"sha256:2afb0064780d8aaf165875be5898c1866766e56175714fa5f9d055433e92d41d",
|
||||
"sha256:2d8f1d9334a4e3ff176d096c14ded3100547d73440683567d85b8842a53180bb",
|
||||
"sha256:2e38db22f6a3199fd63675e1b4bd795d676d906869047398f29f38ca55cb453a",
|
||||
"sha256:3181f84649c1a1ca62b19ddf28436b1b2cb05ae6c7d2628f33872e713994c364",
|
||||
"sha256:37462170dfd88af8431d04de6b236e6e9c06cda71e2ca26d88ef2332fd2a5237",
|
||||
"sha256:3a9d8521c89bf6f2a929c3d12ad3ad7392c774c327ea809fd08a13be6b3bc05f",
|
||||
"sha256:3d0bbd2e1a28b4429f24fd63a122a450ce9edb7a8063d070790092d7343a1aa4",
|
||||
"sha256:483d60585ce3ee71929cea70949059f83850fa5e12deb9c094ed1c8c2ec73cbd",
|
||||
"sha256:4888be27d5cba55ce94209baef5bcd7bbd7314a3d17021a5fc10000b3a5f737d",
|
||||
"sha256:64b0d62e4209170a2a0c404c446ab83b941a0003e96604d2e4f4cb735f8a2254",
|
||||
"sha256:68010900898fdf139ac08549c4dba8206c584070a960ffc530aebf0c6f2794ef",
|
||||
"sha256:872ecb066de602a0099db98bd9e57f4cfc1d62f6093d94460c787737aa08f39e",
|
||||
"sha256:88a32b03f2e4cd0e63f154cac76724709f40b3fc2f30139eb5d6f900521b44ed",
|
||||
"sha256:b1dc7683da4e67ab2bebf266afa68098d681ae02ce570f0d1117312273d2b2ac",
|
||||
"sha256:b29e27ce9371810250cb1528a771d047a9c7b0f79630dc7dc5815ff828f4273b",
|
||||
"sha256:ce197559596370d985f1ce6b7051b52126849d8159040293bf8b98cb2b3e1f78",
|
||||
"sha256:d45cf6daaf22584eff2175f48f82c4aa24d8e72a44913c5aff801819bb73d11f",
|
||||
"sha256:e2ff9496322b2ce947ba4a7a5eb048158de9d6f3fe9efce29f1e8dd6878561e6",
|
||||
"sha256:f7b979518ec1f294a41a707c007d54d0f3b3e1fd15d5b26b7e99b62b10d9a72e",
|
||||
"sha256:f9c7268e9d16e34e50f8246c4f24cf7353764affd2bc971f0379514c246e3f6b",
|
||||
"sha256:f9c839806089d79de588ee1dde2dae05dc1156d3355dfeb2b51fde84d9c960ad",
|
||||
"sha256:ff962953e2389226adc4d355e34a98b0b800984399153c6678f2367b11b4d4b8"
|
||||
"sha256:03984196d00670b2ab14ae0ea83d5cc0cfa4f5a42558afa9ab5fa745995328f5",
|
||||
"sha256:0815b0c9f897468de6a386dc15917a0becf48cc92425613aa8bbfc7f0f82951f",
|
||||
"sha256:175f3825f075cf02d15099eb52658457cf0ff103dcf11512b5d2583e1d40f58b",
|
||||
"sha256:30e14c62d88d1e01a26936ecd1c6e784d4afc9aa002bba4321c5897937112616",
|
||||
"sha256:3210da6f36cf4b835ff1be853962b22cc354d506f493b67a4303c88bbb40d57b",
|
||||
"sha256:40f60819fbd5bad6e191ba1329bfafa09ab7f3f174b3d034d413ef5266963294",
|
||||
"sha256:43b26a865a61549919f8a42e094dfdb62847113cf776d84bd6b60e4e3fc20ea3",
|
||||
"sha256:4a03dd682f8e35a10234904e0b9508d705ff98cf962c5851ed052e9340df3d90",
|
||||
"sha256:62f382cddf3d2e52cf266e161aa522d54fd624b8cc567bc18f573d9d50d40e8e",
|
||||
"sha256:7b98f0325be8450da70aa4a796c4f06852949fe031878b4aa1d6c417a412f314",
|
||||
"sha256:846a0739e595871041385d86d12af4b6999f921359b38affb99cdd6b54219a8f",
|
||||
"sha256:a3080470559938a09a5d0ec558c005282e99ac77bf8211fb7b9a5c66390acd8d",
|
||||
"sha256:ad841b78a476623955da270ab8d207c3c694aa5eba71f4792f65926dc46c6ee8",
|
||||
"sha256:afdd75d9735e44c639ffd6258ce04a2de3b208f148072c02478162d0944d9da3",
|
||||
"sha256:b4fbf9b552faff54742bcd0791ab1da5863363fb19047e68f6592be1ac2dab33",
|
||||
"sha256:b90c4e32d6ec089d3fa3518436bdf5ce4d902a0787dbd9bb09f37afe8b994317",
|
||||
"sha256:b91cfe4438c741aeff662d413fd2808ac901cc6229c838236840d11de4586d63",
|
||||
"sha256:bdb0593a42070b0a5f138b79b872289ee73c8e25b3f0bea6564e795b55b6bcdd",
|
||||
"sha256:c4e4bca2bb68ce22320297dfa1a7bf070a5b20bcbaec4ee023f83d2f6e76496f",
|
||||
"sha256:cec4ab14af9eae8501be3266ff50c3c2aecc017ba1e86c160209bb4f0423df6a",
|
||||
"sha256:e83b4b2bf029f5104bc1227dbb7bf5ace6fd8fabaebffcd4f8106fafc69fc45f",
|
||||
"sha256:e995b3734a46d41ae60b6097f7c51ba9958648c6d1e0935b7e0ee446ee4abe22",
|
||||
"sha256:f679d93dec7f7210575c85379a31322df4c46496f184ef650d3aba1484b38a2d",
|
||||
"sha256:fd213bb5166e46974f113c8228daaef1732abc47cb561ce9c4c8eaed4bd3b09b",
|
||||
"sha256:fdcb57b906dbc1f80666e6290e794ab8fb959a2e17aa5aee1758a85d1da4533f",
|
||||
"sha256:ff424b01d090ffe1947ec7432b07f536912e0300458f9a7f48ea217dd8362b86"
|
||||
],
|
||||
"version": "==4.3.2"
|
||||
"version": "==4.3.3"
|
||||
},
|
||||
"markupsafe": {
|
||||
"hashes": [
|
||||
|
@ -473,10 +474,10 @@
|
|||
},
|
||||
"soupsieve": {
|
||||
"hashes": [
|
||||
"sha256:afa56bf14907bb09403e5d15fbed6275caa4174d36b975226e3b67a3bb6e2c4b",
|
||||
"sha256:eaed742b48b1f3e2d45ba6f79401b2ed5dc33b2123dfe216adb90d4bfa0ade26"
|
||||
"sha256:3aef141566afd07201b525c17bfaadd07580a8066f82b57f7c9417f26adbd0a3",
|
||||
"sha256:e41a65e99bd125972d84221022beb1e4b5cfc68fa12c170c39834ce32d1b294c"
|
||||
],
|
||||
"version": "==1.8"
|
||||
"version": "==1.9"
|
||||
},
|
||||
"twisted": {
|
||||
"hashes": [
|
||||
|
@ -506,10 +507,10 @@
|
|||
},
|
||||
"werkzeug": {
|
||||
"hashes": [
|
||||
"sha256:590abe38f8be026d78457fe3b5200895b3543e58ac3fc1dd792c6333ea11af64",
|
||||
"sha256:ee11b0f0640c56fb491b43b38356c4b588b3202b415a1e03eacf1c5561c961cf"
|
||||
"sha256:96da23fa8ccecbc3ae832a83df5c722c11547d021637faacb0bec4dd2f4666c8",
|
||||
"sha256:ca5c2dcd367d6c0df87185b9082929d255358f5391923269335782b213d52655"
|
||||
],
|
||||
"version": "==0.15.0"
|
||||
"version": "==0.15.1"
|
||||
},
|
||||
"zope.interface": {
|
||||
"hashes": [
|
||||
|
|
|
@ -43,9 +43,9 @@ You need a running splash instance, preferably on [docker](https://splash.readth
|
|||
```bash
|
||||
sudo apt install docker.io
|
||||
sudo docker pull scrapinghub/splash
|
||||
sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui --disable-lua
|
||||
sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui --disable-lua --disable-browser-caches
|
||||
# On a server with a decent abount of RAM, you may want to run it this way:
|
||||
# sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui -s 100 --disable-lua -m 50000
|
||||
# sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui -s 100 --disable-lua -m 50000 --disable-browser-caches
|
||||
```
|
||||
|
||||
## Install redis
|
||||
|
|
|
@ -12,3 +12,7 @@ class MissingEnv(LookylooException):
|
|||
|
||||
class NoValidHarFile(LookylooException):
|
||||
pass
|
||||
|
||||
|
||||
class CreateDirectoryException(LookylooException):
|
||||
pass
|
||||
|
|
|
@ -2,11 +2,15 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
from pathlib import Path
|
||||
from .exceptions import MissingEnv
|
||||
from .exceptions import MissingEnv, CreateDirectoryException
|
||||
from redis import Redis
|
||||
from redis.exceptions import ConnectionError
|
||||
from datetime import datetime, timedelta
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import requests
|
||||
from glob import glob
|
||||
|
||||
|
||||
def get_homedir():
|
||||
|
@ -18,6 +22,12 @@ Run the following command (assuming you run the code from the clonned repository
|
|||
return Path(os.environ['LOOKYLOO_HOME'])
|
||||
|
||||
|
||||
def safe_create_dir(to_create: Path):
|
||||
if to_create.exists() and not to_create.is_dir():
|
||||
raise CreateDirectoryException(f'The path {to_create} already exists and is not a directory')
|
||||
os.makedirs(to_create, exist_ok=True)
|
||||
|
||||
|
||||
def set_running(name: str) -> None:
|
||||
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
|
||||
r.hset('running', name, 1)
|
||||
|
@ -70,3 +80,38 @@ def long_sleep(sleep_in_sec: int, shutdown_check: int=10) -> bool:
|
|||
if shutdown_requested():
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def update_user_agents():
|
||||
today = datetime.now()
|
||||
ua_path = get_homedir() / 'user_agents' / str(today.year) / f'{today.month:02}'
|
||||
safe_create_dir(ua_path)
|
||||
ua_file_name = ua_path / f'{today.date().isoformat()}.json'
|
||||
if ua_file_name.exists():
|
||||
# Already have a UA for that day.
|
||||
return
|
||||
r = requests.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/')
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
uas = soup.find_all('textarea')[1].text
|
||||
to_store = {'by_frequency': []}
|
||||
for ua in json.loads(uas):
|
||||
os = ua['system'].split(' ')[-1]
|
||||
if os not in to_store:
|
||||
to_store[os] = {}
|
||||
browser = ' '.join(ua['system'].split(' ')[:-1])
|
||||
if browser not in to_store[os]:
|
||||
to_store[os][browser] = []
|
||||
to_store[os][browser].append(ua['useragent'])
|
||||
to_store['by_frequency'].append({'os': os, 'browser': browser, 'useragent': ua['useragent']})
|
||||
with open(ua_file_name, 'w') as f:
|
||||
json.dump(to_store, f, indent=2)
|
||||
|
||||
|
||||
def get_user_agents():
|
||||
ua_files_path = str(get_homedir() / 'user_agents' / '*' / '*' / '*.json')
|
||||
paths = sorted(glob(ua_files_path), reverse=True)
|
||||
if not paths:
|
||||
update_user_agents()
|
||||
paths = sorted(glob(ua_files_path), reverse=True)
|
||||
with open(paths[0]) as f:
|
||||
return json.load(f)
|
||||
|
|
|
@ -10,7 +10,7 @@ import os
|
|||
from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response
|
||||
from flask_bootstrap import Bootstrap
|
||||
|
||||
from lookyloo.helpers import get_homedir
|
||||
from lookyloo.helpers import get_homedir, update_user_agents, get_user_agents
|
||||
from lookyloo.lookyloo import Lookyloo
|
||||
from lookyloo.exceptions import NoValidHarFile
|
||||
|
||||
|
@ -54,7 +54,9 @@ def scrape_web():
|
|||
if request.form.get('url'):
|
||||
perma_uuid = lookyloo.scrape(request.form.get('url'), request.form.get('depth'), request.form.get('listing'))
|
||||
return redirect(url_for('tree', tree_uuid=perma_uuid))
|
||||
return render_template('scrape.html')
|
||||
user_agents = get_user_agents()
|
||||
user_agents.pop('by_frequency')
|
||||
return render_template('scrape.html', user_agents=user_agents)
|
||||
|
||||
|
||||
@app.route('/tree/hostname/<node_uuid>/text', methods=['GET'])
|
||||
|
@ -142,6 +144,7 @@ def index():
|
|||
# Just returns ack if the webserver is running
|
||||
return 'Ack'
|
||||
lookyloo.cleanup_old_tmpfiles()
|
||||
update_user_agents()
|
||||
session.clear()
|
||||
titles = []
|
||||
for report_dir in lookyloo.report_dirs:
|
||||
|
|
|
@ -24,7 +24,53 @@
|
|||
<label for="listing">Public</label>
|
||||
<input type="checkbox" name="listing" checked="true"></input>
|
||||
</div>
|
||||
<label for="os">Choose an operating system</label>
|
||||
<select class="form-control" name="os" id="os">
|
||||
{% for os in user_agents.keys() %}
|
||||
<option value="{{ os }}">{{ os }}</option>
|
||||
{% endfor%}
|
||||
</select>
|
||||
{% for os, browsers in user_agents.items() %}
|
||||
<div id="{{os.replace(' ', '_')}}" class="style-sub-1" {% if loop.index0 != 0 %}style="display: none;"{%endif%}>
|
||||
<select class="form-control" name="browser">
|
||||
{% for browser in browsers.keys()%}
|
||||
<option value="{{ browser }}">{{ browser }}</option>
|
||||
{% endfor%}
|
||||
</select>
|
||||
</div>
|
||||
{% set outer_loop = loop %}
|
||||
{% for browser, user_agents in browsers.items()%}
|
||||
<div id="{{os.replace(' ', '_')}}_{{browser.replace(' ', '_')}}" class="style-sub-2" {% if not loop.first or not outer_loop.first %} style="display: none;"{%endif%}>
|
||||
{% set ua_displayed = True %}
|
||||
<select class="form-control" name="user_agent">
|
||||
{% for user_agent in user_agents %}
|
||||
<option value="{{ user_agent }}">{{ user_agent }}</option>
|
||||
{% endfor%}
|
||||
</select>
|
||||
</div>
|
||||
{% endfor%}
|
||||
{% endfor%}
|
||||
<button type="submit" class="btn btn-default">Scrape</button>
|
||||
</form>
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block scripts %}
|
||||
{{ super() }}
|
||||
<script>
|
||||
$("#os").change(function(){
|
||||
var os_name = $(this).find(":selected").val().replace(/(:|\.|\[|\]|,|=)/g, "\\$1").replace(/ /g,"_");
|
||||
var first_browser_name = $("[id='" + os_name + "']").find('select option:first-child').val().replace(/(:|\.|\[|\]|,|=)/g, "\\$1").replace(/ /g,"_");
|
||||
$(".style-sub-1").hide();
|
||||
$(".style-sub-2").hide();
|
||||
$("[id='" + os_name + "']").show();
|
||||
$("[id='" + os_name + '_' + first_browser_name + "']").show();
|
||||
});
|
||||
$('select[name="browser"]').change(function(){
|
||||
var browser_name = $(this).find(":selected").val().replace(/(:|\.|\[|\]|,|=)/g, "\\$1").replace(/ /g,"_");
|
||||
var os_name = $(this).parent().attr("id").replace(/(:|\.|\[|\]|,|=)/g, "\\$1").replace(/ /g,"_");
|
||||
$(".style-sub-2").hide();
|
||||
$("[id='" + os_name + '_' + browser_name + "']").show();
|
||||
});
|
||||
</script>
|
||||
{% endblock %}
|
||||
|
|
Loading…
Reference in New Issue