new: priority for captures

pull/205/head
Raphaël Vinot 2021-05-18 14:58:56 -07:00
parent 15b46e4b71
commit 925bb9d48e
3 changed files with 77 additions and 25 deletions

View File

@ -24,11 +24,22 @@
"auto_trigger_modules": false,
"enable_mail_notification": false,
"email": {
"from": "Lookyloo <lookyloo@myorg.local>",
"to": "Investigation Team <investigation_unit@myorg.local>",
"subject": "Capture from Lookyloo to review",
"smtp_host": "localhost",
"smtp_port": "25"
"from": "Lookyloo <lookyloo@myorg.local>",
"to": "Investigation Team <investigation_unit@myorg.local>",
"subject": "Capture from Lookyloo to review",
"smtp_host": "localhost",
"smtp_port": "25"
},
"priority": {
"sources": {
"web": 10,
"api": 0
},
"users": {
"_default_auth": 5,
"_default_anon": 0,
"admin": 10
}
},
"_notes": {
"loglevel": "(lookyloo) Can be one of the value listed here: https://docs.python.org/3/library/logging.html#levels",
@ -51,6 +62,7 @@
"enable_bookmark": "Allow to bookmark nodes on tree",
"auto_trigger_modules": "Automatically trigger the modules when the tree is loaded",
"enable_mail_notification": "Enable email notification or not",
"email": "Configuration for sending email notifications."
"email": "Configuration for sending email notifications.",
"priority": "Define the priority of a new capture. A capture from the web interface has priority over a capture from the API, same for authenticated user vs. anonymous."
}
}

View File

@ -64,6 +64,8 @@ class Lookyloo():
self.splash_url = get_config('generic', 'splash_url')
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
self._priority = get_config('generic', 'priority')
safe_create_dir(self.capture_dir)
# Initialize 3rd party components
@ -93,6 +95,19 @@ class Lookyloo():
if not self.redis.exists('cache_loaded'):
self._init_existing_dumps()
def _get_priority(self, source: str, user: str, authenticated: bool) -> int:
src_prio: int = self._priority['sources'][source] if source in self._priority['sources'] else -1
if not authenticated:
usr_prio = self._priority['users']['_default_anon']
# reduce priority for anonymous users making lots of captures
queue_size = self.redis.zscore('queues', f'{source}|{authenticated}|{user}')
if queue_size is None:
queue_size = 0
usr_prio -= int(queue_size / 10)
else:
usr_prio = self._priority['users'][user] if self._priority['users'].get(user) else self._priority['users']['_default_auth']
return src_prio + usr_prio
def cache_user_agents(self, user_agent: str, remote_ip: str) -> None:
'''Cache the useragents of the visitors'''
today = date.today().isoformat()
@ -579,7 +594,7 @@ class Lookyloo():
return CaptureStatus.ONGOING
return CaptureStatus.UNKNOWN
def enqueue_capture(self, query: MutableMapping[str, Any]) -> str:
def enqueue_capture(self, query: MutableMapping[str, Any], source: str, user: str, authenticated: bool) -> str:
'''Enqueue a query in the capture queue (used by the UI and the API for asynchronous processing)'''
perma_uuid = str(uuid4())
p = self.redis.pipeline()
@ -590,7 +605,10 @@ class Lookyloo():
if isinstance(value, list):
query[key] = json.dumps(value)
p.hmset(perma_uuid, query) # type: ignore
p.sadd('to_capture', perma_uuid)
priority = self._get_priority(source, user, authenticated)
p.zadd('to_capture', {perma_uuid: priority})
p.zincrby('queues', 1, f'{source}|{authenticated}|{user}')
p.set(f'{perma_uuid}_mgmt', f'{source}|{authenticated}|{user}')
p.execute()
return perma_uuid
@ -604,19 +622,28 @@ class Lookyloo():
self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}')
return None
uuid = self.redis.spop('to_capture')
if not uuid:
value = self.redis.zpopmax('to_capture')
if not value or not value[0]:
return None
uuid, score = value[0]
queue = self.redis.get(f'{uuid}_mgmt')
self.redis.sadd('ongoing', uuid)
lazy_cleanup = self.redis.pipeline()
lazy_cleanup.delete(f'{uuid}_mgmt')
lazy_cleanup.zincrby('queues', -1, queue)
to_capture: Dict[str, Union[str, int, float]] = self.redis.hgetall(uuid)
to_capture['perma_uuid'] = uuid
if 'cookies' in to_capture:
to_capture['cookies_pseudofile'] = to_capture.pop('cookies')
status = self._capture(**to_capture) # type: ignore
self.redis.srem('ongoing', uuid)
self.redis.delete(uuid)
lazy_cleanup.srem('ongoing', uuid)
lazy_cleanup.delete(uuid)
# make sure to expire the key if nothing was process for a while (= queues empty)
lazy_cleanup.expire('queues', 600)
lazy_cleanup.execute()
if status:
self.logger.info(f'Processed {to_capture["url"]}')
return True

View File

@ -189,6 +189,14 @@ app.jinja_env.globals.update(month_name=month_name)
# ##### Generic/configuration methods #####
def src_request_ip(request) -> str:
# NOTE: X-Real-IP is the IP passed by the reverse proxy in the headers.
real_ip = request.headers.get('X-Real-IP')
if not real_ip:
real_ip = request.remote_addr
return real_ip
@app.after_request
def after_request(response):
# We keep a list user agents in order to build a list to use in the capture
@ -200,16 +208,9 @@ def after_request(response):
# The cache of IPs is deleted after the UA file is generated (see lookyloo.build_ua_file),
# once a day.
ua = request.headers.get('User-Agent')
real_ip = request.headers.get('X-Real-IP')
real_ip = src_request_ip(request)
if ua:
if real_ip:
lookyloo.cache_user_agents(ua, real_ip)
else:
if request.remote_addr:
lookyloo.cache_user_agents(ua, request.remote_addr)
else:
# FIXME: That shouldn't happen, I guess, but mypy requires it.
pass
lookyloo.cache_user_agents(ua, real_ip)
# Opt out of FLoC
response.headers.set('Permissions-Policy', 'interest-cohort=()')
return response
@ -433,6 +434,10 @@ def urls_rendered_page(tree_uuid: str):
@app.route('/bulk_captures/<string:base_tree_uuid>', methods=['POST'])
def bulk_captures(base_tree_uuid: str):
if flask_login.current_user.is_authenticated:
user = flask_login.current_user.get_id()
else:
user = src_request_ip(request)
selected_urls = request.form.getlist('url')
urls = lookyloo.get_urls_rendered_page(base_tree_uuid)
ct = lookyloo.get_crawled_tree(base_tree_uuid)
@ -445,7 +450,7 @@ def bulk_captures(base_tree_uuid: str):
'user_agent': ct.user_agent,
'parent': base_tree_uuid
}
new_capture_uuid = lookyloo.enqueue_capture(capture)
new_capture_uuid = lookyloo.enqueue_capture(capture, source='web', user=user, authenticated=flask_login.current_user.is_authenticated)
bulk_captures.append((new_capture_uuid, url))
return render_template('bulk_captures.html', uuid=base_tree_uuid, bulk_captures=bulk_captures)
@ -655,10 +660,14 @@ def rebuild_cache():
return redirect(url_for('index'))
@app.route('/submit', methods=['POST', 'GET'])
@app.route('/submit', methods=['POST'])
def submit():
if flask_login.current_user.is_authenticated:
user = flask_login.current_user.get_id()
else:
user = src_request_ip(request)
to_query: Dict = request.get_json(force=True) # type: ignore
perma_uuid = lookyloo.enqueue_capture(to_query)
perma_uuid = lookyloo.enqueue_capture(to_query, source='api', user=user, authenticated=flask_login.current_user.is_authenticated)
return Response(perma_uuid, mimetype='text/text')
@ -679,6 +688,10 @@ def search():
@app.route('/capture', methods=['GET', 'POST'])
def capture_web():
if request.form.get('url'):
if flask_login.current_user.is_authenticated:
user = flask_login.current_user.get_id()
else:
user = src_request_ip(request)
capture_query: Dict[str, Union[str, bytes, int, bool]] = {'url': request.form['url']}
# check if the post request has the file part
if 'cookies' in request.files and request.files['cookies'].filename:
@ -698,7 +711,7 @@ def capture_web():
if request.form.get('referer'):
capture_query['referer'] = request.form['referer']
perma_uuid = lookyloo.enqueue_capture(capture_query)
perma_uuid = lookyloo.enqueue_capture(capture_query, source='web', user=user, authenticated=flask_login.current_user.is_authenticated)
time.sleep(30)
return redirect(url_for('tree', tree_uuid=perma_uuid))
user_agents: Dict[str, Any] = {}