mirror of https://github.com/CIRCL/lookyloo
new: priority for captures
parent
15b46e4b71
commit
925bb9d48e
|
@ -24,11 +24,22 @@
|
|||
"auto_trigger_modules": false,
|
||||
"enable_mail_notification": false,
|
||||
"email": {
|
||||
"from": "Lookyloo <lookyloo@myorg.local>",
|
||||
"to": "Investigation Team <investigation_unit@myorg.local>",
|
||||
"subject": "Capture from Lookyloo to review",
|
||||
"smtp_host": "localhost",
|
||||
"smtp_port": "25"
|
||||
"from": "Lookyloo <lookyloo@myorg.local>",
|
||||
"to": "Investigation Team <investigation_unit@myorg.local>",
|
||||
"subject": "Capture from Lookyloo to review",
|
||||
"smtp_host": "localhost",
|
||||
"smtp_port": "25"
|
||||
},
|
||||
"priority": {
|
||||
"sources": {
|
||||
"web": 10,
|
||||
"api": 0
|
||||
},
|
||||
"users": {
|
||||
"_default_auth": 5,
|
||||
"_default_anon": 0,
|
||||
"admin": 10
|
||||
}
|
||||
},
|
||||
"_notes": {
|
||||
"loglevel": "(lookyloo) Can be one of the value listed here: https://docs.python.org/3/library/logging.html#levels",
|
||||
|
@ -51,6 +62,7 @@
|
|||
"enable_bookmark": "Allow to bookmark nodes on tree",
|
||||
"auto_trigger_modules": "Automatically trigger the modules when the tree is loaded",
|
||||
"enable_mail_notification": "Enable email notification or not",
|
||||
"email": "Configuration for sending email notifications."
|
||||
"email": "Configuration for sending email notifications.",
|
||||
"priority": "Define the priority of a new capture. A capture from the web interface has priority over a capture from the API, same for authenticated user vs. anonymous."
|
||||
}
|
||||
}
|
||||
|
|
|
@ -64,6 +64,8 @@ class Lookyloo():
|
|||
self.splash_url = get_config('generic', 'splash_url')
|
||||
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
|
||||
|
||||
self._priority = get_config('generic', 'priority')
|
||||
|
||||
safe_create_dir(self.capture_dir)
|
||||
|
||||
# Initialize 3rd party components
|
||||
|
@ -93,6 +95,19 @@ class Lookyloo():
|
|||
if not self.redis.exists('cache_loaded'):
|
||||
self._init_existing_dumps()
|
||||
|
||||
def _get_priority(self, source: str, user: str, authenticated: bool) -> int:
|
||||
src_prio: int = self._priority['sources'][source] if source in self._priority['sources'] else -1
|
||||
if not authenticated:
|
||||
usr_prio = self._priority['users']['_default_anon']
|
||||
# reduce priority for anonymous users making lots of captures
|
||||
queue_size = self.redis.zscore('queues', f'{source}|{authenticated}|{user}')
|
||||
if queue_size is None:
|
||||
queue_size = 0
|
||||
usr_prio -= int(queue_size / 10)
|
||||
else:
|
||||
usr_prio = self._priority['users'][user] if self._priority['users'].get(user) else self._priority['users']['_default_auth']
|
||||
return src_prio + usr_prio
|
||||
|
||||
def cache_user_agents(self, user_agent: str, remote_ip: str) -> None:
|
||||
'''Cache the useragents of the visitors'''
|
||||
today = date.today().isoformat()
|
||||
|
@ -579,7 +594,7 @@ class Lookyloo():
|
|||
return CaptureStatus.ONGOING
|
||||
return CaptureStatus.UNKNOWN
|
||||
|
||||
def enqueue_capture(self, query: MutableMapping[str, Any]) -> str:
|
||||
def enqueue_capture(self, query: MutableMapping[str, Any], source: str, user: str, authenticated: bool) -> str:
|
||||
'''Enqueue a query in the capture queue (used by the UI and the API for asynchronous processing)'''
|
||||
perma_uuid = str(uuid4())
|
||||
p = self.redis.pipeline()
|
||||
|
@ -590,7 +605,10 @@ class Lookyloo():
|
|||
if isinstance(value, list):
|
||||
query[key] = json.dumps(value)
|
||||
p.hmset(perma_uuid, query) # type: ignore
|
||||
p.sadd('to_capture', perma_uuid)
|
||||
priority = self._get_priority(source, user, authenticated)
|
||||
p.zadd('to_capture', {perma_uuid: priority})
|
||||
p.zincrby('queues', 1, f'{source}|{authenticated}|{user}')
|
||||
p.set(f'{perma_uuid}_mgmt', f'{source}|{authenticated}|{user}')
|
||||
p.execute()
|
||||
return perma_uuid
|
||||
|
||||
|
@ -604,19 +622,28 @@ class Lookyloo():
|
|||
self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}')
|
||||
return None
|
||||
|
||||
uuid = self.redis.spop('to_capture')
|
||||
if not uuid:
|
||||
value = self.redis.zpopmax('to_capture')
|
||||
if not value or not value[0]:
|
||||
return None
|
||||
uuid, score = value[0]
|
||||
queue = self.redis.get(f'{uuid}_mgmt')
|
||||
self.redis.sadd('ongoing', uuid)
|
||||
|
||||
lazy_cleanup = self.redis.pipeline()
|
||||
lazy_cleanup.delete(f'{uuid}_mgmt')
|
||||
lazy_cleanup.zincrby('queues', -1, queue)
|
||||
|
||||
to_capture: Dict[str, Union[str, int, float]] = self.redis.hgetall(uuid)
|
||||
to_capture['perma_uuid'] = uuid
|
||||
if 'cookies' in to_capture:
|
||||
to_capture['cookies_pseudofile'] = to_capture.pop('cookies')
|
||||
|
||||
status = self._capture(**to_capture) # type: ignore
|
||||
self.redis.srem('ongoing', uuid)
|
||||
self.redis.delete(uuid)
|
||||
lazy_cleanup.srem('ongoing', uuid)
|
||||
lazy_cleanup.delete(uuid)
|
||||
# make sure to expire the key if nothing was process for a while (= queues empty)
|
||||
lazy_cleanup.expire('queues', 600)
|
||||
lazy_cleanup.execute()
|
||||
if status:
|
||||
self.logger.info(f'Processed {to_capture["url"]}')
|
||||
return True
|
||||
|
|
|
@ -189,6 +189,14 @@ app.jinja_env.globals.update(month_name=month_name)
|
|||
|
||||
# ##### Generic/configuration methods #####
|
||||
|
||||
def src_request_ip(request) -> str:
|
||||
# NOTE: X-Real-IP is the IP passed by the reverse proxy in the headers.
|
||||
real_ip = request.headers.get('X-Real-IP')
|
||||
if not real_ip:
|
||||
real_ip = request.remote_addr
|
||||
return real_ip
|
||||
|
||||
|
||||
@app.after_request
|
||||
def after_request(response):
|
||||
# We keep a list user agents in order to build a list to use in the capture
|
||||
|
@ -200,16 +208,9 @@ def after_request(response):
|
|||
# The cache of IPs is deleted after the UA file is generated (see lookyloo.build_ua_file),
|
||||
# once a day.
|
||||
ua = request.headers.get('User-Agent')
|
||||
real_ip = request.headers.get('X-Real-IP')
|
||||
real_ip = src_request_ip(request)
|
||||
if ua:
|
||||
if real_ip:
|
||||
lookyloo.cache_user_agents(ua, real_ip)
|
||||
else:
|
||||
if request.remote_addr:
|
||||
lookyloo.cache_user_agents(ua, request.remote_addr)
|
||||
else:
|
||||
# FIXME: That shouldn't happen, I guess, but mypy requires it.
|
||||
pass
|
||||
lookyloo.cache_user_agents(ua, real_ip)
|
||||
# Opt out of FLoC
|
||||
response.headers.set('Permissions-Policy', 'interest-cohort=()')
|
||||
return response
|
||||
|
@ -433,6 +434,10 @@ def urls_rendered_page(tree_uuid: str):
|
|||
|
||||
@app.route('/bulk_captures/<string:base_tree_uuid>', methods=['POST'])
|
||||
def bulk_captures(base_tree_uuid: str):
|
||||
if flask_login.current_user.is_authenticated:
|
||||
user = flask_login.current_user.get_id()
|
||||
else:
|
||||
user = src_request_ip(request)
|
||||
selected_urls = request.form.getlist('url')
|
||||
urls = lookyloo.get_urls_rendered_page(base_tree_uuid)
|
||||
ct = lookyloo.get_crawled_tree(base_tree_uuid)
|
||||
|
@ -445,7 +450,7 @@ def bulk_captures(base_tree_uuid: str):
|
|||
'user_agent': ct.user_agent,
|
||||
'parent': base_tree_uuid
|
||||
}
|
||||
new_capture_uuid = lookyloo.enqueue_capture(capture)
|
||||
new_capture_uuid = lookyloo.enqueue_capture(capture, source='web', user=user, authenticated=flask_login.current_user.is_authenticated)
|
||||
bulk_captures.append((new_capture_uuid, url))
|
||||
|
||||
return render_template('bulk_captures.html', uuid=base_tree_uuid, bulk_captures=bulk_captures)
|
||||
|
@ -655,10 +660,14 @@ def rebuild_cache():
|
|||
return redirect(url_for('index'))
|
||||
|
||||
|
||||
@app.route('/submit', methods=['POST', 'GET'])
|
||||
@app.route('/submit', methods=['POST'])
|
||||
def submit():
|
||||
if flask_login.current_user.is_authenticated:
|
||||
user = flask_login.current_user.get_id()
|
||||
else:
|
||||
user = src_request_ip(request)
|
||||
to_query: Dict = request.get_json(force=True) # type: ignore
|
||||
perma_uuid = lookyloo.enqueue_capture(to_query)
|
||||
perma_uuid = lookyloo.enqueue_capture(to_query, source='api', user=user, authenticated=flask_login.current_user.is_authenticated)
|
||||
return Response(perma_uuid, mimetype='text/text')
|
||||
|
||||
|
||||
|
@ -679,6 +688,10 @@ def search():
|
|||
@app.route('/capture', methods=['GET', 'POST'])
|
||||
def capture_web():
|
||||
if request.form.get('url'):
|
||||
if flask_login.current_user.is_authenticated:
|
||||
user = flask_login.current_user.get_id()
|
||||
else:
|
||||
user = src_request_ip(request)
|
||||
capture_query: Dict[str, Union[str, bytes, int, bool]] = {'url': request.form['url']}
|
||||
# check if the post request has the file part
|
||||
if 'cookies' in request.files and request.files['cookies'].filename:
|
||||
|
@ -698,7 +711,7 @@ def capture_web():
|
|||
if request.form.get('referer'):
|
||||
capture_query['referer'] = request.form['referer']
|
||||
|
||||
perma_uuid = lookyloo.enqueue_capture(capture_query)
|
||||
perma_uuid = lookyloo.enqueue_capture(capture_query, source='web', user=user, authenticated=flask_login.current_user.is_authenticated)
|
||||
time.sleep(30)
|
||||
return redirect(url_for('tree', tree_uuid=perma_uuid))
|
||||
user_agents: Dict[str, Any] = {}
|
||||
|
|
Loading…
Reference in New Issue