mirror of https://github.com/CIRCL/lookyloo
new: priority for captures
parent
15b46e4b71
commit
925bb9d48e
|
@ -24,11 +24,22 @@
|
||||||
"auto_trigger_modules": false,
|
"auto_trigger_modules": false,
|
||||||
"enable_mail_notification": false,
|
"enable_mail_notification": false,
|
||||||
"email": {
|
"email": {
|
||||||
"from": "Lookyloo <lookyloo@myorg.local>",
|
"from": "Lookyloo <lookyloo@myorg.local>",
|
||||||
"to": "Investigation Team <investigation_unit@myorg.local>",
|
"to": "Investigation Team <investigation_unit@myorg.local>",
|
||||||
"subject": "Capture from Lookyloo to review",
|
"subject": "Capture from Lookyloo to review",
|
||||||
"smtp_host": "localhost",
|
"smtp_host": "localhost",
|
||||||
"smtp_port": "25"
|
"smtp_port": "25"
|
||||||
|
},
|
||||||
|
"priority": {
|
||||||
|
"sources": {
|
||||||
|
"web": 10,
|
||||||
|
"api": 0
|
||||||
|
},
|
||||||
|
"users": {
|
||||||
|
"_default_auth": 5,
|
||||||
|
"_default_anon": 0,
|
||||||
|
"admin": 10
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"_notes": {
|
"_notes": {
|
||||||
"loglevel": "(lookyloo) Can be one of the value listed here: https://docs.python.org/3/library/logging.html#levels",
|
"loglevel": "(lookyloo) Can be one of the value listed here: https://docs.python.org/3/library/logging.html#levels",
|
||||||
|
@ -51,6 +62,7 @@
|
||||||
"enable_bookmark": "Allow to bookmark nodes on tree",
|
"enable_bookmark": "Allow to bookmark nodes on tree",
|
||||||
"auto_trigger_modules": "Automatically trigger the modules when the tree is loaded",
|
"auto_trigger_modules": "Automatically trigger the modules when the tree is loaded",
|
||||||
"enable_mail_notification": "Enable email notification or not",
|
"enable_mail_notification": "Enable email notification or not",
|
||||||
"email": "Configuration for sending email notifications."
|
"email": "Configuration for sending email notifications.",
|
||||||
|
"priority": "Define the priority of a new capture. A capture from the web interface has priority over a capture from the API, same for authenticated user vs. anonymous."
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -64,6 +64,8 @@ class Lookyloo():
|
||||||
self.splash_url = get_config('generic', 'splash_url')
|
self.splash_url = get_config('generic', 'splash_url')
|
||||||
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
|
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
|
||||||
|
|
||||||
|
self._priority = get_config('generic', 'priority')
|
||||||
|
|
||||||
safe_create_dir(self.capture_dir)
|
safe_create_dir(self.capture_dir)
|
||||||
|
|
||||||
# Initialize 3rd party components
|
# Initialize 3rd party components
|
||||||
|
@ -93,6 +95,19 @@ class Lookyloo():
|
||||||
if not self.redis.exists('cache_loaded'):
|
if not self.redis.exists('cache_loaded'):
|
||||||
self._init_existing_dumps()
|
self._init_existing_dumps()
|
||||||
|
|
||||||
|
def _get_priority(self, source: str, user: str, authenticated: bool) -> int:
|
||||||
|
src_prio: int = self._priority['sources'][source] if source in self._priority['sources'] else -1
|
||||||
|
if not authenticated:
|
||||||
|
usr_prio = self._priority['users']['_default_anon']
|
||||||
|
# reduce priority for anonymous users making lots of captures
|
||||||
|
queue_size = self.redis.zscore('queues', f'{source}|{authenticated}|{user}')
|
||||||
|
if queue_size is None:
|
||||||
|
queue_size = 0
|
||||||
|
usr_prio -= int(queue_size / 10)
|
||||||
|
else:
|
||||||
|
usr_prio = self._priority['users'][user] if self._priority['users'].get(user) else self._priority['users']['_default_auth']
|
||||||
|
return src_prio + usr_prio
|
||||||
|
|
||||||
def cache_user_agents(self, user_agent: str, remote_ip: str) -> None:
|
def cache_user_agents(self, user_agent: str, remote_ip: str) -> None:
|
||||||
'''Cache the useragents of the visitors'''
|
'''Cache the useragents of the visitors'''
|
||||||
today = date.today().isoformat()
|
today = date.today().isoformat()
|
||||||
|
@ -579,7 +594,7 @@ class Lookyloo():
|
||||||
return CaptureStatus.ONGOING
|
return CaptureStatus.ONGOING
|
||||||
return CaptureStatus.UNKNOWN
|
return CaptureStatus.UNKNOWN
|
||||||
|
|
||||||
def enqueue_capture(self, query: MutableMapping[str, Any]) -> str:
|
def enqueue_capture(self, query: MutableMapping[str, Any], source: str, user: str, authenticated: bool) -> str:
|
||||||
'''Enqueue a query in the capture queue (used by the UI and the API for asynchronous processing)'''
|
'''Enqueue a query in the capture queue (used by the UI and the API for asynchronous processing)'''
|
||||||
perma_uuid = str(uuid4())
|
perma_uuid = str(uuid4())
|
||||||
p = self.redis.pipeline()
|
p = self.redis.pipeline()
|
||||||
|
@ -590,7 +605,10 @@ class Lookyloo():
|
||||||
if isinstance(value, list):
|
if isinstance(value, list):
|
||||||
query[key] = json.dumps(value)
|
query[key] = json.dumps(value)
|
||||||
p.hmset(perma_uuid, query) # type: ignore
|
p.hmset(perma_uuid, query) # type: ignore
|
||||||
p.sadd('to_capture', perma_uuid)
|
priority = self._get_priority(source, user, authenticated)
|
||||||
|
p.zadd('to_capture', {perma_uuid: priority})
|
||||||
|
p.zincrby('queues', 1, f'{source}|{authenticated}|{user}')
|
||||||
|
p.set(f'{perma_uuid}_mgmt', f'{source}|{authenticated}|{user}')
|
||||||
p.execute()
|
p.execute()
|
||||||
return perma_uuid
|
return perma_uuid
|
||||||
|
|
||||||
|
@ -604,19 +622,28 @@ class Lookyloo():
|
||||||
self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}')
|
self.logger.critical(f'Splash is not running, unable to process the capture queue: {message}')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
uuid = self.redis.spop('to_capture')
|
value = self.redis.zpopmax('to_capture')
|
||||||
if not uuid:
|
if not value or not value[0]:
|
||||||
return None
|
return None
|
||||||
|
uuid, score = value[0]
|
||||||
|
queue = self.redis.get(f'{uuid}_mgmt')
|
||||||
self.redis.sadd('ongoing', uuid)
|
self.redis.sadd('ongoing', uuid)
|
||||||
|
|
||||||
|
lazy_cleanup = self.redis.pipeline()
|
||||||
|
lazy_cleanup.delete(f'{uuid}_mgmt')
|
||||||
|
lazy_cleanup.zincrby('queues', -1, queue)
|
||||||
|
|
||||||
to_capture: Dict[str, Union[str, int, float]] = self.redis.hgetall(uuid)
|
to_capture: Dict[str, Union[str, int, float]] = self.redis.hgetall(uuid)
|
||||||
to_capture['perma_uuid'] = uuid
|
to_capture['perma_uuid'] = uuid
|
||||||
if 'cookies' in to_capture:
|
if 'cookies' in to_capture:
|
||||||
to_capture['cookies_pseudofile'] = to_capture.pop('cookies')
|
to_capture['cookies_pseudofile'] = to_capture.pop('cookies')
|
||||||
|
|
||||||
status = self._capture(**to_capture) # type: ignore
|
status = self._capture(**to_capture) # type: ignore
|
||||||
self.redis.srem('ongoing', uuid)
|
lazy_cleanup.srem('ongoing', uuid)
|
||||||
self.redis.delete(uuid)
|
lazy_cleanup.delete(uuid)
|
||||||
|
# make sure to expire the key if nothing was process for a while (= queues empty)
|
||||||
|
lazy_cleanup.expire('queues', 600)
|
||||||
|
lazy_cleanup.execute()
|
||||||
if status:
|
if status:
|
||||||
self.logger.info(f'Processed {to_capture["url"]}')
|
self.logger.info(f'Processed {to_capture["url"]}')
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -189,6 +189,14 @@ app.jinja_env.globals.update(month_name=month_name)
|
||||||
|
|
||||||
# ##### Generic/configuration methods #####
|
# ##### Generic/configuration methods #####
|
||||||
|
|
||||||
|
def src_request_ip(request) -> str:
|
||||||
|
# NOTE: X-Real-IP is the IP passed by the reverse proxy in the headers.
|
||||||
|
real_ip = request.headers.get('X-Real-IP')
|
||||||
|
if not real_ip:
|
||||||
|
real_ip = request.remote_addr
|
||||||
|
return real_ip
|
||||||
|
|
||||||
|
|
||||||
@app.after_request
|
@app.after_request
|
||||||
def after_request(response):
|
def after_request(response):
|
||||||
# We keep a list user agents in order to build a list to use in the capture
|
# We keep a list user agents in order to build a list to use in the capture
|
||||||
|
@ -200,16 +208,9 @@ def after_request(response):
|
||||||
# The cache of IPs is deleted after the UA file is generated (see lookyloo.build_ua_file),
|
# The cache of IPs is deleted after the UA file is generated (see lookyloo.build_ua_file),
|
||||||
# once a day.
|
# once a day.
|
||||||
ua = request.headers.get('User-Agent')
|
ua = request.headers.get('User-Agent')
|
||||||
real_ip = request.headers.get('X-Real-IP')
|
real_ip = src_request_ip(request)
|
||||||
if ua:
|
if ua:
|
||||||
if real_ip:
|
lookyloo.cache_user_agents(ua, real_ip)
|
||||||
lookyloo.cache_user_agents(ua, real_ip)
|
|
||||||
else:
|
|
||||||
if request.remote_addr:
|
|
||||||
lookyloo.cache_user_agents(ua, request.remote_addr)
|
|
||||||
else:
|
|
||||||
# FIXME: That shouldn't happen, I guess, but mypy requires it.
|
|
||||||
pass
|
|
||||||
# Opt out of FLoC
|
# Opt out of FLoC
|
||||||
response.headers.set('Permissions-Policy', 'interest-cohort=()')
|
response.headers.set('Permissions-Policy', 'interest-cohort=()')
|
||||||
return response
|
return response
|
||||||
|
@ -433,6 +434,10 @@ def urls_rendered_page(tree_uuid: str):
|
||||||
|
|
||||||
@app.route('/bulk_captures/<string:base_tree_uuid>', methods=['POST'])
|
@app.route('/bulk_captures/<string:base_tree_uuid>', methods=['POST'])
|
||||||
def bulk_captures(base_tree_uuid: str):
|
def bulk_captures(base_tree_uuid: str):
|
||||||
|
if flask_login.current_user.is_authenticated:
|
||||||
|
user = flask_login.current_user.get_id()
|
||||||
|
else:
|
||||||
|
user = src_request_ip(request)
|
||||||
selected_urls = request.form.getlist('url')
|
selected_urls = request.form.getlist('url')
|
||||||
urls = lookyloo.get_urls_rendered_page(base_tree_uuid)
|
urls = lookyloo.get_urls_rendered_page(base_tree_uuid)
|
||||||
ct = lookyloo.get_crawled_tree(base_tree_uuid)
|
ct = lookyloo.get_crawled_tree(base_tree_uuid)
|
||||||
|
@ -445,7 +450,7 @@ def bulk_captures(base_tree_uuid: str):
|
||||||
'user_agent': ct.user_agent,
|
'user_agent': ct.user_agent,
|
||||||
'parent': base_tree_uuid
|
'parent': base_tree_uuid
|
||||||
}
|
}
|
||||||
new_capture_uuid = lookyloo.enqueue_capture(capture)
|
new_capture_uuid = lookyloo.enqueue_capture(capture, source='web', user=user, authenticated=flask_login.current_user.is_authenticated)
|
||||||
bulk_captures.append((new_capture_uuid, url))
|
bulk_captures.append((new_capture_uuid, url))
|
||||||
|
|
||||||
return render_template('bulk_captures.html', uuid=base_tree_uuid, bulk_captures=bulk_captures)
|
return render_template('bulk_captures.html', uuid=base_tree_uuid, bulk_captures=bulk_captures)
|
||||||
|
@ -655,10 +660,14 @@ def rebuild_cache():
|
||||||
return redirect(url_for('index'))
|
return redirect(url_for('index'))
|
||||||
|
|
||||||
|
|
||||||
@app.route('/submit', methods=['POST', 'GET'])
|
@app.route('/submit', methods=['POST'])
|
||||||
def submit():
|
def submit():
|
||||||
|
if flask_login.current_user.is_authenticated:
|
||||||
|
user = flask_login.current_user.get_id()
|
||||||
|
else:
|
||||||
|
user = src_request_ip(request)
|
||||||
to_query: Dict = request.get_json(force=True) # type: ignore
|
to_query: Dict = request.get_json(force=True) # type: ignore
|
||||||
perma_uuid = lookyloo.enqueue_capture(to_query)
|
perma_uuid = lookyloo.enqueue_capture(to_query, source='api', user=user, authenticated=flask_login.current_user.is_authenticated)
|
||||||
return Response(perma_uuid, mimetype='text/text')
|
return Response(perma_uuid, mimetype='text/text')
|
||||||
|
|
||||||
|
|
||||||
|
@ -679,6 +688,10 @@ def search():
|
||||||
@app.route('/capture', methods=['GET', 'POST'])
|
@app.route('/capture', methods=['GET', 'POST'])
|
||||||
def capture_web():
|
def capture_web():
|
||||||
if request.form.get('url'):
|
if request.form.get('url'):
|
||||||
|
if flask_login.current_user.is_authenticated:
|
||||||
|
user = flask_login.current_user.get_id()
|
||||||
|
else:
|
||||||
|
user = src_request_ip(request)
|
||||||
capture_query: Dict[str, Union[str, bytes, int, bool]] = {'url': request.form['url']}
|
capture_query: Dict[str, Union[str, bytes, int, bool]] = {'url': request.form['url']}
|
||||||
# check if the post request has the file part
|
# check if the post request has the file part
|
||||||
if 'cookies' in request.files and request.files['cookies'].filename:
|
if 'cookies' in request.files and request.files['cookies'].filename:
|
||||||
|
@ -698,7 +711,7 @@ def capture_web():
|
||||||
if request.form.get('referer'):
|
if request.form.get('referer'):
|
||||||
capture_query['referer'] = request.form['referer']
|
capture_query['referer'] = request.form['referer']
|
||||||
|
|
||||||
perma_uuid = lookyloo.enqueue_capture(capture_query)
|
perma_uuid = lookyloo.enqueue_capture(capture_query, source='web', user=user, authenticated=flask_login.current_user.is_authenticated)
|
||||||
time.sleep(30)
|
time.sleep(30)
|
||||||
return redirect(url_for('tree', tree_uuid=perma_uuid))
|
return redirect(url_for('tree', tree_uuid=perma_uuid))
|
||||||
user_agents: Dict[str, Any] = {}
|
user_agents: Dict[str, Any] = {}
|
||||||
|
|
Loading…
Reference in New Issue