mirror of https://github.com/CIRCL/lookyloo
parent
2f1a0f5da8
commit
99c939fd7f
|
@ -47,7 +47,7 @@ class Lookyloo():
|
||||||
self.taxonomies = get_taxonomies()
|
self.taxonomies = get_taxonomies()
|
||||||
|
|
||||||
self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
||||||
self.scrape_dir: Path = get_homedir() / 'scraped'
|
self.capture_dir: Path = get_homedir() / 'scraped'
|
||||||
if os.environ.get('SPLASH_URL_DOCKER'):
|
if os.environ.get('SPLASH_URL_DOCKER'):
|
||||||
# In order to have a working default for the docker image, it is easier to use an environment variable
|
# In order to have a working default for the docker image, it is easier to use an environment variable
|
||||||
self.splash_url: str = os.environ['SPLASH_URL_DOCKER']
|
self.splash_url: str = os.environ['SPLASH_URL_DOCKER']
|
||||||
|
@ -55,7 +55,7 @@ class Lookyloo():
|
||||||
self.splash_url = get_config('generic', 'splash_url')
|
self.splash_url = get_config('generic', 'splash_url')
|
||||||
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
|
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
|
||||||
|
|
||||||
safe_create_dir(self.scrape_dir)
|
safe_create_dir(self.capture_dir)
|
||||||
|
|
||||||
# Initialize 3rd party components
|
# Initialize 3rd party components
|
||||||
self.pi = PhishingInitiative(get_config('modules', 'PhishingInitiative'))
|
self.pi = PhishingInitiative(get_config('modules', 'PhishingInitiative'))
|
||||||
|
@ -504,15 +504,15 @@ class Lookyloo():
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def capture_dirs(self) -> List[Path]:
|
def capture_dirs(self) -> List[Path]:
|
||||||
for capture_dir in self.scrape_dir.iterdir():
|
for capture_dir in self.capture_dir.iterdir():
|
||||||
if capture_dir.is_dir() and not capture_dir.iterdir():
|
if capture_dir.is_dir() and not capture_dir.iterdir():
|
||||||
# Cleanup self.scrape_dir of failed runs.
|
# Cleanup self.capture_dir of failed runs.
|
||||||
capture_dir.rmdir()
|
capture_dir.rmdir()
|
||||||
if not (capture_dir / 'uuid').exists():
|
if not (capture_dir / 'uuid').exists():
|
||||||
# Create uuid if missing
|
# Create uuid if missing
|
||||||
with (capture_dir / 'uuid').open('w') as f:
|
with (capture_dir / 'uuid').open('w') as f:
|
||||||
f.write(str(uuid4()))
|
f.write(str(uuid4()))
|
||||||
return sorted(self.scrape_dir.iterdir(), reverse=True)
|
return sorted(self.capture_dir.iterdir(), reverse=True)
|
||||||
|
|
||||||
def lookup_capture_dir(self, capture_uuid: str) -> Union[Path, None]:
|
def lookup_capture_dir(self, capture_uuid: str) -> Union[Path, None]:
|
||||||
capture_dir: str = self.redis.hget('lookup_dirs', capture_uuid) # type: ignore
|
capture_dir: str = self.redis.hget('lookup_dirs', capture_uuid) # type: ignore
|
||||||
|
@ -520,7 +520,7 @@ class Lookyloo():
|
||||||
return Path(capture_dir)
|
return Path(capture_dir)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def enqueue_scrape(self, query: MutableMapping[str, Any]) -> str:
|
def enqueue_capture(self, query: MutableMapping[str, Any]) -> str:
|
||||||
perma_uuid = str(uuid4())
|
perma_uuid = str(uuid4())
|
||||||
p = self.redis.pipeline()
|
p = self.redis.pipeline()
|
||||||
for key, value in query.items():
|
for key, value in query.items():
|
||||||
|
@ -528,19 +528,19 @@ class Lookyloo():
|
||||||
# Yes, empty string because that's False.
|
# Yes, empty string because that's False.
|
||||||
query[key] = 1 if value else ''
|
query[key] = 1 if value else ''
|
||||||
p.hmset(perma_uuid, query)
|
p.hmset(perma_uuid, query)
|
||||||
p.sadd('to_scrape', perma_uuid)
|
p.sadd('to_capture', perma_uuid)
|
||||||
p.execute()
|
p.execute()
|
||||||
return perma_uuid
|
return perma_uuid
|
||||||
|
|
||||||
def process_scrape_queue(self) -> Union[bool, None]:
|
def process_capture_queue(self) -> Union[bool, None]:
|
||||||
uuid = self.redis.spop('to_scrape')
|
uuid = self.redis.spop('to_capture')
|
||||||
if not uuid:
|
if not uuid:
|
||||||
return None
|
return None
|
||||||
to_scrape: Dict[str, Union[str, int, float]] = self.redis.hgetall(uuid) # type: ignore
|
to_capture: Dict[str, Union[str, int, float]] = self.redis.hgetall(uuid) # type: ignore
|
||||||
self.redis.delete(uuid)
|
self.redis.delete(uuid)
|
||||||
to_scrape['perma_uuid'] = uuid
|
to_capture['perma_uuid'] = uuid
|
||||||
if self.scrape(**to_scrape): # type: ignore
|
if self.capture(**to_capture): # type: ignore
|
||||||
self.logger.info(f'Processed {to_scrape["url"]}')
|
self.logger.info(f'Processed {to_capture["url"]}')
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -638,7 +638,7 @@ class Lookyloo():
|
||||||
def get_capture(self, capture_uuid: str) -> BytesIO:
|
def get_capture(self, capture_uuid: str) -> BytesIO:
|
||||||
return self._get_raw(capture_uuid)
|
return self._get_raw(capture_uuid)
|
||||||
|
|
||||||
def scrape(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
|
def capture(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
|
||||||
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
|
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
|
||||||
referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None,
|
referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None,
|
||||||
browser: Optional[str]=None) -> Union[bool, str]:
|
browser: Optional[str]=None) -> Union[bool, str]:
|
||||||
|
@ -668,7 +668,7 @@ class Lookyloo():
|
||||||
ua = user_agent
|
ua = user_agent
|
||||||
|
|
||||||
if int(depth) > int(get_config('generic', 'max_depth')):
|
if int(depth) > int(get_config('generic', 'max_depth')):
|
||||||
self.logger.warning(f'Not allowed to scrape on a depth higher than {get_config("generic", "max_depth")}: {depth}')
|
self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}')
|
||||||
depth = int(get_config('generic', 'max_depth'))
|
depth = int(get_config('generic', 'max_depth'))
|
||||||
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
|
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
|
||||||
referer=referer, log_enabled=True, log_level=get_config('generic', 'splash_loglevel'))
|
referer=referer, log_enabled=True, log_level=get_config('generic', 'splash_loglevel'))
|
||||||
|
@ -678,7 +678,7 @@ class Lookyloo():
|
||||||
if not perma_uuid:
|
if not perma_uuid:
|
||||||
perma_uuid = str(uuid4())
|
perma_uuid = str(uuid4())
|
||||||
width = len(str(len(items)))
|
width = len(str(len(items)))
|
||||||
dirpath = self.scrape_dir / datetime.now().isoformat()
|
dirpath = self.capture_dir / datetime.now().isoformat()
|
||||||
safe_create_dir(dirpath)
|
safe_create_dir(dirpath)
|
||||||
for i, item in enumerate(items):
|
for i, item in enumerate(items):
|
||||||
if not listing: # Write no_index marker
|
if not listing: # Write no_index marker
|
||||||
|
|
|
@ -448,12 +448,12 @@ def rebuild_cache():
|
||||||
@app.route('/submit', methods=['POST', 'GET'])
|
@app.route('/submit', methods=['POST', 'GET'])
|
||||||
def submit():
|
def submit():
|
||||||
to_query = request.get_json(force=True)
|
to_query = request.get_json(force=True)
|
||||||
perma_uuid = lookyloo.enqueue_scrape(to_query)
|
perma_uuid = lookyloo.enqueue_capture(to_query)
|
||||||
return Response(perma_uuid, mimetype='text/text')
|
return Response(perma_uuid, mimetype='text/text')
|
||||||
|
|
||||||
|
|
||||||
@app.route('/scrape', methods=['GET', 'POST'])
|
@app.route('/capture', methods=['GET', 'POST'])
|
||||||
def scrape_web():
|
def capture_web():
|
||||||
if request.form.get('url'):
|
if request.form.get('url'):
|
||||||
# check if the post request has the file part
|
# check if the post request has the file part
|
||||||
if 'cookies' in request.files and request.files['cookies'].filename:
|
if 'cookies' in request.files and request.files['cookies'].filename:
|
||||||
|
@ -464,11 +464,11 @@ def scrape_web():
|
||||||
if url:
|
if url:
|
||||||
depth: int = request.form.get('depth') if request.form.get('depth') else 1 # type: ignore
|
depth: int = request.form.get('depth') if request.form.get('depth') else 1 # type: ignore
|
||||||
listing: bool = request.form.get('listing') if request.form.get('listing') else False # type: ignore
|
listing: bool = request.form.get('listing') if request.form.get('listing') else False # type: ignore
|
||||||
perma_uuid = lookyloo.scrape(url=url, cookies_pseudofile=cookie_file,
|
perma_uuid = lookyloo.capture(url=url, cookies_pseudofile=cookie_file,
|
||||||
depth=depth, listing=listing,
|
depth=depth, listing=listing,
|
||||||
user_agent=request.form.get('user_agent'),
|
user_agent=request.form.get('user_agent'),
|
||||||
referer=request.form.get('referer'), # type: ignore
|
referer=request.form.get('referer'), # type: ignore
|
||||||
os=request.form.get('os'), browser=request.form.get('browser'))
|
os=request.form.get('os'), browser=request.form.get('browser'))
|
||||||
return redirect(url_for('tree', tree_uuid=perma_uuid))
|
return redirect(url_for('tree', tree_uuid=perma_uuid))
|
||||||
user_agents: Dict[str, Any] = {}
|
user_agents: Dict[str, Any] = {}
|
||||||
if get_config('generic', 'use_user_agents_users'):
|
if get_config('generic', 'use_user_agents_users'):
|
||||||
|
@ -478,7 +478,7 @@ def scrape_web():
|
||||||
if not user_agents:
|
if not user_agents:
|
||||||
user_agents = get_user_agents()
|
user_agents = get_user_agents()
|
||||||
user_agents.pop('by_frequency')
|
user_agents.pop('by_frequency')
|
||||||
return render_template('scrape.html', user_agents=user_agents)
|
return render_template('capture.html', user_agents=user_agents)
|
||||||
|
|
||||||
|
|
||||||
@app.route('/cookies/<string:cookie_name>', methods=['GET'])
|
@app.route('/cookies/<string:cookie_name>', methods=['GET'])
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
{% extends "main.html" %}
|
{% extends "main.html" %}
|
||||||
{% block title %}Scrape{% endblock %}
|
{% block title %}Capture{% endblock %}
|
||||||
|
|
||||||
{% block content %}
|
{% block content %}
|
||||||
<div class="container">
|
<div class="container">
|
||||||
|
@ -8,7 +8,7 @@
|
||||||
alt="Lookyloo" width="400">
|
alt="Lookyloo" width="400">
|
||||||
</center>
|
</center>
|
||||||
</br>
|
</br>
|
||||||
<form role="form" action="{{ url_for('scrape_web') }}" method=post enctype=multipart/form-data>
|
<form role="form" action="{{ url_for('capture_web') }}" method=post enctype=multipart/form-data>
|
||||||
<div class="form-group row">
|
<div class="form-group row">
|
||||||
<div class="col-sm-10">
|
<div class="col-sm-10">
|
||||||
<div class="form-check">
|
<div class="form-check">
|
||||||
|
@ -108,5 +108,5 @@
|
||||||
|
|
||||||
{% block scripts %}
|
{% block scripts %}
|
||||||
{{ super() }}
|
{{ super() }}
|
||||||
<script src='{{ url_for('static', filename='scrape.js') }}'></script>
|
<script src='{{ url_for('static', filename='capture.js') }}'></script>
|
||||||
{% endblock %}
|
{% endblock %}
|
|
@ -39,13 +39,13 @@ $(document).ready(function () {
|
||||||
|
|
||||||
{% block content %}
|
{% block content %}
|
||||||
<center>
|
<center>
|
||||||
<a href="{{ url_for('scrape_web') }}">
|
<a href="{{ url_for('capture_web') }}">
|
||||||
<img src="{{ url_for('static', filename='lookyloo.jpeg') }}"
|
<img src="{{ url_for('static', filename='lookyloo.jpeg') }}"
|
||||||
alt="Lookyloo" width="200">
|
alt="Lookyloo" width="200">
|
||||||
</a>
|
</a>
|
||||||
</center>
|
</center>
|
||||||
<center>
|
<center>
|
||||||
<h2><a href="{{ url_for('scrape_web') }}">Start a new capture</a></h2>
|
<h2><a href="{{ url_for('capture_web') }}">Start a new capture</a></h2>
|
||||||
<br><br>
|
<br><br>
|
||||||
{{ render_messages(container=True, dismissible=True) }}
|
{{ render_messages(container=True, dismissible=True) }}
|
||||||
</center>
|
</center>
|
||||||
|
|
Loading…
Reference in New Issue