mirror of https://github.com/CIRCL/lookyloo
parent
2f1a0f5da8
commit
99c939fd7f
|
@ -47,7 +47,7 @@ class Lookyloo():
|
|||
self.taxonomies = get_taxonomies()
|
||||
|
||||
self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
||||
self.scrape_dir: Path = get_homedir() / 'scraped'
|
||||
self.capture_dir: Path = get_homedir() / 'scraped'
|
||||
if os.environ.get('SPLASH_URL_DOCKER'):
|
||||
# In order to have a working default for the docker image, it is easier to use an environment variable
|
||||
self.splash_url: str = os.environ['SPLASH_URL_DOCKER']
|
||||
|
@ -55,7 +55,7 @@ class Lookyloo():
|
|||
self.splash_url = get_config('generic', 'splash_url')
|
||||
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
|
||||
|
||||
safe_create_dir(self.scrape_dir)
|
||||
safe_create_dir(self.capture_dir)
|
||||
|
||||
# Initialize 3rd party components
|
||||
self.pi = PhishingInitiative(get_config('modules', 'PhishingInitiative'))
|
||||
|
@ -504,15 +504,15 @@ class Lookyloo():
|
|||
|
||||
@property
|
||||
def capture_dirs(self) -> List[Path]:
|
||||
for capture_dir in self.scrape_dir.iterdir():
|
||||
for capture_dir in self.capture_dir.iterdir():
|
||||
if capture_dir.is_dir() and not capture_dir.iterdir():
|
||||
# Cleanup self.scrape_dir of failed runs.
|
||||
# Cleanup self.capture_dir of failed runs.
|
||||
capture_dir.rmdir()
|
||||
if not (capture_dir / 'uuid').exists():
|
||||
# Create uuid if missing
|
||||
with (capture_dir / 'uuid').open('w') as f:
|
||||
f.write(str(uuid4()))
|
||||
return sorted(self.scrape_dir.iterdir(), reverse=True)
|
||||
return sorted(self.capture_dir.iterdir(), reverse=True)
|
||||
|
||||
def lookup_capture_dir(self, capture_uuid: str) -> Union[Path, None]:
|
||||
capture_dir: str = self.redis.hget('lookup_dirs', capture_uuid) # type: ignore
|
||||
|
@ -520,7 +520,7 @@ class Lookyloo():
|
|||
return Path(capture_dir)
|
||||
return None
|
||||
|
||||
def enqueue_scrape(self, query: MutableMapping[str, Any]) -> str:
|
||||
def enqueue_capture(self, query: MutableMapping[str, Any]) -> str:
|
||||
perma_uuid = str(uuid4())
|
||||
p = self.redis.pipeline()
|
||||
for key, value in query.items():
|
||||
|
@ -528,19 +528,19 @@ class Lookyloo():
|
|||
# Yes, empty string because that's False.
|
||||
query[key] = 1 if value else ''
|
||||
p.hmset(perma_uuid, query)
|
||||
p.sadd('to_scrape', perma_uuid)
|
||||
p.sadd('to_capture', perma_uuid)
|
||||
p.execute()
|
||||
return perma_uuid
|
||||
|
||||
def process_scrape_queue(self) -> Union[bool, None]:
|
||||
uuid = self.redis.spop('to_scrape')
|
||||
def process_capture_queue(self) -> Union[bool, None]:
|
||||
uuid = self.redis.spop('to_capture')
|
||||
if not uuid:
|
||||
return None
|
||||
to_scrape: Dict[str, Union[str, int, float]] = self.redis.hgetall(uuid) # type: ignore
|
||||
to_capture: Dict[str, Union[str, int, float]] = self.redis.hgetall(uuid) # type: ignore
|
||||
self.redis.delete(uuid)
|
||||
to_scrape['perma_uuid'] = uuid
|
||||
if self.scrape(**to_scrape): # type: ignore
|
||||
self.logger.info(f'Processed {to_scrape["url"]}')
|
||||
to_capture['perma_uuid'] = uuid
|
||||
if self.capture(**to_capture): # type: ignore
|
||||
self.logger.info(f'Processed {to_capture["url"]}')
|
||||
return True
|
||||
return False
|
||||
|
||||
|
@ -638,7 +638,7 @@ class Lookyloo():
|
|||
def get_capture(self, capture_uuid: str) -> BytesIO:
|
||||
return self._get_raw(capture_uuid)
|
||||
|
||||
def scrape(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
|
||||
def capture(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
|
||||
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
|
||||
referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None,
|
||||
browser: Optional[str]=None) -> Union[bool, str]:
|
||||
|
@ -668,7 +668,7 @@ class Lookyloo():
|
|||
ua = user_agent
|
||||
|
||||
if int(depth) > int(get_config('generic', 'max_depth')):
|
||||
self.logger.warning(f'Not allowed to scrape on a depth higher than {get_config("generic", "max_depth")}: {depth}')
|
||||
self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}')
|
||||
depth = int(get_config('generic', 'max_depth'))
|
||||
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
|
||||
referer=referer, log_enabled=True, log_level=get_config('generic', 'splash_loglevel'))
|
||||
|
@ -678,7 +678,7 @@ class Lookyloo():
|
|||
if not perma_uuid:
|
||||
perma_uuid = str(uuid4())
|
||||
width = len(str(len(items)))
|
||||
dirpath = self.scrape_dir / datetime.now().isoformat()
|
||||
dirpath = self.capture_dir / datetime.now().isoformat()
|
||||
safe_create_dir(dirpath)
|
||||
for i, item in enumerate(items):
|
||||
if not listing: # Write no_index marker
|
||||
|
|
|
@ -448,12 +448,12 @@ def rebuild_cache():
|
|||
@app.route('/submit', methods=['POST', 'GET'])
|
||||
def submit():
|
||||
to_query = request.get_json(force=True)
|
||||
perma_uuid = lookyloo.enqueue_scrape(to_query)
|
||||
perma_uuid = lookyloo.enqueue_capture(to_query)
|
||||
return Response(perma_uuid, mimetype='text/text')
|
||||
|
||||
|
||||
@app.route('/scrape', methods=['GET', 'POST'])
|
||||
def scrape_web():
|
||||
@app.route('/capture', methods=['GET', 'POST'])
|
||||
def capture_web():
|
||||
if request.form.get('url'):
|
||||
# check if the post request has the file part
|
||||
if 'cookies' in request.files and request.files['cookies'].filename:
|
||||
|
@ -464,11 +464,11 @@ def scrape_web():
|
|||
if url:
|
||||
depth: int = request.form.get('depth') if request.form.get('depth') else 1 # type: ignore
|
||||
listing: bool = request.form.get('listing') if request.form.get('listing') else False # type: ignore
|
||||
perma_uuid = lookyloo.scrape(url=url, cookies_pseudofile=cookie_file,
|
||||
depth=depth, listing=listing,
|
||||
user_agent=request.form.get('user_agent'),
|
||||
referer=request.form.get('referer'), # type: ignore
|
||||
os=request.form.get('os'), browser=request.form.get('browser'))
|
||||
perma_uuid = lookyloo.capture(url=url, cookies_pseudofile=cookie_file,
|
||||
depth=depth, listing=listing,
|
||||
user_agent=request.form.get('user_agent'),
|
||||
referer=request.form.get('referer'), # type: ignore
|
||||
os=request.form.get('os'), browser=request.form.get('browser'))
|
||||
return redirect(url_for('tree', tree_uuid=perma_uuid))
|
||||
user_agents: Dict[str, Any] = {}
|
||||
if get_config('generic', 'use_user_agents_users'):
|
||||
|
@ -478,7 +478,7 @@ def scrape_web():
|
|||
if not user_agents:
|
||||
user_agents = get_user_agents()
|
||||
user_agents.pop('by_frequency')
|
||||
return render_template('scrape.html', user_agents=user_agents)
|
||||
return render_template('capture.html', user_agents=user_agents)
|
||||
|
||||
|
||||
@app.route('/cookies/<string:cookie_name>', methods=['GET'])
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
{% extends "main.html" %}
|
||||
{% block title %}Scrape{% endblock %}
|
||||
{% block title %}Capture{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="container">
|
||||
|
@ -8,7 +8,7 @@
|
|||
alt="Lookyloo" width="400">
|
||||
</center>
|
||||
</br>
|
||||
<form role="form" action="{{ url_for('scrape_web') }}" method=post enctype=multipart/form-data>
|
||||
<form role="form" action="{{ url_for('capture_web') }}" method=post enctype=multipart/form-data>
|
||||
<div class="form-group row">
|
||||
<div class="col-sm-10">
|
||||
<div class="form-check">
|
||||
|
@ -108,5 +108,5 @@
|
|||
|
||||
{% block scripts %}
|
||||
{{ super() }}
|
||||
<script src='{{ url_for('static', filename='scrape.js') }}'></script>
|
||||
<script src='{{ url_for('static', filename='capture.js') }}'></script>
|
||||
{% endblock %}
|
|
@ -39,13 +39,13 @@ $(document).ready(function () {
|
|||
|
||||
{% block content %}
|
||||
<center>
|
||||
<a href="{{ url_for('scrape_web') }}">
|
||||
<a href="{{ url_for('capture_web') }}">
|
||||
<img src="{{ url_for('static', filename='lookyloo.jpeg') }}"
|
||||
alt="Lookyloo" width="200">
|
||||
</a>
|
||||
</center>
|
||||
<center>
|
||||
<h2><a href="{{ url_for('scrape_web') }}">Start a new capture</a></h2>
|
||||
<h2><a href="{{ url_for('capture_web') }}">Start a new capture</a></h2>
|
||||
<br><br>
|
||||
{{ render_messages(container=True, dismissible=True) }}
|
||||
</center>
|
||||
|
|
Loading…
Reference in New Issue