chg: Rename scrape -> capture everywhere

Related to #118
pull/122/head
Raphaël Vinot 2020-11-03 16:32:04 +01:00
parent 2f1a0f5da8
commit 99c939fd7f
5 changed files with 30 additions and 30 deletions

View File

@ -47,7 +47,7 @@ class Lookyloo():
self.taxonomies = get_taxonomies()
self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
self.scrape_dir: Path = get_homedir() / 'scraped'
self.capture_dir: Path = get_homedir() / 'scraped'
if os.environ.get('SPLASH_URL_DOCKER'):
# In order to have a working default for the docker image, it is easier to use an environment variable
self.splash_url: str = os.environ['SPLASH_URL_DOCKER']
@ -55,7 +55,7 @@ class Lookyloo():
self.splash_url = get_config('generic', 'splash_url')
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
safe_create_dir(self.scrape_dir)
safe_create_dir(self.capture_dir)
# Initialize 3rd party components
self.pi = PhishingInitiative(get_config('modules', 'PhishingInitiative'))
@ -504,15 +504,15 @@ class Lookyloo():
@property
def capture_dirs(self) -> List[Path]:
for capture_dir in self.scrape_dir.iterdir():
for capture_dir in self.capture_dir.iterdir():
if capture_dir.is_dir() and not capture_dir.iterdir():
# Cleanup self.scrape_dir of failed runs.
# Cleanup self.capture_dir of failed runs.
capture_dir.rmdir()
if not (capture_dir / 'uuid').exists():
# Create uuid if missing
with (capture_dir / 'uuid').open('w') as f:
f.write(str(uuid4()))
return sorted(self.scrape_dir.iterdir(), reverse=True)
return sorted(self.capture_dir.iterdir(), reverse=True)
def lookup_capture_dir(self, capture_uuid: str) -> Union[Path, None]:
capture_dir: str = self.redis.hget('lookup_dirs', capture_uuid) # type: ignore
@ -520,7 +520,7 @@ class Lookyloo():
return Path(capture_dir)
return None
def enqueue_scrape(self, query: MutableMapping[str, Any]) -> str:
def enqueue_capture(self, query: MutableMapping[str, Any]) -> str:
perma_uuid = str(uuid4())
p = self.redis.pipeline()
for key, value in query.items():
@ -528,19 +528,19 @@ class Lookyloo():
# Yes, empty string because that's False.
query[key] = 1 if value else ''
p.hmset(perma_uuid, query)
p.sadd('to_scrape', perma_uuid)
p.sadd('to_capture', perma_uuid)
p.execute()
return perma_uuid
def process_scrape_queue(self) -> Union[bool, None]:
uuid = self.redis.spop('to_scrape')
def process_capture_queue(self) -> Union[bool, None]:
uuid = self.redis.spop('to_capture')
if not uuid:
return None
to_scrape: Dict[str, Union[str, int, float]] = self.redis.hgetall(uuid) # type: ignore
to_capture: Dict[str, Union[str, int, float]] = self.redis.hgetall(uuid) # type: ignore
self.redis.delete(uuid)
to_scrape['perma_uuid'] = uuid
if self.scrape(**to_scrape): # type: ignore
self.logger.info(f'Processed {to_scrape["url"]}')
to_capture['perma_uuid'] = uuid
if self.capture(**to_capture): # type: ignore
self.logger.info(f'Processed {to_capture["url"]}')
return True
return False
@ -638,7 +638,7 @@ class Lookyloo():
def get_capture(self, capture_uuid: str) -> BytesIO:
return self._get_raw(capture_uuid)
def scrape(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
def capture(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None,
browser: Optional[str]=None) -> Union[bool, str]:
@ -668,7 +668,7 @@ class Lookyloo():
ua = user_agent
if int(depth) > int(get_config('generic', 'max_depth')):
self.logger.warning(f'Not allowed to scrape on a depth higher than {get_config("generic", "max_depth")}: {depth}')
self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}')
depth = int(get_config('generic', 'max_depth'))
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
referer=referer, log_enabled=True, log_level=get_config('generic', 'splash_loglevel'))
@ -678,7 +678,7 @@ class Lookyloo():
if not perma_uuid:
perma_uuid = str(uuid4())
width = len(str(len(items)))
dirpath = self.scrape_dir / datetime.now().isoformat()
dirpath = self.capture_dir / datetime.now().isoformat()
safe_create_dir(dirpath)
for i, item in enumerate(items):
if not listing: # Write no_index marker

View File

@ -448,12 +448,12 @@ def rebuild_cache():
@app.route('/submit', methods=['POST', 'GET'])
def submit():
to_query = request.get_json(force=True)
perma_uuid = lookyloo.enqueue_scrape(to_query)
perma_uuid = lookyloo.enqueue_capture(to_query)
return Response(perma_uuid, mimetype='text/text')
@app.route('/scrape', methods=['GET', 'POST'])
def scrape_web():
@app.route('/capture', methods=['GET', 'POST'])
def capture_web():
if request.form.get('url'):
# check if the post request has the file part
if 'cookies' in request.files and request.files['cookies'].filename:
@ -464,11 +464,11 @@ def scrape_web():
if url:
depth: int = request.form.get('depth') if request.form.get('depth') else 1 # type: ignore
listing: bool = request.form.get('listing') if request.form.get('listing') else False # type: ignore
perma_uuid = lookyloo.scrape(url=url, cookies_pseudofile=cookie_file,
depth=depth, listing=listing,
user_agent=request.form.get('user_agent'),
referer=request.form.get('referer'), # type: ignore
os=request.form.get('os'), browser=request.form.get('browser'))
perma_uuid = lookyloo.capture(url=url, cookies_pseudofile=cookie_file,
depth=depth, listing=listing,
user_agent=request.form.get('user_agent'),
referer=request.form.get('referer'), # type: ignore
os=request.form.get('os'), browser=request.form.get('browser'))
return redirect(url_for('tree', tree_uuid=perma_uuid))
user_agents: Dict[str, Any] = {}
if get_config('generic', 'use_user_agents_users'):
@ -478,7 +478,7 @@ def scrape_web():
if not user_agents:
user_agents = get_user_agents()
user_agents.pop('by_frequency')
return render_template('scrape.html', user_agents=user_agents)
return render_template('capture.html', user_agents=user_agents)
@app.route('/cookies/<string:cookie_name>', methods=['GET'])

View File

@ -1,5 +1,5 @@
{% extends "main.html" %}
{% block title %}Scrape{% endblock %}
{% block title %}Capture{% endblock %}
{% block content %}
<div class="container">
@ -8,7 +8,7 @@
alt="Lookyloo" width="400">
</center>
</br>
<form role="form" action="{{ url_for('scrape_web') }}" method=post enctype=multipart/form-data>
<form role="form" action="{{ url_for('capture_web') }}" method=post enctype=multipart/form-data>
<div class="form-group row">
<div class="col-sm-10">
<div class="form-check">
@ -108,5 +108,5 @@
{% block scripts %}
{{ super() }}
<script src='{{ url_for('static', filename='scrape.js') }}'></script>
<script src='{{ url_for('static', filename='capture.js') }}'></script>
{% endblock %}

View File

@ -39,13 +39,13 @@ $(document).ready(function () {
{% block content %}
<center>
<a href="{{ url_for('scrape_web') }}">
<a href="{{ url_for('capture_web') }}">
<img src="{{ url_for('static', filename='lookyloo.jpeg') }}"
alt="Lookyloo" width="200">
</a>
</center>
<center>
<h2><a href="{{ url_for('scrape_web') }}">Start a new capture</a></h2>
<h2><a href="{{ url_for('capture_web') }}">Start a new capture</a></h2>
<br><br>
{{ render_messages(container=True, dismissible=True) }}
</center>