chg: Rename scrape -> capture everywhere

Related to #118
pull/122/head
Raphaël Vinot 2020-11-03 16:32:04 +01:00
parent 2f1a0f5da8
commit 99c939fd7f
5 changed files with 30 additions and 30 deletions

View File

@ -47,7 +47,7 @@ class Lookyloo():
self.taxonomies = get_taxonomies() self.taxonomies = get_taxonomies()
self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
self.scrape_dir: Path = get_homedir() / 'scraped' self.capture_dir: Path = get_homedir() / 'scraped'
if os.environ.get('SPLASH_URL_DOCKER'): if os.environ.get('SPLASH_URL_DOCKER'):
# In order to have a working default for the docker image, it is easier to use an environment variable # In order to have a working default for the docker image, it is easier to use an environment variable
self.splash_url: str = os.environ['SPLASH_URL_DOCKER'] self.splash_url: str = os.environ['SPLASH_URL_DOCKER']
@ -55,7 +55,7 @@ class Lookyloo():
self.splash_url = get_config('generic', 'splash_url') self.splash_url = get_config('generic', 'splash_url')
self.only_global_lookups: bool = get_config('generic', 'only_global_lookups') self.only_global_lookups: bool = get_config('generic', 'only_global_lookups')
safe_create_dir(self.scrape_dir) safe_create_dir(self.capture_dir)
# Initialize 3rd party components # Initialize 3rd party components
self.pi = PhishingInitiative(get_config('modules', 'PhishingInitiative')) self.pi = PhishingInitiative(get_config('modules', 'PhishingInitiative'))
@ -504,15 +504,15 @@ class Lookyloo():
@property @property
def capture_dirs(self) -> List[Path]: def capture_dirs(self) -> List[Path]:
for capture_dir in self.scrape_dir.iterdir(): for capture_dir in self.capture_dir.iterdir():
if capture_dir.is_dir() and not capture_dir.iterdir(): if capture_dir.is_dir() and not capture_dir.iterdir():
# Cleanup self.scrape_dir of failed runs. # Cleanup self.capture_dir of failed runs.
capture_dir.rmdir() capture_dir.rmdir()
if not (capture_dir / 'uuid').exists(): if not (capture_dir / 'uuid').exists():
# Create uuid if missing # Create uuid if missing
with (capture_dir / 'uuid').open('w') as f: with (capture_dir / 'uuid').open('w') as f:
f.write(str(uuid4())) f.write(str(uuid4()))
return sorted(self.scrape_dir.iterdir(), reverse=True) return sorted(self.capture_dir.iterdir(), reverse=True)
def lookup_capture_dir(self, capture_uuid: str) -> Union[Path, None]: def lookup_capture_dir(self, capture_uuid: str) -> Union[Path, None]:
capture_dir: str = self.redis.hget('lookup_dirs', capture_uuid) # type: ignore capture_dir: str = self.redis.hget('lookup_dirs', capture_uuid) # type: ignore
@ -520,7 +520,7 @@ class Lookyloo():
return Path(capture_dir) return Path(capture_dir)
return None return None
def enqueue_scrape(self, query: MutableMapping[str, Any]) -> str: def enqueue_capture(self, query: MutableMapping[str, Any]) -> str:
perma_uuid = str(uuid4()) perma_uuid = str(uuid4())
p = self.redis.pipeline() p = self.redis.pipeline()
for key, value in query.items(): for key, value in query.items():
@ -528,19 +528,19 @@ class Lookyloo():
# Yes, empty string because that's False. # Yes, empty string because that's False.
query[key] = 1 if value else '' query[key] = 1 if value else ''
p.hmset(perma_uuid, query) p.hmset(perma_uuid, query)
p.sadd('to_scrape', perma_uuid) p.sadd('to_capture', perma_uuid)
p.execute() p.execute()
return perma_uuid return perma_uuid
def process_scrape_queue(self) -> Union[bool, None]: def process_capture_queue(self) -> Union[bool, None]:
uuid = self.redis.spop('to_scrape') uuid = self.redis.spop('to_capture')
if not uuid: if not uuid:
return None return None
to_scrape: Dict[str, Union[str, int, float]] = self.redis.hgetall(uuid) # type: ignore to_capture: Dict[str, Union[str, int, float]] = self.redis.hgetall(uuid) # type: ignore
self.redis.delete(uuid) self.redis.delete(uuid)
to_scrape['perma_uuid'] = uuid to_capture['perma_uuid'] = uuid
if self.scrape(**to_scrape): # type: ignore if self.capture(**to_capture): # type: ignore
self.logger.info(f'Processed {to_scrape["url"]}') self.logger.info(f'Processed {to_capture["url"]}')
return True return True
return False return False
@ -638,7 +638,7 @@ class Lookyloo():
def get_capture(self, capture_uuid: str) -> BytesIO: def get_capture(self, capture_uuid: str) -> BytesIO:
return self._get_raw(capture_uuid) return self._get_raw(capture_uuid)
def scrape(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None, def capture(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None,
depth: int=1, listing: bool=True, user_agent: Optional[str]=None, depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None, referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None,
browser: Optional[str]=None) -> Union[bool, str]: browser: Optional[str]=None) -> Union[bool, str]:
@ -668,7 +668,7 @@ class Lookyloo():
ua = user_agent ua = user_agent
if int(depth) > int(get_config('generic', 'max_depth')): if int(depth) > int(get_config('generic', 'max_depth')):
self.logger.warning(f'Not allowed to scrape on a depth higher than {get_config("generic", "max_depth")}: {depth}') self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}')
depth = int(get_config('generic', 'max_depth')) depth = int(get_config('generic', 'max_depth'))
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua, items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
referer=referer, log_enabled=True, log_level=get_config('generic', 'splash_loglevel')) referer=referer, log_enabled=True, log_level=get_config('generic', 'splash_loglevel'))
@ -678,7 +678,7 @@ class Lookyloo():
if not perma_uuid: if not perma_uuid:
perma_uuid = str(uuid4()) perma_uuid = str(uuid4())
width = len(str(len(items))) width = len(str(len(items)))
dirpath = self.scrape_dir / datetime.now().isoformat() dirpath = self.capture_dir / datetime.now().isoformat()
safe_create_dir(dirpath) safe_create_dir(dirpath)
for i, item in enumerate(items): for i, item in enumerate(items):
if not listing: # Write no_index marker if not listing: # Write no_index marker

View File

@ -448,12 +448,12 @@ def rebuild_cache():
@app.route('/submit', methods=['POST', 'GET']) @app.route('/submit', methods=['POST', 'GET'])
def submit(): def submit():
to_query = request.get_json(force=True) to_query = request.get_json(force=True)
perma_uuid = lookyloo.enqueue_scrape(to_query) perma_uuid = lookyloo.enqueue_capture(to_query)
return Response(perma_uuid, mimetype='text/text') return Response(perma_uuid, mimetype='text/text')
@app.route('/scrape', methods=['GET', 'POST']) @app.route('/capture', methods=['GET', 'POST'])
def scrape_web(): def capture_web():
if request.form.get('url'): if request.form.get('url'):
# check if the post request has the file part # check if the post request has the file part
if 'cookies' in request.files and request.files['cookies'].filename: if 'cookies' in request.files and request.files['cookies'].filename:
@ -464,11 +464,11 @@ def scrape_web():
if url: if url:
depth: int = request.form.get('depth') if request.form.get('depth') else 1 # type: ignore depth: int = request.form.get('depth') if request.form.get('depth') else 1 # type: ignore
listing: bool = request.form.get('listing') if request.form.get('listing') else False # type: ignore listing: bool = request.form.get('listing') if request.form.get('listing') else False # type: ignore
perma_uuid = lookyloo.scrape(url=url, cookies_pseudofile=cookie_file, perma_uuid = lookyloo.capture(url=url, cookies_pseudofile=cookie_file,
depth=depth, listing=listing, depth=depth, listing=listing,
user_agent=request.form.get('user_agent'), user_agent=request.form.get('user_agent'),
referer=request.form.get('referer'), # type: ignore referer=request.form.get('referer'), # type: ignore
os=request.form.get('os'), browser=request.form.get('browser')) os=request.form.get('os'), browser=request.form.get('browser'))
return redirect(url_for('tree', tree_uuid=perma_uuid)) return redirect(url_for('tree', tree_uuid=perma_uuid))
user_agents: Dict[str, Any] = {} user_agents: Dict[str, Any] = {}
if get_config('generic', 'use_user_agents_users'): if get_config('generic', 'use_user_agents_users'):
@ -478,7 +478,7 @@ def scrape_web():
if not user_agents: if not user_agents:
user_agents = get_user_agents() user_agents = get_user_agents()
user_agents.pop('by_frequency') user_agents.pop('by_frequency')
return render_template('scrape.html', user_agents=user_agents) return render_template('capture.html', user_agents=user_agents)
@app.route('/cookies/<string:cookie_name>', methods=['GET']) @app.route('/cookies/<string:cookie_name>', methods=['GET'])

View File

@ -1,5 +1,5 @@
{% extends "main.html" %} {% extends "main.html" %}
{% block title %}Scrape{% endblock %} {% block title %}Capture{% endblock %}
{% block content %} {% block content %}
<div class="container"> <div class="container">
@ -8,7 +8,7 @@
alt="Lookyloo" width="400"> alt="Lookyloo" width="400">
</center> </center>
</br> </br>
<form role="form" action="{{ url_for('scrape_web') }}" method=post enctype=multipart/form-data> <form role="form" action="{{ url_for('capture_web') }}" method=post enctype=multipart/form-data>
<div class="form-group row"> <div class="form-group row">
<div class="col-sm-10"> <div class="col-sm-10">
<div class="form-check"> <div class="form-check">
@ -108,5 +108,5 @@
{% block scripts %} {% block scripts %}
{{ super() }} {{ super() }}
<script src='{{ url_for('static', filename='scrape.js') }}'></script> <script src='{{ url_for('static', filename='capture.js') }}'></script>
{% endblock %} {% endblock %}

View File

@ -39,13 +39,13 @@ $(document).ready(function () {
{% block content %} {% block content %}
<center> <center>
<a href="{{ url_for('scrape_web') }}"> <a href="{{ url_for('capture_web') }}">
<img src="{{ url_for('static', filename='lookyloo.jpeg') }}" <img src="{{ url_for('static', filename='lookyloo.jpeg') }}"
alt="Lookyloo" width="200"> alt="Lookyloo" width="200">
</a> </a>
</center> </center>
<center> <center>
<h2><a href="{{ url_for('scrape_web') }}">Start a new capture</a></h2> <h2><a href="{{ url_for('capture_web') }}">Start a new capture</a></h2>
<br><br> <br><br>
{{ render_messages(container=True, dismissible=True) }} {{ render_messages(container=True, dismissible=True) }}
</center> </center>