From 99c939fd7f25a2bbb1972884b446ae999bb92932 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Tue, 3 Nov 2020 16:32:04 +0100 Subject: [PATCH] chg: Rename scrape -> capture everywhere Related to #118 --- lookyloo/lookyloo.py | 32 +++++++++---------- website/web/__init__.py | 18 +++++------ website/web/static/{scrape.js => capture.js} | 0 .../templates/{scrape.html => capture.html} | 6 ++-- website/web/templates/index.html | 4 +-- 5 files changed, 30 insertions(+), 30 deletions(-) rename website/web/static/{scrape.js => capture.js} (100%) rename website/web/templates/{scrape.html => capture.html} (95%) diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 09187967..4925122d 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -47,7 +47,7 @@ class Lookyloo(): self.taxonomies = get_taxonomies() self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) - self.scrape_dir: Path = get_homedir() / 'scraped' + self.capture_dir: Path = get_homedir() / 'scraped' if os.environ.get('SPLASH_URL_DOCKER'): # In order to have a working default for the docker image, it is easier to use an environment variable self.splash_url: str = os.environ['SPLASH_URL_DOCKER'] @@ -55,7 +55,7 @@ class Lookyloo(): self.splash_url = get_config('generic', 'splash_url') self.only_global_lookups: bool = get_config('generic', 'only_global_lookups') - safe_create_dir(self.scrape_dir) + safe_create_dir(self.capture_dir) # Initialize 3rd party components self.pi = PhishingInitiative(get_config('modules', 'PhishingInitiative')) @@ -504,15 +504,15 @@ class Lookyloo(): @property def capture_dirs(self) -> List[Path]: - for capture_dir in self.scrape_dir.iterdir(): + for capture_dir in self.capture_dir.iterdir(): if capture_dir.is_dir() and not capture_dir.iterdir(): - # Cleanup self.scrape_dir of failed runs. + # Cleanup self.capture_dir of failed runs. capture_dir.rmdir() if not (capture_dir / 'uuid').exists(): # Create uuid if missing with (capture_dir / 'uuid').open('w') as f: f.write(str(uuid4())) - return sorted(self.scrape_dir.iterdir(), reverse=True) + return sorted(self.capture_dir.iterdir(), reverse=True) def lookup_capture_dir(self, capture_uuid: str) -> Union[Path, None]: capture_dir: str = self.redis.hget('lookup_dirs', capture_uuid) # type: ignore @@ -520,7 +520,7 @@ class Lookyloo(): return Path(capture_dir) return None - def enqueue_scrape(self, query: MutableMapping[str, Any]) -> str: + def enqueue_capture(self, query: MutableMapping[str, Any]) -> str: perma_uuid = str(uuid4()) p = self.redis.pipeline() for key, value in query.items(): @@ -528,19 +528,19 @@ class Lookyloo(): # Yes, empty string because that's False. query[key] = 1 if value else '' p.hmset(perma_uuid, query) - p.sadd('to_scrape', perma_uuid) + p.sadd('to_capture', perma_uuid) p.execute() return perma_uuid - def process_scrape_queue(self) -> Union[bool, None]: - uuid = self.redis.spop('to_scrape') + def process_capture_queue(self) -> Union[bool, None]: + uuid = self.redis.spop('to_capture') if not uuid: return None - to_scrape: Dict[str, Union[str, int, float]] = self.redis.hgetall(uuid) # type: ignore + to_capture: Dict[str, Union[str, int, float]] = self.redis.hgetall(uuid) # type: ignore self.redis.delete(uuid) - to_scrape['perma_uuid'] = uuid - if self.scrape(**to_scrape): # type: ignore - self.logger.info(f'Processed {to_scrape["url"]}') + to_capture['perma_uuid'] = uuid + if self.capture(**to_capture): # type: ignore + self.logger.info(f'Processed {to_capture["url"]}') return True return False @@ -638,7 +638,7 @@ class Lookyloo(): def get_capture(self, capture_uuid: str) -> BytesIO: return self._get_raw(capture_uuid) - def scrape(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None, + def capture(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None, depth: int=1, listing: bool=True, user_agent: Optional[str]=None, referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None, browser: Optional[str]=None) -> Union[bool, str]: @@ -668,7 +668,7 @@ class Lookyloo(): ua = user_agent if int(depth) > int(get_config('generic', 'max_depth')): - self.logger.warning(f'Not allowed to scrape on a depth higher than {get_config("generic", "max_depth")}: {depth}') + self.logger.warning(f'Not allowed to capture on a depth higher than {get_config("generic", "max_depth")}: {depth}') depth = int(get_config('generic', 'max_depth')) items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua, referer=referer, log_enabled=True, log_level=get_config('generic', 'splash_loglevel')) @@ -678,7 +678,7 @@ class Lookyloo(): if not perma_uuid: perma_uuid = str(uuid4()) width = len(str(len(items))) - dirpath = self.scrape_dir / datetime.now().isoformat() + dirpath = self.capture_dir / datetime.now().isoformat() safe_create_dir(dirpath) for i, item in enumerate(items): if not listing: # Write no_index marker diff --git a/website/web/__init__.py b/website/web/__init__.py index 790dc807..f0e398be 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -448,12 +448,12 @@ def rebuild_cache(): @app.route('/submit', methods=['POST', 'GET']) def submit(): to_query = request.get_json(force=True) - perma_uuid = lookyloo.enqueue_scrape(to_query) + perma_uuid = lookyloo.enqueue_capture(to_query) return Response(perma_uuid, mimetype='text/text') -@app.route('/scrape', methods=['GET', 'POST']) -def scrape_web(): +@app.route('/capture', methods=['GET', 'POST']) +def capture_web(): if request.form.get('url'): # check if the post request has the file part if 'cookies' in request.files and request.files['cookies'].filename: @@ -464,11 +464,11 @@ def scrape_web(): if url: depth: int = request.form.get('depth') if request.form.get('depth') else 1 # type: ignore listing: bool = request.form.get('listing') if request.form.get('listing') else False # type: ignore - perma_uuid = lookyloo.scrape(url=url, cookies_pseudofile=cookie_file, - depth=depth, listing=listing, - user_agent=request.form.get('user_agent'), - referer=request.form.get('referer'), # type: ignore - os=request.form.get('os'), browser=request.form.get('browser')) + perma_uuid = lookyloo.capture(url=url, cookies_pseudofile=cookie_file, + depth=depth, listing=listing, + user_agent=request.form.get('user_agent'), + referer=request.form.get('referer'), # type: ignore + os=request.form.get('os'), browser=request.form.get('browser')) return redirect(url_for('tree', tree_uuid=perma_uuid)) user_agents: Dict[str, Any] = {} if get_config('generic', 'use_user_agents_users'): @@ -478,7 +478,7 @@ def scrape_web(): if not user_agents: user_agents = get_user_agents() user_agents.pop('by_frequency') - return render_template('scrape.html', user_agents=user_agents) + return render_template('capture.html', user_agents=user_agents) @app.route('/cookies/', methods=['GET']) diff --git a/website/web/static/scrape.js b/website/web/static/capture.js similarity index 100% rename from website/web/static/scrape.js rename to website/web/static/capture.js diff --git a/website/web/templates/scrape.html b/website/web/templates/capture.html similarity index 95% rename from website/web/templates/scrape.html rename to website/web/templates/capture.html index ad93cef3..b701a97b 100644 --- a/website/web/templates/scrape.html +++ b/website/web/templates/capture.html @@ -1,5 +1,5 @@ {% extends "main.html" %} -{% block title %}Scrape{% endblock %} +{% block title %}Capture{% endblock %} {% block content %}
@@ -8,7 +8,7 @@ alt="Lookyloo" width="400">
-
+
@@ -108,5 +108,5 @@ {% block scripts %} {{ super() }} - + {% endblock %} diff --git a/website/web/templates/index.html b/website/web/templates/index.html index 72026a83..1f355043 100644 --- a/website/web/templates/index.html +++ b/website/web/templates/index.html @@ -39,13 +39,13 @@ $(document).ready(function () { {% block content %}
- + Lookyloo
-

Start a new capture

+

Start a new capture



{{ render_messages(container=True, dismissible=True) }}