chg: Refactoring of the redirects, rename report -> capture

2020-03-26 01:56:24 +01:00 · 2020-03-26 01:56:24 +01:00 · 949ad58667
parent f26a02ef86
commit 949ad58667
6 changed files with 108 additions and 69 deletions
--- a/config/.keepdir
+++ b/config/.keepdir
--- a/lookyloo/helpers.py
+++ b/lookyloo/helpers.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import os
-from typing import List, Optional
+from typing import List, Optional, Dict, Union, Any
 from io import BufferedIOBase
 from pathlib import Path
 from .exceptions import MissingEnv, CreateDirectoryException
@ -116,7 +116,7 @@ def update_user_agents():
    try:
        s = cloudscraper.create_scraper()
        r = s.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/')
-    except Exception as e:
+    except Exception:
        traceback.print_exc()
        return
    soup = BeautifulSoup(r.text, 'html.parser')
@ -165,7 +165,7 @@ def load_cookies(cookie_pseudofile: Optional[BufferedIOBase]=None) -> List[dict]
                      'expires': (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z',
                      'domain': u,
                      'value': cookie['Content raw']
-                     }
+                      }
            to_return.append(to_add)
    except Exception as e:
        print(f'Unable to load the cookie file: {e}')
--- a/lookyloo/lookyloo.py
+++ b/lookyloo/lookyloo.py
@ -24,7 +24,7 @@ from .helpers import get_homedir, get_socket_path, load_cookies
 from .exceptions import NoValidHarFile
 from redis import Redis

-from typing import Union, Dict, List, Tuple, Optional
+from typing import Union, Dict, List, Tuple, Optional, Any

 import logging

@ -60,76 +60,93 @@ class Lookyloo():
        self.logger = logging.getLogger(f'{self.__class__.__name__}')
        self.logger.setLevel(loglevel)

-    def _set_report_cache(self, report_dir: Path) -> None:
-        if self.redis.exists(str(report_dir)):
+    def _set_capture_cache(self, capture_dir: Path, force: bool=False) -> None:
+        if force or not self.redis.exists(str(capture_dir)):
+            # (re)build cache
+            pass
+        else:
            return
-        with (report_dir / 'uuid').open() as f:
+
+        with (capture_dir / 'uuid').open() as f:
            uuid = f.read().strip()

-        har_files = sorted(report_dir.glob('*.har'))
+        har_files = sorted(capture_dir.glob('*.har'))

        error_cache: Dict[str, str] = {}
-        if (report_dir / 'error.txt').exists():
+        if (capture_dir / 'error.txt').exists():
            # Something went wrong
-            with (Path(report_dir) / 'error.txt').open() as _error:
-                error_cache['error'] = f'Capture in {report_dir} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum'
+            with (Path(capture_dir) / 'error.txt').open() as _error:
+                error_cache['error'] = f'Capture in {capture_dir} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum'
        elif not har_files:
-            error_cache['error'] = f'No har files in {report_dir}'
+            error_cache['error'] = f'No har files in {capture_dir}'

        if error_cache:
            self.logger.warning(error_cache['error'])
-            self.redis.hmset(str(report_dir), error_cache)
-            self.redis.hset('lookup_dirs', uuid, str(report_dir))
+            self.redis.hmset(str(capture_dir), error_cache)
+            self.redis.hset('lookup_dirs', uuid, str(capture_dir))
            return

        har = HarFile(har_files[0])

+        redirects = har.initial_redirects
+        incomplete_redirects = False
+        if redirects and har.need_tree_redirects:
+            # load tree from disk, get redirects
+            ct = self._load_pickle(capture_dir / 'tree.pickle')
+            if ct:
+                redirects = ct.redirects
+            else:
+                # Pickle not available
+                incomplete_redirects = True
+
        cache: Dict[str, Union[str, int]] = {'uuid': uuid,
                                             'title': har.initial_title,
                                             'timestamp': har.initial_start_time,
                                             'url': har.first_url,
-                                             'redirects': json.dumps(har.initial_redirects)}
-        if (report_dir / 'no_index').exists():  # If the folders claims anonymity
+                                             'redirects': json.dumps(redirects),
+                                             'incomplete_redirects': 1 if incomplete_redirects else 0}
+        if (capture_dir / 'no_index').exists():  # If the folders claims anonymity
            cache['no_index'] = 1
-        if uuid and not self.redis.exists(str(report_dir)):
-            self.redis.hmset(str(report_dir), cache)
-            self.redis.hset('lookup_dirs', uuid, str(report_dir))

-    def report_cache(self, report_dir: Union[str, Path]) -> Optional[Dict[str, Union[str, int]]]:
-        if isinstance(report_dir, Path):
-            report_dir = str(report_dir)
-        cached = self.redis.hgetall(report_dir)
+        self.redis.hmset(str(capture_dir), cache)
+        self.redis.hset('lookup_dirs', uuid, str(capture_dir))
+
+    def capture_cache(self, capture_dir: Path) -> Optional[Dict[str, Union[str, int]]]:
+        if self.redis.hget(str(capture_dir), 'incomplete_redirects') == '1':
+            # try to rebuild the cache
+            self._set_capture_cache(capture_dir, force=True)
+        cached = self.redis.hgetall(str(capture_dir))
        if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects']):
            cached['redirects'] = json.loads(cached['redirects'])
            return cached
        elif 'error' in cached:
            return cached
        else:
-            self.logger.warning(f'Cache ({report_dir}) is invalid: {json.dumps(cached, indent=2)}')
+            self.logger.warning(f'Cache ({capture_dir}) is invalid: {json.dumps(cached, indent=2)}')
            return None

    def _init_existing_dumps(self) -> None:
-        for report_dir in self.report_dirs:
-            if report_dir.exists():
-                self._set_report_cache(report_dir)
+        for capture_dir in self.capture_dirs:
+            if capture_dir.exists():
+                self._set_capture_cache(capture_dir)
        self.redis.set('cache_loaded', 1)

    @property
-    def report_dirs(self) -> List[Path]:
-        for report_dir in self.scrape_dir.iterdir():
-            if report_dir.is_dir() and not report_dir.iterdir():
+    def capture_dirs(self) -> List[Path]:
+        for capture_dir in self.scrape_dir.iterdir():
+            if capture_dir.is_dir() and not capture_dir.iterdir():
                # Cleanup self.scrape_dir of failed runs.
-                report_dir.rmdir()
-            if not (report_dir / 'uuid').exists():
+                capture_dir.rmdir()
+            if not (capture_dir / 'uuid').exists():
                # Create uuid if missing
-                with (report_dir / 'uuid').open('w') as f:
+                with (capture_dir / 'uuid').open('w') as f:
                    f.write(str(uuid4()))
        return sorted(self.scrape_dir.iterdir(), reverse=True)

-    def lookup_report_dir(self, uuid) -> Union[Path, None]:
-        report_dir = self.redis.hget('lookup_dirs', uuid)
-        if report_dir:
-            return Path(report_dir)
+    def lookup_capture_dir(self, uuid) -> Union[Path, None]:
+        capture_dir = self.redis.hget('lookup_dirs', uuid)
+        if capture_dir:
+            return Path(capture_dir)
        return None

    def enqueue_scrape(self, query: dict) -> str:
@ -152,18 +169,27 @@ class Lookyloo():
            return True
        return False

-    def load_tree(self, report_dir: Path) -> Tuple[str, dict, str, str, str, dict]:
-        har_files = sorted(report_dir.glob('*.har'))
+    def _load_pickle(self, pickle_file: Path) -> Optional[CrawledTree]:
+        if pickle_file.exists():
+            with pickle_file.open('rb') as _p:
+                return pickle.load(_p)
+        return None
+
+    def load_tree(self, capture_dir: Path) -> Tuple[str, dict, str, str, str, dict]:
+        har_files = sorted(capture_dir.glob('*.har'))
+        pickle_file = capture_dir / 'tree.pickle'
        try:
            meta = {}
-            if (report_dir / 'meta').exists():
-                with open((report_dir / 'meta'), 'r') as f:
+            if (capture_dir / 'meta').exists():
+                # NOTE: Legacy, the meta file should be present
+                with open((capture_dir / 'meta'), 'r') as f:
                    meta = json.load(f)
-            ct = CrawledTree(har_files)
-            temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)
-            pickle.dump(ct, temp)
-            temp.close()
-            return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
+            ct = self._load_pickle(pickle_file)
+            if not ct:
+                ct = CrawledTree(har_files)
+                with pickle_file.open('wb') as _p:
+                    pickle.dump(ct, _p)
+            return pickle_file.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
        except Har2TreeError as e:
            raise NoValidHarFile(e.message)

@ -172,8 +198,8 @@ class Lookyloo():
            if time.time() - tmpfile.stat().st_atime > 36000:
                tmpfile.unlink()

-    def load_image(self, report_dir: Path) -> BytesIO:
-        with open(list(report_dir.glob('*.png'))[0], 'rb') as f:
+    def load_image(self, capture_dir: Path) -> BytesIO:
+        with open(list(capture_dir.glob('*.png'))[0], 'rb') as f:
            return BytesIO(f.read())

    def sane_js_query(self, sha512: str) -> Dict:
@ -254,5 +280,5 @@ class Lookyloo():
                with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
                    json.dump(cookies, _cookies)

-        self._set_report_cache(dirpath)
+        self._set_capture_cache(dirpath)
        return perma_uuid
--- a/poetry.lock
+++ b/poetry.lock
@ -251,7 +251,7 @@ publicsuffix2 = "^2.20191221"
 six = "^1.14.0"

 [package.source]
-reference = "7656d60b3a200285205f991f6646d9a1e366c7cf"
+reference = "02c7e19229c33a62bfabbc3a1981f0401d7b3a71"
 type = "git"
 url = "https://github.com/viper-framework/har2tree.git"
 [[package]]
@ -701,7 +701,7 @@ scrapy = "^1.8.0"
 scrapy-splash = "^0.7.2"

 [package.source]
-reference = "383fafad20a111e02fa53dc639b4cc53c7b8456c"
+reference = "bf18e5e1c88c9263b90a69348e0020ceccf2aa12"
 type = "git"
 url = "https://github.com/viper-framework/ScrapySplashWrapper.git"
 [[package]]
@ -795,7 +795,7 @@ python-versions = "*"
 version = "1.4.1"

 [[package]]
-category = "dev"
+category = "main"
 description = "Backported and Experimental Type Hints for Python 3.5+"
 name = "typing-extensions"
 optional = false
--- a/website/web/init.py
+++ b/website/web/init.py
@ -47,9 +47,9 @@ lookyloo: Lookyloo = Lookyloo(splash_url=splash_url, loglevel=loglevel, only_glo


 # keep
-def load_tree(report_dir: Path) -> Tuple[dict, str, str, str, dict]:
+def load_tree(capture_dir: Path) -> Tuple[dict, str, str, str, dict]:
    session.clear()
-    temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(report_dir)
+    temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(capture_dir)
    session["tree"] = temp_file_name
    return tree_json, tree_time, tree_ua, tree_root_url, meta

@ -137,20 +137,20 @@ def urlnode_details(node_uuid):

@app.route('/tree/<string:tree_uuid>/image', methods=['GET'])
 def image(tree_uuid):
-    report_dir = lookyloo.lookup_report_dir(tree_uuid)
-    if not report_dir:
+    capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
+    if not capture_dir:
        return Response('Not available.', mimetype='text/text')
-    to_return = lookyloo.load_image(report_dir)
+    to_return = lookyloo.load_image(capture_dir)
    return send_file(to_return, mimetype='image/png',
                     as_attachment=True, attachment_filename='image.png')


@app.route('/redirects/<string:tree_uuid>', methods=['GET'])
 def redirects(tree_uuid):
-    report_dir = lookyloo.lookup_report_dir(tree_uuid)
-    if not report_dir:
+    capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
+    if not capture_dir:
        return Response('Not available.', mimetype='text/text')
-    cache = lookyloo.report_cache(report_dir)
+    cache = lookyloo.capture_cache(capture_dir)
    if not cache['redirects']:
        return Response('No redirects.', mimetype='text/text')
    to_return = BytesIO('\n'.join(cache['redirects']).encode())
@ -158,23 +158,31 @@ def redirects(tree_uuid):
                     as_attachment=True, attachment_filename='redirects.txt')


+@app.route('/cache_tree/<string:tree_uuid>', methods=['GET'])
+def cache_tree(tree_uuid):
+    capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
+    if capture_dir:
+        lookyloo.load_tree(capture_dir)
+    return redirect(url_for('index'))
+
+
@app.route('/tree/<string:tree_uuid>', methods=['GET'])
 def tree(tree_uuid):
    if tree_uuid == 'False':
        flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error')
        return redirect(url_for('index'))
-    report_dir = lookyloo.lookup_report_dir(tree_uuid)
-    if not report_dir:
+    capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
+    if not capture_dir:
        flash(f'Unable to find this UUID ({tree_uuid}). The capture may still be ongoing, try again later.', 'error')
        return redirect(url_for('index'))

-    cache = lookyloo.report_cache(report_dir)
+    cache = lookyloo.capture_cache(capture_dir)
    if 'error' in cache:
        flash(cache['error'], 'error')
        return redirect(url_for('index'))

    try:
-        tree_json, start_time, user_agent, root_url, meta = load_tree(report_dir)
+        tree_json, start_time, user_agent, root_url, meta = load_tree(capture_dir)
        return render_template('tree.html', tree_json=tree_json, start_time=start_time,
                               user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid,
                               meta=meta)
@ -190,12 +198,13 @@ def index():
    lookyloo.cleanup_old_tmpfiles()
    update_user_agents()
    titles = []
-    for report_dir in lookyloo.report_dirs:
-        cached = lookyloo.report_cache(report_dir)
+    for capture_dir in lookyloo.capture_dirs:
+        cached = lookyloo.capture_cache(capture_dir)
        if not cached or 'no_index' in cached or 'error' in cached:
            continue
        date, time = cached['timestamp'].split('T')
        time, _ = time.split('.', 1)
-        titles.append((cached['uuid'], cached['title'], date, time, cached['url'], cached['redirects']))
+        titles.append((cached['uuid'], cached['title'], date, time, cached['url'],
+                       cached['redirects'], True if cached['incomplete_redirects'] == '1' else False))
    titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True)
    return render_template('index.html', titles=titles)
--- a/website/web/templates/index.html
+++ b/website/web/templates/index.html
@ -28,7 +28,7 @@
     </tr>
    </thead>
    <tbody>
-      {% for uuid, page_title, date, time, url, redirects in titles %}
+      {% for uuid, page_title, date, time, url, redirects, incomplete_redirects in titles %}
      <tr>
        <td>
          <p title="{{ page_title }}"><a href="{{ url_for('tree', tree_uuid=uuid) }}">{{ page_title }}</a></p>
@ -46,7 +46,11 @@
            {%endif%}
            </p>
            {% endfor %}
-            <a style="float: right;" href="{{ url_for('redirects', tree_uuid=uuid) }}">Download redirects</a>
+            {% if incomplete_redirects %}
+                <a style="float: right;" href="{{ url_for('cache_tree', tree_uuid=uuid) }}">Unable to find the redirects, click here to build the tree</a>
+            {%else%}
+              <a style="float: right;" href="{{ url_for('redirects', tree_uuid=uuid) }}">Download redirects</a>
+            {%endif%}
          {% else%}
            No redirect
          {%endif%}