chg: Bump dependencies, Improve HTTP errors handling

Fix https://github.com/CIRCL/lookyloo/issues/66
2020-02-16 23:38:42 +01:00 · 2020-02-16 23:38:42 +01:00 · 7d2f50174d
parent 61e5609687
commit 7d2f50174d
2 changed files with 46 additions and 33 deletions
--- a/lookyloo/lookyloo.py
+++ b/lookyloo/lookyloo.py
@ -61,14 +61,17 @@ class Lookyloo():
    def _set_report_cache(self, report_dir: Path) -> None:
        if self.redis.exists(str(report_dir)):
            return
+        if (report_dir / 'error.txt').exists():
+            # Something went wrong
+            return
        har_files = sorted(report_dir.glob('*.har'))
        if not har_files:
            self.logger.warning(f'No har files in {report_dir}')
-            if (report_dir / 'uuid').exists():
-                (report_dir / 'uuid').unlink()
-            if (report_dir / 'no_index').exists():
-                (report_dir / 'no_index').unlink()
-            report_dir.rmdir()
+            # if (report_dir / 'uuid').exists():
+            #    (report_dir / 'uuid').unlink()
+            # if (report_dir / 'no_index').exists():
+            #    (report_dir / 'no_index').unlink()
+            # report_dir.rmdir()
            return
        with (report_dir / 'uuid').open() as f:
            uuid = f.read().strip()
@ -88,6 +91,9 @@ class Lookyloo():
    def report_cache(self, report_dir: Union[str, Path]) -> Optional[Dict[str, Union[str, int]]]:
        if isinstance(report_dir, Path):
            report_dir = str(report_dir)
+        if (Path(report_dir) / 'error.txt').exists():
+            with (report_dir / 'error.txt').open() as _error:
+                self.logger.warning(f'Capture in ({report_dir}) has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go')
        cached = self.redis.hgetall(report_dir)
        if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects']):
            cached['redirects'] = json.loads(cached['redirects'])
@ -194,8 +200,34 @@ class Lookyloo():
        dirpath = self.scrape_dir / datetime.now().isoformat()
        dirpath.mkdir()
        for i, item in enumerate(items):
+            if not listing:  # Write no_index marker
+                (dirpath / 'no_index').touch()
+            with (dirpath / 'uuid').open('w') as _uuid:
+                _uuid.write(perma_uuid)
+            if os or browser:
+                meta = {}
+                if os:
+                    meta['os'] = os
+                if browser:
+                    meta['browser'] = browser
+                with (dirpath / 'meta').open('w') as _meta:
+                    json.dump(meta, _meta)
+            if 'error' in item:
+                with (dirpath / 'error.txt').open('w') as _error:
+                    _error.write(item['error'])
+                continue
+
+            # The capture went fine
            harfile = item['har']
            png = base64.b64decode(item['png'])
+            html = item['html']
+
+            with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
+                json.dump(harfile, _har)
+            with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
+                _img.write(png)
+            with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
+                _html.write(html)

            if 'childFrames' in item:
                child_frames = item['childFrames']
@ -207,24 +239,5 @@ class Lookyloo():
                with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
                    json.dump(cookies, _cookies)

-            html = item['html']
-            with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
-                json.dump(harfile, _har)
-            with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
-                _img.write(png)
-            with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
-                _html.write(html)
-            with (dirpath / 'uuid').open('w') as _uuid:
-                _uuid.write(perma_uuid)
-            if not listing:  # Write no_index marker
-                (dirpath / 'no_index').touch()
-            if os or browser:
-                meta = {}
-                if os:
-                    meta['os'] = os
-                if browser:
-                    meta['browser'] = browser
-                with (dirpath / 'meta').open('w') as _meta:
-                    json.dump(meta, _meta)
        self._set_report_cache(dirpath)
        return perma_uuid
--- a/poetry.lock
+++ b/poetry.lock
@ -19,10 +19,10 @@ marker = "python_version >= \"3.5\""
 name = "automat"
 optional = false
 python-versions = "*"
-version = "0.8.0"
+version = "20.2.0"

 [package.dependencies]
-attrs = ">=16.1.0"
+attrs = ">=19.2.0"
 six = "*"

 [package.extras]
@ -107,7 +107,7 @@ description = "A Python module to bypass Cloudflare's anti-bot page."
 name = "cloudscraper"
 optional = false
 python-versions = "*"
-version = "1.2.20"
+version = "1.2.23"

 [package.dependencies]
 requests = ">=2.9.2"
@ -208,7 +208,7 @@ lxml = "^4.4.2"
 six = "^1.14.0"

 [package.source]
-reference = "7df6b07111fc039e813f9a43968ad430ff8eb73b"
+reference = "4d16f46b6ad9f86e34422cea7d48652012092e53"
 type = "git"
 url = "https://github.com/viper-framework/har2tree.git"
 [[package]]
@ -536,7 +536,7 @@ scrapy = "^1.8.0"
 scrapy-splash = "^0.7.2"

 [package.source]
-reference = "d781ff5867504f50ce9411fc7cad1a653dd2a02d"
+reference = "f327de3dbbdab0b4eaf85928a637df33d28538a1"
 type = "git"
 url = "https://github.com/viper-framework/ScrapySplashWrapper.git"
 [[package]]
@ -683,8 +683,8 @@ attrs = [
    {file = "attrs-19.3.0.tar.gz", hash = "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"},
 ]
 automat = [
-    {file = "Automat-0.8.0-py2.py3-none-any.whl", hash = "sha256:81c93c55d2742c55e74e6497a48e048a859fa01d7aa0b91a032be432229837e2"},
-    {file = "Automat-0.8.0.tar.gz", hash = "sha256:269a09dfb063a3b078983f4976d83f0a0d3e6e7aaf8e27d8df1095e09dc4a484"},
+    {file = "Automat-20.2.0-py2.py3-none-any.whl", hash = "sha256:b6feb6455337df834f6c9962d6ccf771515b7d939bca142b29c20c2376bc6111"},
+    {file = "Automat-20.2.0.tar.gz", hash = "sha256:7979803c74610e11ef0c0d68a2942b152df52da55336e0c9d58daf1831cbdf33"},
 ]
 beautifulsoup4 = [
    {file = "beautifulsoup4-4.8.2-py2-none-any.whl", hash = "sha256:e1505eeed31b0f4ce2dbb3bc8eb256c04cc2b3b72af7d551a4ab6efd5cbe5dae"},
@ -763,8 +763,8 @@ click = [
    {file = "Click-7.0.tar.gz", hash = "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"},
 ]
 cloudscraper = [
-    {file = "cloudscraper-1.2.20-py2.py3-none-any.whl", hash = "sha256:63a93877552b6e8d5b6b020f6893aa275c1d3a0b586984be6da5343978985166"},
-    {file = "cloudscraper-1.2.20.tar.gz", hash = "sha256:3168535f56e33a4db66e754e5a968e3e12c2f891082214f51269bee7a57de8ef"},
+    {file = "cloudscraper-1.2.23-py2.py3-none-any.whl", hash = "sha256:a50e366d6d5ae299ce1554fc025c1e756b2598bc0b2232efa76e36c3e8f79e63"},
+    {file = "cloudscraper-1.2.23.tar.gz", hash = "sha256:b01bd3ab916d30624643d7c14569218024b3f81b20da8045af4a301129f3bc11"},
 ]
 constantly = [
    {file = "constantly-15.1.0-py2.py3-none-any.whl", hash = "sha256:dd2fa9d6b1a51a83f0d7dd76293d734046aa176e384bf6e33b7e44880eb37c5d"},