chg: Bump dependencies, Improve HTTP errors handling

Fix https://github.com/CIRCL/lookyloo/issues/66
pull/67/head
Raphaël Vinot 2020-02-16 23:38:42 +01:00
parent 61e5609687
commit 7d2f50174d
2 changed files with 46 additions and 33 deletions

View File

@ -61,14 +61,17 @@ class Lookyloo():
def _set_report_cache(self, report_dir: Path) -> None:
if self.redis.exists(str(report_dir)):
return
if (report_dir / 'error.txt').exists():
# Something went wrong
return
har_files = sorted(report_dir.glob('*.har'))
if not har_files:
self.logger.warning(f'No har files in {report_dir}')
if (report_dir / 'uuid').exists():
(report_dir / 'uuid').unlink()
if (report_dir / 'no_index').exists():
(report_dir / 'no_index').unlink()
report_dir.rmdir()
# if (report_dir / 'uuid').exists():
# (report_dir / 'uuid').unlink()
# if (report_dir / 'no_index').exists():
# (report_dir / 'no_index').unlink()
# report_dir.rmdir()
return
with (report_dir / 'uuid').open() as f:
uuid = f.read().strip()
@ -88,6 +91,9 @@ class Lookyloo():
def report_cache(self, report_dir: Union[str, Path]) -> Optional[Dict[str, Union[str, int]]]:
if isinstance(report_dir, Path):
report_dir = str(report_dir)
if (Path(report_dir) / 'error.txt').exists():
with (report_dir / 'error.txt').open() as _error:
self.logger.warning(f'Capture in ({report_dir}) has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go')
cached = self.redis.hgetall(report_dir)
if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects']):
cached['redirects'] = json.loads(cached['redirects'])
@ -194,8 +200,34 @@ class Lookyloo():
dirpath = self.scrape_dir / datetime.now().isoformat()
dirpath.mkdir()
for i, item in enumerate(items):
if not listing: # Write no_index marker
(dirpath / 'no_index').touch()
with (dirpath / 'uuid').open('w') as _uuid:
_uuid.write(perma_uuid)
if os or browser:
meta = {}
if os:
meta['os'] = os
if browser:
meta['browser'] = browser
with (dirpath / 'meta').open('w') as _meta:
json.dump(meta, _meta)
if 'error' in item:
with (dirpath / 'error.txt').open('w') as _error:
_error.write(item['error'])
continue
# The capture went fine
harfile = item['har']
png = base64.b64decode(item['png'])
html = item['html']
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
json.dump(harfile, _har)
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
_img.write(png)
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
_html.write(html)
if 'childFrames' in item:
child_frames = item['childFrames']
@ -207,24 +239,5 @@ class Lookyloo():
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
json.dump(cookies, _cookies)
html = item['html']
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
json.dump(harfile, _har)
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
_img.write(png)
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
_html.write(html)
with (dirpath / 'uuid').open('w') as _uuid:
_uuid.write(perma_uuid)
if not listing: # Write no_index marker
(dirpath / 'no_index').touch()
if os or browser:
meta = {}
if os:
meta['os'] = os
if browser:
meta['browser'] = browser
with (dirpath / 'meta').open('w') as _meta:
json.dump(meta, _meta)
self._set_report_cache(dirpath)
return perma_uuid

18
poetry.lock generated
View File

@ -19,10 +19,10 @@ marker = "python_version >= \"3.5\""
name = "automat"
optional = false
python-versions = "*"
version = "0.8.0"
version = "20.2.0"
[package.dependencies]
attrs = ">=16.1.0"
attrs = ">=19.2.0"
six = "*"
[package.extras]
@ -107,7 +107,7 @@ description = "A Python module to bypass Cloudflare's anti-bot page."
name = "cloudscraper"
optional = false
python-versions = "*"
version = "1.2.20"
version = "1.2.23"
[package.dependencies]
requests = ">=2.9.2"
@ -208,7 +208,7 @@ lxml = "^4.4.2"
six = "^1.14.0"
[package.source]
reference = "7df6b07111fc039e813f9a43968ad430ff8eb73b"
reference = "4d16f46b6ad9f86e34422cea7d48652012092e53"
type = "git"
url = "https://github.com/viper-framework/har2tree.git"
[[package]]
@ -536,7 +536,7 @@ scrapy = "^1.8.0"
scrapy-splash = "^0.7.2"
[package.source]
reference = "d781ff5867504f50ce9411fc7cad1a653dd2a02d"
reference = "f327de3dbbdab0b4eaf85928a637df33d28538a1"
type = "git"
url = "https://github.com/viper-framework/ScrapySplashWrapper.git"
[[package]]
@ -683,8 +683,8 @@ attrs = [
{file = "attrs-19.3.0.tar.gz", hash = "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"},
]
automat = [
{file = "Automat-0.8.0-py2.py3-none-any.whl", hash = "sha256:81c93c55d2742c55e74e6497a48e048a859fa01d7aa0b91a032be432229837e2"},
{file = "Automat-0.8.0.tar.gz", hash = "sha256:269a09dfb063a3b078983f4976d83f0a0d3e6e7aaf8e27d8df1095e09dc4a484"},
{file = "Automat-20.2.0-py2.py3-none-any.whl", hash = "sha256:b6feb6455337df834f6c9962d6ccf771515b7d939bca142b29c20c2376bc6111"},
{file = "Automat-20.2.0.tar.gz", hash = "sha256:7979803c74610e11ef0c0d68a2942b152df52da55336e0c9d58daf1831cbdf33"},
]
beautifulsoup4 = [
{file = "beautifulsoup4-4.8.2-py2-none-any.whl", hash = "sha256:e1505eeed31b0f4ce2dbb3bc8eb256c04cc2b3b72af7d551a4ab6efd5cbe5dae"},
@ -763,8 +763,8 @@ click = [
{file = "Click-7.0.tar.gz", hash = "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"},
]
cloudscraper = [
{file = "cloudscraper-1.2.20-py2.py3-none-any.whl", hash = "sha256:63a93877552b6e8d5b6b020f6893aa275c1d3a0b586984be6da5343978985166"},
{file = "cloudscraper-1.2.20.tar.gz", hash = "sha256:3168535f56e33a4db66e754e5a968e3e12c2f891082214f51269bee7a57de8ef"},
{file = "cloudscraper-1.2.23-py2.py3-none-any.whl", hash = "sha256:a50e366d6d5ae299ce1554fc025c1e756b2598bc0b2232efa76e36c3e8f79e63"},
{file = "cloudscraper-1.2.23.tar.gz", hash = "sha256:b01bd3ab916d30624643d7c14569218024b3f81b20da8045af4a301129f3bc11"},
]
constantly = [
{file = "constantly-15.1.0-py2.py3-none-any.whl", hash = "sha256:dd2fa9d6b1a51a83f0d7dd76293d734046aa176e384bf6e33b7e44880eb37c5d"},