mirror of https://github.com/CIRCL/lookyloo
chg: Bump dependencies, Improve HTTP errors handling
Fix https://github.com/CIRCL/lookyloo/issues/66pull/67/head
parent
61e5609687
commit
7d2f50174d
|
@ -61,14 +61,17 @@ class Lookyloo():
|
|||
def _set_report_cache(self, report_dir: Path) -> None:
|
||||
if self.redis.exists(str(report_dir)):
|
||||
return
|
||||
if (report_dir / 'error.txt').exists():
|
||||
# Something went wrong
|
||||
return
|
||||
har_files = sorted(report_dir.glob('*.har'))
|
||||
if not har_files:
|
||||
self.logger.warning(f'No har files in {report_dir}')
|
||||
if (report_dir / 'uuid').exists():
|
||||
(report_dir / 'uuid').unlink()
|
||||
if (report_dir / 'no_index').exists():
|
||||
(report_dir / 'no_index').unlink()
|
||||
report_dir.rmdir()
|
||||
# if (report_dir / 'uuid').exists():
|
||||
# (report_dir / 'uuid').unlink()
|
||||
# if (report_dir / 'no_index').exists():
|
||||
# (report_dir / 'no_index').unlink()
|
||||
# report_dir.rmdir()
|
||||
return
|
||||
with (report_dir / 'uuid').open() as f:
|
||||
uuid = f.read().strip()
|
||||
|
@ -88,6 +91,9 @@ class Lookyloo():
|
|||
def report_cache(self, report_dir: Union[str, Path]) -> Optional[Dict[str, Union[str, int]]]:
|
||||
if isinstance(report_dir, Path):
|
||||
report_dir = str(report_dir)
|
||||
if (Path(report_dir) / 'error.txt').exists():
|
||||
with (report_dir / 'error.txt').open() as _error:
|
||||
self.logger.warning(f'Capture in ({report_dir}) has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go')
|
||||
cached = self.redis.hgetall(report_dir)
|
||||
if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects']):
|
||||
cached['redirects'] = json.loads(cached['redirects'])
|
||||
|
@ -194,8 +200,34 @@ class Lookyloo():
|
|||
dirpath = self.scrape_dir / datetime.now().isoformat()
|
||||
dirpath.mkdir()
|
||||
for i, item in enumerate(items):
|
||||
if not listing: # Write no_index marker
|
||||
(dirpath / 'no_index').touch()
|
||||
with (dirpath / 'uuid').open('w') as _uuid:
|
||||
_uuid.write(perma_uuid)
|
||||
if os or browser:
|
||||
meta = {}
|
||||
if os:
|
||||
meta['os'] = os
|
||||
if browser:
|
||||
meta['browser'] = browser
|
||||
with (dirpath / 'meta').open('w') as _meta:
|
||||
json.dump(meta, _meta)
|
||||
if 'error' in item:
|
||||
with (dirpath / 'error.txt').open('w') as _error:
|
||||
_error.write(item['error'])
|
||||
continue
|
||||
|
||||
# The capture went fine
|
||||
harfile = item['har']
|
||||
png = base64.b64decode(item['png'])
|
||||
html = item['html']
|
||||
|
||||
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
|
||||
json.dump(harfile, _har)
|
||||
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
|
||||
_img.write(png)
|
||||
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
|
||||
_html.write(html)
|
||||
|
||||
if 'childFrames' in item:
|
||||
child_frames = item['childFrames']
|
||||
|
@ -207,24 +239,5 @@ class Lookyloo():
|
|||
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
|
||||
json.dump(cookies, _cookies)
|
||||
|
||||
html = item['html']
|
||||
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
|
||||
json.dump(harfile, _har)
|
||||
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
|
||||
_img.write(png)
|
||||
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
|
||||
_html.write(html)
|
||||
with (dirpath / 'uuid').open('w') as _uuid:
|
||||
_uuid.write(perma_uuid)
|
||||
if not listing: # Write no_index marker
|
||||
(dirpath / 'no_index').touch()
|
||||
if os or browser:
|
||||
meta = {}
|
||||
if os:
|
||||
meta['os'] = os
|
||||
if browser:
|
||||
meta['browser'] = browser
|
||||
with (dirpath / 'meta').open('w') as _meta:
|
||||
json.dump(meta, _meta)
|
||||
self._set_report_cache(dirpath)
|
||||
return perma_uuid
|
||||
|
|
|
@ -19,10 +19,10 @@ marker = "python_version >= \"3.5\""
|
|||
name = "automat"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
version = "0.8.0"
|
||||
version = "20.2.0"
|
||||
|
||||
[package.dependencies]
|
||||
attrs = ">=16.1.0"
|
||||
attrs = ">=19.2.0"
|
||||
six = "*"
|
||||
|
||||
[package.extras]
|
||||
|
@ -107,7 +107,7 @@ description = "A Python module to bypass Cloudflare's anti-bot page."
|
|||
name = "cloudscraper"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
version = "1.2.20"
|
||||
version = "1.2.23"
|
||||
|
||||
[package.dependencies]
|
||||
requests = ">=2.9.2"
|
||||
|
@ -208,7 +208,7 @@ lxml = "^4.4.2"
|
|||
six = "^1.14.0"
|
||||
|
||||
[package.source]
|
||||
reference = "7df6b07111fc039e813f9a43968ad430ff8eb73b"
|
||||
reference = "4d16f46b6ad9f86e34422cea7d48652012092e53"
|
||||
type = "git"
|
||||
url = "https://github.com/viper-framework/har2tree.git"
|
||||
[[package]]
|
||||
|
@ -536,7 +536,7 @@ scrapy = "^1.8.0"
|
|||
scrapy-splash = "^0.7.2"
|
||||
|
||||
[package.source]
|
||||
reference = "d781ff5867504f50ce9411fc7cad1a653dd2a02d"
|
||||
reference = "f327de3dbbdab0b4eaf85928a637df33d28538a1"
|
||||
type = "git"
|
||||
url = "https://github.com/viper-framework/ScrapySplashWrapper.git"
|
||||
[[package]]
|
||||
|
@ -683,8 +683,8 @@ attrs = [
|
|||
{file = "attrs-19.3.0.tar.gz", hash = "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"},
|
||||
]
|
||||
automat = [
|
||||
{file = "Automat-0.8.0-py2.py3-none-any.whl", hash = "sha256:81c93c55d2742c55e74e6497a48e048a859fa01d7aa0b91a032be432229837e2"},
|
||||
{file = "Automat-0.8.0.tar.gz", hash = "sha256:269a09dfb063a3b078983f4976d83f0a0d3e6e7aaf8e27d8df1095e09dc4a484"},
|
||||
{file = "Automat-20.2.0-py2.py3-none-any.whl", hash = "sha256:b6feb6455337df834f6c9962d6ccf771515b7d939bca142b29c20c2376bc6111"},
|
||||
{file = "Automat-20.2.0.tar.gz", hash = "sha256:7979803c74610e11ef0c0d68a2942b152df52da55336e0c9d58daf1831cbdf33"},
|
||||
]
|
||||
beautifulsoup4 = [
|
||||
{file = "beautifulsoup4-4.8.2-py2-none-any.whl", hash = "sha256:e1505eeed31b0f4ce2dbb3bc8eb256c04cc2b3b72af7d551a4ab6efd5cbe5dae"},
|
||||
|
@ -763,8 +763,8 @@ click = [
|
|||
{file = "Click-7.0.tar.gz", hash = "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"},
|
||||
]
|
||||
cloudscraper = [
|
||||
{file = "cloudscraper-1.2.20-py2.py3-none-any.whl", hash = "sha256:63a93877552b6e8d5b6b020f6893aa275c1d3a0b586984be6da5343978985166"},
|
||||
{file = "cloudscraper-1.2.20.tar.gz", hash = "sha256:3168535f56e33a4db66e754e5a968e3e12c2f891082214f51269bee7a57de8ef"},
|
||||
{file = "cloudscraper-1.2.23-py2.py3-none-any.whl", hash = "sha256:a50e366d6d5ae299ce1554fc025c1e756b2598bc0b2232efa76e36c3e8f79e63"},
|
||||
{file = "cloudscraper-1.2.23.tar.gz", hash = "sha256:b01bd3ab916d30624643d7c14569218024b3f81b20da8045af4a301129f3bc11"},
|
||||
]
|
||||
constantly = [
|
||||
{file = "constantly-15.1.0-py2.py3-none-any.whl", hash = "sha256:dd2fa9d6b1a51a83f0d7dd76293d734046aa176e384bf6e33b7e44880eb37c5d"},
|
||||
|
|
Loading…
Reference in New Issue