new: keep track of metadata about OS and Browser when scraping

pull/42/head
Raphaël Vinot 2019-04-07 23:54:16 +02:00
parent d951d55367
commit 66545c26a5
3 changed files with 29 additions and 9 deletions

View File

@ -125,13 +125,17 @@ class Lookyloo():
def load_tree(self, report_dir: Path): def load_tree(self, report_dir: Path):
har_files = sorted(report_dir.glob('*.har')) har_files = sorted(report_dir.glob('*.har'))
try: try:
meta = {}
if (report_dir / 'meta').exists():
with open((report_dir / 'meta'), 'r') as f:
meta = json.load(f)
ct = CrawledTree(har_files) ct = CrawledTree(har_files)
ct.find_parents() ct.find_parents()
ct.join_trees() ct.join_trees()
temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False) temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)
pickle.dump(ct, temp) pickle.dump(ct, temp)
temp.close() temp.close()
return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
except Har2TreeError as e: except Har2TreeError as e:
raise NoValidHarFile(e.message) raise NoValidHarFile(e.message)
@ -149,7 +153,8 @@ class Lookyloo():
return self.sanejs.sha512(sha512) return self.sanejs.sha512(sha512)
return {'response': []} return {'response': []}
def scrape(self, url, depth: int=1, listing: bool=True, user_agent: str=None, perma_uuid: str=None): def scrape(self, url, depth: int=1, listing: bool=True, user_agent: str=None, perma_uuid: str=None,
os: str=None, browser: str=None):
if not url.startswith('http'): if not url.startswith('http'):
url = f'http://{url}' url = f'http://{url}'
items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO') items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
@ -161,8 +166,6 @@ class Lookyloo():
width = len(str(len(items))) width = len(str(len(items)))
dirpath = self.scrape_dir / datetime.now().isoformat() dirpath = self.scrape_dir / datetime.now().isoformat()
dirpath.mkdir() dirpath.mkdir()
if not listing: # Write no_index marker
(dirpath / 'no_index').touch()
for i, item in enumerate(items): for i, item in enumerate(items):
harfile = item['har'] harfile = item['har']
png = base64.b64decode(item['png']) png = base64.b64decode(item['png'])
@ -178,5 +181,15 @@ class Lookyloo():
json.dump(child_frames, f) json.dump(child_frames, f)
with (dirpath / 'uuid').open('w') as f: with (dirpath / 'uuid').open('w') as f:
f.write(perma_uuid) f.write(perma_uuid)
if not listing: # Write no_index marker
(dirpath / 'no_index').touch()
if os or browser:
meta = {}
if os:
meta['os'] = os
if browser:
meta['browser'] = browser
with (dirpath / 'meta').open('w') as f:
json.dump(meta, f)
self._set_report_cache(dirpath) self._set_report_cache(dirpath)
return perma_uuid return perma_uuid

View File

@ -37,9 +37,9 @@ lookyloo = Lookyloo()
# keep # keep
def load_tree(report_dir): def load_tree(report_dir):
session.clear() session.clear()
temp_file_name, tree_json, tree_time, tree_ua, tree_root_url = lookyloo.load_tree(report_dir) temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(report_dir)
session["tree"] = temp_file_name session["tree"] = temp_file_name
return tree_json, tree_time, tree_ua, tree_root_url return tree_json, tree_time, tree_ua, tree_root_url, meta
@app.route('/submit', methods=['POST', 'GET']) @app.route('/submit', methods=['POST', 'GET'])
@ -53,7 +53,8 @@ def submit():
def scrape_web(): def scrape_web():
if request.form.get('url'): if request.form.get('url'):
perma_uuid = lookyloo.scrape(url=request.form.get('url'), depth=request.form.get('depth'), perma_uuid = lookyloo.scrape(url=request.form.get('url'), depth=request.form.get('depth'),
listing=request.form.get('listing'), user_agent=request.form.get('user_agent')) listing=request.form.get('listing'), user_agent=request.form.get('user_agent'),
os=request.form.get('os'), browser=request.form.get('browser'))
return redirect(url_for('tree', tree_uuid=perma_uuid)) return redirect(url_for('tree', tree_uuid=perma_uuid))
user_agents = get_user_agents() user_agents = get_user_agents()
user_agents.pop('by_frequency') user_agents.pop('by_frequency')
@ -132,9 +133,10 @@ def tree(tree_uuid):
return redirect(url_for('index')) return redirect(url_for('index'))
try: try:
tree_json, start_time, user_agent, root_url = load_tree(report_dir) tree_json, start_time, user_agent, root_url, meta = load_tree(report_dir)
return render_template('tree.html', tree_json=tree_json, start_time=start_time, return render_template('tree.html', tree_json=tree_json, start_time=start_time,
user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid) user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid,
meta=meta)
except NoValidHarFile as e: except NoValidHarFile as e:
return render_template('error.html', error_message=e) return render_template('error.html', error_message=e)

View File

@ -62,6 +62,11 @@
<b>Root URL</b>: {{ root_url }}</br> <b>Root URL</b>: {{ root_url }}</br>
<b>Start time</b>: {{ start_time }}</br> <b>Start time</b>: {{ start_time }}</br>
<b>User Agent</b>: {{ user_agent }}</br> <b>User Agent</b>: {{ user_agent }}</br>
{% if meta %}
{%for k, v in meta.items()%}
<b>{{k.title()}}</b>: {{ v }}</br>
{%endfor%}
{%endif%}
<center><a href="{{ url_for('image', tree_uuid=tree_uuid) }}">Download Image</a></center> <center><a href="{{ url_for('image', tree_uuid=tree_uuid) }}">Download Image</a></center>
</div> </div>