new: keep track of metadata about OS and Browser when scraping

pull/42/head
Raphaël Vinot 2019-04-07 23:54:16 +02:00
parent d951d55367
commit 66545c26a5
3 changed files with 29 additions and 9 deletions

View File

@ -125,13 +125,17 @@ class Lookyloo():
def load_tree(self, report_dir: Path):
har_files = sorted(report_dir.glob('*.har'))
try:
meta = {}
if (report_dir / 'meta').exists():
with open((report_dir / 'meta'), 'r') as f:
meta = json.load(f)
ct = CrawledTree(har_files)
ct.find_parents()
ct.join_trees()
temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)
pickle.dump(ct, temp)
temp.close()
return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url
return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
except Har2TreeError as e:
raise NoValidHarFile(e.message)
@ -149,7 +153,8 @@ class Lookyloo():
return self.sanejs.sha512(sha512)
return {'response': []}
def scrape(self, url, depth: int=1, listing: bool=True, user_agent: str=None, perma_uuid: str=None):
def scrape(self, url, depth: int=1, listing: bool=True, user_agent: str=None, perma_uuid: str=None,
os: str=None, browser: str=None):
if not url.startswith('http'):
url = f'http://{url}'
items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
@ -161,8 +166,6 @@ class Lookyloo():
width = len(str(len(items)))
dirpath = self.scrape_dir / datetime.now().isoformat()
dirpath.mkdir()
if not listing: # Write no_index marker
(dirpath / 'no_index').touch()
for i, item in enumerate(items):
harfile = item['har']
png = base64.b64decode(item['png'])
@ -178,5 +181,15 @@ class Lookyloo():
json.dump(child_frames, f)
with (dirpath / 'uuid').open('w') as f:
f.write(perma_uuid)
if not listing: # Write no_index marker
(dirpath / 'no_index').touch()
if os or browser:
meta = {}
if os:
meta['os'] = os
if browser:
meta['browser'] = browser
with (dirpath / 'meta').open('w') as f:
json.dump(meta, f)
self._set_report_cache(dirpath)
return perma_uuid

View File

@ -37,9 +37,9 @@ lookyloo = Lookyloo()
# keep
def load_tree(report_dir):
session.clear()
temp_file_name, tree_json, tree_time, tree_ua, tree_root_url = lookyloo.load_tree(report_dir)
temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(report_dir)
session["tree"] = temp_file_name
return tree_json, tree_time, tree_ua, tree_root_url
return tree_json, tree_time, tree_ua, tree_root_url, meta
@app.route('/submit', methods=['POST', 'GET'])
@ -53,7 +53,8 @@ def submit():
def scrape_web():
if request.form.get('url'):
perma_uuid = lookyloo.scrape(url=request.form.get('url'), depth=request.form.get('depth'),
listing=request.form.get('listing'), user_agent=request.form.get('user_agent'))
listing=request.form.get('listing'), user_agent=request.form.get('user_agent'),
os=request.form.get('os'), browser=request.form.get('browser'))
return redirect(url_for('tree', tree_uuid=perma_uuid))
user_agents = get_user_agents()
user_agents.pop('by_frequency')
@ -132,9 +133,10 @@ def tree(tree_uuid):
return redirect(url_for('index'))
try:
tree_json, start_time, user_agent, root_url = load_tree(report_dir)
tree_json, start_time, user_agent, root_url, meta = load_tree(report_dir)
return render_template('tree.html', tree_json=tree_json, start_time=start_time,
user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid)
user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid,
meta=meta)
except NoValidHarFile as e:
return render_template('error.html', error_message=e)

View File

@ -62,6 +62,11 @@
<b>Root URL</b>: {{ root_url }}</br>
<b>Start time</b>: {{ start_time }}</br>
<b>User Agent</b>: {{ user_agent }}</br>
{% if meta %}
{%for k, v in meta.items()%}
<b>{{k.title()}}</b>: {{ v }}</br>
{%endfor%}
{%endif%}
<center><a href="{{ url_for('image', tree_uuid=tree_uuid) }}">Download Image</a></center>
</div>