mirror of https://github.com/CIRCL/lookyloo
new: keep track of metadata about OS and Browser when scraping
parent
d951d55367
commit
66545c26a5
|
@ -125,13 +125,17 @@ class Lookyloo():
|
|||
def load_tree(self, report_dir: Path):
|
||||
har_files = sorted(report_dir.glob('*.har'))
|
||||
try:
|
||||
meta = {}
|
||||
if (report_dir / 'meta').exists():
|
||||
with open((report_dir / 'meta'), 'r') as f:
|
||||
meta = json.load(f)
|
||||
ct = CrawledTree(har_files)
|
||||
ct.find_parents()
|
||||
ct.join_trees()
|
||||
temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)
|
||||
pickle.dump(ct, temp)
|
||||
temp.close()
|
||||
return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url
|
||||
return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
|
||||
except Har2TreeError as e:
|
||||
raise NoValidHarFile(e.message)
|
||||
|
||||
|
@ -149,7 +153,8 @@ class Lookyloo():
|
|||
return self.sanejs.sha512(sha512)
|
||||
return {'response': []}
|
||||
|
||||
def scrape(self, url, depth: int=1, listing: bool=True, user_agent: str=None, perma_uuid: str=None):
|
||||
def scrape(self, url, depth: int=1, listing: bool=True, user_agent: str=None, perma_uuid: str=None,
|
||||
os: str=None, browser: str=None):
|
||||
if not url.startswith('http'):
|
||||
url = f'http://{url}'
|
||||
items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
|
||||
|
@ -161,8 +166,6 @@ class Lookyloo():
|
|||
width = len(str(len(items)))
|
||||
dirpath = self.scrape_dir / datetime.now().isoformat()
|
||||
dirpath.mkdir()
|
||||
if not listing: # Write no_index marker
|
||||
(dirpath / 'no_index').touch()
|
||||
for i, item in enumerate(items):
|
||||
harfile = item['har']
|
||||
png = base64.b64decode(item['png'])
|
||||
|
@ -178,5 +181,15 @@ class Lookyloo():
|
|||
json.dump(child_frames, f)
|
||||
with (dirpath / 'uuid').open('w') as f:
|
||||
f.write(perma_uuid)
|
||||
if not listing: # Write no_index marker
|
||||
(dirpath / 'no_index').touch()
|
||||
if os or browser:
|
||||
meta = {}
|
||||
if os:
|
||||
meta['os'] = os
|
||||
if browser:
|
||||
meta['browser'] = browser
|
||||
with (dirpath / 'meta').open('w') as f:
|
||||
json.dump(meta, f)
|
||||
self._set_report_cache(dirpath)
|
||||
return perma_uuid
|
||||
|
|
|
@ -37,9 +37,9 @@ lookyloo = Lookyloo()
|
|||
# keep
|
||||
def load_tree(report_dir):
|
||||
session.clear()
|
||||
temp_file_name, tree_json, tree_time, tree_ua, tree_root_url = lookyloo.load_tree(report_dir)
|
||||
temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(report_dir)
|
||||
session["tree"] = temp_file_name
|
||||
return tree_json, tree_time, tree_ua, tree_root_url
|
||||
return tree_json, tree_time, tree_ua, tree_root_url, meta
|
||||
|
||||
|
||||
@app.route('/submit', methods=['POST', 'GET'])
|
||||
|
@ -53,7 +53,8 @@ def submit():
|
|||
def scrape_web():
|
||||
if request.form.get('url'):
|
||||
perma_uuid = lookyloo.scrape(url=request.form.get('url'), depth=request.form.get('depth'),
|
||||
listing=request.form.get('listing'), user_agent=request.form.get('user_agent'))
|
||||
listing=request.form.get('listing'), user_agent=request.form.get('user_agent'),
|
||||
os=request.form.get('os'), browser=request.form.get('browser'))
|
||||
return redirect(url_for('tree', tree_uuid=perma_uuid))
|
||||
user_agents = get_user_agents()
|
||||
user_agents.pop('by_frequency')
|
||||
|
@ -132,9 +133,10 @@ def tree(tree_uuid):
|
|||
return redirect(url_for('index'))
|
||||
|
||||
try:
|
||||
tree_json, start_time, user_agent, root_url = load_tree(report_dir)
|
||||
tree_json, start_time, user_agent, root_url, meta = load_tree(report_dir)
|
||||
return render_template('tree.html', tree_json=tree_json, start_time=start_time,
|
||||
user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid)
|
||||
user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid,
|
||||
meta=meta)
|
||||
except NoValidHarFile as e:
|
||||
return render_template('error.html', error_message=e)
|
||||
|
||||
|
|
|
@ -62,6 +62,11 @@
|
|||
<b>Root URL</b>: {{ root_url }}</br>
|
||||
<b>Start time</b>: {{ start_time }}</br>
|
||||
<b>User Agent</b>: {{ user_agent }}</br>
|
||||
{% if meta %}
|
||||
{%for k, v in meta.items()%}
|
||||
<b>{{k.title()}}</b>: {{ v }}</br>
|
||||
{%endfor%}
|
||||
{%endif%}
|
||||
<center><a href="{{ url_for('image', tree_uuid=tree_uuid) }}">Download Image</a></center>
|
||||
</div>
|
||||
|
||||
|
|
Loading…
Reference in New Issue