mirror of https://github.com/CIRCL/lookyloo
new: keep track of metadata about OS and Browser when scraping
parent
d951d55367
commit
66545c26a5
|
@ -125,13 +125,17 @@ class Lookyloo():
|
||||||
def load_tree(self, report_dir: Path):
|
def load_tree(self, report_dir: Path):
|
||||||
har_files = sorted(report_dir.glob('*.har'))
|
har_files = sorted(report_dir.glob('*.har'))
|
||||||
try:
|
try:
|
||||||
|
meta = {}
|
||||||
|
if (report_dir / 'meta').exists():
|
||||||
|
with open((report_dir / 'meta'), 'r') as f:
|
||||||
|
meta = json.load(f)
|
||||||
ct = CrawledTree(har_files)
|
ct = CrawledTree(har_files)
|
||||||
ct.find_parents()
|
ct.find_parents()
|
||||||
ct.join_trees()
|
ct.join_trees()
|
||||||
temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)
|
temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)
|
||||||
pickle.dump(ct, temp)
|
pickle.dump(ct, temp)
|
||||||
temp.close()
|
temp.close()
|
||||||
return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url
|
return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
|
||||||
except Har2TreeError as e:
|
except Har2TreeError as e:
|
||||||
raise NoValidHarFile(e.message)
|
raise NoValidHarFile(e.message)
|
||||||
|
|
||||||
|
@ -149,7 +153,8 @@ class Lookyloo():
|
||||||
return self.sanejs.sha512(sha512)
|
return self.sanejs.sha512(sha512)
|
||||||
return {'response': []}
|
return {'response': []}
|
||||||
|
|
||||||
def scrape(self, url, depth: int=1, listing: bool=True, user_agent: str=None, perma_uuid: str=None):
|
def scrape(self, url, depth: int=1, listing: bool=True, user_agent: str=None, perma_uuid: str=None,
|
||||||
|
os: str=None, browser: str=None):
|
||||||
if not url.startswith('http'):
|
if not url.startswith('http'):
|
||||||
url = f'http://{url}'
|
url = f'http://{url}'
|
||||||
items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
|
items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
|
||||||
|
@ -161,8 +166,6 @@ class Lookyloo():
|
||||||
width = len(str(len(items)))
|
width = len(str(len(items)))
|
||||||
dirpath = self.scrape_dir / datetime.now().isoformat()
|
dirpath = self.scrape_dir / datetime.now().isoformat()
|
||||||
dirpath.mkdir()
|
dirpath.mkdir()
|
||||||
if not listing: # Write no_index marker
|
|
||||||
(dirpath / 'no_index').touch()
|
|
||||||
for i, item in enumerate(items):
|
for i, item in enumerate(items):
|
||||||
harfile = item['har']
|
harfile = item['har']
|
||||||
png = base64.b64decode(item['png'])
|
png = base64.b64decode(item['png'])
|
||||||
|
@ -178,5 +181,15 @@ class Lookyloo():
|
||||||
json.dump(child_frames, f)
|
json.dump(child_frames, f)
|
||||||
with (dirpath / 'uuid').open('w') as f:
|
with (dirpath / 'uuid').open('w') as f:
|
||||||
f.write(perma_uuid)
|
f.write(perma_uuid)
|
||||||
|
if not listing: # Write no_index marker
|
||||||
|
(dirpath / 'no_index').touch()
|
||||||
|
if os or browser:
|
||||||
|
meta = {}
|
||||||
|
if os:
|
||||||
|
meta['os'] = os
|
||||||
|
if browser:
|
||||||
|
meta['browser'] = browser
|
||||||
|
with (dirpath / 'meta').open('w') as f:
|
||||||
|
json.dump(meta, f)
|
||||||
self._set_report_cache(dirpath)
|
self._set_report_cache(dirpath)
|
||||||
return perma_uuid
|
return perma_uuid
|
||||||
|
|
|
@ -37,9 +37,9 @@ lookyloo = Lookyloo()
|
||||||
# keep
|
# keep
|
||||||
def load_tree(report_dir):
|
def load_tree(report_dir):
|
||||||
session.clear()
|
session.clear()
|
||||||
temp_file_name, tree_json, tree_time, tree_ua, tree_root_url = lookyloo.load_tree(report_dir)
|
temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(report_dir)
|
||||||
session["tree"] = temp_file_name
|
session["tree"] = temp_file_name
|
||||||
return tree_json, tree_time, tree_ua, tree_root_url
|
return tree_json, tree_time, tree_ua, tree_root_url, meta
|
||||||
|
|
||||||
|
|
||||||
@app.route('/submit', methods=['POST', 'GET'])
|
@app.route('/submit', methods=['POST', 'GET'])
|
||||||
|
@ -53,7 +53,8 @@ def submit():
|
||||||
def scrape_web():
|
def scrape_web():
|
||||||
if request.form.get('url'):
|
if request.form.get('url'):
|
||||||
perma_uuid = lookyloo.scrape(url=request.form.get('url'), depth=request.form.get('depth'),
|
perma_uuid = lookyloo.scrape(url=request.form.get('url'), depth=request.form.get('depth'),
|
||||||
listing=request.form.get('listing'), user_agent=request.form.get('user_agent'))
|
listing=request.form.get('listing'), user_agent=request.form.get('user_agent'),
|
||||||
|
os=request.form.get('os'), browser=request.form.get('browser'))
|
||||||
return redirect(url_for('tree', tree_uuid=perma_uuid))
|
return redirect(url_for('tree', tree_uuid=perma_uuid))
|
||||||
user_agents = get_user_agents()
|
user_agents = get_user_agents()
|
||||||
user_agents.pop('by_frequency')
|
user_agents.pop('by_frequency')
|
||||||
|
@ -132,9 +133,10 @@ def tree(tree_uuid):
|
||||||
return redirect(url_for('index'))
|
return redirect(url_for('index'))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tree_json, start_time, user_agent, root_url = load_tree(report_dir)
|
tree_json, start_time, user_agent, root_url, meta = load_tree(report_dir)
|
||||||
return render_template('tree.html', tree_json=tree_json, start_time=start_time,
|
return render_template('tree.html', tree_json=tree_json, start_time=start_time,
|
||||||
user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid)
|
user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid,
|
||||||
|
meta=meta)
|
||||||
except NoValidHarFile as e:
|
except NoValidHarFile as e:
|
||||||
return render_template('error.html', error_message=e)
|
return render_template('error.html', error_message=e)
|
||||||
|
|
||||||
|
|
|
@ -62,6 +62,11 @@
|
||||||
<b>Root URL</b>: {{ root_url }}</br>
|
<b>Root URL</b>: {{ root_url }}</br>
|
||||||
<b>Start time</b>: {{ start_time }}</br>
|
<b>Start time</b>: {{ start_time }}</br>
|
||||||
<b>User Agent</b>: {{ user_agent }}</br>
|
<b>User Agent</b>: {{ user_agent }}</br>
|
||||||
|
{% if meta %}
|
||||||
|
{%for k, v in meta.items()%}
|
||||||
|
<b>{{k.title()}}</b>: {{ v }}</br>
|
||||||
|
{%endfor%}
|
||||||
|
{%endif%}
|
||||||
<center><a href="{{ url_for('image', tree_uuid=tree_uuid) }}">Download Image</a></center>
|
<center><a href="{{ url_for('image', tree_uuid=tree_uuid) }}">Download Image</a></center>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue