mirror of https://github.com/CIRCL/lookyloo
				
				
				
			new: keep track of metadata about OS and Browser when scraping
							parent
							
								
									d951d55367
								
							
						
					
					
						commit
						66545c26a5
					
				|  | @ -125,13 +125,17 @@ class Lookyloo(): | |||
|     def load_tree(self, report_dir: Path): | ||||
|         har_files = sorted(report_dir.glob('*.har')) | ||||
|         try: | ||||
|             meta = {} | ||||
|             if (report_dir / 'meta').exists(): | ||||
|                 with open((report_dir / 'meta'), 'r') as f: | ||||
|                     meta = json.load(f) | ||||
|             ct = CrawledTree(har_files) | ||||
|             ct.find_parents() | ||||
|             ct.join_trees() | ||||
|             temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False) | ||||
|             pickle.dump(ct, temp) | ||||
|             temp.close() | ||||
|             return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url | ||||
|             return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta | ||||
|         except Har2TreeError as e: | ||||
|             raise NoValidHarFile(e.message) | ||||
| 
 | ||||
|  | @ -149,7 +153,8 @@ class Lookyloo(): | |||
|             return self.sanejs.sha512(sha512) | ||||
|         return {'response': []} | ||||
| 
 | ||||
|     def scrape(self, url, depth: int=1, listing: bool=True, user_agent: str=None, perma_uuid: str=None): | ||||
|     def scrape(self, url, depth: int=1, listing: bool=True, user_agent: str=None, perma_uuid: str=None, | ||||
|                os: str=None, browser: str=None): | ||||
|         if not url.startswith('http'): | ||||
|             url = f'http://{url}' | ||||
|         items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO') | ||||
|  | @ -161,8 +166,6 @@ class Lookyloo(): | |||
|         width = len(str(len(items))) | ||||
|         dirpath = self.scrape_dir / datetime.now().isoformat() | ||||
|         dirpath.mkdir() | ||||
|         if not listing:  # Write no_index marker | ||||
|             (dirpath / 'no_index').touch() | ||||
|         for i, item in enumerate(items): | ||||
|             harfile = item['har'] | ||||
|             png = base64.b64decode(item['png']) | ||||
|  | @ -178,5 +181,15 @@ class Lookyloo(): | |||
|                 json.dump(child_frames, f) | ||||
|             with (dirpath / 'uuid').open('w') as f: | ||||
|                 f.write(perma_uuid) | ||||
|             if not listing:  # Write no_index marker | ||||
|                 (dirpath / 'no_index').touch() | ||||
|             if os or browser: | ||||
|                 meta = {} | ||||
|                 if os: | ||||
|                     meta['os'] = os | ||||
|                 if browser: | ||||
|                     meta['browser'] = browser | ||||
|                 with (dirpath / 'meta').open('w') as f: | ||||
|                     json.dump(meta, f) | ||||
|         self._set_report_cache(dirpath) | ||||
|         return perma_uuid | ||||
|  |  | |||
|  | @ -37,9 +37,9 @@ lookyloo = Lookyloo() | |||
| # keep | ||||
| def load_tree(report_dir): | ||||
|     session.clear() | ||||
|     temp_file_name, tree_json, tree_time, tree_ua, tree_root_url = lookyloo.load_tree(report_dir) | ||||
|     temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(report_dir) | ||||
|     session["tree"] = temp_file_name | ||||
|     return tree_json, tree_time, tree_ua, tree_root_url | ||||
|     return tree_json, tree_time, tree_ua, tree_root_url, meta | ||||
| 
 | ||||
| 
 | ||||
| @app.route('/submit', methods=['POST', 'GET']) | ||||
|  | @ -53,7 +53,8 @@ def submit(): | |||
| def scrape_web(): | ||||
|     if request.form.get('url'): | ||||
|         perma_uuid = lookyloo.scrape(url=request.form.get('url'), depth=request.form.get('depth'), | ||||
|                                      listing=request.form.get('listing'), user_agent=request.form.get('user_agent')) | ||||
|                                      listing=request.form.get('listing'), user_agent=request.form.get('user_agent'), | ||||
|                                      os=request.form.get('os'), browser=request.form.get('browser')) | ||||
|         return redirect(url_for('tree', tree_uuid=perma_uuid)) | ||||
|     user_agents = get_user_agents() | ||||
|     user_agents.pop('by_frequency') | ||||
|  | @ -132,9 +133,10 @@ def tree(tree_uuid): | |||
|         return redirect(url_for('index')) | ||||
| 
 | ||||
|     try: | ||||
|         tree_json, start_time, user_agent, root_url = load_tree(report_dir) | ||||
|         tree_json, start_time, user_agent, root_url, meta = load_tree(report_dir) | ||||
|         return render_template('tree.html', tree_json=tree_json, start_time=start_time, | ||||
|                                user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid) | ||||
|                                user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid, | ||||
|                                meta=meta) | ||||
|     except NoValidHarFile as e: | ||||
|         return render_template('error.html', error_message=e) | ||||
| 
 | ||||
|  |  | |||
|  | @ -62,6 +62,11 @@ | |||
|     <b>Root URL</b>: {{ root_url }}</br> | ||||
|     <b>Start time</b>: {{ start_time }}</br> | ||||
|     <b>User Agent</b>: {{ user_agent }}</br> | ||||
|     {% if meta %} | ||||
|         {%for k, v in meta.items()%} | ||||
|         <b>{{k.title()}}</b>: {{ v }}</br> | ||||
|         {%endfor%} | ||||
|     {%endif%} | ||||
|     <center><a href="{{ url_for('image', tree_uuid=tree_uuid) }}">Download Image</a></center> | ||||
| </div> | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Raphaël Vinot
						Raphaël Vinot