mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			chg: [crawler] submit free text of urls to crawl
							parent
							
								
									9d26a47c17
								
							
						
					
					
						commit
						1505bf0157
					
				|  | @ -39,6 +39,7 @@ from packages import git_status | |||
| from packages import Date | ||||
| from lib import ail_orgs | ||||
| from lib.ConfigLoader import ConfigLoader | ||||
| from lib.regex_helper import regex_findall | ||||
| from lib.objects.Domains import Domain | ||||
| from lib.objects.Titles import Title | ||||
| from lib.objects import HHHashs | ||||
|  | @ -183,6 +184,19 @@ def unpack_url(url): | |||
|     url_decoded['url'] = url.replace(url_decoded['host'], url_decoded['host'].lower(), 1) | ||||
|     return url_decoded | ||||
| 
 | ||||
| # TODO options to only extract domains | ||||
| # TODO extract onions | ||||
| def extract_url_from_text(content): | ||||
|     urls = [] | ||||
|     r_url = r"(?:(?:https?|ftp):\/\/)?(?:\S+(?::\S*)?@)?(?:\[(?:(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}|(?:[A-Fa-f0-9]{1,4}:){1,7}:|(?:[A-Fa-f0-9]{1,4}:){1,6}:[A-Fa-f0-9]{1,4}|::(?:[A-Fa-f0-9]{1,4}:){0,5}[A-Fa-f0-9]{1,4}|(?:[A-Fa-f0-9]{1,4}:){1,5}::(?:[A-Fa-f0-9]{1,4})?|(?:[A-Fa-f0-9]{1,4}:){1,4}::(?:[A-Fa-f0-9]{1,4}:){0,1}[A-Fa-f0-9]{1,4}|(?:[A-Fa-f0-9]{1,3}:){1}::(?:[A-Fa-f0-9]{1,4}:){0,2}[A-Fa-f0-9]{1,4}|(?:[A-Fa-f0-9]{1,2}:){1}::(?:[A-Fa-f0-9]{1,4}:){0,3}[A-Fa-f0-9]{1,4}|[A-Fa-f0-9]{1,4}::(?:[A-Fa-f0-9]{1,4}:){0,4}[A-Fa-f0-9]{1,4}|::(?:[A-Fa-f0-9]{1,4}:){0,5}[A-Fa-f0-9]{1,4}|fe80:(?:[A-Fa-f0-9]{0,4}:){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9])?[0-9])\.){3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9])?[0-9]))\]|(?:(?:25[0-5]|2[0-4]\d|1\d\d|\d{1,2})\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|\d{1,2})|(?:(?:[a-zA-Z0-9\-]+\.)+[a-zA-Z]{2,}))(?::\d{2,5})?(?:\/[^\s]*)?" | ||||
|     for url in regex_findall('extract_url_from_text', gen_uuid(), r_url, 'user_id', content, max_time=10): | ||||
|         urls.append(url) | ||||
|         # check if onions | ||||
|     return urls | ||||
|     # extract onions | ||||
|     # extract IP | ||||
| 
 | ||||
| 
 | ||||
| # # # # # # # # | ||||
| #             # | ||||
| #   FAVICON   # TODO REWRITE ME | ||||
|  | @ -1828,8 +1842,9 @@ def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar= | |||
| 
 | ||||
| def api_parse_task_dict_basic(data, user_id): | ||||
|     url = data.get('url', None) | ||||
|     if not url or url == '\n': | ||||
|         return {'status': 'error', 'reason': 'No url supplied'}, 400 | ||||
|     urls = data.get('urls', None) | ||||
|     if (not url or url == '\n') and not urls: | ||||
|         return {'status': 'error', 'reason': 'No url(s) supplied'}, 400 | ||||
| 
 | ||||
|     screenshot = data.get('screenshot', False) | ||||
|     if screenshot: | ||||
|  | @ -1863,14 +1878,20 @@ def api_parse_task_dict_basic(data, user_id): | |||
| 
 | ||||
|     tags = data.get('tags', []) | ||||
| 
 | ||||
|     return {'url': url, 'depth_limit': depth_limit, 'har': har, 'screenshot': screenshot, 'proxy': proxy, 'tags': tags}, 200 | ||||
|     data = {'depth_limit': depth_limit, 'har': har, 'screenshot': screenshot, 'proxy': proxy, 'tags': tags} | ||||
|     if url : | ||||
|         data['url'] = url | ||||
|     elif urls: | ||||
|         data['urls'] = urls | ||||
|     return data, 200 | ||||
| 
 | ||||
| def api_add_crawler_task(data, user_org, user_id=None): | ||||
|     task, resp = api_parse_task_dict_basic(data, user_id) | ||||
|     if resp != 200: | ||||
|         return task, resp | ||||
| 
 | ||||
|     url = task['url'] | ||||
|     url = task.get('url') | ||||
|     urls = task.get('urls') | ||||
|     screenshot = task['screenshot'] | ||||
|     har = task['har'] | ||||
|     depth_limit = task['depth_limit'] | ||||
|  | @ -1920,17 +1941,22 @@ def api_add_crawler_task(data, user_org, user_id=None): | |||
|                 if max(months, weeks, days, hours, minutes) <= 0: | ||||
|                     return {'error': 'Invalid frequency'}, 400 | ||||
|                 frequency = f'{months}:{weeks}:{days}:{hours}:{minutes}' | ||||
| 
 | ||||
|     if frequency: | ||||
|         # TODO verify user | ||||
|         task_uuid = create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None, | ||||
|                                     cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags) | ||||
|     else: | ||||
|         # TODO HEADERS | ||||
|         # TODO USER AGENT | ||||
|         task_uuid = create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None, | ||||
|                                 cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags, | ||||
|                                 parent='manual', priority=90) | ||||
|     if url: | ||||
|         if frequency: | ||||
|             # TODO verify user | ||||
|             task_uuid = create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None, | ||||
|                                         cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags) | ||||
|         else: | ||||
|             # TODO HEADERS | ||||
|             # TODO USER AGENT | ||||
|             task_uuid = create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None, | ||||
|                                     cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags, | ||||
|                                     parent='manual', priority=90) | ||||
|     elif urls: | ||||
|         for url in urls: | ||||
|             task_uuid = create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None, | ||||
|                                     cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags, | ||||
|                                     parent='manual', priority=90) | ||||
| 
 | ||||
|     return {'uuid': task_uuid}, 200 | ||||
| 
 | ||||
|  |  | |||
|  | @ -7,7 +7,6 @@ Regex Helper | |||
| 
 | ||||
| import os | ||||
| import logging.config | ||||
| import phonenumbers | ||||
| import re | ||||
| import sys | ||||
| import uuid | ||||
|  | @ -20,7 +19,6 @@ sys.path.append(os.environ['AIL_BIN']) | |||
| ################################## | ||||
| from lib import ail_logger | ||||
| from lib import ConfigLoader | ||||
| # from lib import Statistics | ||||
| 
 | ||||
| logging.config.dictConfig(ail_logger.get_config()) | ||||
| logger = logging.getLogger() | ||||
|  | @ -171,6 +169,7 @@ def regex_search(r_key, regex, item_id, content, max_time=30): | |||
| 
 | ||||
| ## Phone Regexs ## | ||||
| def _regex_phone_iter(r_key, country_code, content): | ||||
|     import phonenumbers | ||||
|     iterator = phonenumbers.PhoneNumberMatcher(content, country_code) | ||||
|     for match in iterator: | ||||
|         value = match.raw_string | ||||
|  |  | |||
|  | @ -122,6 +122,20 @@ def send_to_spider(): | |||
| 
 | ||||
|     # POST val | ||||
|     url = request.form.get('url_to_crawl') | ||||
|     urls = request.form.get('urls_to_crawl') | ||||
|     if urls: | ||||
|         urls = crawlers.extract_url_from_text(urls) | ||||
|         l_cookiejar = crawlers.api_get_cookiejars_selector(user_org, user_id) | ||||
|         crawlers_types = crawlers.get_crawler_all_types() | ||||
|         proxies = []  # TODO HANDLE PROXIES | ||||
|         return render_template("crawler_manual.html", urls=urls, | ||||
|                                is_manager_connected=crawlers.get_lacus_connection_metadata(), | ||||
|                                crawlers_types=crawlers_types, | ||||
|                                proxies=proxies, | ||||
|                                l_cookiejar=l_cookiejar, | ||||
|                                tags_selector_data=Tag.get_tags_selector_data()) | ||||
| 
 | ||||
|     urls = request.form.getlist('urls') | ||||
|     crawler_type = request.form.get('crawler_queue_type') | ||||
|     screenshot = request.form.get('screenshot') | ||||
|     har = request.form.get('har') | ||||
|  | @ -185,7 +199,11 @@ def send_to_spider(): | |||
|             cookiejar_uuid = cookiejar_uuid.rsplit(':') | ||||
|             cookiejar_uuid = cookiejar_uuid[-1].replace(' ', '') | ||||
| 
 | ||||
|     data = {'url': url, 'depth': depth_limit, 'har': har, 'screenshot': screenshot, 'frequency': frequency} | ||||
|     data = {'depth': depth_limit, 'har': har, 'screenshot': screenshot, 'frequency': frequency} | ||||
|     if url: | ||||
|         data['url']= url | ||||
|     if urls: | ||||
|         data['urls'] = urls | ||||
|     if proxy: | ||||
|         data['proxy'] = proxy | ||||
|     if cookiejar_uuid: | ||||
|  |  | |||
|  | @ -43,9 +43,28 @@ | |||
| 								<form action="{{ url_for('crawler_splash.send_to_spider') }}" method='post'> | ||||
| 									<div class="row"> | ||||
| 										<div class="col-12 col-lg-6"> | ||||
| 											<div class="input-group" id="date-range-from"> | ||||
| 												<input type="text" class="form-control" id="url_to_crawl" name="url_to_crawl" placeholder="Address or Domain"> | ||||
| 											</div> | ||||
|                                             {% if urls %} | ||||
|                                                 {% for url in urls %} | ||||
|                                                     <div class="input-group mb-1"> | ||||
|                                                         <input type="text" class="form-control col-10" name="urls" value="{{ url }}"> | ||||
|                                                         <span class="btn btn-danger col-1" id="" onclick="$(this).parent().remove();"><i class="fas fa-trash-alt"></i></span> | ||||
|                                                     </div> | ||||
|                                                 {% endfor %} | ||||
| 
 | ||||
|                                             {% else %} | ||||
|                                                 <div class="input-group" id="single_urls"> | ||||
|                                                     <input type="text" class="form-control" id="url_to_crawl" name="url_to_crawl" placeholder="Address or Domain"> | ||||
|                                                     <div class="input-group-append"> | ||||
|                                                         <button class="btn btn-secondary" type="button" onclick="btn_multiple_urls()"><i class="fa fa-plus"></i> Multiple Urls</button> | ||||
|                                                     </div> | ||||
|                                                 </div> | ||||
|                                                 <div class="input-group" id="multiple_urls"> | ||||
|                                                     <textarea type="text" class="form-control" id="urls_to_crawl" name="urls_to_crawl" rows="3" placeholder="List Of Urls or Free Text"></textarea> | ||||
|                                                     <div class="input-group-append"> | ||||
|                                                         <button class="btn btn-secondary" type="button" onclick="btn_single_url()"><i class="fa fa-minus"></i> One Url</button> | ||||
|                                                     </div> | ||||
|                                                 </div> | ||||
|                                             {% endif %} | ||||
| 											<div class="d-flex mt-2"> | ||||
| 												<i class="fas fa-spider mt-1"></i>  Crawler Type   | ||||
| 												<div class="custom-control custom-switch"> | ||||
|  | @ -221,6 +240,7 @@ $(document).ready(function(){ | |||
| 	queue_type_selector_input_controler() | ||||
| 	manual_crawler_input_controler(); | ||||
|     $("#custom_frequency").hide(); | ||||
|     $("#multiple_urls").hide(); | ||||
| 
 | ||||
| 	$('#crawler_scheduler').on("change", function () { | ||||
| 		manual_crawler_input_controler(); | ||||
|  | @ -245,6 +265,16 @@ function toggle_sidebar(){ | |||
| 	} | ||||
| } | ||||
| 
 | ||||
| function btn_single_url() { | ||||
|     $("#multiple_urls").hide(); | ||||
|     $("#single_urls").show() | ||||
| } | ||||
| 
 | ||||
| function btn_multiple_urls() { | ||||
|     $("#single_urls").hide() | ||||
|     $("#multiple_urls").show(); | ||||
| } | ||||
| 
 | ||||
| function manual_crawler_input_controler() { | ||||
| 	if($('#crawler_scheduler').is(':checked')){ | ||||
| 		$("#frequency_inputs").show(); | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 terrtia
						terrtia