From 68dffcd26b53229525ffe649526e3cef669dbcef Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 25 Jul 2023 15:57:11 +0200 Subject: [PATCH] chg: [api crawler] fix response + add cookiejar, proxy and frequency parameters --- bin/lib/crawlers.py | 10 +- bin/modules/Onion.py | 8 +- tools/crawler_add_task.py | 120 +++++++++++++++++++++++ var/www/modules/restApi/Flask_restApi.py | 5 +- 4 files changed, 134 insertions(+), 9 deletions(-) create mode 100755 tools/crawler_add_task.py diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 571af921..3e61ed88 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -1723,14 +1723,16 @@ def api_add_crawler_task(data, user_id=None): if frequency: # TODO verify user - return create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None, - cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags), 200 + task_uuid = create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None, + cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags) else: # TODO HEADERS # TODO USER AGENT - return create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None, + task_uuid = create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None, cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags, - parent='manual', priority=90), 200 + parent='manual', priority=90) + + return {'uuid': task_uuid}, 200 #### #### diff --git a/bin/modules/Onion.py b/bin/modules/Onion.py index 5e76d0fa..303b5bdc 100755 --- a/bin/modules/Onion.py +++ b/bin/modules/Onion.py @@ -42,7 +42,8 @@ class Onion(AbstractModule): self.faup = crawlers.get_faup() # activate_crawler = p.config.get("Crawler", "activate_crawler") - + self.har = config_loader.get_config_boolean('Crawler', 'default_har') + self.screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot') self.onion_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" # self.i2p_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" @@ -90,8 +91,9 @@ class Onion(AbstractModule): if onion_urls: if crawlers.is_crawler_activated(): - for domain in domains: # TODO LOAD DEFAULT SCREENSHOT + HAR - task_uuid = crawlers.create_task(domain, parent=item.get_id(), priority=0) + for domain in domains: + task_uuid = crawlers.create_task(domain, parent=item.get_id(), priority=0, + har=self.har, screenshot=self.screenshot) if task_uuid: print(f'{domain} added to crawler queue: {task_uuid}') else: diff --git a/tools/crawler_add_task.py b/tools/crawler_add_task.py new file mode 100755 index 00000000..f785f167 --- /dev/null +++ b/tools/crawler_add_task.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +DIR/File Importer Helper +================ + +Import Content + +""" + +import argparse +import os +from pyail import PyAIL +import sys + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.ConfigLoader import ConfigLoader + +def check_frequency(value): + value = int(value) + if value <= 0: + raise argparse.ArgumentTypeError(f'Error: Invalid frequency {value}') + + +if __name__ == "__main__": + + # TODO add c argument for config file + parser = argparse.ArgumentParser(description='Directory or file importer') + parser.add_argument('-u', '--url', type=str, help='URL to crawl', required=True) + parser.add_argument('-k', '--key', type=str, help='AIL API Key', required=True) + parser.add_argument('-a', '--ail', type=str, help='AIL URL') + parser.add_argument('-d', '--depth', type=int, default=1, help='Depth limit') # TODO improve me + parser.add_argument('--cookiejar', type=str, help='Cookiejar uuid') + parser.add_argument('-p', '--proxy', type=str, help='Proxy address to use, "web" and "tor" can be used as shortcut (web is used by default is )') + + group = parser.add_mutually_exclusive_group() + group.add_argument('--har', dest='har', action='store_true', help='Save HAR') + group.add_argument('--no-har', dest='har', action='store_false', help='Don\'t save HAR') + parser.set_defaults(har=None) + + group = parser.add_mutually_exclusive_group() + group.add_argument('--screenshot', dest='screenshot', action='store_true', help='Save screenshot') + group.add_argument('--no-screenshot', dest='screenshot', action='store_false', help='Don\'t save screenshot') + parser.set_defaults(screenshot=None) + + group = parser.add_argument_group('Frequency, create a regular crawler/scheduler') + group.add_argument('-f', '--frequency', type=str, choices=['monthly', 'weekly', 'daily', 'hourly'], + help='monthly, weekly, daily or hourly frequency or specify a custom one with the others arguments') + group.add_argument('--minutes', type=int, help='frequency in minutes') + group.add_argument('--hours', type=int, help='frequency in hours') + group.add_argument('--days', type=int, help='frequency in days') + group.add_argument('--weeks', type=int, help='frequency in weeks') + group.add_argument('--months', type=int, help='frequency in months') + + args = parser.parse_args() + + if not args.url and not args.key: + parser.print_help() + sys.exit(0) + + # Load crawler default config + config_loader = ConfigLoader() + har = args.har + if har is None: + har = config_loader.get_config_boolean('Crawler', 'default_har') + screenshot = args.screenshot + if screenshot is None: + screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot') + + if args.depth: + depth = args.depth + if depth < 0: + raise argparse.ArgumentTypeError(f'Error: Invalid depth {depth}') + else: + depth = 1 + + # frequency + frequency = {} + if args.frequency: + if args.frequency in ['monthly', 'weekly', 'daily', 'hourly']: + frequency = args.frequency + else: + raise argparse.ArgumentTypeError('Invalid frequency') + elif args.minutes or args.hours or args.days or args.weeks or args.months: + if args.minutes: + check_frequency(args.minutes) + frequency['minutes'] = args.minutes + if args.hours: + check_frequency(args.hours) + frequency['hours'] = args.hours + if args.days: + check_frequency(args.days) + frequency['days'] = args.days + if args.weeks: + check_frequency(args.weeks) + frequency['weeks'] = args.weeks + if args.months: + check_frequency(args.months) + frequency['months'] = args.months + if not frequency: + frequency = None + + proxy = args.proxy + + if args.cookiejar: + cookiejar = args.cookiejar + else: + cookiejar = None + + ail = args.ail + if not ail: + ail = 'https://localhost:7000/' + + client = PyAIL(ail, args.key, ssl=False) + r = client.crawl_url(args.url, har=har, screenshot=screenshot, depth_limit=depth, frequency=frequency, + cookiejar=cookiejar, proxy=proxy) + print(r) diff --git a/var/www/modules/restApi/Flask_restApi.py b/var/www/modules/restApi/Flask_restApi.py index e233bb00..0c95d0a0 100644 --- a/var/www/modules/restApi/Flask_restApi.py +++ b/var/www/modules/restApi/Flask_restApi.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 # -*-coding:UTF-8 -* -''' +""" Flask functions and routes for the rest api -''' +""" import os import re @@ -508,6 +508,7 @@ def get_item_cryptocurrency_bitcoin(): # # # # # # # # # # # # # # CRAWLER # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # TODO: ADD RESULT JSON Response +# @restApi.route("api/v1/crawler/task/add", methods=['POST']) @restApi.route("api/v1/add/crawler/task", methods=['POST']) @token_required('analyst') def add_crawler_task():