diff --git a/config/generic.json.sample b/config/generic.json.sample index 50abe424..065864bd 100644 --- a/config/generic.json.sample +++ b/config/generic.json.sample @@ -10,6 +10,7 @@ "days": 0, "hours": 0 }, + "max_depth": 1, "use_user_agents_users": false, "enable_mail_notification": false, "email": { @@ -28,6 +29,7 @@ "default_user_agent": "Ultimate fallback if the capture form, or the asynchronous submission, don't provide a UA", "cache_clean_user": "Format: {username: password}", "time_delta_on_index": "Time interval of the capture displayed on the index", + "max_depth": "Maximum depth for scraping. Anything > 1 will be exponentially bigger.", "use_user_agents_users": "Only usable for medium/high use instances: use the user agents of the users of the platform", "enable_mail_notification": "Enable email notification or not", "email": "Configuration for sending email notifications." diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index ba8855e2..43e68602 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -19,7 +19,6 @@ from urllib.parse import urlsplit from uuid import uuid4 from zipfile import ZipFile -import publicsuffix2 # type: ignore from defang import refang # type: ignore from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode from redis import Redis @@ -404,7 +403,7 @@ class Lookyloo(): s.send_message(msg) s.quit() except Exception as e: - logging.exception(e) + self.logger.exception(e) def _ensure_meta(self, capture_dir: Path, tree: CrawledTree) -> None: metafile = capture_dir / 'meta' @@ -486,6 +485,10 @@ class Lookyloo(): ua: str = self.get_config('default_user_agent') # type: ignore else: ua = user_agent + + if depth > self.get_config('max_depth'): # type: ignore + self.logger.warning(f'Not allowed to scrape on a depth higher than {self.get_config("max_depth")}: {depth}') + depth = self.get_config('max_depth') # type: ignore items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua, log_enabled=True, log_level=self.get_config('splash_loglevel')) if not items: