new: Add max depth for scraping.

2020-06-29 18:00:53 +02:00 · 2020-06-29 18:00:53 +02:00 · afe95c28f6
parent bf9fdcb0ab
commit afe95c28f6
2 changed files with 7 additions and 2 deletions
--- a/config/generic.json.sample
+++ b/config/generic.json.sample
@ -10,6 +10,7 @@
    "days": 0,
    "hours": 0
  },
+  "max_depth": 1,
  "use_user_agents_users": false,
  "enable_mail_notification": false,
  "email": {
@ -28,6 +29,7 @@
    "default_user_agent": "Ultimate fallback if the capture form, or the asynchronous submission, don't provide a UA",
    "cache_clean_user": "Format: {username: password}",
    "time_delta_on_index": "Time interval of the capture displayed on the index",
+    "max_depth": "Maximum depth for scraping. Anything > 1 will be exponentially bigger.",
    "use_user_agents_users": "Only usable for medium/high use instances: use the user agents of the users of the platform",
    "enable_mail_notification": "Enable email notification or not",
    "email": "Configuration for sending email notifications."
--- a/lookyloo/lookyloo.py
+++ b/lookyloo/lookyloo.py
@ -19,7 +19,6 @@ from urllib.parse import urlsplit
 from uuid import uuid4
 from zipfile import ZipFile

-import publicsuffix2  # type: ignore
 from defang import refang  # type: ignore
 from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode
 from redis import Redis
@ -404,7 +403,7 @@ class Lookyloo():
            s.send_message(msg)
            s.quit()
        except Exception as e:
-            logging.exception(e)
+            self.logger.exception(e)

    def _ensure_meta(self, capture_dir: Path, tree: CrawledTree) -> None:
        metafile = capture_dir / 'meta'
@ -486,6 +485,10 @@ class Lookyloo():
            ua: str = self.get_config('default_user_agent')  # type: ignore
        else:
            ua = user_agent
+
+        if depth > self.get_config('max_depth'):  # type: ignore
+            self.logger.warning(f'Not allowed to scrape on a depth higher than {self.get_config("max_depth")}: {depth}')
+            depth = self.get_config('max_depth')  # type: ignore
        items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
                      log_enabled=True, log_level=self.get_config('splash_loglevel'))
        if not items: