new: Add max depth for scraping.

pull/79/head
Raphaël Vinot 2020-06-29 18:00:53 +02:00
parent bf9fdcb0ab
commit afe95c28f6
2 changed files with 7 additions and 2 deletions

View File

@ -10,6 +10,7 @@
"days": 0,
"hours": 0
},
"max_depth": 1,
"use_user_agents_users": false,
"enable_mail_notification": false,
"email": {
@ -28,6 +29,7 @@
"default_user_agent": "Ultimate fallback if the capture form, or the asynchronous submission, don't provide a UA",
"cache_clean_user": "Format: {username: password}",
"time_delta_on_index": "Time interval of the capture displayed on the index",
"max_depth": "Maximum depth for scraping. Anything > 1 will be exponentially bigger.",
"use_user_agents_users": "Only usable for medium/high use instances: use the user agents of the users of the platform",
"enable_mail_notification": "Enable email notification or not",
"email": "Configuration for sending email notifications."

View File

@ -19,7 +19,6 @@ from urllib.parse import urlsplit
from uuid import uuid4
from zipfile import ZipFile
import publicsuffix2 # type: ignore
from defang import refang # type: ignore
from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode
from redis import Redis
@ -404,7 +403,7 @@ class Lookyloo():
s.send_message(msg)
s.quit()
except Exception as e:
logging.exception(e)
self.logger.exception(e)
def _ensure_meta(self, capture_dir: Path, tree: CrawledTree) -> None:
metafile = capture_dir / 'meta'
@ -486,6 +485,10 @@ class Lookyloo():
ua: str = self.get_config('default_user_agent') # type: ignore
else:
ua = user_agent
if depth > self.get_config('max_depth'): # type: ignore
self.logger.warning(f'Not allowed to scrape on a depth higher than {self.get_config("max_depth")}: {depth}')
depth = self.get_config('max_depth') # type: ignore
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
log_enabled=True, log_level=self.get_config('splash_loglevel'))
if not items: