mirror of https://github.com/CIRCL/lookyloo
new: Add max depth for scraping.
parent
bf9fdcb0ab
commit
afe95c28f6
|
@ -10,6 +10,7 @@
|
||||||
"days": 0,
|
"days": 0,
|
||||||
"hours": 0
|
"hours": 0
|
||||||
},
|
},
|
||||||
|
"max_depth": 1,
|
||||||
"use_user_agents_users": false,
|
"use_user_agents_users": false,
|
||||||
"enable_mail_notification": false,
|
"enable_mail_notification": false,
|
||||||
"email": {
|
"email": {
|
||||||
|
@ -28,6 +29,7 @@
|
||||||
"default_user_agent": "Ultimate fallback if the capture form, or the asynchronous submission, don't provide a UA",
|
"default_user_agent": "Ultimate fallback if the capture form, or the asynchronous submission, don't provide a UA",
|
||||||
"cache_clean_user": "Format: {username: password}",
|
"cache_clean_user": "Format: {username: password}",
|
||||||
"time_delta_on_index": "Time interval of the capture displayed on the index",
|
"time_delta_on_index": "Time interval of the capture displayed on the index",
|
||||||
|
"max_depth": "Maximum depth for scraping. Anything > 1 will be exponentially bigger.",
|
||||||
"use_user_agents_users": "Only usable for medium/high use instances: use the user agents of the users of the platform",
|
"use_user_agents_users": "Only usable for medium/high use instances: use the user agents of the users of the platform",
|
||||||
"enable_mail_notification": "Enable email notification or not",
|
"enable_mail_notification": "Enable email notification or not",
|
||||||
"email": "Configuration for sending email notifications."
|
"email": "Configuration for sending email notifications."
|
||||||
|
|
|
@ -19,7 +19,6 @@ from urllib.parse import urlsplit
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
|
|
||||||
import publicsuffix2 # type: ignore
|
|
||||||
from defang import refang # type: ignore
|
from defang import refang # type: ignore
|
||||||
from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode
|
from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode
|
||||||
from redis import Redis
|
from redis import Redis
|
||||||
|
@ -404,7 +403,7 @@ class Lookyloo():
|
||||||
s.send_message(msg)
|
s.send_message(msg)
|
||||||
s.quit()
|
s.quit()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.exception(e)
|
self.logger.exception(e)
|
||||||
|
|
||||||
def _ensure_meta(self, capture_dir: Path, tree: CrawledTree) -> None:
|
def _ensure_meta(self, capture_dir: Path, tree: CrawledTree) -> None:
|
||||||
metafile = capture_dir / 'meta'
|
metafile = capture_dir / 'meta'
|
||||||
|
@ -486,6 +485,10 @@ class Lookyloo():
|
||||||
ua: str = self.get_config('default_user_agent') # type: ignore
|
ua: str = self.get_config('default_user_agent') # type: ignore
|
||||||
else:
|
else:
|
||||||
ua = user_agent
|
ua = user_agent
|
||||||
|
|
||||||
|
if depth > self.get_config('max_depth'): # type: ignore
|
||||||
|
self.logger.warning(f'Not allowed to scrape on a depth higher than {self.get_config("max_depth")}: {depth}')
|
||||||
|
depth = self.get_config('max_depth') # type: ignore
|
||||||
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
|
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
|
||||||
log_enabled=True, log_level=self.get_config('splash_loglevel'))
|
log_enabled=True, log_level=self.get_config('splash_loglevel'))
|
||||||
if not items:
|
if not items:
|
||||||
|
|
Loading…
Reference in New Issue