From 33fb74ae8335796de496ad07af1289c56311b602 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Wed, 8 Jul 2020 00:37:29 +0200 Subject: [PATCH] new: Add referer to initial URL --- lookyloo/lookyloo.py | 5 +++-- poetry.lock | 6 +++--- website/web/__init__.py | 1 + website/web/templates/scrape.html | 9 ++++++++- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index ea6c8fc..0f8bf7c 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -482,7 +482,8 @@ class Lookyloo(): def scrape(self, url: str, cookies_pseudofile: Optional[BufferedIOBase]=None, depth: int=1, listing: bool=True, user_agent: Optional[str]=None, - perma_uuid: str=None, os: str=None, browser: str=None) -> Union[bool, str]: + referer: Optional[str]=None, perma_uuid: str=None, os: str=None, + browser: str=None) -> Union[bool, str]: url = url.strip() url = refang(url) if not url.startswith('http'): @@ -512,7 +513,7 @@ class Lookyloo(): self.logger.warning(f'Not allowed to scrape on a depth higher than {self.get_config("max_depth")}: {depth}') depth = int(self.get_config('max_depth')) # type: ignore items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua, - log_enabled=True, log_level=self.get_config('splash_loglevel')) + referer=referer, log_enabled=True, log_level=self.get_config('splash_loglevel')) if not items: # broken return False diff --git a/poetry.lock b/poetry.lock index b81fd36..9ea6c03 100644 --- a/poetry.lock +++ b/poetry.lock @@ -866,7 +866,7 @@ description = "Scrapy splash wrapper as a standalone library." name = "scrapysplashwrapper" optional = false python-versions = ">=3.6,<4.0" -version = "1.1.1" +version = "1.1.2" [package.dependencies] scrapy = ">=1.8.0,<2.0.0" @@ -1543,8 +1543,8 @@ scrapy-splash = [ {file = "scrapy_splash-0.7.2-py2.py3-none-any.whl", hash = "sha256:71ac958370f8732fec746a25a8235b03a4d3c4c93a59be51aa8e910a08cfe511"}, ] scrapysplashwrapper = [ - {file = "scrapysplashwrapper-1.1.1-py3-none-any.whl", hash = "sha256:660275a5a6f899e09abf8b732e0724a280cab6b44cb3405c85a92e25b87dac6b"}, - {file = "scrapysplashwrapper-1.1.1.tar.gz", hash = "sha256:1ac854f4c4e5a7a594d2e1a39d94330b67359420e16c7f1adc2a016579fcc16c"}, + {file = "scrapysplashwrapper-1.1.2-py3-none-any.whl", hash = "sha256:02460ec8714206045aa64c5586765fb8b5ba57e4aecfb3c08bd36b2c10b6b546"}, + {file = "scrapysplashwrapper-1.1.2.tar.gz", hash = "sha256:3ecb4455d8949c4fb7e2ba868efd71fcd4ae746be28c5eaa1f8a884312c544ec"}, ] service-identity = [ {file = "service_identity-18.1.0-py2.py3-none-any.whl", hash = "sha256:001c0707759cb3de7e49c078a7c0c9cd12594161d3bf06b9c254fdcb1a60dc36"}, diff --git a/website/web/__init__.py b/website/web/__init__.py index 88ab34c..2241252 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -125,6 +125,7 @@ def scrape_web(): perma_uuid = lookyloo.scrape(url=url, cookies_pseudofile=cookie_file, depth=depth, listing=listing, user_agent=request.form.get('user_agent'), + referer=request.form.get('referer'), os=request.form.get('os'), browser=request.form.get('browser')) return redirect(url_for('tree', tree_uuid=perma_uuid)) user_agents: Dict[str, Any] = {} diff --git a/website/web/templates/scrape.html b/website/web/templates/scrape.html index cef6f5f..b9d8b09 100644 --- a/website/web/templates/scrape.html +++ b/website/web/templates/scrape.html @@ -20,7 +20,7 @@
- +
@@ -41,6 +41,13 @@ +
+ +
+ +
+
+