mirror of https://github.com/CIRCL/lookyloo
new: Add referer to initial URL
parent
c70eed9a94
commit
33fb74ae83
|
@ -482,7 +482,8 @@ class Lookyloo():
|
||||||
|
|
||||||
def scrape(self, url: str, cookies_pseudofile: Optional[BufferedIOBase]=None,
|
def scrape(self, url: str, cookies_pseudofile: Optional[BufferedIOBase]=None,
|
||||||
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
|
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
|
||||||
perma_uuid: str=None, os: str=None, browser: str=None) -> Union[bool, str]:
|
referer: Optional[str]=None, perma_uuid: str=None, os: str=None,
|
||||||
|
browser: str=None) -> Union[bool, str]:
|
||||||
url = url.strip()
|
url = url.strip()
|
||||||
url = refang(url)
|
url = refang(url)
|
||||||
if not url.startswith('http'):
|
if not url.startswith('http'):
|
||||||
|
@ -512,7 +513,7 @@ class Lookyloo():
|
||||||
self.logger.warning(f'Not allowed to scrape on a depth higher than {self.get_config("max_depth")}: {depth}')
|
self.logger.warning(f'Not allowed to scrape on a depth higher than {self.get_config("max_depth")}: {depth}')
|
||||||
depth = int(self.get_config('max_depth')) # type: ignore
|
depth = int(self.get_config('max_depth')) # type: ignore
|
||||||
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
|
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
|
||||||
log_enabled=True, log_level=self.get_config('splash_loglevel'))
|
referer=referer, log_enabled=True, log_level=self.get_config('splash_loglevel'))
|
||||||
if not items:
|
if not items:
|
||||||
# broken
|
# broken
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -866,7 +866,7 @@ description = "Scrapy splash wrapper as a standalone library."
|
||||||
name = "scrapysplashwrapper"
|
name = "scrapysplashwrapper"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6,<4.0"
|
python-versions = ">=3.6,<4.0"
|
||||||
version = "1.1.1"
|
version = "1.1.2"
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
scrapy = ">=1.8.0,<2.0.0"
|
scrapy = ">=1.8.0,<2.0.0"
|
||||||
|
@ -1543,8 +1543,8 @@ scrapy-splash = [
|
||||||
{file = "scrapy_splash-0.7.2-py2.py3-none-any.whl", hash = "sha256:71ac958370f8732fec746a25a8235b03a4d3c4c93a59be51aa8e910a08cfe511"},
|
{file = "scrapy_splash-0.7.2-py2.py3-none-any.whl", hash = "sha256:71ac958370f8732fec746a25a8235b03a4d3c4c93a59be51aa8e910a08cfe511"},
|
||||||
]
|
]
|
||||||
scrapysplashwrapper = [
|
scrapysplashwrapper = [
|
||||||
{file = "scrapysplashwrapper-1.1.1-py3-none-any.whl", hash = "sha256:660275a5a6f899e09abf8b732e0724a280cab6b44cb3405c85a92e25b87dac6b"},
|
{file = "scrapysplashwrapper-1.1.2-py3-none-any.whl", hash = "sha256:02460ec8714206045aa64c5586765fb8b5ba57e4aecfb3c08bd36b2c10b6b546"},
|
||||||
{file = "scrapysplashwrapper-1.1.1.tar.gz", hash = "sha256:1ac854f4c4e5a7a594d2e1a39d94330b67359420e16c7f1adc2a016579fcc16c"},
|
{file = "scrapysplashwrapper-1.1.2.tar.gz", hash = "sha256:3ecb4455d8949c4fb7e2ba868efd71fcd4ae746be28c5eaa1f8a884312c544ec"},
|
||||||
]
|
]
|
||||||
service-identity = [
|
service-identity = [
|
||||||
{file = "service_identity-18.1.0-py2.py3-none-any.whl", hash = "sha256:001c0707759cb3de7e49c078a7c0c9cd12594161d3bf06b9c254fdcb1a60dc36"},
|
{file = "service_identity-18.1.0-py2.py3-none-any.whl", hash = "sha256:001c0707759cb3de7e49c078a7c0c9cd12594161d3bf06b9c254fdcb1a60dc36"},
|
||||||
|
|
|
@ -125,6 +125,7 @@ def scrape_web():
|
||||||
perma_uuid = lookyloo.scrape(url=url, cookies_pseudofile=cookie_file,
|
perma_uuid = lookyloo.scrape(url=url, cookies_pseudofile=cookie_file,
|
||||||
depth=depth, listing=listing,
|
depth=depth, listing=listing,
|
||||||
user_agent=request.form.get('user_agent'),
|
user_agent=request.form.get('user_agent'),
|
||||||
|
referer=request.form.get('referer'),
|
||||||
os=request.form.get('os'), browser=request.form.get('browser'))
|
os=request.form.get('os'), browser=request.form.get('browser'))
|
||||||
return redirect(url_for('tree', tree_uuid=perma_uuid))
|
return redirect(url_for('tree', tree_uuid=perma_uuid))
|
||||||
user_agents: Dict[str, Any] = {}
|
user_agents: Dict[str, Any] = {}
|
||||||
|
|
|
@ -20,7 +20,7 @@
|
||||||
<div class="form-group row">
|
<div class="form-group row">
|
||||||
<label for="url" class="col-sm-2 col-form-label">URL:</label>
|
<label for="url" class="col-sm-2 col-form-label">URL:</label>
|
||||||
<div class="col-sm-10">
|
<div class="col-sm-10">
|
||||||
<input type="text" class="form-control" name="url" id=url placeholder="URL to scrape" required>
|
<input type="text" class="form-control" name="url" id=url placeholder="URL to capture" required>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -41,6 +41,13 @@
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div class="form-group row">
|
||||||
|
<label for="referer" class="col-sm-2 col-form-label">Referer:</label>
|
||||||
|
<div class="col-sm-10">
|
||||||
|
<input type="text" class="form-control" name="referer" id=referer placeholder="Referer of the URL to capture" required>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="form-group row">
|
<div class="form-group row">
|
||||||
<label for="cookies" class="col-sm-6">Cookies (JSON export from the Firefox plugin Cookie Quick Manager)</label>
|
<label for="cookies" class="col-sm-6">Cookies (JSON export from the Firefox plugin Cookie Quick Manager)</label>
|
||||||
<div class="col-sm-4">
|
<div class="col-sm-4">
|
||||||
|
|
Loading…
Reference in New Issue