mirror of https://github.com/CIRCL/lookyloo
new: Add support for user defined cookies, bump deps
parent
32a07fc355
commit
f1d83d29cf
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
from .exceptions import MissingEnv, CreateDirectoryException
|
||||
from redis import Redis
|
||||
|
@ -10,6 +11,8 @@ import time
|
|||
from glob import glob
|
||||
import json
|
||||
import traceback
|
||||
from urllib.parse import urlparse
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from bs4 import BeautifulSoup # type: ignore
|
||||
try:
|
||||
|
@ -139,3 +142,23 @@ def get_user_agents() -> dict:
|
|||
paths = sorted(glob(ua_files_path), reverse=True)
|
||||
with open(paths[0]) as f:
|
||||
return json.load(f)
|
||||
|
||||
def load_cookies() -> List[dict]:
|
||||
if not (get_homedir() / 'cookies.json').exists():
|
||||
return []
|
||||
|
||||
with (get_homedir() / 'cookies.json').open() as f:
|
||||
cookies = json.load(f)
|
||||
to_return = []
|
||||
for cookie in cookies:
|
||||
u = urlparse(cookie['Host raw']).netloc.split(':', 1)[0]
|
||||
to_add = {'path': cookie['Path raw'],
|
||||
'name': cookie['Name raw'],
|
||||
'httpOnly': cookie['HTTP only raw'] == 'true',
|
||||
'secure': cookie['Send for'] == 'Encrypted connections only',
|
||||
'expires': (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z',
|
||||
'domain': u,
|
||||
'value': cookie['Content raw']
|
||||
}
|
||||
to_return.append(to_add)
|
||||
return to_return
|
||||
|
|
|
@ -20,17 +20,17 @@ import base64
|
|||
from uuid import uuid4
|
||||
|
||||
from pathlib import Path
|
||||
from .helpers import get_homedir, get_socket_path
|
||||
from .helpers import get_homedir, get_socket_path, load_cookies
|
||||
from .exceptions import NoValidHarFile
|
||||
from redis import Redis
|
||||
|
||||
from typing import Union, Dict, List, Tuple
|
||||
from typing import Union, Dict, List, Tuple, Optional
|
||||
|
||||
import logging
|
||||
|
||||
from pysanejs import SaneJS # type: ignore
|
||||
from scrapysplashwrapper import crawl # type: ignore
|
||||
from har2tree import CrawledTree, Har2TreeError # type: ignore
|
||||
from pysanejs import SaneJS
|
||||
from scrapysplashwrapper import crawl
|
||||
from har2tree import CrawledTree, Har2TreeError
|
||||
|
||||
|
||||
class Lookyloo():
|
||||
|
@ -50,7 +50,9 @@ class Lookyloo():
|
|||
# Try to reach sanejs
|
||||
self.sanejs = SaneJS()
|
||||
if not self.sanejs.is_up:
|
||||
self.sanejs = None
|
||||
self.use_sane_js = False
|
||||
else:
|
||||
self.use_sane_js = True
|
||||
|
||||
def __init_logger(self, loglevel: int) -> None:
|
||||
self.logger = logging.getLogger(f'{self.__class__.__name__}')
|
||||
|
@ -156,11 +158,11 @@ class Lookyloo():
|
|||
return BytesIO(f.read())
|
||||
|
||||
def sane_js_query(self, sha512: str) -> Dict:
|
||||
if self.sanejs:
|
||||
if self.use_sane_js:
|
||||
return self.sanejs.sha512(sha512)
|
||||
return {'response': []}
|
||||
|
||||
def scrape(self, url: str, cookies: List[dict]=[], depth: int=1, listing: bool=True, user_agent: str=None, perma_uuid: str=None,
|
||||
def scrape(self, url: str, cookies: List[dict]=[], depth: int=1, listing: bool=True, user_agent: Optional[str]=None, perma_uuid: str=None,
|
||||
os: str=None, browser: str=None) -> Union[bool, str]:
|
||||
if not url.startswith('http'):
|
||||
url = f'http://{url}'
|
||||
|
@ -173,7 +175,7 @@ class Lookyloo():
|
|||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
cookies = load_cookies()
|
||||
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
|
||||
if not items:
|
||||
# broken
|
||||
|
|
|
@ -239,7 +239,7 @@ lxml = "^4.4.2"
|
|||
six = "^1.14.0"
|
||||
|
||||
[package.source]
|
||||
reference = "8b986b8b0db9736ca5e4ffae1cb2ba5ab7ae6589"
|
||||
reference = "3f7771a1467c27a45ebcc3ff00e3e6e834d54e72"
|
||||
type = "git"
|
||||
url = "https://github.com/viper-framework/har2tree.git"
|
||||
[[package]]
|
||||
|
@ -459,17 +459,17 @@ version = "2.1.2"
|
|||
|
||||
[[package]]
|
||||
category = "main"
|
||||
description = "Python client for SaneJS"
|
||||
description = ""
|
||||
name = "pysanejs"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
version = "0.1"
|
||||
python-versions = "^3.6"
|
||||
version = "0.1.0"
|
||||
|
||||
[package.dependencies]
|
||||
requests = "*"
|
||||
requests = "^2.22.0"
|
||||
|
||||
[package.source]
|
||||
reference = "3dfe0530a24e4caff04f27830ad4a38c52c74a2c"
|
||||
reference = "bdc091fbae7019c39b47a149b12f8ac032eda2a3"
|
||||
type = "git"
|
||||
url = "https://github.com/CIRCL/PySaneJS.git"
|
||||
[[package]]
|
||||
|
@ -568,7 +568,7 @@ scrapy = "^1.8.0"
|
|||
scrapy-splash = "^0.7.2"
|
||||
|
||||
[package.source]
|
||||
reference = "b1e96b1d3c871c6e621d5463a3515cfb652ea5db"
|
||||
reference = "6165ff2d4f95618d6be99c5bc44fc707685364e1"
|
||||
type = "git"
|
||||
url = "https://github.com/viper-framework/ScrapySplashWrapper.git"
|
||||
[[package]]
|
||||
|
@ -658,8 +658,8 @@ category = "main"
|
|||
description = "HTTP library with thread-safe connection pooling, file post, and more."
|
||||
name = "urllib3"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4"
|
||||
version = "1.25.7"
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
|
||||
version = "1.25.8"
|
||||
|
||||
[package.extras]
|
||||
brotli = ["brotlipy (>=0.6.0)"]
|
||||
|
@ -1135,8 +1135,8 @@ typing-extensions = [
|
|||
{file = "typing_extensions-3.7.4.1.tar.gz", hash = "sha256:091ecc894d5e908ac75209f10d5b4f118fbdb2eb1ede6a63544054bb1edb41f2"},
|
||||
]
|
||||
urllib3 = [
|
||||
{file = "urllib3-1.25.7-py2.py3-none-any.whl", hash = "sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293"},
|
||||
{file = "urllib3-1.25.7.tar.gz", hash = "sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745"},
|
||||
{file = "urllib3-1.25.8-py2.py3-none-any.whl", hash = "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc"},
|
||||
{file = "urllib3-1.25.8.tar.gz", hash = "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc"},
|
||||
]
|
||||
w3lib = [
|
||||
{file = "w3lib-1.21.0-py2.py3-none-any.whl", hash = "sha256:847704b837b2b973cddef6938325d466628e6078266bc2e1f7ac49ba85c34823"},
|
||||
|
|
Loading…
Reference in New Issue