new: Add support for user defined cookies, bump deps

pull/67/head
Raphaël Vinot 2020-01-23 10:52:50 +01:00
parent 32a07fc355
commit f1d83d29cf
3 changed files with 45 additions and 20 deletions

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
from typing import List
from pathlib import Path from pathlib import Path
from .exceptions import MissingEnv, CreateDirectoryException from .exceptions import MissingEnv, CreateDirectoryException
from redis import Redis from redis import Redis
@ -10,6 +11,8 @@ import time
from glob import glob from glob import glob
import json import json
import traceback import traceback
from urllib.parse import urlparse
from datetime import datetime, timedelta
from bs4 import BeautifulSoup # type: ignore from bs4 import BeautifulSoup # type: ignore
try: try:
@ -139,3 +142,23 @@ def get_user_agents() -> dict:
paths = sorted(glob(ua_files_path), reverse=True) paths = sorted(glob(ua_files_path), reverse=True)
with open(paths[0]) as f: with open(paths[0]) as f:
return json.load(f) return json.load(f)
def load_cookies() -> List[dict]:
if not (get_homedir() / 'cookies.json').exists():
return []
with (get_homedir() / 'cookies.json').open() as f:
cookies = json.load(f)
to_return = []
for cookie in cookies:
u = urlparse(cookie['Host raw']).netloc.split(':', 1)[0]
to_add = {'path': cookie['Path raw'],
'name': cookie['Name raw'],
'httpOnly': cookie['HTTP only raw'] == 'true',
'secure': cookie['Send for'] == 'Encrypted connections only',
'expires': (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z',
'domain': u,
'value': cookie['Content raw']
}
to_return.append(to_add)
return to_return

View File

@ -20,17 +20,17 @@ import base64
from uuid import uuid4 from uuid import uuid4
from pathlib import Path from pathlib import Path
from .helpers import get_homedir, get_socket_path from .helpers import get_homedir, get_socket_path, load_cookies
from .exceptions import NoValidHarFile from .exceptions import NoValidHarFile
from redis import Redis from redis import Redis
from typing import Union, Dict, List, Tuple from typing import Union, Dict, List, Tuple, Optional
import logging import logging
from pysanejs import SaneJS # type: ignore from pysanejs import SaneJS
from scrapysplashwrapper import crawl # type: ignore from scrapysplashwrapper import crawl
from har2tree import CrawledTree, Har2TreeError # type: ignore from har2tree import CrawledTree, Har2TreeError
class Lookyloo(): class Lookyloo():
@ -50,7 +50,9 @@ class Lookyloo():
# Try to reach sanejs # Try to reach sanejs
self.sanejs = SaneJS() self.sanejs = SaneJS()
if not self.sanejs.is_up: if not self.sanejs.is_up:
self.sanejs = None self.use_sane_js = False
else:
self.use_sane_js = True
def __init_logger(self, loglevel: int) -> None: def __init_logger(self, loglevel: int) -> None:
self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger = logging.getLogger(f'{self.__class__.__name__}')
@ -156,11 +158,11 @@ class Lookyloo():
return BytesIO(f.read()) return BytesIO(f.read())
def sane_js_query(self, sha512: str) -> Dict: def sane_js_query(self, sha512: str) -> Dict:
if self.sanejs: if self.use_sane_js:
return self.sanejs.sha512(sha512) return self.sanejs.sha512(sha512)
return {'response': []} return {'response': []}
def scrape(self, url: str, cookies: List[dict]=[], depth: int=1, listing: bool=True, user_agent: str=None, perma_uuid: str=None, def scrape(self, url: str, cookies: List[dict]=[], depth: int=1, listing: bool=True, user_agent: Optional[str]=None, perma_uuid: str=None,
os: str=None, browser: str=None) -> Union[bool, str]: os: str=None, browser: str=None) -> Union[bool, str]:
if not url.startswith('http'): if not url.startswith('http'):
url = f'http://{url}' url = f'http://{url}'
@ -173,7 +175,7 @@ class Lookyloo():
return False return False
else: else:
return False return False
cookies = load_cookies()
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=user_agent, log_enabled=True, log_level='INFO') items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
if not items: if not items:
# broken # broken

22
poetry.lock generated
View File

@ -239,7 +239,7 @@ lxml = "^4.4.2"
six = "^1.14.0" six = "^1.14.0"
[package.source] [package.source]
reference = "8b986b8b0db9736ca5e4ffae1cb2ba5ab7ae6589" reference = "3f7771a1467c27a45ebcc3ff00e3e6e834d54e72"
type = "git" type = "git"
url = "https://github.com/viper-framework/har2tree.git" url = "https://github.com/viper-framework/har2tree.git"
[[package]] [[package]]
@ -459,17 +459,17 @@ version = "2.1.2"
[[package]] [[package]]
category = "main" category = "main"
description = "Python client for SaneJS" description = ""
name = "pysanejs" name = "pysanejs"
optional = false optional = false
python-versions = "*" python-versions = "^3.6"
version = "0.1" version = "0.1.0"
[package.dependencies] [package.dependencies]
requests = "*" requests = "^2.22.0"
[package.source] [package.source]
reference = "3dfe0530a24e4caff04f27830ad4a38c52c74a2c" reference = "bdc091fbae7019c39b47a149b12f8ac032eda2a3"
type = "git" type = "git"
url = "https://github.com/CIRCL/PySaneJS.git" url = "https://github.com/CIRCL/PySaneJS.git"
[[package]] [[package]]
@ -568,7 +568,7 @@ scrapy = "^1.8.0"
scrapy-splash = "^0.7.2" scrapy-splash = "^0.7.2"
[package.source] [package.source]
reference = "b1e96b1d3c871c6e621d5463a3515cfb652ea5db" reference = "6165ff2d4f95618d6be99c5bc44fc707685364e1"
type = "git" type = "git"
url = "https://github.com/viper-framework/ScrapySplashWrapper.git" url = "https://github.com/viper-framework/ScrapySplashWrapper.git"
[[package]] [[package]]
@ -658,8 +658,8 @@ category = "main"
description = "HTTP library with thread-safe connection pooling, file post, and more." description = "HTTP library with thread-safe connection pooling, file post, and more."
name = "urllib3" name = "urllib3"
optional = false optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
version = "1.25.7" version = "1.25.8"
[package.extras] [package.extras]
brotli = ["brotlipy (>=0.6.0)"] brotli = ["brotlipy (>=0.6.0)"]
@ -1135,8 +1135,8 @@ typing-extensions = [
{file = "typing_extensions-3.7.4.1.tar.gz", hash = "sha256:091ecc894d5e908ac75209f10d5b4f118fbdb2eb1ede6a63544054bb1edb41f2"}, {file = "typing_extensions-3.7.4.1.tar.gz", hash = "sha256:091ecc894d5e908ac75209f10d5b4f118fbdb2eb1ede6a63544054bb1edb41f2"},
] ]
urllib3 = [ urllib3 = [
{file = "urllib3-1.25.7-py2.py3-none-any.whl", hash = "sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293"}, {file = "urllib3-1.25.8-py2.py3-none-any.whl", hash = "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc"},
{file = "urllib3-1.25.7.tar.gz", hash = "sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745"}, {file = "urllib3-1.25.8.tar.gz", hash = "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc"},
] ]
w3lib = [ w3lib = [
{file = "w3lib-1.21.0-py2.py3-none-any.whl", hash = "sha256:847704b837b2b973cddef6938325d466628e6078266bc2e1f7ac49ba85c34823"}, {file = "w3lib-1.21.0-py2.py3-none-any.whl", hash = "sha256:847704b837b2b973cddef6938325d466628e6078266bc2e1f7ac49ba85c34823"},