From f1d83d29cf51dd0f9daa74553257e6f4eff1e662 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Thu, 23 Jan 2020 10:52:50 +0100 Subject: [PATCH] new: Add support for user defined cookies, bump deps --- lookyloo/helpers.py | 23 +++++++++++++++++++++++ lookyloo/lookyloo.py | 20 +++++++++++--------- poetry.lock | 22 +++++++++++----------- 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py index cca2b768..bff4bf0e 100644 --- a/lookyloo/helpers.py +++ b/lookyloo/helpers.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- import os +from typing import List from pathlib import Path from .exceptions import MissingEnv, CreateDirectoryException from redis import Redis @@ -10,6 +11,8 @@ import time from glob import glob import json import traceback +from urllib.parse import urlparse +from datetime import datetime, timedelta from bs4 import BeautifulSoup # type: ignore try: @@ -139,3 +142,23 @@ def get_user_agents() -> dict: paths = sorted(glob(ua_files_path), reverse=True) with open(paths[0]) as f: return json.load(f) + +def load_cookies() -> List[dict]: + if not (get_homedir() / 'cookies.json').exists(): + return [] + + with (get_homedir() / 'cookies.json').open() as f: + cookies = json.load(f) + to_return = [] + for cookie in cookies: + u = urlparse(cookie['Host raw']).netloc.split(':', 1)[0] + to_add = {'path': cookie['Path raw'], + 'name': cookie['Name raw'], + 'httpOnly': cookie['HTTP only raw'] == 'true', + 'secure': cookie['Send for'] == 'Encrypted connections only', + 'expires': (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z', + 'domain': u, + 'value': cookie['Content raw'] + } + to_return.append(to_add) + return to_return diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index c71a4d5a..dcfad476 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -20,17 +20,17 @@ import base64 from uuid import uuid4 from pathlib import Path -from .helpers import get_homedir, get_socket_path +from .helpers import get_homedir, get_socket_path, load_cookies from .exceptions import NoValidHarFile from redis import Redis -from typing import Union, Dict, List, Tuple +from typing import Union, Dict, List, Tuple, Optional import logging -from pysanejs import SaneJS # type: ignore -from scrapysplashwrapper import crawl # type: ignore -from har2tree import CrawledTree, Har2TreeError # type: ignore +from pysanejs import SaneJS +from scrapysplashwrapper import crawl +from har2tree import CrawledTree, Har2TreeError class Lookyloo(): @@ -50,7 +50,9 @@ class Lookyloo(): # Try to reach sanejs self.sanejs = SaneJS() if not self.sanejs.is_up: - self.sanejs = None + self.use_sane_js = False + else: + self.use_sane_js = True def __init_logger(self, loglevel: int) -> None: self.logger = logging.getLogger(f'{self.__class__.__name__}') @@ -156,11 +158,11 @@ class Lookyloo(): return BytesIO(f.read()) def sane_js_query(self, sha512: str) -> Dict: - if self.sanejs: + if self.use_sane_js: return self.sanejs.sha512(sha512) return {'response': []} - def scrape(self, url: str, cookies: List[dict]=[], depth: int=1, listing: bool=True, user_agent: str=None, perma_uuid: str=None, + def scrape(self, url: str, cookies: List[dict]=[], depth: int=1, listing: bool=True, user_agent: Optional[str]=None, perma_uuid: str=None, os: str=None, browser: str=None) -> Union[bool, str]: if not url.startswith('http'): url = f'http://{url}' @@ -173,7 +175,7 @@ class Lookyloo(): return False else: return False - + cookies = load_cookies() items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=user_agent, log_enabled=True, log_level='INFO') if not items: # broken diff --git a/poetry.lock b/poetry.lock index 090d36a5..1c91a108 100644 --- a/poetry.lock +++ b/poetry.lock @@ -239,7 +239,7 @@ lxml = "^4.4.2" six = "^1.14.0" [package.source] -reference = "8b986b8b0db9736ca5e4ffae1cb2ba5ab7ae6589" +reference = "3f7771a1467c27a45ebcc3ff00e3e6e834d54e72" type = "git" url = "https://github.com/viper-framework/har2tree.git" [[package]] @@ -459,17 +459,17 @@ version = "2.1.2" [[package]] category = "main" -description = "Python client for SaneJS" +description = "" name = "pysanejs" optional = false -python-versions = "*" -version = "0.1" +python-versions = "^3.6" +version = "0.1.0" [package.dependencies] -requests = "*" +requests = "^2.22.0" [package.source] -reference = "3dfe0530a24e4caff04f27830ad4a38c52c74a2c" +reference = "bdc091fbae7019c39b47a149b12f8ac032eda2a3" type = "git" url = "https://github.com/CIRCL/PySaneJS.git" [[package]] @@ -568,7 +568,7 @@ scrapy = "^1.8.0" scrapy-splash = "^0.7.2" [package.source] -reference = "b1e96b1d3c871c6e621d5463a3515cfb652ea5db" +reference = "6165ff2d4f95618d6be99c5bc44fc707685364e1" type = "git" url = "https://github.com/viper-framework/ScrapySplashWrapper.git" [[package]] @@ -658,8 +658,8 @@ category = "main" description = "HTTP library with thread-safe connection pooling, file post, and more." name = "urllib3" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" -version = "1.25.7" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" +version = "1.25.8" [package.extras] brotli = ["brotlipy (>=0.6.0)"] @@ -1135,8 +1135,8 @@ typing-extensions = [ {file = "typing_extensions-3.7.4.1.tar.gz", hash = "sha256:091ecc894d5e908ac75209f10d5b4f118fbdb2eb1ede6a63544054bb1edb41f2"}, ] urllib3 = [ - {file = "urllib3-1.25.7-py2.py3-none-any.whl", hash = "sha256:a8a318824cc77d1fd4b2bec2ded92646630d7fe8619497b142c84a9e6f5a7293"}, - {file = "urllib3-1.25.7.tar.gz", hash = "sha256:f3c5fd51747d450d4dcf6f923c81f78f811aab8205fda64b0aba34a4e48b0745"}, + {file = "urllib3-1.25.8-py2.py3-none-any.whl", hash = "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc"}, + {file = "urllib3-1.25.8.tar.gz", hash = "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc"}, ] w3lib = [ {file = "w3lib-1.21.0-py2.py3-none-any.whl", hash = "sha256:847704b837b2b973cddef6938325d466628e6078266bc2e1f7ac49ba85c34823"},