lookyloo/lookyloo/lookyloo.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json

import pickle

from datetime import datetime

import tempfile
import pathlib
import time

import ipaddress
import socket
from urllib.parse import urlsplit

from io import BufferedIOBase, BytesIO
import base64
from uuid import uuid4

from pathlib import Path
from .helpers import get_homedir, get_socket_path, load_cookies, load_configs, safe_create_dir
from .exceptions import NoValidHarFile
from redis import Redis

from typing import Union, Dict, List, Tuple, Optional, Any

import logging

from pysanejs import SaneJS
from scrapysplashwrapper import crawl
from har2tree import CrawledTree, Har2TreeError, HarFile

from defang import refang  # type: ignore

from .modules import VirusTotal


class Lookyloo():

    def __init__(self) -> None:
        self.logger = logging.getLogger(f'{self.__class__.__name__}')
        self.configs: Dict[str, Dict[str, Any]] = load_configs()
        self.logger.setLevel(self.get_config('loglevel'))

        self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
        self.scrape_dir: Path = get_homedir() / 'scraped'
        self.splash_url: str = self.get_config('splash_url')
        self.only_global_lookups: bool = self.get_config('only_global_lookups')

        safe_create_dir(self.scrape_dir)

        # Initialize 3rd party components
        if 'modules' not in self.configs:
            self.logger.info('No third party components available in the config directory')
        else:
            if 'VirusTotal' in self.configs['modules']:
                self.vt = VirusTotal(self.configs['modules']['VirusTotal'])
                if not self.vt.available:
                    self.logger.warning('Unable to setup the VirusTotal module')

        if not self.redis.exists('cache_loaded'):
            self._init_existing_dumps()

        # Try to reach sanejs
        self.sanejs = SaneJS()
        if not self.sanejs.is_up:
            self.use_sane_js = False
        else:
            self.use_sane_js = True

    def rebuild_cache(self):
        self.redis.flushdb()
        self._init_existing_dumps()

    def remove_pickle(self, capture_dir: Path):
        if (capture_dir / 'tree.pickle').exists():
            (capture_dir / 'tree.pickle').unlink()

    def rebuild_all(self):
        for capture_dir in self.capture_dirs:
            self.remove_pickle(capture_dir)
        self.rebuild_cache()

    def get_config(self, entry: str) -> Any:
        """Get an entry from the generic config file. Automatic fallback to the sample file"""
        if 'generic' in self.configs:
            if entry in self.configs['generic']:
                return self.configs['generic'][entry]
            else:
                self.logger.warning(f'Unable to fing {entry} in config file.')
        else:
            self.logger.warning('No generic config file available.')
        self.logger.warning('Falling back on sample config, please initialize the generic config file.')
        with (get_homedir() / 'config' / 'generic.json.sample').open() as _c:
            sample_config = json.load(_c)
        return sample_config[entry]

    def _set_capture_cache(self, capture_dir: Path, force: bool=False) -> None:
        if force or not self.redis.exists(str(capture_dir)):
            # (re)build cache
            pass
        else:
            return

        with (capture_dir / 'uuid').open() as f:
            uuid = f.read().strip()

        har_files = sorted(capture_dir.glob('*.har'))

        error_cache: Dict[str, str] = {}
        if (capture_dir / 'error.txt').exists():
            # Something went wrong
            with (Path(capture_dir) / 'error.txt').open() as _error:
                error_cache['error'] = f'Capture in {capture_dir} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum'
        elif not har_files:
            error_cache['error'] = f'No har files in {capture_dir}'

        if error_cache:
            self.logger.warning(error_cache['error'])
            self.redis.hmset(str(capture_dir), error_cache)
            self.redis.hset('lookup_dirs', uuid, str(capture_dir))
            return

        har = HarFile(har_files[0])

        redirects = har.initial_redirects
        incomplete_redirects = False
        if redirects and har.need_tree_redirects:
            # load tree from disk, get redirects
            ct = self._load_pickle(capture_dir / 'tree.pickle')
            if ct:
                redirects = ct.redirects
            else:
                # Pickle not available
                incomplete_redirects = True

        cache: Dict[str, Union[str, int]] = {'uuid': uuid,
                                             'title': har.initial_title,
                                             'timestamp': har.initial_start_time,
                                             'url': har.first_url,
                                             'redirects': json.dumps(redirects),
                                             'incomplete_redirects': 1 if incomplete_redirects else 0}
        if (capture_dir / 'no_index').exists():  # If the folders claims anonymity
            cache['no_index'] = 1

        self.redis.hmset(str(capture_dir), cache)
        self.redis.hset('lookup_dirs', uuid, str(capture_dir))

    def capture_cache(self, capture_dir: Path) -> Optional[Dict[str, Union[str, int]]]:
        if self.redis.hget(str(capture_dir), 'incomplete_redirects') == '1':
            # try to rebuild the cache
            self._set_capture_cache(capture_dir, force=True)
        cached = self.redis.hgetall(str(capture_dir))
        if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects']):
            cached['redirects'] = json.loads(cached['redirects'])
            return cached
        elif 'error' in cached:
            return cached
        else:
            self.logger.warning(f'Cache ({capture_dir}) is invalid: {json.dumps(cached, indent=2)}')
            return None

    def _init_existing_dumps(self) -> None:
        for capture_dir in self.capture_dirs:
            if capture_dir.exists():
                self._set_capture_cache(capture_dir)
        self.redis.set('cache_loaded', 1)

    @property
    def capture_dirs(self) -> List[Path]:
        for capture_dir in self.scrape_dir.iterdir():
            if capture_dir.is_dir() and not capture_dir.iterdir():
                # Cleanup self.scrape_dir of failed runs.
                capture_dir.rmdir()
            if not (capture_dir / 'uuid').exists():
                # Create uuid if missing
                with (capture_dir / 'uuid').open('w') as f:
                    f.write(str(uuid4()))
        return sorted(self.scrape_dir.iterdir(), reverse=True)

    def lookup_capture_dir(self, uuid) -> Union[Path, None]:
        capture_dir = self.redis.hget('lookup_dirs', uuid)
        if capture_dir:
            return Path(capture_dir)
        return None

    def enqueue_scrape(self, query: dict) -> str:
        perma_uuid = str(uuid4())
        p = self.redis.pipeline()
        p.hmset(perma_uuid, query)
        p.sadd('to_scrape', perma_uuid)
        p.execute()
        return perma_uuid

    def process_scrape_queue(self) -> Union[bool, None]:
        uuid = self.redis.spop('to_scrape')
        if not uuid:
            return None
        to_scrape = self.redis.hgetall(uuid)
        self.redis.delete(uuid)
        to_scrape['perma_uuid'] = uuid
        if self.scrape(**to_scrape):
            self.logger.info(f'Processed {to_scrape["url"]}')
            return True
        return False

    def _load_pickle(self, pickle_file: Path) -> Optional[CrawledTree]:
        if pickle_file.exists():
            with pickle_file.open('rb') as _p:
                return pickle.load(_p)
        return None

    def load_tree(self, capture_dir: Path) -> Tuple[str, dict, str, str, str, dict]:
        har_files = sorted(capture_dir.glob('*.har'))
        pickle_file = capture_dir / 'tree.pickle'
        try:
            meta = {}
            if (capture_dir / 'meta').exists():
                # NOTE: Legacy, the meta file should be present
                with open((capture_dir / 'meta'), 'r') as f:
                    meta = json.load(f)
            ct = self._load_pickle(pickle_file)
            if not ct:
                ct = CrawledTree(har_files)
                with pickle_file.open('wb') as _p:
                    pickle.dump(ct, _p)
            return str(pickle_file), ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
        except Har2TreeError as e:
            raise NoValidHarFile(e.message)

    def cleanup_old_tmpfiles(self):
        for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'):
            if time.time() - tmpfile.stat().st_atime > 36000:
                tmpfile.unlink()

    def load_image(self, capture_dir: Path) -> BytesIO:
        with open(list(capture_dir.glob('*.png'))[0], 'rb') as f:
            return BytesIO(f.read())

    def sane_js_query(self, sha512: str) -> Dict:
        if self.use_sane_js:
            return self.sanejs.sha512(sha512)
        return {'response': []}

    def scrape(self, url: str, cookies_pseudofile: Optional[BufferedIOBase]=None, depth: int=1, listing: bool=True, user_agent: Optional[str]=None, perma_uuid: str=None,
               os: str=None, browser: str=None) -> Union[bool, str]:
        url = url.strip()
        url = refang(url)
        if not url.startswith('http'):
            url = f'http://{url}'
        if self.only_global_lookups:
            splitted_url = urlsplit(url)
            if splitted_url.netloc:
                if splitted_url.hostname:
                    try:
                        ip = socket.gethostbyname(splitted_url.hostname)
                    except socket.gaierror:
                        self.logger.info(f'Name or service not known')
                        return False
                    if not ipaddress.ip_address(ip).is_global:
                        return False
            else:
                return False

        cookies = load_cookies(cookies_pseudofile)
        items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
        if not items:
            # broken
            return False
        if not perma_uuid:
            perma_uuid = str(uuid4())
        width = len(str(len(items)))
        dirpath = self.scrape_dir / datetime.now().isoformat()
        safe_create_dir(dirpath)
        for i, item in enumerate(items):
            if not listing:  # Write no_index marker
                (dirpath / 'no_index').touch()
            with (dirpath / 'uuid').open('w') as _uuid:
                _uuid.write(perma_uuid)
            if os or browser:
                meta = {}
                if os:
                    meta['os'] = os
                if browser:
                    meta['browser'] = browser
                with (dirpath / 'meta').open('w') as _meta:
                    json.dump(meta, _meta)
            if 'error' in item:
                with (dirpath / 'error.txt').open('w') as _error:
                    _error.write(item['error'])
                continue

            # The capture went fine
            harfile = item['har']
            png = base64.b64decode(item['png'])
            html = item['html']
            last_redirect = item['last_redirected_url']

            with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
                json.dump(harfile, _har)
            with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
                _img.write(png)
            with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
                _html.write(html)
            with (dirpath / '{0:0{width}}.last_redirect.txt'.format(i, width=width)).open('w') as _redir:
                _redir.write(last_redirect)

            if 'childFrames' in item:
                child_frames = item['childFrames']
                with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as _iframes:
                    json.dump(child_frames, _iframes)

            if 'cookies' in item:
                cookies = item['cookies']
                with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
                    json.dump(cookies, _cookies)

        self._set_capture_cache(dirpath)
        return perma_uuid
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`

			`import json`

			`import pickle`

			`from datetime import datetime`

			`import tempfile`
			`import pathlib`
			`import time`

fix: Allow to disable scraping private IPs. 2019-07-05 16:27:23 +02:00			`import ipaddress`
			`import socket`
			`from urllib.parse import urlsplit`

chg: Fix typing 2020-01-24 11:25:53 +01:00			`from io import BufferedIOBase, BytesIO`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`import base64`
			`from uuid import uuid4`

			`from pathlib import Path`
chg: Bump har2tree, cleanup 2020-04-01 14:33:35 +02:00			`from .helpers import get_homedir, get_socket_path, load_cookies, load_configs, safe_create_dir`
new: Add error page 2019-02-18 13:52:48 +01:00			`from .exceptions import NoValidHarFile`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`from redis import Redis`

chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`from typing import Union, Dict, List, Tuple, Optional, Any`
chg: Add typing 2020-01-06 15:32:38 +01:00
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`import logging`

new: Add support for user defined cookies, bump deps 2020-01-23 10:52:50 +01:00			`from pysanejs import SaneJS`
			`from scrapysplashwrapper import crawl`
new: Add initial redirects columns 2020-02-03 22:25:48 +01:00			`from har2tree import CrawledTree, Har2TreeError, HarFile`
chg: Add typing 2020-01-06 15:32:38 +01:00
new: refang urls when needed 2020-03-16 13:51:21 +01:00			`from defang import refang # type: ignore`

new: Add config files, initial support for 3rd party modules 2020-03-31 14:12:49 +02:00			`from .modules import VirusTotal`

chg: Refactor code organisation 2019-01-30 14:30:01 +01:00
			`class Lookyloo():`

new: Add config files, initial support for 3rd party modules 2020-03-31 14:12:49 +02:00			`def __init__(self) -> None:`
			`self.logger = logging.getLogger(f'{self.__class__.__name__}')`
			`self.configs: Dict[str, Dict[str, Any]] = load_configs()`
			`self.logger.setLevel(self.get_config('loglevel'))`

chg: Add typing 2020-01-06 15:32:38 +01:00			`self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)`
			`self.scrape_dir: Path = get_homedir() / 'scraped'`
new: Add config files, initial support for 3rd party modules 2020-03-31 14:12:49 +02:00			`self.splash_url: str = self.get_config('splash_url')`
			`self.only_global_lookups: bool = self.get_config('only_global_lookups')`
chg: Bump har2tree, cleanup 2020-04-01 14:33:35 +02:00
			`safe_create_dir(self.scrape_dir)`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00
new: Add config files, initial support for 3rd party modules 2020-03-31 14:12:49 +02:00			`# Initialize 3rd party components`
			`if 'modules' not in self.configs:`
			`self.logger.info('No third party components available in the config directory')`
			`else:`
			`if 'VirusTotal' in self.configs['modules']:`
			`self.vt = VirusTotal(self.configs['modules']['VirusTotal'])`
			`if not self.vt.available:`
			`self.logger.warning('Unable to setup the VirusTotal module')`

fix: Avoid loading the cache multiple times 2019-04-05 15:07:22 +02:00			`if not self.redis.exists('cache_loaded'):`
			`self._init_existing_dumps()`
chg: pre-load the existing reports 2019-02-01 16:11:16 +01:00
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`# Try to reach sanejs`
			`self.sanejs = SaneJS()`
			`if not self.sanejs.is_up:`
new: Add support for user defined cookies, bump deps 2020-01-23 10:52:50 +01:00			`self.use_sane_js = False`
			`else:`
			`self.use_sane_js = True`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00
new: Allow admin to rebuild cache 2020-04-01 17:44:06 +02:00			`def rebuild_cache(self):`
			`self.redis.flushdb()`
			`self._init_existing_dumps()`

			`def remove_pickle(self, capture_dir: Path):`
			`if (capture_dir / 'tree.pickle').exists():`
			`(capture_dir / 'tree.pickle').unlink()`

			`def rebuild_all(self):`
			`for capture_dir in self.capture_dirs:`
			`self.remove_pickle(capture_dir)`
			`self.rebuild_cache()`

chg: Bump har2tree, cleanup 2020-04-01 14:33:35 +02:00			`def get_config(self, entry: str) -> Any:`
new: Add config files, initial support for 3rd party modules 2020-03-31 14:12:49 +02:00			`"""Get an entry from the generic config file. Automatic fallback to the sample file"""`
			`if 'generic' in self.configs:`
			`if entry in self.configs['generic']:`
			`return self.configs['generic'][entry]`
			`else:`
			`self.logger.warning(f'Unable to fing {entry} in config file.')`
			`else:`
			`self.logger.warning('No generic config file available.')`
			`self.logger.warning('Falling back on sample config, please initialize the generic config file.')`
			`with (get_homedir() / 'config' / 'generic.json.sample').open() as _c:`
			`sample_config = json.load(_c)`
			`return sample_config[entry]`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00
chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`def _set_capture_cache(self, capture_dir: Path, force: bool=False) -> None:`
			`if force or not self.redis.exists(str(capture_dir)):`
			`# (re)build cache`
			`pass`
			`else:`
fix: Initialize cache only once 2019-06-25 18:08:52 +02:00			`return`
chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00
			`with (capture_dir / 'uuid').open() as f:`
chg: Better handling of error messages, display them. 2020-03-17 14:17:18 +01:00			`uuid = f.read().strip()`

chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`har_files = sorted(capture_dir.glob('*.har'))`
chg: Better handling of error messages, display them. 2020-03-17 14:17:18 +01:00
chg: Improve error notification 2020-03-17 15:27:04 +01:00			`error_cache: Dict[str, str] = {}`
chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`if (capture_dir / 'error.txt').exists():`
chg: Bump dependencies, Improve HTTP errors handling Fix https://github.com/CIRCL/lookyloo/issues/66 2020-02-16 23:38:42 +01:00			`# Something went wrong`
chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`with (Path(capture_dir) / 'error.txt').open() as _error:`
			`error_cache['error'] = f'Capture in {capture_dir} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum'`
chg: Improve error notification 2020-03-17 15:27:04 +01:00			`elif not har_files:`
chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`error_cache['error'] = f'No har files in {capture_dir}'`
chg: Better handling of error messages, display them. 2020-03-17 14:17:18 +01:00
			`if error_cache:`
			`self.logger.warning(error_cache['error'])`
chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`self.redis.hmset(str(capture_dir), error_cache)`
			`self.redis.hset('lookup_dirs', uuid, str(capture_dir))`
chg: pre-load the existing reports 2019-02-01 16:11:16 +01:00			`return`
chg: Better handling of error messages, display them. 2020-03-17 14:17:18 +01:00
new: Add initial redirects columns 2020-02-03 22:25:48 +01:00			`har = HarFile(har_files[0])`

chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`redirects = har.initial_redirects`
			`incomplete_redirects = False`
			`if redirects and har.need_tree_redirects:`
			`# load tree from disk, get redirects`
			`ct = self._load_pickle(capture_dir / 'tree.pickle')`
			`if ct:`
			`redirects = ct.redirects`
			`else:`
			`# Pickle not available`
			`incomplete_redirects = True`

new: Add initial redirects columns 2020-02-03 22:25:48 +01:00			`cache: Dict[str, Union[str, int]] = {'uuid': uuid,`
			`'title': har.initial_title,`
			`'timestamp': har.initial_start_time,`
			`'url': har.first_url,`
chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`'redirects': json.dumps(redirects),`
			`'incomplete_redirects': 1 if incomplete_redirects else 0}`
			`if (capture_dir / 'no_index').exists(): # If the folders claims anonymity`
chg: Slight cleanup 2019-02-18 14:29:15 +01:00			`cache['no_index'] = 1`
chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00
			`self.redis.hmset(str(capture_dir), cache)`
			`self.redis.hset('lookup_dirs', uuid, str(capture_dir))`

			`def capture_cache(self, capture_dir: Path) -> Optional[Dict[str, Union[str, int]]]:`
			`if self.redis.hget(str(capture_dir), 'incomplete_redirects') == '1':`
			`# try to rebuild the cache`
			`self._set_capture_cache(capture_dir, force=True)`
			`cached = self.redis.hgetall(str(capture_dir))`
fix: Properly check keys in cache 2020-02-11 17:26:01 +01:00			`if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects']):`
fix: Better handling of cache issues 2020-02-11 17:03:25 +01:00			`cached['redirects'] = json.loads(cached['redirects'])`
			`return cached`
chg: Better handling of error messages, display them. 2020-03-17 14:17:18 +01:00			`elif 'error' in cached:`
			`return cached`
			`else:`
chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`self.logger.warning(f'Cache ({capture_dir}) is invalid: {json.dumps(cached, indent=2)}')`
chg: Better handling of error messages, display them. 2020-03-17 14:17:18 +01:00			`return None`
chg: pre-load the existing reports 2019-02-01 16:11:16 +01:00
chg: Add typing 2020-01-06 15:32:38 +01:00			`def _init_existing_dumps(self) -> None:`
chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`for capture_dir in self.capture_dirs:`
			`if capture_dir.exists():`
			`self._set_capture_cache(capture_dir)`
fix: Avoid loading the cache multiple times 2019-04-05 15:07:22 +02:00			`self.redis.set('cache_loaded', 1)`
chg: pre-load the existing reports 2019-02-01 16:11:16 +01:00
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`@property`
chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`def capture_dirs(self) -> List[Path]:`
			`for capture_dir in self.scrape_dir.iterdir():`
			`if capture_dir.is_dir() and not capture_dir.iterdir():`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`# Cleanup self.scrape_dir of failed runs.`
chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`capture_dir.rmdir()`
			`if not (capture_dir / 'uuid').exists():`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`# Create uuid if missing`
chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`with (capture_dir / 'uuid').open('w') as f:`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`f.write(str(uuid4()))`
			`return sorted(self.scrape_dir.iterdir(), reverse=True)`

chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`def lookup_capture_dir(self, uuid) -> Union[Path, None]:`
			`capture_dir = self.redis.hget('lookup_dirs', uuid)`
			`if capture_dir:`
			`return Path(capture_dir)`
chg: pre-load the existing reports 2019-02-01 16:11:16 +01:00			`return None`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00
chg: Add typing 2020-01-06 15:32:38 +01:00			`def enqueue_scrape(self, query: dict) -> str:`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`perma_uuid = str(uuid4())`
			`p = self.redis.pipeline()`
			`p.hmset(perma_uuid, query)`
			`p.sadd('to_scrape', perma_uuid)`
			`p.execute()`
			`return perma_uuid`

chg: Add typing 2020-01-06 15:32:38 +01:00			`def process_scrape_queue(self) -> Union[bool, None]:`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`uuid = self.redis.spop('to_scrape')`
			`if not uuid:`
chg: Improve async processing 2019-04-05 16:12:54 +02:00			`return None`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`to_scrape = self.redis.hgetall(uuid)`
			`self.redis.delete(uuid)`
			`to_scrape['perma_uuid'] = uuid`
chg: Improve async processing 2019-04-05 16:12:54 +02:00			`if self.scrape(**to_scrape):`
			`self.logger.info(f'Processed {to_scrape["url"]}')`
			`return True`
			`return False`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00
chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`def _load_pickle(self, pickle_file: Path) -> Optional[CrawledTree]:`
			`if pickle_file.exists():`
			`with pickle_file.open('rb') as _p:`
			`return pickle.load(_p)`
			`return None`

			`def load_tree(self, capture_dir: Path) -> Tuple[str, dict, str, str, str, dict]:`
			`har_files = sorted(capture_dir.glob('*.har'))`
			`pickle_file = capture_dir / 'tree.pickle'`
new: Add error page 2019-02-18 13:52:48 +01:00			`try:`
new: keep track of metadata about OS and Browser when scraping 2019-04-07 23:54:16 +02:00			`meta = {}`
chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`if (capture_dir / 'meta').exists():`
			`# NOTE: Legacy, the meta file should be present`
			`with open((capture_dir / 'meta'), 'r') as f:`
new: keep track of metadata about OS and Browser when scraping 2019-04-07 23:54:16 +02:00			`meta = json.load(f)`
chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`ct = self._load_pickle(pickle_file)`
			`if not ct:`
			`ct = CrawledTree(har_files)`
			`with pickle_file.open('wb') as _p:`
			`pickle.dump(ct, _p)`
fix: Pass proper path to the pickled dump 2020-03-26 14:55:01 +01:00			`return str(pickle_file), ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta`
new: Add error page 2019-02-18 13:52:48 +01:00			`except Har2TreeError as e:`
			`raise NoValidHarFile(e.message)`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00
			`def cleanup_old_tmpfiles(self):`
			`for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'):`
			`if time.time() - tmpfile.stat().st_atime > 36000:`
			`tmpfile.unlink()`

chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`def load_image(self, capture_dir: Path) -> BytesIO:`
			`with open(list(capture_dir.glob('*.png'))[0], 'rb') as f:`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`return BytesIO(f.read())`

chg: Add typing 2020-01-06 15:32:38 +01:00			`def sane_js_query(self, sha512: str) -> Dict:`
new: Add support for user defined cookies, bump deps 2020-01-23 10:52:50 +01:00			`if self.use_sane_js:`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`return self.sanejs.sha512(sha512)`
			`return {'response': []}`

chg: Fix typing 2020-01-24 11:25:53 +01:00			`def scrape(self, url: str, cookies_pseudofile: Optional[BufferedIOBase]=None, depth: int=1, listing: bool=True, user_agent: Optional[str]=None, perma_uuid: str=None,`
chg: Add typing 2020-01-06 15:32:38 +01:00			`os: str=None, browser: str=None) -> Union[bool, str]:`
fix: Strip tabs/spaces around URL 2020-03-19 14:05:19 +01:00			`url = url.strip()`
fix: Refang the URL at the right place 2020-03-19 11:05:29 +01:00			`url = refang(url)`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`if not url.startswith('http'):`
			`url = f'http://{url}'`
fix: Allow to disable scraping private IPs. 2019-07-05 16:27:23 +02:00			`if self.only_global_lookups:`
			`splitted_url = urlsplit(url)`
			`if splitted_url.netloc:`
chg: Add typing 2020-01-06 15:32:38 +01:00			`if splitted_url.hostname:`
Fixing Unknown domain if only_global_lookups is set 2020-03-17 16:26:31 +01:00			`try:`
			`ip = socket.gethostbyname(splitted_url.hostname)`
Fixing Unknown domain if only_global_lookups is set 2020-03-17 16:32:39 +01:00			`except socket.gaierror:`
Fixing Unknown domain if only_global_lookups is set 2020-03-17 16:26:31 +01:00			`self.logger.info(f'Name or service not known')`
			`return False`
chg: Add typing 2020-01-06 15:32:38 +01:00			`if not ipaddress.ip_address(ip).is_global:`
			`return False`
fix: Avoid bypass with "http://user:pwd@host.tld" 2019-07-05 18:41:23 +02:00			`else:`
			`return False`
new: Upload optional cookie file 2020-01-24 10:17:41 +01:00
			`cookies = load_cookies(cookies_pseudofile)`
chg: Support cookies from/to splash (no UI yet) 2020-01-21 11:47:36 +01:00			`items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=user_agent, log_enabled=True, log_level='INFO')`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`if not items:`
			`# broken`
chg: Improve async processing 2019-04-05 16:12:54 +02:00			`return False`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`if not perma_uuid:`
			`perma_uuid = str(uuid4())`
			`width = len(str(len(items)))`
			`dirpath = self.scrape_dir / datetime.now().isoformat()`
chg: Bump har2tree, cleanup 2020-04-01 14:33:35 +02:00			`safe_create_dir(dirpath)`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`for i, item in enumerate(items):`
chg: Bump dependencies, Improve HTTP errors handling Fix https://github.com/CIRCL/lookyloo/issues/66 2020-02-16 23:38:42 +01:00			`if not listing: # Write no_index marker`
			`(dirpath / 'no_index').touch()`
			`with (dirpath / 'uuid').open('w') as _uuid:`
			`_uuid.write(perma_uuid)`
			`if os or browser:`
			`meta = {}`
			`if os:`
			`meta['os'] = os`
			`if browser:`
			`meta['browser'] = browser`
			`with (dirpath / 'meta').open('w') as _meta:`
			`json.dump(meta, _meta)`
			`if 'error' in item:`
			`with (dirpath / 'error.txt').open('w') as _error:`
			`_error.write(item['error'])`
			`continue`

			`# The capture went fine`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`harfile = item['har']`
			`png = base64.b64decode(item['png'])`
chg: Bump dependencies, Improve HTTP errors handling Fix https://github.com/CIRCL/lookyloo/issues/66 2020-02-16 23:38:42 +01:00			`html = item['html']`
new: Add support for last redirect 2020-03-18 21:14:48 +01:00			`last_redirect = item['last_redirected_url']`
chg: Bump dependencies, Improve HTTP errors handling Fix https://github.com/CIRCL/lookyloo/issues/66 2020-02-16 23:38:42 +01:00
			`with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:`
			`json.dump(harfile, _har)`
			`with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:`
			`_img.write(png)`
			`with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:`
			`_html.write(html)`
new: Add support for last redirect 2020-03-18 21:14:48 +01:00			`with (dirpath / '{0:0{width}}.last_redirect.txt'.format(i, width=width)).open('w') as _redir:`
			`_redir.write(last_redirect)`
chg: Support cookies from/to splash (no UI yet) 2020-01-21 11:47:36 +01:00
			`if 'childFrames' in item:`
			`child_frames = item['childFrames']`
			`with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as _iframes:`
			`json.dump(child_frames, _iframes)`

			`if 'cookies' in item:`
			`cookies = item['cookies']`
			`with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:`
			`json.dump(cookies, _cookies)`

chg: Refactoring of the redirects, rename report -> capture 2020-03-26 01:56:24 +01:00			`self._set_capture_cache(dirpath)`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`return perma_uuid`