lookyloo/lookyloo/lookyloo.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json

import pickle

from datetime import datetime

import tempfile
import pathlib
import time

import ipaddress
import socket
from urllib.parse import urlsplit

from io import BytesIO
import base64
from uuid import uuid4

from pathlib import Path
from .helpers import get_homedir, get_socket_path
from .exceptions import NoValidHarFile
from redis import Redis

from typing import Union, Dict, List, Tuple

import logging

from pysanejs import SaneJS  # type: ignore
from scrapysplashwrapper import crawl  # type: ignore
from har2tree import CrawledTree, Har2TreeError  # type: ignore


class Lookyloo():

    def __init__(self, splash_url: str='http://127.0.0.1:8050', loglevel: int=logging.DEBUG, only_global_lookups: bool=False) -> None:
        self.__init_logger(loglevel)
        self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
        self.scrape_dir: Path = get_homedir() / 'scraped'
        self.splash_url: str = splash_url
        self.only_global_lookups: bool = only_global_lookups
        if not self.scrape_dir.exists():
            self.scrape_dir.mkdir(parents=True, exist_ok=True)

        if not self.redis.exists('cache_loaded'):
            self._init_existing_dumps()

        # Try to reach sanejs
        self.sanejs = SaneJS()
        if not self.sanejs.is_up:
            self.sanejs = None

    def __init_logger(self, loglevel: int) -> None:
        self.logger = logging.getLogger(f'{self.__class__.__name__}')
        self.logger.setLevel(loglevel)

    def _set_report_cache(self, report_dir: Path) -> None:
        if self.redis.exists(str(report_dir)):
            return
        har_files = sorted(report_dir.glob('*.har'))
        if not har_files:
            self.logger.warning(f'No har files in {report_dir}')
            if (report_dir / 'uuid').exists():
                (report_dir / 'uuid').unlink()
            if (report_dir / 'no_index').exists():
                (report_dir / 'no_index').unlink()
            report_dir.rmdir()
            return
        with (report_dir / 'uuid').open() as f:
            uuid = f.read().strip()
        with har_files[0].open() as f:
            j = json.load(f)
            title = j['log']['pages'][0]['title']
            if not title:
                title = '!! No title found !! '
        cache = {'uuid': uuid, 'title': title}
        if (report_dir / 'no_index').exists():  # If the folders claims anonymity
            cache['no_index'] = 1
        if uuid and not self.redis.exists(str(report_dir)):
            self.redis.hmset(str(report_dir), cache)
            self.redis.hset('lookup_dirs', uuid, str(report_dir))

    def report_cache(self, report_dir: Union[str, Path]) -> Dict:
        if isinstance(report_dir, Path):
            report_dir = str(report_dir)
        return self.redis.hgetall(report_dir)

    def _init_existing_dumps(self) -> None:
        for report_dir in self.report_dirs:
            if report_dir.exists():
                self._set_report_cache(report_dir)
        self.redis.set('cache_loaded', 1)

    @property
    def report_dirs(self) -> List[Path]:
        for report_dir in self.scrape_dir.iterdir():
            if report_dir.is_dir() and not report_dir.iterdir():
                # Cleanup self.scrape_dir of failed runs.
                report_dir.rmdir()
            if not (report_dir / 'uuid').exists():
                # Create uuid if missing
                with (report_dir / 'uuid').open('w') as f:
                    f.write(str(uuid4()))
        return sorted(self.scrape_dir.iterdir(), reverse=True)

    def lookup_report_dir(self, uuid) -> Union[Path, None]:
        report_dir = self.redis.hget('lookup_dirs', uuid)
        if report_dir:
            return Path(report_dir)
        return None

    def enqueue_scrape(self, query: dict) -> str:
        perma_uuid = str(uuid4())
        p = self.redis.pipeline()
        p.hmset(perma_uuid, query)
        p.sadd('to_scrape', perma_uuid)
        p.execute()
        return perma_uuid

    def process_scrape_queue(self) -> Union[bool, None]:
        uuid = self.redis.spop('to_scrape')
        if not uuid:
            return None
        to_scrape = self.redis.hgetall(uuid)
        self.redis.delete(uuid)
        to_scrape['perma_uuid'] = uuid
        if self.scrape(**to_scrape):
            self.logger.info(f'Processed {to_scrape["url"]}')
            return True
        return False

    def load_tree(self, report_dir: Path) -> Tuple[str, dict, str, str, str, dict]:
        har_files = sorted(report_dir.glob('*.har'))
        try:
            meta = {}
            if (report_dir / 'meta').exists():
                with open((report_dir / 'meta'), 'r') as f:
                    meta = json.load(f)
            ct = CrawledTree(har_files)
            ct.find_parents()
            ct.join_trees()
            temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)
            pickle.dump(ct, temp)
            temp.close()
            return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
        except Har2TreeError as e:
            raise NoValidHarFile(e.message)

    def cleanup_old_tmpfiles(self):
        for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'):
            if time.time() - tmpfile.stat().st_atime > 36000:
                tmpfile.unlink()

    def load_image(self, report_dir: Path) -> BytesIO:
        with open(list(report_dir.glob('*.png'))[0], 'rb') as f:
            return BytesIO(f.read())

    def sane_js_query(self, sha512: str) -> Dict:
        if self.sanejs:
            return self.sanejs.sha512(sha512)
        return {'response': []}

    def scrape(self, url: str, depth: int=1, listing: bool=True, user_agent: str=None, perma_uuid: str=None,
               os: str=None, browser: str=None) -> Union[bool, str]:
        if not url.startswith('http'):
            url = f'http://{url}'
        if self.only_global_lookups:
            splitted_url = urlsplit(url)
            if splitted_url.netloc:
                if splitted_url.hostname:
                    ip = socket.gethostbyname(splitted_url.hostname)
                    if not ipaddress.ip_address(ip).is_global:
                        return False
            else:
                return False

        items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
        if not items:
            # broken
            return False
        if not perma_uuid:
            perma_uuid = str(uuid4())
        width = len(str(len(items)))
        dirpath = self.scrape_dir / datetime.now().isoformat()
        dirpath.mkdir()
        for i, item in enumerate(items):
            harfile = item['har']
            png = base64.b64decode(item['png'])
            child_frames = item['childFrames']
            html = item['html']
            with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
                json.dump(harfile, _har)
            with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
                _img.write(png)
            with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
                _html.write(html)
            with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as _iframes:
                json.dump(child_frames, _iframes)
            with (dirpath / 'uuid').open('w') as _uuid:
                _uuid.write(perma_uuid)
            if not listing:  # Write no_index marker
                (dirpath / 'no_index').touch()
            if os or browser:
                meta = {}
                if os:
                    meta['os'] = os
                if browser:
                    meta['browser'] = browser
                with (dirpath / 'meta').open('w') as _meta:
                    json.dump(meta, _meta)
        self._set_report_cache(dirpath)
        return perma_uuid
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`

			`import json`

			`import pickle`

			`from datetime import datetime`

			`import tempfile`
			`import pathlib`
			`import time`

fix: Allow to disable scraping private IPs. 2019-07-05 16:27:23 +02:00			`import ipaddress`
			`import socket`
			`from urllib.parse import urlsplit`

chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`from io import BytesIO`
			`import base64`
			`from uuid import uuid4`

			`from pathlib import Path`
			`from .helpers import get_homedir, get_socket_path`
new: Add error page 2019-02-18 13:52:48 +01:00			`from .exceptions import NoValidHarFile`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`from redis import Redis`

chg: Add typing 2020-01-06 15:32:38 +01:00			`from typing import Union, Dict, List, Tuple`

chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`import logging`

chg: Add typing 2020-01-06 15:32:38 +01:00			`from pysanejs import SaneJS # type: ignore`
			`from scrapysplashwrapper import crawl # type: ignore`
			`from har2tree import CrawledTree, Har2TreeError # type: ignore`

chg: Refactor code organisation 2019-01-30 14:30:01 +01:00
			`class Lookyloo():`

chg: Add typing 2020-01-06 15:32:38 +01:00			`def __init__(self, splash_url: str='http://127.0.0.1:8050', loglevel: int=logging.DEBUG, only_global_lookups: bool=False) -> None:`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`self.__init_logger(loglevel)`
chg: Add typing 2020-01-06 15:32:38 +01:00			`self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)`
			`self.scrape_dir: Path = get_homedir() / 'scraped'`
			`self.splash_url: str = splash_url`
			`self.only_global_lookups: bool = only_global_lookups`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`if not self.scrape_dir.exists():`
			`self.scrape_dir.mkdir(parents=True, exist_ok=True)`

fix: Avoid loading the cache multiple times 2019-04-05 15:07:22 +02:00			`if not self.redis.exists('cache_loaded'):`
			`self._init_existing_dumps()`
chg: pre-load the existing reports 2019-02-01 16:11:16 +01:00
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`# Try to reach sanejs`
			`self.sanejs = SaneJS()`
			`if not self.sanejs.is_up:`
			`self.sanejs = None`

chg: Add typing 2020-01-06 15:32:38 +01:00			`def __init_logger(self, loglevel: int) -> None:`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`self.logger = logging.getLogger(f'{self.__class__.__name__}')`
			`self.logger.setLevel(loglevel)`

chg: Add typing 2020-01-06 15:32:38 +01:00			`def _set_report_cache(self, report_dir: Path) -> None:`
fix: Initialize cache only once 2019-06-25 18:08:52 +02:00			`if self.redis.exists(str(report_dir)):`
			`return`
chg: pre-load the existing reports 2019-02-01 16:11:16 +01:00			`har_files = sorted(report_dir.glob('*.har'))`
			`if not har_files:`
			`self.logger.warning(f'No har files in {report_dir}')`
fix: Properly remove files if the scraping failed 2019-04-04 00:41:15 +02:00			`if (report_dir / 'uuid').exists():`
			`(report_dir / 'uuid').unlink()`
			`if (report_dir / 'no_index').exists():`
			`(report_dir / 'no_index').unlink()`
fix: Automatically remove failed runs. 2019-03-27 10:21:37 +01:00			`report_dir.rmdir()`
chg: pre-load the existing reports 2019-02-01 16:11:16 +01:00			`return`
			`with (report_dir / 'uuid').open() as f:`
			`uuid = f.read().strip()`
			`with har_files[0].open() as f:`
			`j = json.load(f)`
			`title = j['log']['pages'][0]['title']`
			`if not title:`
			`title = '!! No title found !! '`
chg: Slight cleanup 2019-02-18 14:29:15 +01:00			`cache = {'uuid': uuid, 'title': title}`
			`if (report_dir / 'no_index').exists(): # If the folders claims anonymity`
			`cache['no_index'] = 1`
fix: Avoid loading the cache multiple times 2019-04-05 15:07:22 +02:00			`if uuid and not self.redis.exists(str(report_dir)):`
fix: do not initialize the redis cache more than once. 2019-04-05 14:48:25 +02:00			`self.redis.hmset(str(report_dir), cache)`
			`self.redis.hset('lookup_dirs', uuid, str(report_dir))`
chg: pre-load the existing reports 2019-02-01 16:11:16 +01:00
chg: Add typing 2020-01-06 15:32:38 +01:00			`def report_cache(self, report_dir: Union[str, Path]) -> Dict:`
chg: pre-load the existing reports 2019-02-01 16:11:16 +01:00			`if isinstance(report_dir, Path):`
			`report_dir = str(report_dir)`
			`return self.redis.hgetall(report_dir)`

chg: Add typing 2020-01-06 15:32:38 +01:00			`def _init_existing_dumps(self) -> None:`
chg: pre-load the existing reports 2019-02-01 16:11:16 +01:00			`for report_dir in self.report_dirs:`
fix: Avoid loading the cache multiple times 2019-04-05 15:07:22 +02:00			`if report_dir.exists():`
			`self._set_report_cache(report_dir)`
			`self.redis.set('cache_loaded', 1)`
chg: pre-load the existing reports 2019-02-01 16:11:16 +01:00
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`@property`
chg: Add typing 2020-01-06 15:32:38 +01:00			`def report_dirs(self) -> List[Path]:`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`for report_dir in self.scrape_dir.iterdir():`
			`if report_dir.is_dir() and not report_dir.iterdir():`
			`# Cleanup self.scrape_dir of failed runs.`
			`report_dir.rmdir()`
			`if not (report_dir / 'uuid').exists():`
			`# Create uuid if missing`
			`with (report_dir / 'uuid').open('w') as f:`
			`f.write(str(uuid4()))`
			`return sorted(self.scrape_dir.iterdir(), reverse=True)`

chg: Add typing 2020-01-06 15:32:38 +01:00			`def lookup_report_dir(self, uuid) -> Union[Path, None]:`
chg: pre-load the existing reports 2019-02-01 16:11:16 +01:00			`report_dir = self.redis.hget('lookup_dirs', uuid)`
			`if report_dir:`
			`return Path(report_dir)`
			`return None`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00
chg: Add typing 2020-01-06 15:32:38 +01:00			`def enqueue_scrape(self, query: dict) -> str:`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`perma_uuid = str(uuid4())`
			`p = self.redis.pipeline()`
			`p.hmset(perma_uuid, query)`
			`p.sadd('to_scrape', perma_uuid)`
			`p.execute()`
			`return perma_uuid`

chg: Add typing 2020-01-06 15:32:38 +01:00			`def process_scrape_queue(self) -> Union[bool, None]:`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`uuid = self.redis.spop('to_scrape')`
			`if not uuid:`
chg: Improve async processing 2019-04-05 16:12:54 +02:00			`return None`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`to_scrape = self.redis.hgetall(uuid)`
			`self.redis.delete(uuid)`
			`to_scrape['perma_uuid'] = uuid`
chg: Improve async processing 2019-04-05 16:12:54 +02:00			`if self.scrape(**to_scrape):`
			`self.logger.info(f'Processed {to_scrape["url"]}')`
			`return True`
			`return False`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00
chg: Add typing 2020-01-06 15:32:38 +01:00			`def load_tree(self, report_dir: Path) -> Tuple[str, dict, str, str, str, dict]:`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`har_files = sorted(report_dir.glob('*.har'))`
new: Add error page 2019-02-18 13:52:48 +01:00			`try:`
new: keep track of metadata about OS and Browser when scraping 2019-04-07 23:54:16 +02:00			`meta = {}`
			`if (report_dir / 'meta').exists():`
			`with open((report_dir / 'meta'), 'r') as f:`
			`meta = json.load(f)`
new: Add error page 2019-02-18 13:52:48 +01:00			`ct = CrawledTree(har_files)`
			`ct.find_parents()`
			`ct.join_trees()`
			`temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)`
			`pickle.dump(ct, temp)`
			`temp.close()`
new: keep track of metadata about OS and Browser when scraping 2019-04-07 23:54:16 +02:00			`return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta`
new: Add error page 2019-02-18 13:52:48 +01:00			`except Har2TreeError as e:`
			`raise NoValidHarFile(e.message)`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00
			`def cleanup_old_tmpfiles(self):`
			`for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'):`
			`if time.time() - tmpfile.stat().st_atime > 36000:`
			`tmpfile.unlink()`

chg: Add typing 2020-01-06 15:32:38 +01:00			`def load_image(self, report_dir: Path) -> BytesIO:`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`with open(list(report_dir.glob('*.png'))[0], 'rb') as f:`
			`return BytesIO(f.read())`

chg: Add typing 2020-01-06 15:32:38 +01:00			`def sane_js_query(self, sha512: str) -> Dict:`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`if self.sanejs:`
			`return self.sanejs.sha512(sha512)`
			`return {'response': []}`

chg: Add typing 2020-01-06 15:32:38 +01:00			`def scrape(self, url: str, depth: int=1, listing: bool=True, user_agent: str=None, perma_uuid: str=None,`
			`os: str=None, browser: str=None) -> Union[bool, str]:`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`if not url.startswith('http'):`
			`url = f'http://{url}'`
fix: Allow to disable scraping private IPs. 2019-07-05 16:27:23 +02:00			`if self.only_global_lookups:`
			`splitted_url = urlsplit(url)`
			`if splitted_url.netloc:`
chg: Add typing 2020-01-06 15:32:38 +01:00			`if splitted_url.hostname:`
			`ip = socket.gethostbyname(splitted_url.hostname)`
			`if not ipaddress.ip_address(ip).is_global:`
			`return False`
fix: Avoid bypass with "http://user:pwd@host.tld" 2019-07-05 18:41:23 +02:00			`else:`
			`return False`
fix: Allow to disable scraping private IPs. 2019-07-05 16:27:23 +02:00
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')`
			`if not items:`
			`# broken`
chg: Improve async processing 2019-04-05 16:12:54 +02:00			`return False`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`if not perma_uuid:`
			`perma_uuid = str(uuid4())`
			`width = len(str(len(items)))`
			`dirpath = self.scrape_dir / datetime.now().isoformat()`
			`dirpath.mkdir()`
			`for i, item in enumerate(items):`
			`harfile = item['har']`
			`png = base64.b64decode(item['png'])`
			`child_frames = item['childFrames']`
			`html = item['html']`
chg: Add typing 2020-01-06 15:32:38 +01:00			`with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:`
			`json.dump(harfile, _har)`
			`with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:`
			`_img.write(png)`
			`with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:`
			`_html.write(html)`
			`with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as _iframes:`
			`json.dump(child_frames, _iframes)`
			`with (dirpath / 'uuid').open('w') as _uuid:`
			`_uuid.write(perma_uuid)`
new: keep track of metadata about OS and Browser when scraping 2019-04-07 23:54:16 +02:00			`if not listing: # Write no_index marker`
			`(dirpath / 'no_index').touch()`
			`if os or browser:`
			`meta = {}`
			`if os:`
			`meta['os'] = os`
			`if browser:`
			`meta['browser'] = browser`
chg: Add typing 2020-01-06 15:32:38 +01:00			`with (dirpath / 'meta').open('w') as _meta:`
			`json.dump(meta, _meta)`
chg: pre-load the existing reports 2019-02-01 16:11:16 +01:00			`self._set_report_cache(dirpath)`
chg: Refactor code organisation 2019-01-30 14:30:01 +01:00			`return perma_uuid`