lookyloo/lookyloo/lookyloo.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json

from scrapysplashwrapper import crawl
from har2tree import CrawledTree
import pickle

from datetime import datetime

import tempfile
import pathlib
import time

from io import BytesIO
import base64
from uuid import uuid4

from pysanejs import SaneJS

from pathlib import Path
from .helpers import get_homedir, get_socket_path
from redis import Redis

import logging


class Lookyloo():

    def __init__(self, splash_url: str='http://127.0.0.1:8050', loglevel: int=logging.DEBUG):
        self.__init_logger(loglevel)
        self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
        self.scrape_dir = get_homedir() / 'scraped'
        self.splash_url = splash_url
        if not self.scrape_dir.exists():
            self.scrape_dir.mkdir(parents=True, exist_ok=True)

        # Try to reach sanejs
        self.sanejs = SaneJS()
        if not self.sanejs.is_up:
            self.sanejs = None

    def __init_logger(self, loglevel) -> None:
        self.logger = logging.getLogger(f'{self.__class__.__name__}')
        self.logger.setLevel(loglevel)

    @property
    def report_dirs(self):
        for report_dir in self.scrape_dir.iterdir():
            if report_dir.is_dir() and not report_dir.iterdir():
                # Cleanup self.scrape_dir of failed runs.
                report_dir.rmdir()
            if not (report_dir / 'uuid').exists():
                # Create uuid if missing
                with (report_dir / 'uuid').open('w') as f:
                    f.write(str(uuid4()))
        return sorted(self.scrape_dir.iterdir(), reverse=True)

    @property
    def lookup_dirs(self):
        # Build lookup table trees
        lookup_dirs = {}
        for report_dir in self.report_dirs:
            with (report_dir / 'uuid').open() as f:
                lookup_dirs[f.read().strip()] = report_dir
        return lookup_dirs

    def enqueue_scrape(self, query: dict):
        perma_uuid = str(uuid4())
        p = self.redis.pipeline()
        p.hmset(perma_uuid, query)
        p.sadd('to_scrape', perma_uuid)
        p.execute()
        return perma_uuid

    def process_scrape_queue(self):
        uuid = self.redis.spop('to_scrape')
        if not uuid:
            return
        to_scrape = self.redis.hgetall(uuid)
        self.redis.delete(uuid)
        to_scrape['perma_uuid'] = uuid
        self.scrape(**to_scrape)

    def load_tree(self, report_dir: Path):
        har_files = sorted(report_dir.glob('*.har'))
        ct = CrawledTree(har_files)
        ct.find_parents()
        ct.join_trees()
        temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)
        pickle.dump(ct, temp)
        temp.close()
        return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url

    def cleanup_old_tmpfiles(self):
        for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'):
            if time.time() - tmpfile.stat().st_atime > 36000:
                tmpfile.unlink()

    def load_image(self, report_dir):
        with open(list(report_dir.glob('*.png'))[0], 'rb') as f:
            return BytesIO(f.read())

    def sane_js_query(self, sha512: str):
        if self.sanejs:
            return self.sanejs.sha512(sha512)
        return {'response': []}

    def scrape(self, url, depth: int=1, user_agent: str=None, perma_uuid: str=None):
        if not url.startswith('http'):
            url = f'http://{url}'
        items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
        if not items:
            # broken
            pass
        if not perma_uuid:
            perma_uuid = str(uuid4())
        width = len(str(len(items)))
        dirpath = self.scrape_dir / datetime.now().isoformat()
        dirpath.mkdir()
        for i, item in enumerate(items):
            harfile = item['har']
            png = base64.b64decode(item['png'])
            child_frames = item['childFrames']
            html = item['html']
            with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f:
                json.dump(harfile, f)
            with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f:
                f.write(png)
            with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f:
                f.write(html)
            with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f:
                json.dump(child_frames, f)
            with (dirpath / 'uuid').open('w') as f:
                f.write(perma_uuid)
        return perma_uuid