new: Paginate URL index on UI

2024-11-27 14:09:40 +01:00 · 2024-11-27 14:09:40 +01:00 · 556b7a4ac3
parent e0b12d6df3
commit 556b7a4ac3
7 changed files with 47 additions and 52 deletions
--- a/lookyloo/indexing.py
+++ b/lookyloo/indexing.py
@ -485,22 +485,24 @@ class Indexing():
        self.logger.debug(f'done with URLs for {crawled_tree.uuid}.')

    def get_captures_url(self, url: str, most_recent_capture: datetime | None = None,
-                         oldest_capture: datetime | None= None) -> list[tuple[str, float]]:
+                         oldest_capture: datetime | None= None,
+                         offset: int | None=None, limit: int | None=None) -> tuple[int, list[tuple[str, float]]]:
        """Get all the captures for a specific URL, on a time interval starting from the most recent one.

        :param url: The URL
        :param most_recent_capture: The capture time of the most recent capture to consider
-        :param oldest_capture: The capture time of the oldest capture to consider, defaults to 15 days ago.
+        :param oldest_capture: The capture time of the oldest capture to consider.
        """
        max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
-        min_score: str | float = oldest_capture.timestamp() if oldest_capture else (datetime.now() - timedelta(days=15)).timestamp()
+        min_score: str | float = oldest_capture.timestamp() if oldest_capture else '-Inf'
        md5 = hashlib.md5(url.encode()).hexdigest()
        if self.redis.type(f'urls|{md5}|captures') == 'set':  # type: ignore[no-untyped-call]
            # triggers the re-index soon.
            self.redis.srem('indexed_urls', *self.redis.smembers(f'urls|{md5}|captures'))
            self.redis.delete(f'urls|{md5}|captures')
-            return []
-        return self.redis.zrevrangebyscore(f'urls|{md5}|captures', max_score, min_score, withscores=True)
+            return 0, []
+        total = self.redis.zcard(f'urls|{md5}|captures')
+        return total, self.redis.zrevrangebyscore(f'urls|{md5}|captures', max_score, min_score, withscores=True)

    def get_captures_url_count(self, url: str) -> int:
        md5 = hashlib.md5(url.encode()).hexdigest()
--- a/website/web/init.py
+++ b/website/web/init.py
@ -409,14 +409,15 @@ def get_hostname_investigator(hostname: str, offset: int | None=None, limit: int
                    ) for cache in cached_captures]


-def get_url_investigator(url: str) -> list[tuple[str, str, str, datetime, set[str]]]:
+def get_url_investigator(url: str, offset: int | None=None, limit: int | None=None) -> tuple[int, list[tuple[str, str, str, datetime, set[str]]]]:
    '''Returns all the captures loading content from that url, used in the web interface.'''
+    total, entries = get_indexing(flask_login.current_user).get_captures_url(url=url, offset=offset, limit=limit)
    cached_captures = lookyloo.sorted_capture_cache(
-        [uuid for uuid, _ in get_indexing(flask_login.current_user).get_captures_url(url=url)],
+        [uuid for uuid, _ in entries],
        cached_captures_only=True)
-    return [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp,
-             get_indexing(flask_login.current_user).get_capture_url_nodes(cache.uuid, url)
-             ) for cache in cached_captures]
+    return total, [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp,
+                   get_indexing(flask_login.current_user).get_capture_url_nodes(cache.uuid, url)
+                    ) for cache in cached_captures]


 def get_cookie_name_investigator(cookie_name: str, /) -> list[tuple[str, str, datetime, set[str]]]:
@ -1790,9 +1791,9 @@ def body_hash_details(body_hash: str) -> str:
@app.route('/urls/<string:url>', methods=['GET'])
 def url_details(url: str) -> str:
    from_popup = True if (request.args.get('from_popup') and request.args.get('from_popup') == 'True') else False
-    url = unquote_plus(url).strip()
-    captures = get_url_investigator(url)
-    return render_template('url.html', url=url, captures=captures, from_popup=from_popup)
+    url_unquoted = unquote_plus(url).strip()
+    url_b64 = base64.b64encode(url_unquoted.encode()).decode()
+    return render_template('url.html', url=url_unquoted, url_quoted=url_b64, from_popup=from_popup)


@app.route('/hostnames/<string:hostname>', methods=['GET'])
@ -1979,7 +1980,7 @@ def add_context(tree_uuid: str, node_uuid: str) -> WerkzeugResponse | None:


 def __prepare_node_view(capture_uuid: str, nodes: set[str]) -> str:
-    to_return = f'The capture contains this hostname in {len(nodes)} nodes, click below to see them on the tree:'
+    to_return = f'The capture contains this value in {len(nodes)} nodes, click below to see them on the tree:'
    to_return += '<ul>'
    for node in nodes:
        to_return += f'<li><a href="{url_for("tree", tree_uuid=capture_uuid, node_uuid=node)}">{node}</a></li>'
@ -2004,6 +2005,19 @@ def post_table(table_name: str, value: str) -> Response:
            }
            prepared_captures.append(to_append)
        return jsonify({'draw': draw, 'recordsTotal': total, 'recordsFiltered': total, 'data': prepared_captures})
+    if table_name == 'urlTable':
+        url = base64.b64decode(value.strip()).decode()
+        total, captures = get_url_investigator(url, offset=start, limit=length)
+        prepared_captures = []
+        for capture_uuid, title, landing_page, capture_time, nodes in captures:
+            _nodes = __prepare_node_view(capture_uuid, nodes)
+            to_append = {
+                'capture_time': capture_time.isoformat(),
+                'capture_title': f"""<a href="{url_for('tree', tree_uuid=capture_uuid)}">{title}</a></br>{_nodes}""",
+                'landing_page': f"""<span class="d-inline-block text-break" style="max-width: 400px;">{landing_page}</span>"""
+            }
+            prepared_captures.append(to_append)
+        return jsonify({'draw': draw, 'recordsTotal': total, 'recordsFiltered': total, 'data': prepared_captures})
    return jsonify({})


--- a/website/web/genericapi.py
+++ b/website/web/genericapi.py
@ -339,8 +339,9 @@ class HashInfo(Resource):  # type: ignore[misc]

 def get_url_occurrences(url: str, /, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]:
    '''Get the most recent captures and URL nodes where the URL has been seen.'''
+    _, entries = get_indexing(flask_login.current_user).get_captures_url(url, offset=0, limit=limit)
    captures = lookyloo.sorted_capture_cache(
-        [uuid for uuid, _ in get_indexing(flask_login.current_user).get_captures_url(url)],
+        [uuid for uuid, _ in entries],
        cached_captures_only=cached_captures_only)

    to_return: list[dict[str, Any]] = []
--- a/website/web/sri.txt
+++ b/website/web/sri.txt
@ -33,7 +33,7 @@
    "loader.gif": "ZZKD5vLSKBWKeUpa2KI9qheUJ49iTI/UULmVU/AX28fBfH00K3lLc2v5pVJZ4qXG1BbB13LTXzRKKU35H2XfNg==",
    "lookyloo.jpeg": "i6wBj8CsIM5YAQLEMQfhs3CNOSKkErF8AMqqM6ZygSwCyQgv9CU8xt94veMZhM/ufBWoz7kAXmR+yywmxsTxug==",
    "redirect.png": "PAjzlPV97rEFvH55mG1ZC9wRl98be3yMeX/nENuFkJcds6/AXgSR2ig/QyPULgobSnNgiYieLVWY/oqsgeywrQ==",
-    "render_tables.js": "wGoy3iMmLKFtMD2rC2dsgqtjIzBSCUwQ87yqmmIGHyXvp9hbltevbfPDLjKNv5OLLD/qIdYubmPRc8HdMAmmNw==",
+    "render_tables.js": "Su4TaO5d7pZ+hZM3b6JQAhtW5vJJ9vUIsOatzJR/wZOLykpPe9HuzNV48AxH4wumWKsK5HVt0wW7njOjHl8p7A==",
    "secure.svg": "H8ni7t0d60nCJDVGuZpuxC+RBy/ipAjWT627D12HlZGg6LUmjSwPTQTUekm3UJupEP7TUkhXyq6WHc5gy7QBjg==",
    "stats.css": "/kY943FwWBTne4IIyf7iBROSfbGd82TeBicEXqKkRwawMVRIvM/Pk5MRa7okUyGIxaDjFQGmV/U1vy+PhN6Jbw==",
    "stats_graph.js": "S/sMNQK1UMMLD0xQeEa7sq3ce8o6oPxwxGlyKVtaHOODjair86dbBDm7cu6pa/elMRDJT1j09jEFjWp+5GbhTw==",
--- a/website/web/static/render_tables.js
+++ b/website/web/static/render_tables.js
@ -156,8 +156,20 @@
  }

  if (document.getElementById('urlTable')) {
+      url = document.getElementById('urlTable').dataset.url;
      new DataTable('#urlTable', {
+        processing: true,
+        serverSide: true,
        retrieve: true,
+        ajax: {
+            url: `/tables/urlTable/${url}`,
+            type: 'POST'
+        },
+        columns : [
+            { data: 'capture_time' },
+            { data: 'capture_title' },
+            { data: 'landing_page' }
+        ],
        order: [[ 0, "desc" ]],
        columnDefs: [{ width: '20%', targets: 0,
                       render: (data) => {
--- a/website/web/templates/hostname.html
+++ b/website/web/templates/hostname.html
@ -17,7 +17,6 @@

 <center>
    <h4>{{ hostname }}</h4>
-    <h6>Only the most recent captures are listed below, this will change soon.</h6>
 </center>

 <table id="hostnameTable" class="table table-striped" style="width:100%" data-hostname="{{hostname}}">
--- a/website/web/templates/url.html
+++ b/website/web/templates/url.html
@ -9,20 +9,17 @@

 {%endif%}

-
 {% block content %}

 {% if from_popup %}
 <center><button class="btn btn-primary goBack" type="button">Go Back</button></center>
 {%endif%}

-
 <center>
-  <h4>{{ url }}</h4>
-  <h6>Only the most recent captures are listed below, this will change soon.</h6>
+    <h4>{{ url }}</h4>
 </center>

-<table id="urlTable" class="table table-striped" style="width:100%">
+<table id="urlTable" class="table table-striped" style="width:100%" data-url="{{url_quoted}}">
  <thead>
   <tr>
     <th>Capture Time</th>
@ -30,35 +27,5 @@
     <th>Landing page</th>
   </tr>
  </thead>
-  <tbody>
-    {% for capture_uuid, title, landing_page, capture_time, nodes in captures %}
-    <tr>
-      <td>
-        {{capture_time}}
-      </td>
-      <td>
-        <a href="{{ url_for('tree', tree_uuid=capture_uuid) }}">
-          {{ title }}
-        </a>
-        <br>
-        The capture contains this URL in {{ nodes|length }} nodes, click below to see them on the tree:
-        <ul>
-            {% for node in nodes %}
-            <li>
-                <a href="{{ url_for('tree', tree_uuid=capture_uuid, node_uuid=node) }}">
-                {{ shorten_string(node, 50) }}
-                </a>
-            </li>
-            {% endfor %}
-        </ul>
-      </td>
-      <td>
-        <span class="d-inline-block text-break" style="max-width: 400px;">
-          {{ landing_page }}
-        </span>
-      </td>
-    </tr>
-    {% endfor %}
-  </tbody>
 </table>
 {% endblock %}