chg: Support new format on TLD index

pull/1008/head
Raphaël Vinot 2024-12-03 00:10:16 +01:00
parent 73d90070c6
commit cce300034f
2 changed files with 7 additions and 5 deletions

View File

@ -619,16 +619,18 @@ class Indexing():
self.logger.debug(f'done with TLDs for {crawled_tree.uuid}.')
def get_captures_tld(self, tld: str, most_recent_capture: datetime | None = None,
oldest_capture: datetime | None= None) -> list[tuple[str, float]]:
oldest_capture: datetime | None=None,
offset: int | None=None, limit: int | None=None) -> tuple[int, list[tuple[str, float]]]:
"""Get all the captures for a specific TLD, on a time interval starting from the most recent one.
:param tld: The TLD
:param most_recent_capture: The capture time of the most recent capture to consider
:param oldest_capture: The capture time of the oldest capture to consider, defaults to 5 days ago.
:param oldest_capture: The capture time of the oldest capture to consider.
"""
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
min_score: str | float = oldest_capture.timestamp() if oldest_capture else (datetime.now() - timedelta(days=5)).timestamp()
return self.redis.zrevrangebyscore(f'tlds|{tld}|captures', max_score, min_score, withscores=True)
min_score: str | float = oldest_capture.timestamp() if oldest_capture else '-Inf'
total = self.redis.zcard(f'tlds|{tld}|captures')
return total, self.redis.zrevrangebyscore(f'tlds|{tld}|captures', max_score, min_score, withscores=True, start=offset, num=limit)
def get_capture_tld_counter(self, capture_uuid: str, tld: str) -> int:
# NOTE: what to do when the capture isn't indexed yet? Raise an exception?

View File

@ -873,7 +873,7 @@ class TLDCaptures(Resource): # type: ignore[misc]
except Exception:
oldest_capture = None
recent_captures_with_tld = get_indexing(flask_login.current_user).get_captures_tld(tld, most_recent_capture, oldest_capture)
_, recent_captures_with_tld = get_indexing(flask_login.current_user).get_captures_tld(tld, most_recent_capture, oldest_capture)
if not recent_captures_with_tld:
return []
if not urls_only: