mirror of https://github.com/CIRCL/lookyloo
fix: Avoid indexing URLs multiple times
parent
aff9c62508
commit
e90fc151bd
|
@ -170,8 +170,8 @@ class Indexing():
|
||||||
if self.redis.sismember('indexed_urls', crawled_tree.uuid):
|
if self.redis.sismember('indexed_urls', crawled_tree.uuid):
|
||||||
# Do not reindex
|
# Do not reindex
|
||||||
return
|
return
|
||||||
|
self.redis.sadd('indexed_urls', crawled_tree.uuid)
|
||||||
pipeline = self.redis.pipeline()
|
pipeline = self.redis.pipeline()
|
||||||
pipeline.sadd('indexed_urls', crawled_tree.uuid)
|
|
||||||
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
|
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
|
||||||
if not urlnode.hostname or not urlnode.name:
|
if not urlnode.hostname or not urlnode.name:
|
||||||
continue
|
continue
|
||||||
|
@ -203,7 +203,6 @@ class Indexing():
|
||||||
def index_categories_capture(self, capture_uuid: str, categories: Iterable[str]):
|
def index_categories_capture(self, capture_uuid: str, categories: Iterable[str]):
|
||||||
if not categories:
|
if not categories:
|
||||||
return
|
return
|
||||||
print(capture_uuid, categories)
|
|
||||||
if self.redis.sismember('indexed_categories', capture_uuid):
|
if self.redis.sismember('indexed_categories', capture_uuid):
|
||||||
# do not reindex
|
# do not reindex
|
||||||
return
|
return
|
||||||
|
|
Loading…
Reference in New Issue