fix: [crawler] fix down domain + domain redirection history

pull/604/head
Terrtia 2023-07-18 14:30:00 +02:00
parent 77e8cc6c02
commit fe2769308b
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
1 changed files with 13 additions and 3 deletions

View File

@ -60,6 +60,7 @@ class Crawler(AbstractModule):
self.root_item = None
self.date = None
self.items_dir = None
self.original_domain = None
self.domain = None
# TODO Replace with warning list ???
@ -190,6 +191,7 @@ class Crawler(AbstractModule):
print(domain)
self.domain = Domain(domain)
self.original_domain = Domain(domain)
epoch = int(time.time())
parent_id = task.get_parent()
@ -212,12 +214,20 @@ class Crawler(AbstractModule):
# Origin + History + tags
if self.root_item:
self.domain.set_last_origin(parent_id)
self.domain.add_history(epoch, root_item=self.root_item)
# Tags
for tag in task.get_tags():
self.domain.add_tag(tag)
elif self.domain.was_up():
self.domain.add_history(epoch, root_item=epoch)
self.domain.add_history(epoch, root_item=self.root_item)
if self.domain != self.original_domain:
self.original_domain.update_daterange(self.date.replace('/', ''))
if self.root_item:
self.original_domain.set_last_origin(parent_id)
# Tags
for tag in task.get_tags():
self.domain.add_tag(tag)
self.original_domain.add_history(epoch, root_item=self.root_item)
crawlers.update_last_crawled_domain(self.original_domain.get_domain_type(), self.original_domain.id, epoch)
crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
print('capture:', capture.uuid, 'completed')