From 179fba2eccf38e8fbb74dea735e99ab82b15a9b4 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 1 Apr 2020 14:58:27 +0200 Subject: [PATCH] fix: [crawler] error catcher --- bin/torcrawler/TorSplashCrawler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 9ff94883..f1ed527d 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -156,7 +156,7 @@ class TorSplashCrawler(): self.parse, errback=self.errback_catcher, endpoint='execute', - meta={'father': self.original_item}, + meta={'father': self.original_item, 'current_url': self.start_urls}, args=l_cookies ) @@ -217,7 +217,7 @@ class TorSplashCrawler(): self.parse, errback=self.errback_catcher, endpoint='execute', - meta={'father': item_id}, + meta={'father': item_id, 'current_url': link.url}, args=l_cookies ) @@ -227,7 +227,7 @@ class TorSplashCrawler(): if failure.check(ResponseNeverReceived): request = failure.request - url= response.data['last_url'] + url= request.meta['current_url'] father = request.meta['father'] self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url) @@ -242,7 +242,7 @@ class TorSplashCrawler(): errback=self.errback_catcher, endpoint='execute', cache_args=['lua_source'], - meta={'father': father}, + meta={'father': father, 'current_url': url}, args=self.build_request_arg(response.cookiejar) )