From 0c443e87e67d4698f38c2818a79391c7524d9080 Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Fri, 15 Nov 2024 02:26:17 -0800 Subject: [PATCH] add maximum page limit --- .env.production.sample | 3 +++ .../activitypub/fetch_all_replies_service.rb | 6 +++--- .../activitypub/fetch_replies_service.rb | 11 +++++++---- .../activitypub/fetch_all_replies_worker.rb | 12 +++++++----- .../activitypub/fetch_all_replies_worker_spec.rb | 16 ++++++++++------ 5 files changed, 30 insertions(+), 18 deletions(-) diff --git a/.env.production.sample b/.env.production.sample index c934a76e46..277f31b73b 100644 --- a/.env.production.sample +++ b/.env.production.sample @@ -104,3 +104,6 @@ FETCH_REPLIES_MAX_GLOBAL=1000 # Max number of replies to fetch - for a single post FETCH_REPLIES_MAX_SINGLE=500 + +# Max number of replies Collection pages to fetch - total +FETCH_REPLIES_MAX_PAGES=500 diff --git a/app/services/activitypub/fetch_all_replies_service.rb b/app/services/activitypub/fetch_all_replies_service.rb index 1c17ba4732..8771096feb 100644 --- a/app/services/activitypub/fetch_all_replies_service.rb +++ b/app/services/activitypub/fetch_all_replies_service.rb @@ -6,18 +6,18 @@ class ActivityPub::FetchAllRepliesService < ActivityPub::FetchRepliesService # Limit of replies to fetch per status MAX_REPLIES = (ENV['FETCH_REPLIES_MAX_SINGLE'] || 500).to_i - def call(collection_or_uri, allow_synchronous_requests: true, request_id: nil) + def call(collection_or_uri, max_pages = nil, allow_synchronous_requests: true, request_id: nil) @allow_synchronous_requests = allow_synchronous_requests @filter_by_host = false @collection_or_uri = collection_or_uri - @items = collection_items(collection_or_uri) + @items, n_pages = collection_items(collection_or_uri, max_pages) @items = filtered_replies return if @items.nil? FetchReplyWorker.push_bulk(@items) { |reply_uri| [reply_uri, { 'request_id' => request_id }] } - @items + [@items, n_pages] end private diff --git a/app/services/activitypub/fetch_replies_service.rb b/app/services/activitypub/fetch_replies_service.rb index bb20e66649..b26e75f125 100644 --- a/app/services/activitypub/fetch_replies_service.rb +++ b/app/services/activitypub/fetch_replies_service.rb @@ -11,7 +11,7 @@ class ActivityPub::FetchRepliesService < BaseService @allow_synchronous_requests = allow_synchronous_requests @filter_by_host = filter_by_host - @items = collection_items(collection_or_uri) + @items, = collection_items(collection_or_uri) return if @items.nil? FetchReplyWorker.push_bulk(filtered_replies) { |reply_uri| [reply_uri, { 'request_id' => request_id }] } @@ -21,7 +21,7 @@ class ActivityPub::FetchRepliesService < BaseService private - def collection_items(collection_or_uri) + def collection_items(collection_or_uri, max_pages = nil) collection = fetch_collection(collection_or_uri) return unless collection.is_a?(Hash) @@ -29,6 +29,7 @@ class ActivityPub::FetchRepliesService < BaseService return unless collection.is_a?(Hash) all_items = [] + n_pages = 1 while collection.is_a?(Hash) items = case collection['type'] when 'Collection', 'CollectionPage' @@ -39,12 +40,14 @@ class ActivityPub::FetchRepliesService < BaseService all_items.concat(as_array(items)) - break if all_items.size > MAX_REPLIES + break if all_items.size >= MAX_REPLIES + break if !max_pages.nil? && n_pages >= max_pages collection = collection['next'].present? ? fetch_collection(collection['next']) : nil + n_pages += 1 end - all_items + [all_items, n_pages] end def fetch_collection(collection_or_uri) diff --git a/app/workers/activitypub/fetch_all_replies_worker.rb b/app/workers/activitypub/fetch_all_replies_worker.rb index c8d0546891..c5351b77af 100644 --- a/app/workers/activitypub/fetch_all_replies_worker.rb +++ b/app/workers/activitypub/fetch_all_replies_worker.rb @@ -13,28 +13,30 @@ class ActivityPub::FetchAllRepliesWorker # Global max replies to fetch per request (all replies, recursively) MAX_REPLIES = (ENV['FETCH_REPLIES_MAX_GLOBAL'] || 1000).to_i + MAX_PAGES = (ENV['FETCH_REPLIES_MAX_PAGES'] || 500).to_i def perform(parent_status_id, options = {}) @parent_status = Status.find(parent_status_id) Rails.logger.debug { "FetchAllRepliesWorker - #{@parent_status.uri}: Fetching all replies for status: #{@parent_status}" } - uris_to_fetch = get_replies(@parent_status.uri, options) + uris_to_fetch, n_pages = get_replies(@parent_status.uri, MAX_PAGES, options) return if uris_to_fetch.nil? @parent_status.touch(:fetched_replies_at) fetched_uris = uris_to_fetch.clone.to_set - until uris_to_fetch.empty? || fetched_uris.length >= MAX_REPLIES + until uris_to_fetch.empty? || fetched_uris.length >= MAX_REPLIES || n_pages >= MAX_PAGES next_reply = uris_to_fetch.pop next if next_reply.nil? - new_reply_uris = get_replies(next_reply, options) + new_reply_uris, new_n_pages = get_replies(next_reply, MAX_PAGES - n_pages, options) next if new_reply_uris.nil? new_reply_uris = new_reply_uris.reject { |uri| fetched_uris.include?(uri) } uris_to_fetch.concat(new_reply_uris) fetched_uris = fetched_uris.merge(new_reply_uris) + n_pages += new_n_pages end Rails.logger.debug { "FetchAllRepliesWorker - #{parent_status_id}: fetched #{fetched_uris.length} replies" } @@ -43,11 +45,11 @@ class ActivityPub::FetchAllRepliesWorker private - def get_replies(status_uri, options = {}) + def get_replies(status_uri, max_pages, options = {}) replies_collection_or_uri = get_replies_uri(status_uri) return if replies_collection_or_uri.nil? - ActivityPub::FetchAllRepliesService.new.call(replies_collection_or_uri, **options.deep_symbolize_keys) + ActivityPub::FetchAllRepliesService.new.call(replies_collection_or_uri, max_pages, **options.deep_symbolize_keys) end def get_replies_uri(parent_status_uri) diff --git a/spec/workers/activitypub/fetch_all_replies_worker_spec.rb b/spec/workers/activitypub/fetch_all_replies_worker_spec.rb index 35fadb6caf..7346088371 100644 --- a/spec/workers/activitypub/fetch_all_replies_worker_spec.rb +++ b/spec/workers/activitypub/fetch_all_replies_worker_spec.rb @@ -122,20 +122,18 @@ RSpec.describe ActivityPub::FetchAllRepliesWorker do stub_request(:get, item).to_return(status: 200, body: Oj.dump(empty_object), headers: { 'Content-Type': 'application/activity+json' }) end + + stub_request(:get, top_note_uri).to_return(status: 200, body: Oj.dump(top_object), headers: { 'Content-Type': 'application/activity+json' }) + stub_request(:get, reply_note_uri).to_return(status: 200, body: Oj.dump(reply_object), headers: { 'Content-Type': 'application/activity+json' }) end shared_examples 'fetches all replies' do - before do - stub_request(:get, top_note_uri).to_return(status: 200, body: Oj.dump(top_object), headers: { 'Content-Type': 'application/activity+json' }) - stub_request(:get, reply_note_uri).to_return(status: 200, body: Oj.dump(reply_object), headers: { 'Content-Type': 'application/activity+json' }) - end - it 'fetches statuses recursively' do got_uris = subject.perform(status.id) expect(got_uris).to match_array(all_items) end - it 'respects the maxium limits set by not recursing after the max is reached' do + it 'respects the maximum limits set by not recursing after the max is reached' do stub_const('ActivityPub::FetchAllRepliesWorker::MAX_REPLIES', 5) got_uris = subject.perform(status.id) expect(got_uris).to match_array(top_items + top_items_paged) @@ -249,6 +247,12 @@ RSpec.describe ActivityPub::FetchAllRepliesWorker do end it_behaves_like 'fetches all replies' + + it 'limits by max pages' do + stub_const('ActivityPub::FetchAllRepliesWorker::MAX_PAGES', 3) + got_uris = subject.perform(status.id) + expect(got_uris).to match_array(top_items + top_items_paged + nested_items) + end end end end