From 887bccd622270761d5b6c68a0768d131b2689106 Mon Sep 17 00:00:00 2001 From: sneakers-the-rat Date: Sat, 12 Oct 2024 21:26:12 -0700 Subject: [PATCH] Fix limit in fetch_replies_service to not always limit by 5 (which always caused us to only do one page). Rename some variables to make purpose clearer. Return the array of all fetched uris instead of just the number we got Signed-off-by: sneakers-the-rat --- .../activitypub/fetch_all_replies_service.rb | 4 +++- .../activitypub/fetch_replies_service.rb | 8 ++++--- .../activitypub/fetch_all_replies_worker.rb | 22 +++++++++---------- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/app/services/activitypub/fetch_all_replies_service.rb b/app/services/activitypub/fetch_all_replies_service.rb index 19fae824df..cba476ab46 100644 --- a/app/services/activitypub/fetch_all_replies_service.rb +++ b/app/services/activitypub/fetch_all_replies_service.rb @@ -10,7 +10,7 @@ class ActivityPub::FetchAllRepliesService < ActivityPub::FetchRepliesService @allow_synchronous_requests = allow_synchronous_requests @filter_by_host = false - @items = collection_items(collection_or_uri) + @items = collection_items(collection_or_uri, fetch_all: true) @items = filtered_replies return if @items.nil? @@ -25,6 +25,8 @@ class ActivityPub::FetchAllRepliesService < ActivityPub::FetchRepliesService return if @items.nil? # find all statuses that we *shouldn't* update the replies for, and use that as a filter + # Typically we assume this is smaller than the replies we *should* fetch, + # so we minimize the number of uris we should load here. uris = @items.map { |item| value_or_id(item) } dont_update = Status.where(uri: uris).shouldnt_fetch_replies.pluck(:uri) diff --git a/app/services/activitypub/fetch_replies_service.rb b/app/services/activitypub/fetch_replies_service.rb index 0dcdb638e1..2963f616e2 100644 --- a/app/services/activitypub/fetch_replies_service.rb +++ b/app/services/activitypub/fetch_replies_service.rb @@ -5,6 +5,8 @@ class ActivityPub::FetchRepliesService < BaseService # Limit of fetched replies MAX_REPLIES = 5 + # Limit when fetching all (to prevent infinite fetch attack) + FETCH_ALL_MAX_REPLIES = 500 def call(parent_status, collection_or_uri, allow_synchronous_requests: true, request_id: nil, filter_by_host: true) @account = parent_status.account @@ -21,7 +23,7 @@ class ActivityPub::FetchRepliesService < BaseService private - def collection_items(collection_or_uri) + def collection_items(collection_or_uri, fetch_all: false) collection = fetch_collection(collection_or_uri) return unless collection.is_a?(Hash) @@ -39,8 +41,8 @@ class ActivityPub::FetchRepliesService < BaseService all_items.concat(as_array(items)) - # Quit early if we are not fetching all replies - break if all_items.size >= MAX_REPLIES + # Quit early if we are not fetching all replies or we've reached the absolute max + break if (!fetch_all && all_items.size >= MAX_REPLIES) || (all_items.size >= FETCH_ALL_MAX_REPLIES) collection = collection['next'].present? ? fetch_collection(collection['next']) : nil end diff --git a/app/workers/activitypub/fetch_all_replies_worker.rb b/app/workers/activitypub/fetch_all_replies_worker.rb index af2cf40e91..222736224b 100644 --- a/app/workers/activitypub/fetch_all_replies_worker.rb +++ b/app/workers/activitypub/fetch_all_replies_worker.rb @@ -11,7 +11,7 @@ class ActivityPub::FetchAllRepliesWorker sidekiq_options queue: 'pull', retry: 3 - # Global max replies to fetch per request + # Global max replies to fetch per request (all replies, recursively) MAX_REPLIES = 1000 def perform(parent_status_id, current_account_id = nil, options = {}) @@ -20,21 +20,21 @@ class ActivityPub::FetchAllRepliesWorker @current_account = @current_account_id.nil? ? nil : Account.find(@current_account_id) Rails.logger.debug { "FetchAllRepliesWorker - #{parent_status_id}: Fetching all replies for status: #{@parent_status}" } - all_replies = get_replies(@parent_status.uri, options) - got_replies = all_replies.length - until all_replies.empty? || got_replies >= MAX_REPLIES - next_reply = all_replies.pop + uris_to_fetch = get_replies(@parent_status.uri, options) + fetched_uris = uris_to_fetch + until uris_to_fetch.empty? || fetched_uris.length >= MAX_REPLIES + next_reply = uris_to_fetch.pop next if next_reply.nil? - new_replies = get_replies(next_reply, options) - next if new_replies.nil? + new_reply_uris = get_replies(next_reply, options) + next if new_reply_uris.nil? - got_replies += new_replies.length - all_replies.concat(new_replies) + uris_to_fetch.concat(new_reply_uris) + fetched_uris.concat(new_reply_uris) end - Rails.logger.debug { "FetchAllRepliesWorker - #{parent_status_id}: fetched #{got_replies} replies" } - got_replies + Rails.logger.debug { "FetchAllRepliesWorker - #{parent_status_id}: fetched #{fetched_uris.length} replies" } + fetched_uris end private