From e2dc3ea492bdd7e351e222da242d404384f743ed Mon Sep 17 00:00:00 2001 From: Mike Johnston Date: Sat, 21 Mar 2026 18:00:02 -0400 Subject: [PATCH] new post fetch strategy --- src/lib/mastodon.ts | 67 ++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 40 deletions(-) diff --git a/src/lib/mastodon.ts b/src/lib/mastodon.ts index 09dc23d..815d255 100644 --- a/src/lib/mastodon.ts +++ b/src/lib/mastodon.ts @@ -71,18 +71,13 @@ export async function getPostsPerHour(tag: string): Promise { * (lowercased, excluding the queried tag itself). * * Pagination strategy: - * - Fetch pages until the oldest post in a batch falls before the 1-hour cutoff - * (the horizon), OR the timeline is exhausted, OR MAX_PAGES_PER_HASHTAG is reached. - * - When we first cross the horizon, keep fetching additional pages as long as each - * new page contributes at least one post within the cutoff and not beyond 24 hours. - * Only stop when a page adds nothing new to the in-window count — at that point the - * window is stable. - * This handles out-of-order federation: Mastodon timelines are ordered by post ID - * (local receive time), not created_at. A post authored at 10:45 on a remote server - * may arrive at 11:05, get a recent ID and appear near the top of the stream — but - * its created_at is in the past/out of order. Continuing until the count stabilises - * ensures all such late-arriving posts are captured regardless of how many pages - * they span. + * - Keep fetching pages until >= 50% of posts in a page fall outside the 1-hour window, + * OR the timeline is exhausted, OR MAX_PAGES_PER_HASHTAG is reached. + * - The 50% rule handles federated out-of-order posts gracefully: Mastodon timelines are + * ordered by post ID (local receive time), not created_at. A remote post from hours or + * even years ago can arrive late, get a fresh ID, and appear at the top of the stream. + * A minority of such posts won't trigger the stop condition; only once the majority of + * a page is old content do we consider the 1-hour window fully covered. * - After collecting all pages, sort by created_at and filter to the last hour for an * accurate count regardless of any remaining ordering noise. * @@ -105,15 +100,8 @@ export async function getPostsData( let allPosts: MastodonPost[] = [] let maxId: string | undefined let hitPageCap = false - let crossedHorizon = false for (let page = 0; page < maxPages; page++) { - // Once we've crossed the horizon, snapshot the in-window count before this fetch - // so we can detect whether the page contributed anything new. - const inWindowBefore = crossedHorizon - ? allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length - : 0 - const { posts, nextMaxId } = await fetchPage(tag, maxId, postLimit) if (posts.length === 0) break @@ -122,24 +110,15 @@ export async function getPostsData( // End of timeline or no more pages if (posts.length < postLimit || !nextMaxId) break - if (crossedHorizon) { - // Keep fetching while this page added new in-window posts; stop when count stabilises - const inWindowAfter = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length - if (inWindowAfter === inWindowBefore) break - } - - // Check if this batch first crosses the 1-hour horizon - const oldestInBatch = Math.min(...posts.map((p) => new Date(p.created_at).getTime())) - if (oldestInBatch < cutoff) crossedHorizon = true - - // If the oldest post is more than 24 hours old the window is well covered — no need - // to keep fetching for federation stragglers this far back - if (oldestInBatch < now - 24 * ONE_HOUR_MS) break + // Stop when >= 50% of this page's posts are outside the 1-hour window. + // A handful of old federated posts won't trigger this; once the majority of a page + // is old content we have a reliable picture of the last hour. + const outsideWindow = posts.filter((p) => new Date(p.created_at).getTime() < cutoff).length + if (outsideWindow / posts.length >= 0.5) break maxId = nextMaxId - // Only mark as hit-cap when we never found old enough data (true burst scenario) - if (page === maxPages - 1 && !crossedHorizon) hitPageCap = true + if (page === maxPages - 1) hitPageCap = true } if (allPosts.length === 0) return { postsPerHour: 0, relatedTags: [], hasAnyPosts: false } @@ -156,14 +135,21 @@ export async function getPostsData( // We reached (or passed) the 1-hour horizon — count posts within the last hour directly postsPerHour = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length } else if (hitPageCap) { - // Hit the page limit — more posts likely exist beyond what we fetched (burst scenario). - // Extrapolate from the covered span. Minimum 1-minute span to avoid divide-by-zero. - const coveredMs = Math.max(newestMs - oldestMs, 60_000) - postsPerHour = allPosts.length / (coveredMs / ONE_HOUR_MS) + // Hit the page cap and never reached the horizon — burst scenario, more posts exist + // beyond what we fetched. Extrapolate using only in-window posts: + // rate = inWindowCount / coveredHours, where coveredHours = (now - oldestInWindowPost) / ONE_HOUR_MS + // This gives posts-per-hour as if the same rate continued for the full 60 minutes. + // Minimum 1-minute covered span to avoid divide-by-zero on a single-post window. + const inWindowPosts = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff) + const oldestInWindowMs = inWindowPosts.length > 0 + ? Math.min(...inWindowPosts.map((p) => new Date(p.created_at).getTime())) + : newestMs + const coveredMs = Math.max(now - oldestInWindowMs, 60_000) + postsPerHour = inWindowPosts.length / (coveredMs / ONE_HOUR_MS) } else { // Timeline exhausted — these are all the posts that exist within the last hour. // Use the raw count directly; extrapolating would inflate a sparse tag. - postsPerHour = allPosts.length + postsPerHour = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length } // Count co-occurring tags from the API tags object (authoritative for membership) @@ -222,9 +208,10 @@ export async function getPostsData( return `${d}d ${h}h ${m}m ago` } + const inWindowCount = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length const method = oldestMs < cutoff ? 'direct' : hitPageCap ? 'extrapolated' : 'raw' console.log( - `[mastodon] #${tag} — pages: ${pagesFetched}, posts: ${allPosts.length}, ` + + `[mastodon] #${tag} — pages: ${pagesFetched}, posts: ${allPosts.length} (${inWindowCount} in-window), ` + `between: ${relAge(oldestMs)} - ${relAge(newestMs)}, ` + `pph: ${postsPerHour.toFixed(2)} (${method})`, )