From e067d3f5c78ea8d20d49cd3324146adc7c84d139 Mon Sep 17 00:00:00 2001 From: Mike Johnston Date: Sat, 21 Mar 2026 14:19:11 -0400 Subject: [PATCH] redefine logic to try and get a stable price with unstable timeline --- src/lib/mastodon.ts | 57 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/src/lib/mastodon.ts b/src/lib/mastodon.ts index 723ac7f..6ffcd45 100644 --- a/src/lib/mastodon.ts +++ b/src/lib/mastodon.ts @@ -70,14 +70,27 @@ export async function getPostsPerHour(tag: string): Promise { * Returns posts-per-hour AND a sorted list of co-occurring tag names * (lowercased, excluding the queried tag itself). * - * Strategy: - * - Paginate until we have at least one post older than 1 hour (a complete picture), - * OR we exhaust the timeline, OR we hit MAX_PAGES_PER_HASHTAG. - * - Oldest post >= 1 hour old: count posts in the last hour directly (full window). - * - Hit the page cap (burst): more posts exist beyond what we fetched — extrapolate from - * the covered span (postsPerHour = count / coveredHours). - * - Timeline exhausted (sparse): these are all the posts that exist — use the raw count. - * Extrapolating would artificially inflate a tag with 3 posts clustered in 10 minutes. + * Pagination strategy: + * - Fetch pages until the oldest post in a batch falls before the 1-hour cutoff + * (the horizon), OR the timeline is exhausted, OR MAX_PAGES_PER_HASHTAG is reached. + * - When we first cross the horizon, keep fetching additional pages as long as each + * new page contributes at least one post within the cutoff. Only stop when a page + * adds nothing new to the in-window count — at that point the window is stable. + * This handles out-of-order federation: Mastodon timelines are ordered by post ID + * (local receive time), not created_at. A post authored at 10:45 on a remote server + * may arrive at 11:05, get a recent ID and appear near the top of the stream — but + * its created_at is in the past/out of order. Continuing until the count stabilises + * ensures all such late-arriving posts are captured regardless of how many pages + * they span. + * - After collecting all pages, sort by created_at and filter to the last hour for an + * accurate count regardless of any remaining ordering noise. + * + * PPH calculation: + * - Crossed horizon (direct): we have a full window — count posts with created_at >= cutoff. + * - Hit page cap without crossing (burst): more posts exist beyond what we fetched — + * extrapolate from the covered time span (count / coveredHours). + * - Timeline exhausted without crossing (sparse): all posts in the last hour are accounted + * for — use the raw count directly (no extrapolation). */ export async function getPostsData( tag: string, @@ -91,8 +104,15 @@ export async function getPostsData( let allPosts: MastodonPost[] = [] let maxId: string | undefined let hitPageCap = false + let crossedHorizon = false for (let page = 0; page < maxPages; page++) { + // Once we've crossed the horizon, snapshot the in-window count before this fetch + // so we can detect whether the page contributed anything new. + const inWindowBefore = crossedHorizon + ? allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length + : 0 + const { posts, nextMaxId } = await fetchPage(tag, maxId, postLimit) if (posts.length === 0) break @@ -101,21 +121,30 @@ export async function getPostsData( // End of timeline or no more pages if (posts.length < postLimit || !nextMaxId) break - // If the oldest post in this batch is already beyond 1 hour, we have a full window + if (crossedHorizon) { + // Keep fetching while this page added new in-window posts; stop when count stabilises + const inWindowAfter = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length + if (inWindowAfter === inWindowBefore) break + } + + // Check if this batch first crosses the 1-hour horizon const oldestInBatch = Math.min(...posts.map((p) => new Date(p.created_at).getTime())) - if (oldestInBatch < cutoff) break + if (oldestInBatch < cutoff) crossedHorizon = true maxId = nextMaxId - // Mark if we completed the final allowed page without breaking - if (page === maxPages - 1) hitPageCap = true + // Only mark as hit-cap when we never found old enough data (true burst scenario) + if (page === maxPages - 1 && !crossedHorizon) hitPageCap = true } if (allPosts.length === 0) return { postsPerHour: 0, relatedTags: [], hasAnyPosts: false } + // Sort globally by created_at so the window filter is accurate regardless of federation order + allPosts.sort((a, b) => new Date(b.created_at).getTime() - new Date(a.created_at).getTime()) + const times = allPosts.map((p) => new Date(p.created_at).getTime()) - const newestMs = Math.max(...times) - const oldestMs = Math.min(...times) + const newestMs = times[0] + const oldestMs = times[times.length - 1] let postsPerHour: number if (oldestMs < cutoff) {