This commit is contained in:
+27
-40
@@ -71,18 +71,13 @@ export async function getPostsPerHour(tag: string): Promise<number> {
|
|||||||
* (lowercased, excluding the queried tag itself).
|
* (lowercased, excluding the queried tag itself).
|
||||||
*
|
*
|
||||||
* Pagination strategy:
|
* Pagination strategy:
|
||||||
* - Fetch pages until the oldest post in a batch falls before the 1-hour cutoff
|
* - Keep fetching pages until >= 50% of posts in a page fall outside the 1-hour window,
|
||||||
* (the horizon), OR the timeline is exhausted, OR MAX_PAGES_PER_HASHTAG is reached.
|
* OR the timeline is exhausted, OR MAX_PAGES_PER_HASHTAG is reached.
|
||||||
* - When we first cross the horizon, keep fetching additional pages as long as each
|
* - The 50% rule handles federated out-of-order posts gracefully: Mastodon timelines are
|
||||||
* new page contributes at least one post within the cutoff and not beyond 24 hours.
|
* ordered by post ID (local receive time), not created_at. A remote post from hours or
|
||||||
* Only stop when a page adds nothing new to the in-window count — at that point the
|
* even years ago can arrive late, get a fresh ID, and appear at the top of the stream.
|
||||||
* window is stable.
|
* A minority of such posts won't trigger the stop condition; only once the majority of
|
||||||
* This handles out-of-order federation: Mastodon timelines are ordered by post ID
|
* a page is old content do we consider the 1-hour window fully covered.
|
||||||
* (local receive time), not created_at. A post authored at 10:45 on a remote server
|
|
||||||
* may arrive at 11:05, get a recent ID and appear near the top of the stream — but
|
|
||||||
* its created_at is in the past/out of order. Continuing until the count stabilises
|
|
||||||
* ensures all such late-arriving posts are captured regardless of how many pages
|
|
||||||
* they span.
|
|
||||||
* - After collecting all pages, sort by created_at and filter to the last hour for an
|
* - After collecting all pages, sort by created_at and filter to the last hour for an
|
||||||
* accurate count regardless of any remaining ordering noise.
|
* accurate count regardless of any remaining ordering noise.
|
||||||
*
|
*
|
||||||
@@ -105,15 +100,8 @@ export async function getPostsData(
|
|||||||
let allPosts: MastodonPost[] = []
|
let allPosts: MastodonPost[] = []
|
||||||
let maxId: string | undefined
|
let maxId: string | undefined
|
||||||
let hitPageCap = false
|
let hitPageCap = false
|
||||||
let crossedHorizon = false
|
|
||||||
|
|
||||||
for (let page = 0; page < maxPages; page++) {
|
for (let page = 0; page < maxPages; page++) {
|
||||||
// Once we've crossed the horizon, snapshot the in-window count before this fetch
|
|
||||||
// so we can detect whether the page contributed anything new.
|
|
||||||
const inWindowBefore = crossedHorizon
|
|
||||||
? allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length
|
|
||||||
: 0
|
|
||||||
|
|
||||||
const { posts, nextMaxId } = await fetchPage(tag, maxId, postLimit)
|
const { posts, nextMaxId } = await fetchPage(tag, maxId, postLimit)
|
||||||
|
|
||||||
if (posts.length === 0) break
|
if (posts.length === 0) break
|
||||||
@@ -122,24 +110,15 @@ export async function getPostsData(
|
|||||||
// End of timeline or no more pages
|
// End of timeline or no more pages
|
||||||
if (posts.length < postLimit || !nextMaxId) break
|
if (posts.length < postLimit || !nextMaxId) break
|
||||||
|
|
||||||
if (crossedHorizon) {
|
// Stop when >= 50% of this page's posts are outside the 1-hour window.
|
||||||
// Keep fetching while this page added new in-window posts; stop when count stabilises
|
// A handful of old federated posts won't trigger this; once the majority of a page
|
||||||
const inWindowAfter = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length
|
// is old content we have a reliable picture of the last hour.
|
||||||
if (inWindowAfter === inWindowBefore) break
|
const outsideWindow = posts.filter((p) => new Date(p.created_at).getTime() < cutoff).length
|
||||||
}
|
if (outsideWindow / posts.length >= 0.5) break
|
||||||
|
|
||||||
// Check if this batch first crosses the 1-hour horizon
|
|
||||||
const oldestInBatch = Math.min(...posts.map((p) => new Date(p.created_at).getTime()))
|
|
||||||
if (oldestInBatch < cutoff) crossedHorizon = true
|
|
||||||
|
|
||||||
// If the oldest post is more than 24 hours old the window is well covered — no need
|
|
||||||
// to keep fetching for federation stragglers this far back
|
|
||||||
if (oldestInBatch < now - 24 * ONE_HOUR_MS) break
|
|
||||||
|
|
||||||
maxId = nextMaxId
|
maxId = nextMaxId
|
||||||
|
|
||||||
// Only mark as hit-cap when we never found old enough data (true burst scenario)
|
if (page === maxPages - 1) hitPageCap = true
|
||||||
if (page === maxPages - 1 && !crossedHorizon) hitPageCap = true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (allPosts.length === 0) return { postsPerHour: 0, relatedTags: [], hasAnyPosts: false }
|
if (allPosts.length === 0) return { postsPerHour: 0, relatedTags: [], hasAnyPosts: false }
|
||||||
@@ -156,14 +135,21 @@ export async function getPostsData(
|
|||||||
// We reached (or passed) the 1-hour horizon — count posts within the last hour directly
|
// We reached (or passed) the 1-hour horizon — count posts within the last hour directly
|
||||||
postsPerHour = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length
|
postsPerHour = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length
|
||||||
} else if (hitPageCap) {
|
} else if (hitPageCap) {
|
||||||
// Hit the page limit — more posts likely exist beyond what we fetched (burst scenario).
|
// Hit the page cap and never reached the horizon — burst scenario, more posts exist
|
||||||
// Extrapolate from the covered span. Minimum 1-minute span to avoid divide-by-zero.
|
// beyond what we fetched. Extrapolate using only in-window posts:
|
||||||
const coveredMs = Math.max(newestMs - oldestMs, 60_000)
|
// rate = inWindowCount / coveredHours, where coveredHours = (now - oldestInWindowPost) / ONE_HOUR_MS
|
||||||
postsPerHour = allPosts.length / (coveredMs / ONE_HOUR_MS)
|
// This gives posts-per-hour as if the same rate continued for the full 60 minutes.
|
||||||
|
// Minimum 1-minute covered span to avoid divide-by-zero on a single-post window.
|
||||||
|
const inWindowPosts = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff)
|
||||||
|
const oldestInWindowMs = inWindowPosts.length > 0
|
||||||
|
? Math.min(...inWindowPosts.map((p) => new Date(p.created_at).getTime()))
|
||||||
|
: newestMs
|
||||||
|
const coveredMs = Math.max(now - oldestInWindowMs, 60_000)
|
||||||
|
postsPerHour = inWindowPosts.length / (coveredMs / ONE_HOUR_MS)
|
||||||
} else {
|
} else {
|
||||||
// Timeline exhausted — these are all the posts that exist within the last hour.
|
// Timeline exhausted — these are all the posts that exist within the last hour.
|
||||||
// Use the raw count directly; extrapolating would inflate a sparse tag.
|
// Use the raw count directly; extrapolating would inflate a sparse tag.
|
||||||
postsPerHour = allPosts.length
|
postsPerHour = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length
|
||||||
}
|
}
|
||||||
|
|
||||||
// Count co-occurring tags from the API tags object (authoritative for membership)
|
// Count co-occurring tags from the API tags object (authoritative for membership)
|
||||||
@@ -222,9 +208,10 @@ export async function getPostsData(
|
|||||||
return `${d}d ${h}h ${m}m ago`
|
return `${d}d ${h}h ${m}m ago`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const inWindowCount = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length
|
||||||
const method = oldestMs < cutoff ? 'direct' : hitPageCap ? 'extrapolated' : 'raw'
|
const method = oldestMs < cutoff ? 'direct' : hitPageCap ? 'extrapolated' : 'raw'
|
||||||
console.log(
|
console.log(
|
||||||
`[mastodon] #${tag} — pages: ${pagesFetched}, posts: ${allPosts.length}, ` +
|
`[mastodon] #${tag} — pages: ${pagesFetched}, posts: ${allPosts.length} (${inWindowCount} in-window), ` +
|
||||||
`between: ${relAge(oldestMs)} - ${relAge(newestMs)}, ` +
|
`between: ${relAge(oldestMs)} - ${relAge(newestMs)}, ` +
|
||||||
`pph: ${postsPerHour.toFixed(2)} (${method})`,
|
`pph: ${postsPerHour.toFixed(2)} (${method})`,
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user