This commit is contained in:
+47
-13
@@ -17,6 +17,21 @@ function extractMaxId(linkHeader: string | null): string | null {
|
||||
return match ? match[1] : null
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts hashtag names from Mastodon HTML content as a supplement to post.tags.
|
||||
* Mastodon renders hashtags as: #<span>TagName</span> inside an anchor.
|
||||
*/
|
||||
function extractTagsFromHtml(html: string): string[] {
|
||||
const results: string[] = []
|
||||
// Match: #<span>TagName</span>
|
||||
const re = /#<span>([^<]+)<\/span>/gi
|
||||
let m: RegExpExecArray | null
|
||||
while ((m = re.exec(html)) !== null) {
|
||||
results.push(m[1].toLowerCase())
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
||||
async function fetchPage(tag: string, maxId?: string): Promise<TimelineResult> {
|
||||
const instance = process.env.MASTODON_INSTANCE
|
||||
if (!instance) throw new Error('MASTODON_INSTANCE is not configured')
|
||||
@@ -46,8 +61,6 @@ async function fetchPage(tag: string, maxId?: string): Promise<TimelineResult> {
|
||||
|
||||
/**
|
||||
* Fetches recent posts for a hashtag and returns posts-per-hour.
|
||||
* Paginates when all fetched posts share a very tight timestamp window
|
||||
* (e.g., #happynewyear at midnight) up to MAX_PAGES_PER_HASHTAG pages.
|
||||
*/
|
||||
export async function getPostsPerHour(tag: string): Promise<number> {
|
||||
const { postsPerHour } = await getPostsData(tag)
|
||||
@@ -57,11 +70,22 @@ export async function getPostsPerHour(tag: string): Promise<number> {
|
||||
/**
|
||||
* Returns posts-per-hour AND a sorted list of co-occurring tag names
|
||||
* (lowercased, excluding the queried tag itself).
|
||||
*
|
||||
* Strategy:
|
||||
* - Paginate until we have at least one post older than 1 hour (a complete picture),
|
||||
* OR we exhaust the timeline, OR we hit MAX_PAGES_PER_HASHTAG.
|
||||
* - If the oldest fetched post is >= 1 hour old: postsPerHour = count of posts in the
|
||||
* last hour (direct measurement over a full window).
|
||||
* - If all fetched posts are within the last hour (hit page limit or timeline exhausted
|
||||
* with a narrow window): extrapolate — postsPerHour = count / (coveredHours).
|
||||
*/
|
||||
export async function getPostsData(
|
||||
tag: string,
|
||||
): Promise<{ postsPerHour: number; relatedTags: string[]; displayTag?: string }> {
|
||||
const maxPages = parseInt(process.env.MAX_PAGES_PER_HASHTAG ?? '5', 10)
|
||||
const ONE_HOUR_MS = 60 * 60 * 1000
|
||||
const now = Date.now()
|
||||
const cutoff = now - ONE_HOUR_MS
|
||||
|
||||
let allPosts: MastodonPost[] = []
|
||||
let maxId: string | undefined
|
||||
@@ -72,13 +96,12 @@ export async function getPostsData(
|
||||
if (posts.length === 0) break
|
||||
allPosts = [...allPosts, ...posts]
|
||||
|
||||
// Stop paginating if we got fewer than 40 posts (end of timeline)
|
||||
// End of timeline or no more pages
|
||||
if (posts.length < 40 || !nextMaxId) break
|
||||
|
||||
// Stop paginating if the time span of what we have is already > 5 minutes
|
||||
const times = allPosts.map((p) => new Date(p.created_at).getTime())
|
||||
const spanMs = Math.max(...times) - Math.min(...times)
|
||||
if (spanMs > 5 * 60 * 1000) break
|
||||
// If the oldest post in this batch is already beyond 1 hour, we have a full window
|
||||
const oldestInBatch = Math.min(...posts.map((p) => new Date(p.created_at).getTime()))
|
||||
if (oldestInBatch < cutoff) break
|
||||
|
||||
maxId = nextMaxId
|
||||
}
|
||||
@@ -89,16 +112,26 @@ export async function getPostsData(
|
||||
const newestMs = Math.max(...times)
|
||||
const oldestMs = Math.min(...times)
|
||||
|
||||
// Minimum 1-minute span to handle flood scenario (all same timestamp)
|
||||
const spanHours = Math.max((newestMs - oldestMs) / (1000 * 60 * 60), 1 / 60)
|
||||
const postsPerHour = allPosts.length / spanHours
|
||||
let postsPerHour: number
|
||||
if (oldestMs < cutoff) {
|
||||
// We reached (or passed) the 1-hour horizon — count posts within the last hour directly
|
||||
postsPerHour = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length
|
||||
} else {
|
||||
// All posts are within the last hour (burst scenario or very sparse tag).
|
||||
// Extrapolate from the covered span. Minimum 1-minute span to avoid divide-by-zero.
|
||||
const coveredMs = Math.max(newestMs - oldestMs, 60_000)
|
||||
postsPerHour = allPosts.length / (coveredMs / ONE_HOUR_MS)
|
||||
}
|
||||
|
||||
// Count co-occurring tags
|
||||
// Count co-occurring tags — merge post.tags with tags parsed from HTML content
|
||||
const counts = new Map<string, number>()
|
||||
const lowerTag = tag.toLowerCase()
|
||||
for (const post of allPosts) {
|
||||
for (const t of post.tags ?? []) {
|
||||
const name = t.name.toLowerCase()
|
||||
const fromApi = (post.tags ?? []).map((t) => t.name.toLowerCase())
|
||||
const fromContent = extractTagsFromHtml(post.content)
|
||||
// Union of both sources
|
||||
const allTagNames = [...new Set([...fromApi, ...fromContent])]
|
||||
for (const name of allTagNames) {
|
||||
if (name !== lowerTag && name.length >= 2 && name.length <= 100) {
|
||||
counts.set(name, (counts.get(name) ?? 0) + 1)
|
||||
}
|
||||
@@ -128,3 +161,4 @@ export async function getPostsData(
|
||||
|
||||
return { postsPerHour, relatedTags, displayTag }
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user