This commit is contained in:
+47
-13
@@ -17,6 +17,21 @@ function extractMaxId(linkHeader: string | null): string | null {
|
|||||||
return match ? match[1] : null
|
return match ? match[1] : null
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts hashtag names from Mastodon HTML content as a supplement to post.tags.
|
||||||
|
* Mastodon renders hashtags as: #<span>TagName</span> inside an anchor.
|
||||||
|
*/
|
||||||
|
function extractTagsFromHtml(html: string): string[] {
|
||||||
|
const results: string[] = []
|
||||||
|
// Match: #<span>TagName</span>
|
||||||
|
const re = /#<span>([^<]+)<\/span>/gi
|
||||||
|
let m: RegExpExecArray | null
|
||||||
|
while ((m = re.exec(html)) !== null) {
|
||||||
|
results.push(m[1].toLowerCase())
|
||||||
|
}
|
||||||
|
return results
|
||||||
|
}
|
||||||
|
|
||||||
async function fetchPage(tag: string, maxId?: string): Promise<TimelineResult> {
|
async function fetchPage(tag: string, maxId?: string): Promise<TimelineResult> {
|
||||||
const instance = process.env.MASTODON_INSTANCE
|
const instance = process.env.MASTODON_INSTANCE
|
||||||
if (!instance) throw new Error('MASTODON_INSTANCE is not configured')
|
if (!instance) throw new Error('MASTODON_INSTANCE is not configured')
|
||||||
@@ -46,8 +61,6 @@ async function fetchPage(tag: string, maxId?: string): Promise<TimelineResult> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetches recent posts for a hashtag and returns posts-per-hour.
|
* Fetches recent posts for a hashtag and returns posts-per-hour.
|
||||||
* Paginates when all fetched posts share a very tight timestamp window
|
|
||||||
* (e.g., #happynewyear at midnight) up to MAX_PAGES_PER_HASHTAG pages.
|
|
||||||
*/
|
*/
|
||||||
export async function getPostsPerHour(tag: string): Promise<number> {
|
export async function getPostsPerHour(tag: string): Promise<number> {
|
||||||
const { postsPerHour } = await getPostsData(tag)
|
const { postsPerHour } = await getPostsData(tag)
|
||||||
@@ -57,11 +70,22 @@ export async function getPostsPerHour(tag: string): Promise<number> {
|
|||||||
/**
|
/**
|
||||||
* Returns posts-per-hour AND a sorted list of co-occurring tag names
|
* Returns posts-per-hour AND a sorted list of co-occurring tag names
|
||||||
* (lowercased, excluding the queried tag itself).
|
* (lowercased, excluding the queried tag itself).
|
||||||
|
*
|
||||||
|
* Strategy:
|
||||||
|
* - Paginate until we have at least one post older than 1 hour (a complete picture),
|
||||||
|
* OR we exhaust the timeline, OR we hit MAX_PAGES_PER_HASHTAG.
|
||||||
|
* - If the oldest fetched post is >= 1 hour old: postsPerHour = count of posts in the
|
||||||
|
* last hour (direct measurement over a full window).
|
||||||
|
* - If all fetched posts are within the last hour (hit page limit or timeline exhausted
|
||||||
|
* with a narrow window): extrapolate — postsPerHour = count / (coveredHours).
|
||||||
*/
|
*/
|
||||||
export async function getPostsData(
|
export async function getPostsData(
|
||||||
tag: string,
|
tag: string,
|
||||||
): Promise<{ postsPerHour: number; relatedTags: string[]; displayTag?: string }> {
|
): Promise<{ postsPerHour: number; relatedTags: string[]; displayTag?: string }> {
|
||||||
const maxPages = parseInt(process.env.MAX_PAGES_PER_HASHTAG ?? '5', 10)
|
const maxPages = parseInt(process.env.MAX_PAGES_PER_HASHTAG ?? '5', 10)
|
||||||
|
const ONE_HOUR_MS = 60 * 60 * 1000
|
||||||
|
const now = Date.now()
|
||||||
|
const cutoff = now - ONE_HOUR_MS
|
||||||
|
|
||||||
let allPosts: MastodonPost[] = []
|
let allPosts: MastodonPost[] = []
|
||||||
let maxId: string | undefined
|
let maxId: string | undefined
|
||||||
@@ -72,13 +96,12 @@ export async function getPostsData(
|
|||||||
if (posts.length === 0) break
|
if (posts.length === 0) break
|
||||||
allPosts = [...allPosts, ...posts]
|
allPosts = [...allPosts, ...posts]
|
||||||
|
|
||||||
// Stop paginating if we got fewer than 40 posts (end of timeline)
|
// End of timeline or no more pages
|
||||||
if (posts.length < 40 || !nextMaxId) break
|
if (posts.length < 40 || !nextMaxId) break
|
||||||
|
|
||||||
// Stop paginating if the time span of what we have is already > 5 minutes
|
// If the oldest post in this batch is already beyond 1 hour, we have a full window
|
||||||
const times = allPosts.map((p) => new Date(p.created_at).getTime())
|
const oldestInBatch = Math.min(...posts.map((p) => new Date(p.created_at).getTime()))
|
||||||
const spanMs = Math.max(...times) - Math.min(...times)
|
if (oldestInBatch < cutoff) break
|
||||||
if (spanMs > 5 * 60 * 1000) break
|
|
||||||
|
|
||||||
maxId = nextMaxId
|
maxId = nextMaxId
|
||||||
}
|
}
|
||||||
@@ -89,16 +112,26 @@ export async function getPostsData(
|
|||||||
const newestMs = Math.max(...times)
|
const newestMs = Math.max(...times)
|
||||||
const oldestMs = Math.min(...times)
|
const oldestMs = Math.min(...times)
|
||||||
|
|
||||||
// Minimum 1-minute span to handle flood scenario (all same timestamp)
|
let postsPerHour: number
|
||||||
const spanHours = Math.max((newestMs - oldestMs) / (1000 * 60 * 60), 1 / 60)
|
if (oldestMs < cutoff) {
|
||||||
const postsPerHour = allPosts.length / spanHours
|
// We reached (or passed) the 1-hour horizon — count posts within the last hour directly
|
||||||
|
postsPerHour = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length
|
||||||
|
} else {
|
||||||
|
// All posts are within the last hour (burst scenario or very sparse tag).
|
||||||
|
// Extrapolate from the covered span. Minimum 1-minute span to avoid divide-by-zero.
|
||||||
|
const coveredMs = Math.max(newestMs - oldestMs, 60_000)
|
||||||
|
postsPerHour = allPosts.length / (coveredMs / ONE_HOUR_MS)
|
||||||
|
}
|
||||||
|
|
||||||
// Count co-occurring tags
|
// Count co-occurring tags — merge post.tags with tags parsed from HTML content
|
||||||
const counts = new Map<string, number>()
|
const counts = new Map<string, number>()
|
||||||
const lowerTag = tag.toLowerCase()
|
const lowerTag = tag.toLowerCase()
|
||||||
for (const post of allPosts) {
|
for (const post of allPosts) {
|
||||||
for (const t of post.tags ?? []) {
|
const fromApi = (post.tags ?? []).map((t) => t.name.toLowerCase())
|
||||||
const name = t.name.toLowerCase()
|
const fromContent = extractTagsFromHtml(post.content)
|
||||||
|
// Union of both sources
|
||||||
|
const allTagNames = [...new Set([...fromApi, ...fromContent])]
|
||||||
|
for (const name of allTagNames) {
|
||||||
if (name !== lowerTag && name.length >= 2 && name.length <= 100) {
|
if (name !== lowerTag && name.length >= 2 && name.length <= 100) {
|
||||||
counts.set(name, (counts.get(name) ?? 0) + 1)
|
counts.set(name, (counts.get(name) ?? 0) + 1)
|
||||||
}
|
}
|
||||||
@@ -128,3 +161,4 @@ export async function getPostsData(
|
|||||||
|
|
||||||
return { postsPerHour, relatedTags, displayTag }
|
return { postsPerHour, relatedTags, displayTag }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user