better hashtag handling
Build Images and Deploy / Update-PROD-Stack (push) Successful in 1m19s

This commit is contained in:
2026-03-19 00:52:17 -04:00
parent 20f939799d
commit ec275dd858
2 changed files with 48 additions and 14 deletions
+47 -13
View File
@@ -17,6 +17,21 @@ function extractMaxId(linkHeader: string | null): string | null {
return match ? match[1] : null
}
/**
* Extracts hashtag names from Mastodon HTML content as a supplement to post.tags.
* Mastodon renders hashtags as: #<span>TagName</span> inside an anchor.
*/
function extractTagsFromHtml(html: string): string[] {
const results: string[] = []
// Match: #<span>TagName</span>
const re = /#<span>([^<]+)<\/span>/gi
let m: RegExpExecArray | null
while ((m = re.exec(html)) !== null) {
results.push(m[1].toLowerCase())
}
return results
}
async function fetchPage(tag: string, maxId?: string): Promise<TimelineResult> {
const instance = process.env.MASTODON_INSTANCE
if (!instance) throw new Error('MASTODON_INSTANCE is not configured')
@@ -46,8 +61,6 @@ async function fetchPage(tag: string, maxId?: string): Promise<TimelineResult> {
/**
* Fetches recent posts for a hashtag and returns posts-per-hour.
* Paginates when all fetched posts share a very tight timestamp window
* (e.g., #happynewyear at midnight) up to MAX_PAGES_PER_HASHTAG pages.
*/
export async function getPostsPerHour(tag: string): Promise<number> {
const { postsPerHour } = await getPostsData(tag)
@@ -57,11 +70,22 @@ export async function getPostsPerHour(tag: string): Promise<number> {
/**
* Returns posts-per-hour AND a sorted list of co-occurring tag names
* (lowercased, excluding the queried tag itself).
*
* Strategy:
* - Paginate until we have at least one post older than 1 hour (a complete picture),
* OR we exhaust the timeline, OR we hit MAX_PAGES_PER_HASHTAG.
* - If the oldest fetched post is >= 1 hour old: postsPerHour = count of posts in the
* last hour (direct measurement over a full window).
* - If all fetched posts are within the last hour (hit page limit or timeline exhausted
* with a narrow window): extrapolate — postsPerHour = count / (coveredHours).
*/
export async function getPostsData(
tag: string,
): Promise<{ postsPerHour: number; relatedTags: string[]; displayTag?: string }> {
const maxPages = parseInt(process.env.MAX_PAGES_PER_HASHTAG ?? '5', 10)
const ONE_HOUR_MS = 60 * 60 * 1000
const now = Date.now()
const cutoff = now - ONE_HOUR_MS
let allPosts: MastodonPost[] = []
let maxId: string | undefined
@@ -72,13 +96,12 @@ export async function getPostsData(
if (posts.length === 0) break
allPosts = [...allPosts, ...posts]
// Stop paginating if we got fewer than 40 posts (end of timeline)
// End of timeline or no more pages
if (posts.length < 40 || !nextMaxId) break
// Stop paginating if the time span of what we have is already > 5 minutes
const times = allPosts.map((p) => new Date(p.created_at).getTime())
const spanMs = Math.max(...times) - Math.min(...times)
if (spanMs > 5 * 60 * 1000) break
// If the oldest post in this batch is already beyond 1 hour, we have a full window
const oldestInBatch = Math.min(...posts.map((p) => new Date(p.created_at).getTime()))
if (oldestInBatch < cutoff) break
maxId = nextMaxId
}
@@ -89,16 +112,26 @@ export async function getPostsData(
const newestMs = Math.max(...times)
const oldestMs = Math.min(...times)
// Minimum 1-minute span to handle flood scenario (all same timestamp)
const spanHours = Math.max((newestMs - oldestMs) / (1000 * 60 * 60), 1 / 60)
const postsPerHour = allPosts.length / spanHours
let postsPerHour: number
if (oldestMs < cutoff) {
// We reached (or passed) the 1-hour horizon — count posts within the last hour directly
postsPerHour = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length
} else {
// All posts are within the last hour (burst scenario or very sparse tag).
// Extrapolate from the covered span. Minimum 1-minute span to avoid divide-by-zero.
const coveredMs = Math.max(newestMs - oldestMs, 60_000)
postsPerHour = allPosts.length / (coveredMs / ONE_HOUR_MS)
}
// Count co-occurring tags
// Count co-occurring tags — merge post.tags with tags parsed from HTML content
const counts = new Map<string, number>()
const lowerTag = tag.toLowerCase()
for (const post of allPosts) {
for (const t of post.tags ?? []) {
const name = t.name.toLowerCase()
const fromApi = (post.tags ?? []).map((t) => t.name.toLowerCase())
const fromContent = extractTagsFromHtml(post.content)
// Union of both sources
const allTagNames = [...new Set([...fromApi, ...fromContent])]
for (const name of allTagNames) {
if (name !== lowerTag && name.length >= 2 && name.length <= 100) {
counts.set(name, (counts.get(name) ?? 0) + 1)
}
@@ -128,3 +161,4 @@ export async function getPostsData(
return { postsPerHour, relatedTags, displayTag }
}
+1 -1
View File
File diff suppressed because one or more lines are too long