better hashtag handling

2026-03-19 00:52:17 -04:00
parent 20f939799d
commit ec275dd858
2 changed files with 48 additions and 14 deletions
@@ -17,6 +17,21 @@ function extractMaxId(linkHeader: string | null): string | null {
  return match ? match[1] : null
 }
 /**
 * Extracts hashtag names from Mastodon HTML content as a supplement to post.tags.
 * Mastodon renders hashtags as: #<span>TagName</span> inside an anchor.
 */
 function extractTagsFromHtml(html: string): string[] {
  const results: string[] = []
  // Match: #<span>TagName</span>
  const re = /#<span>([^<]+)<\/span>/gi
  let m: RegExpExecArray | null
  while ((m = re.exec(html)) !== null) {
    results.push(m[1].toLowerCase())
  }
  return results
 }
 async function fetchPage(tag: string, maxId?: string): Promise<TimelineResult> {
  const instance = process.env.MASTODON_INSTANCE
  if (!instance) throw new Error('MASTODON_INSTANCE is not configured')
@@ -46,8 +61,6 @@ async function fetchPage(tag: string, maxId?: string): Promise<TimelineResult> {
 /**
 * Fetches recent posts for a hashtag and returns posts-per-hour.
 * Paginates when all fetched posts share a very tight timestamp window
 * (e.g., #happynewyear at midnight) up to MAX_PAGES_PER_HASHTAG pages.
 */
 export async function getPostsPerHour(tag: string): Promise<number> {
  const { postsPerHour } = await getPostsData(tag)
@@ -57,11 +70,22 @@ export async function getPostsPerHour(tag: string): Promise<number> {
 /**
 * Returns posts-per-hour AND a sorted list of co-occurring tag names
 * (lowercased, excluding the queried tag itself).
 *
 * Strategy:
 *   - Paginate until we have at least one post older than 1 hour (a complete picture),
 *     OR we exhaust the timeline, OR we hit MAX_PAGES_PER_HASHTAG.
 *   - If the oldest fetched post is >= 1 hour old: postsPerHour = count of posts in the
 *     last hour (direct measurement over a full window).
 *   - If all fetched posts are within the last hour (hit page limit or timeline exhausted
 *     with a narrow window): extrapolate — postsPerHour = count / (coveredHours).
 */
 export async function getPostsData(
  tag: string,
 ): Promise<{ postsPerHour: number; relatedTags: string[]; displayTag?: string }> {
  const maxPages = parseInt(process.env.MAX_PAGES_PER_HASHTAG ?? '5', 10)
  const ONE_HOUR_MS = 60 * 60 * 1000
  const now = Date.now()
  const cutoff = now - ONE_HOUR_MS
  let allPosts: MastodonPost[] = []
  let maxId: string | undefined
@@ -72,13 +96,12 @@ export async function getPostsData(
    if (posts.length === 0) break
    allPosts = [...allPosts, ...posts]
-    // Stop paginating if we got fewer than 40 posts (end of timeline)
+    // End of timeline or no more pages
    if (posts.length < 40 || !nextMaxId) break
-    // Stop paginating if the time span of what we have is already > 5 minutes
+    // If the oldest post in this batch is already beyond 1 hour, we have a full window
-    const times = allPosts.map((p) => new Date(p.created_at).getTime())
+    const oldestInBatch = Math.min(...posts.map((p) => new Date(p.created_at).getTime()))
-    const spanMs = Math.max(...times) - Math.min(...times)
+    if (oldestInBatch < cutoff) break
    if (spanMs > 5 * 60 * 1000) break
    maxId = nextMaxId
  }
@@ -89,16 +112,26 @@ export async function getPostsData(
  const newestMs = Math.max(...times)
  const oldestMs = Math.min(...times)
-  // Minimum 1-minute span to handle flood scenario (all same timestamp)
+  let postsPerHour: number
-  const spanHours = Math.max((newestMs - oldestMs) / (1000 * 60 * 60), 1 / 60)
+  if (oldestMs < cutoff) {
-  const postsPerHour = allPosts.length / spanHours
+    // We reached (or passed) the 1-hour horizon — count posts within the last hour directly
    postsPerHour = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length
  } else {
    // All posts are within the last hour (burst scenario or very sparse tag).
    // Extrapolate from the covered span. Minimum 1-minute span to avoid divide-by-zero.
    const coveredMs = Math.max(newestMs - oldestMs, 60_000)
    postsPerHour = allPosts.length / (coveredMs / ONE_HOUR_MS)
  }
-  // Count co-occurring tags
+  // Count co-occurring tags — merge post.tags with tags parsed from HTML content
  const counts = new Map<string, number>()
  const lowerTag = tag.toLowerCase()
  for (const post of allPosts) {
-    for (const t of post.tags ?? []) {
+    const fromApi = (post.tags ?? []).map((t) => t.name.toLowerCase())
-      const name = t.name.toLowerCase()
+    const fromContent = extractTagsFromHtml(post.content)
    // Union of both sources
    const allTagNames = [...new Set([...fromApi, ...fromContent])]
    for (const name of allTagNames) {
      if (name !== lowerTag && name.length >= 2 && name.length <= 100) {
        counts.set(name, (counts.get(name) ?? 0) + 1)
      }
@@ -128,3 +161,4 @@ export async function getPostsData(
  return { postsPerHour, relatedTags, displayTag }
 }