better hashtag handling

2026-03-19 00:52:17 -04:00
parent 20f939799d
commit ec275dd858
2 changed files with 48 additions and 14 deletions
@@ -17,6 +17,21 @@ function extractMaxId(linkHeader: string | null): string | null {
  return match ? match[1] : null
 }

+/**
+ * Extracts hashtag names from Mastodon HTML content as a supplement to post.tags.
+ * Mastodon renders hashtags as: #<span>TagName</span> inside an anchor.
+ */
+function extractTagsFromHtml(html: string): string[] {
+  const results: string[] = []
+  // Match: #<span>TagName</span>
+  const re = /#<span>([^<]+)<\/span>/gi
+  let m: RegExpExecArray | null
+  while ((m = re.exec(html)) !== null) {
+    results.push(m[1].toLowerCase())
+  }
+  return results
+}
+
 async function fetchPage(tag: string, maxId?: string): Promise<TimelineResult> {
  const instance = process.env.MASTODON_INSTANCE
  if (!instance) throw new Error('MASTODON_INSTANCE is not configured')
@@ -46,8 +61,6 @@ async function fetchPage(tag: string, maxId?: string): Promise<TimelineResult> {

 /**
 * Fetches recent posts for a hashtag and returns posts-per-hour.
- * Paginates when all fetched posts share a very tight timestamp window
- * (e.g., #happynewyear at midnight) up to MAX_PAGES_PER_HASHTAG pages.
 */
 export async function getPostsPerHour(tag: string): Promise<number> {
  const { postsPerHour } = await getPostsData(tag)
@@ -57,11 +70,22 @@ export async function getPostsPerHour(tag: string): Promise<number> {
 /**
 * Returns posts-per-hour AND a sorted list of co-occurring tag names
 * (lowercased, excluding the queried tag itself).
+ *
+ * Strategy:
+ *   - Paginate until we have at least one post older than 1 hour (a complete picture),
+ *     OR we exhaust the timeline, OR we hit MAX_PAGES_PER_HASHTAG.
+ *   - If the oldest fetched post is >= 1 hour old: postsPerHour = count of posts in the
+ *     last hour (direct measurement over a full window).
+ *   - If all fetched posts are within the last hour (hit page limit or timeline exhausted
+ *     with a narrow window): extrapolate — postsPerHour = count / (coveredHours).
 */
 export async function getPostsData(
  tag: string,
 ): Promise<{ postsPerHour: number; relatedTags: string[]; displayTag?: string }> {
  const maxPages = parseInt(process.env.MAX_PAGES_PER_HASHTAG ?? '5', 10)
+  const ONE_HOUR_MS = 60 * 60 * 1000
+  const now = Date.now()
+  const cutoff = now - ONE_HOUR_MS

  let allPosts: MastodonPost[] = []
  let maxId: string | undefined
@@ -72,13 +96,12 @@ export async function getPostsData(
    if (posts.length === 0) break
    allPosts = [...allPosts, ...posts]

-    // Stop paginating if we got fewer than 40 posts (end of timeline)
+    // End of timeline or no more pages
    if (posts.length < 40 || !nextMaxId) break

-    // Stop paginating if the time span of what we have is already > 5 minutes
-    const times = allPosts.map((p) => new Date(p.created_at).getTime())
-    const spanMs = Math.max(...times) - Math.min(...times)
-    if (spanMs > 5 * 60 * 1000) break
+    // If the oldest post in this batch is already beyond 1 hour, we have a full window
+    const oldestInBatch = Math.min(...posts.map((p) => new Date(p.created_at).getTime()))
+    if (oldestInBatch < cutoff) break

    maxId = nextMaxId
  }
@@ -89,16 +112,26 @@ export async function getPostsData(
  const newestMs = Math.max(...times)
  const oldestMs = Math.min(...times)

-  // Minimum 1-minute span to handle flood scenario (all same timestamp)
-  const spanHours = Math.max((newestMs - oldestMs) / (1000 * 60 * 60), 1 / 60)
-  const postsPerHour = allPosts.length / spanHours
+  let postsPerHour: number
+  if (oldestMs < cutoff) {
+    // We reached (or passed) the 1-hour horizon — count posts within the last hour directly
+    postsPerHour = allPosts.filter((p) => new Date(p.created_at).getTime() >= cutoff).length
+  } else {
+    // All posts are within the last hour (burst scenario or very sparse tag).
+    // Extrapolate from the covered span. Minimum 1-minute span to avoid divide-by-zero.
+    const coveredMs = Math.max(newestMs - oldestMs, 60_000)
+    postsPerHour = allPosts.length / (coveredMs / ONE_HOUR_MS)
+  }

-  // Count co-occurring tags
+  // Count co-occurring tags — merge post.tags with tags parsed from HTML content
  const counts = new Map<string, number>()
  const lowerTag = tag.toLowerCase()
  for (const post of allPosts) {
-    for (const t of post.tags ?? []) {
-      const name = t.name.toLowerCase()
+    const fromApi = (post.tags ?? []).map((t) => t.name.toLowerCase())
+    const fromContent = extractTagsFromHtml(post.content)
+    // Union of both sources
+    const allTagNames = [...new Set([...fromApi, ...fromContent])]
+    for (const name of allTagNames) {
      if (name !== lowerTag && name.length >= 2 && name.length <= 100) {
        counts.set(name, (counts.get(name) ?? 0) + 1)
      }
@@ -128,3 +161,4 @@ export async function getPostsData(

  return { postsPerHour, relatedTags, displayTag }
 }
+