diff --git a/AGENTS.md b/AGENTS.md index 8308e70..f4d0e4f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -72,7 +72,7 @@ src/ calendar.ts # Fetches /{username}/calendar/ → extracts available years year.ts # Fetches /{username}/{year}/ → extracts dates with entries day.ts # Fetches /{username}/{year}/{mm}/{dd}/ → extracts entries - comments.ts # Fetches /{post-id}.html?view=comments → extracts comments with nesting depth + comments.ts # Fetches /{post-id}.html?nojs=1&view=comments → extracts comments with nesting depth across modern (b-tree-twig) and S1 legacy (ljcmt{id}) themes tui/ tty.ts # isTTY() — checks whether stdout is an interactive terminal logger.ts # TuiLogger (extends Logger) — routes log calls through clack/spinners/progress diff --git a/README.md b/README.md index 6b68c3e..41e659b 100644 --- a/README.md +++ b/README.md @@ -203,7 +203,7 @@ src/ calendar.ts # Discovers available years year.ts # Discovers entry dates in a year day.ts # Extracts entries from a day page - comments.ts # Extracts comments from an entry's comment page + comments.ts # Fetches /{post-id}.html?nojs=1&view=comments → extracts comments with nesting depth across modern (b-tree-twig) and S1 legacy (ljcmt{id}) themes converters/ html-to-markdown.ts # HTML to Markdown conversion writers/ diff --git a/src/scrapers/comments.ts b/src/scrapers/comments.ts index 163f672..a3f1ed7 100644 --- a/src/scrapers/comments.ts +++ b/src/scrapers/comments.ts @@ -1,5 +1,5 @@ import * as cheerio from "cheerio"; -import type { AnyNode } from "domhandler"; +import type { AnyNode, Element } from "domhandler"; import { fetchWithRetry, sleep } from "../utils/http.ts"; import type { Logger } from "../utils/logger.ts"; @@ -20,6 +20,9 @@ const LJ_COMMENT_ACTION_PATTERNS = [ /b-pseudo/, ]; +// LJ S1 legacy themes indent threaded replies in 25px increments per nesting level +const LEGACY_INDENT_PX = 25; + export async function scrapeComments( entryUrl: string, retries: number, @@ -35,16 +38,18 @@ export async function scrapeComments( export function buildCommentUrl(entryUrl: string): string { const base = entryUrl.replace(/[?#].*$/, ""); - return `${base}?view=comments`; + // ?nojs=1 forces LJ to render the static, server-rendered comment tree. + // Without it, modern themes return an empty .b-tree-root and load comments via JS. + return `${base}?nojs=1&view=comments`; } export function extractCommentsFromHtml(html: string): Comment[] { const $ = cheerio.load(html); - const comments: Comment[] = []; // Modern LJ theme: b-tree-twig wrappers inside b-tree-root const twigs = $(".b-tree-twig"); if (twigs.length > 0) { + const comments: Comment[] = []; twigs.each((_i, twig) => { const $twig = $(twig); const leaf = $twig.children(".b-leaf.comment").first(); @@ -57,20 +62,24 @@ export function extractCommentsFromHtml(html: string): Comment[] { return comments; } - // Fallback: look for comment elements with thread links (older/custom themes) - const threadLinks = $('a[href*="thread="]'); - if (threadLinks.length > 0) { - return extractLegacyComments($); + // S1 legacy theme:
+ const legacyContainers = $('[id^="ljcmt"]'); + if (legacyContainers.length > 0) { + const comments: Comment[] = []; + legacyContainers.each((_i, el) => { + const comment = parseLegacyComment($, $(el)); + if (comment) comments.push(comment); + }); + return comments; } - return comments; + return []; } function extractTwigDepth(className: string): number { // b-tree-twig-N where N is the 1-based depth level const match = /\bb-tree-twig-(\d+)\b/.exec(className); if (match) return parseInt(match[1]!, 10) - 1; // convert to 0-based - // Fallback: parse margin-left from style (30px per level) return 0; } @@ -87,7 +96,7 @@ function parseModernComment( // User profile link — the avatar/name link; anonymous has empty href const userLinkEl = $leaf.find(".b-leaf-username a, .b-leaf-userpic-inner").first(); const rawUserUrl = userLinkEl.attr("href") ?? ""; - const userUrl = rawUserUrl && rawUserUrl !== "#" && rawUserUrl !== "" ? rawUserUrl : ""; + const userUrl = rawUserUrl && rawUserUrl !== "#" ? rawUserUrl : ""; const permalinkEl = $leaf.find(".b-leaf-permalink").first(); const permalinkUrl = permalinkEl.attr("href") ?? ""; @@ -116,45 +125,198 @@ function isCommentActionLink(href: string, cls: string): boolean { return false; } -function extractLegacyComments($: cheerio.CheerioAPI): Comment[] { - // Older LJ themes may not use b-tree-twig; use thread link anchors as anchors - const comments: Comment[] = []; - const seen = new Set(); - const threadPattern = /[?&]thread=(\d+)/; +function parseLegacyComment( + $: cheerio.CheerioAPI, + $cmt: cheerio.Cheerio +): Comment | null { + const idAttr = $cmt.attr("id") ?? ""; + const threadNumeric = idAttr.replace(/^ljcmt/, ""); + if (!threadNumeric) return null; + const id = `t${threadNumeric}`; - $('a[href*="thread="]').each((_i, el) => { - const href = $(el).attr("href") ?? ""; - const match = threadPattern.exec(href); - if (!match) return; - const threadId = `t${match[1]}`; - if (seen.has(threadId)) return; - seen.add(threadId); - - // Walk up to find the comment container - let $container = $(el).closest("[id^='t']"); - if ($container.length === 0) { - $container = $(el).closest("div, li, td"); + const depth = extractLegacyDepth($cmt.attr("style") ?? ""); + + const $ljuser = $cmt.find("span.ljuser[data-ljuser]").first(); + let username = ($ljuser.attr("data-ljuser") ?? "").trim(); + let userUrl = ""; + if (username) { + userUrl = ($ljuser.find("a.i-ljuser-username").first().attr("href") ?? "").trim(); + if (!userUrl) { + // Fall back to any non-profile anchor inside the ljuser span + userUrl = ($ljuser.find("a").not(".i-ljuser-profile").first().attr("href") ?? "").trim(); } - if ($container.length === 0) return; - - const username = $container.find('[class*="username"]').first().text().trim() || "Anonymous"; - const timestampText = $(el).text().trim(); - const $contentClone = $container.clone(); - $contentClone.find("a").each((_j, a) => { - const aHref = $(a).attr("href") ?? ""; - if (/[?&]replyto=|[?&]mode=reply/.test(aHref)) $(a).remove(); - }); + } else { + username = "Anonymous"; + } - comments.push({ - id: threadId, - depth: 0, - username, - userUrl: "", - timestampText, - permalinkUrl: href, - contentHtml: $contentClone.html()?.trim() ?? "", + const permalinkUrl = findLegacyPermalink($, $cmt, threadNumeric); + const timestampText = findLegacyTimestamp($, $cmt); + const contentHtml = extractLegacyBody($, $cmt, threadNumeric); + + return { id, depth, username, userUrl, timestampText, permalinkUrl, contentHtml }; +} + +function extractLegacyDepth(style: string): number { + const match = /margin-left\s*:\s*(\d+)\s*px/i.exec(style); + if (!match) return 0; + const px = parseInt(match[1]!, 10); + if (!Number.isFinite(px) || px <= 0) return 0; + return Math.round(px / LEGACY_INDENT_PX); +} + +function findLegacyPermalink( + $: cheerio.CheerioAPI, + $cmt: cheerio.Cheerio, + threadNumeric: string +): string { + const exactPattern = new RegExp(`[?&]thread=${threadNumeric}(?:#t${threadNumeric}\\b|\\b)`); + let permalinkUrl = ""; + $cmt.find("a[href*='thread=']").each((_i, a) => { + const href = $(a).attr("href") ?? ""; + if (!exactPattern.test(href)) return; + if (LJ_COMMENT_ACTION_PATTERNS.some((p) => p.test(href))) return; + permalinkUrl = href; + return false; + }); + return permalinkUrl; +} + +function findLegacyTimestamp( + $: cheerio.CheerioAPI, + $cmt: cheerio.Cheerio +): string { + // The timestamp is inside a within the comment header. + // Scope to the header region (comment_bar_one for one S1 variant, the first + // row of the cmtbar table for the other) so user-authored body content like + // "We met in 2003" can't shadow the real timestamp. + let $headers: cheerio.Cheerio = $cmt.find(".comment_bar_one, .comment_bar_alt"); + if ($headers.length === 0) { + $headers = $cmt.find('table[id^="cmtbar"]').find("> tbody > tr, > tr").first(); + } + // Last-resort fallback for unknown S1 layouts: search the whole comment. + // Re-introduces the body-shadowing risk, so prefer the scoped paths above. + if ($headers.length === 0) $headers = $cmt; + + let timestampText = ""; + $headers.find("span[title]").each((_i, sp) => { + const text = $(sp).text().trim(); + if (/(?:19|20)\d{2}/.test(text)) { + timestampText = text; + return false; + } + }); + return timestampText; +} + +function extractLegacyBody( + $: cheerio.CheerioAPI, + $cmt: cheerio.Cheerio, + threadNumeric: string +): string { + const $clone = $cmt.clone(); + + // Drop the named anchor target () — has no body content + $clone.find('a[name^="t"]').remove(); + + // Strip metadata header used by some S1 themes ("comment_bar_one" / "comment_bar_alt" + // contain the userpic + From/Date sub-table; not part of the body) + $clone.find(".comment_bar_one, .comment_bar_alt").remove(); + + // For themes that wrap the entire comment (header + body + footer) in a single + // , surgically extract just the body row(s). Walk only the + // outer rows — find("tr") would also descend into the inner metadata sub-table. + $clone.find(`table[id^="cmtbar"]`).each((_i, tbl) => { + const $table = $(tbl); + const bodyHtmlParts: string[] = []; + const $rows = $table.find("> tbody > tr, > thead > tr, > tfoot > tr, > tr"); + $rows.each((_j, tr) => { + const $tr = $(tr); + // Skip header rows (contain a sub-table or userpic/ljuser markup) + if ($tr.find("table, span.ljuser, .i-ljuser, img.i-ljuser-userhead").length > 0) return; + // Skip footer rows (Reply/Parent/Thread links) + if (rowHasReplyOrParentLink($, $tr)) return; + $tr.find("> td, > th").each((_k, cell) => { + bodyHtmlParts.push($(cell).html() ?? ""); + }); }); + const replacement = bodyHtmlParts.join(" ").trim(); + $table.replaceWith(replacement); + }); + + // Strip any empty wrapper divs that previously contained the metadata table + $clone.find('div[align="right"].entry').each((_i, d) => { + const $d = $(d); + if (($d.text() ?? "").trim() === "" && $d.find("img").length === 0) $d.remove(); + }); + + // Strip footer-only divs containing reply/parent/thread links + $clone.find("div").each((_i, d) => { + const $d = $(d); + if (!divIsLegacyFooter($, $d)) return; + $d.remove(); + }); + + // Hidden quick-reply containers + $clone.find('[id^="ljqrt"], [id="ljqrttopcomment"], [id="ljqrtbottomcomment"]').remove(); + $clone.find("form, input, button, select, textarea").remove(); + + // Strip orphan permalink anchors (e.g. a leftover bare "(Link)" line). Use a + // word-boundary regex so a body link to ?thread=500 isn't matched when this + // comment's id is "5". + const exactThreadPattern = new RegExp(`[?&]thread=${threadNumeric}(?:#t${threadNumeric}\\b|\\b)`); + $clone.find("a[href*='thread=']").each((_i, a) => { + const href = $(a).attr("href") ?? ""; + if (exactThreadPattern.test(href)) { + $(a).remove(); + } + }); + // After removing those, prune empty (...) wrappers left behind + $clone.find("strong").each((_i, s) => { + const $s = $(s); + const text = ($s.text() ?? "").replace(/[\s()]/g, ""); + if (text === "") $s.remove(); + }); + + return ($clone.html() ?? "").trim(); +} + +function divIsLegacyFooter( + $: cheerio.CheerioAPI, + $d: cheerio.Cheerio +): boolean { + const $links = $d.find("a"); + if ($links.length === 0) return false; + // Footer divs only contain Reply/Parent/Thread/Link anchors and decorative punctuation + const hasReply = $links.toArray().some((a) => { + const href = $(a as Element).attr("href") ?? ""; + return /[?&]replyto=|[?&]mode=reply/i.test(href); + }); + if (!hasReply) return false; + // Don't yank a div that also has substantial non-link prose + let nonLinkText = ($d.text() ?? "").trim(); + $links.toArray().forEach((a) => { + const linkText = ($(a as Element).text() ?? "").trim(); + if (linkText) nonLinkText = nonLinkText.replaceAll(linkText, ""); }); + nonLinkText = nonLinkText.replace(/[()|·•\s]/g, ""); + return nonLinkText.length === 0; +} - return comments; +function rowHasReplyOrParentLink( + $: cheerio.CheerioAPI, + $tr: cheerio.Cheerio +): boolean { + const links = $tr.find("a").toArray(); + if (links.length === 0) return false; + // The text-only match (Parent/Thread/Link) is gated on a thread= href so a + // commenter who wrote Link with an unrelated href can't make their + // body row look like an LJ footer. The Reply arm is gated by replyto/mode. + return links.some((a) => { + const $a = $(a as Element); + const href = $a.attr("href") ?? ""; + if (/[?&]replyto=|[?&]mode=reply/i.test(href)) return true; + if (!/[?&]thread=/i.test(href)) return false; + const text = ($a.text() ?? "").trim(); + return /^(?:Reply|Parent|Thread|Link)$/i.test(text); + }); } diff --git a/tests/scrapers/comments.test.ts b/tests/scrapers/comments.test.ts index 1d34c4e..04c98ad 100644 --- a/tests/scrapers/comments.test.ts +++ b/tests/scrapers/comments.test.ts @@ -93,19 +93,19 @@ const NO_COMMENTS_HTML = ` `; describe("buildCommentUrl", () => { - it("appends ?view=comments to a plain entry URL", () => { + it("appends ?nojs=1&view=comments to a plain entry URL", () => { expect(buildCommentUrl("https://user.livejournal.com/12345.html")) - .toBe("https://user.livejournal.com/12345.html?view=comments"); + .toBe("https://user.livejournal.com/12345.html?nojs=1&view=comments"); }); it("strips existing query params before appending", () => { expect(buildCommentUrl("https://user.livejournal.com/12345.html?view=flat")) - .toBe("https://user.livejournal.com/12345.html?view=comments"); + .toBe("https://user.livejournal.com/12345.html?nojs=1&view=comments"); }); it("strips fragment before appending", () => { expect(buildCommentUrl("https://user.livejournal.com/12345.html#comments")) - .toBe("https://user.livejournal.com/12345.html?view=comments"); + .toBe("https://user.livejournal.com/12345.html?nojs=1&view=comments"); }); }); @@ -181,52 +181,283 @@ describe("extractCommentsFromHtml", () => { }); }); -const LEGACY_COMMENTS_HTML = ` +// Mirrors the LJ S1 "comment_bar_one" theme: each comment is a
+// containing a header
with the userpic + From/Date table, +// then a sibling
for the body, then a small footer div with reply/thread links. +const LEGACY_COMMENT_BAR_HTML = ` -
- - - - - - -
-
olduser
- January 5 2003, 12:00:00 UTC -

Legacy comment content here.

- Reply -
-
otheruser
- January 5 2003, 14:00:00 UTC -

Another legacy comment.

- Reply -
+
+
+ +
+ +
+ + + + +
From:commenter1
Date:June 18th, 2003 09:57 am (UTC)
+
(Link)
+
+
ur no fun!
+ +
+
+ +
+ +
+ + + + +
From:commenter2
Date:June 18th, 2003 07:34 pm (UTC)
+
(Link)
+
+
second reply on this entry
+ +
+
+ + +`; + +// Mirrors the LJ S1 "cmtbar table" theme used by some custom layouts: the entire +// comment (header + body + footer) lives inside a single . +const LEGACY_CMTBAR_HTML = ` + + + +
+ +
+
+ + + +
+ + + +
commenter3
Subject:Re:
Link:(Link)
Time:2004-02-09 05:46 am (UTC)
+
top-level body content
(Reply) (Thread)
+
+ +
+ +
+ + + + +
+ + + +
commenter4
Subject:Re:
Link:(Link)
Time:2004-02-10 02:17 am (UTC)
+
nested reply content
(Reply) (Parent) (Thread)
+
+
`; -describe("extractCommentsFromHtml legacy fallback", () => { - it("extracts comments from pages without b-tree-twig structure", () => { - const comments = extractCommentsFromHtml(LEGACY_COMMENTS_HTML); +describe("extractCommentsFromHtml S1 legacy comment_bar layout", () => { + it("extracts comments from comment_bar_one structure", () => { + const comments = extractCommentsFromHtml(LEGACY_COMMENT_BAR_HTML); expect(comments.length).toBe(2); }); - it("extracts thread id from thread= URL without #t anchor", () => { - const comments = extractCommentsFromHtml(LEGACY_COMMENTS_HTML); + it("extracts thread id from ljcmt{id} container", () => { + const comments = extractCommentsFromHtml(LEGACY_COMMENT_BAR_HTML); expect(comments[0]?.id).toBe("t500"); + expect(comments[1]?.id).toBe("t600"); }); - it("extracts thread id from thread= URL with #t anchor", () => { - const comments = extractCommentsFromHtml(LEGACY_COMMENTS_HTML); - expect(comments[1]?.id).toBe("t600"); + it("extracts username from data-ljuser attribute", () => { + const comments = extractCommentsFromHtml(LEGACY_COMMENT_BAR_HTML); + expect(comments[0]?.username).toBe("commenter1"); + expect(comments[1]?.username).toBe("commenter2"); }); - it("strips reply links from legacy comment content", () => { - const comments = extractCommentsFromHtml(LEGACY_COMMENTS_HTML); - for (const comment of comments) { - expect(comment.contentHtml).not.toContain("replyto="); + it("extracts user profile URL from i-ljuser-username anchor", () => { + const comments = extractCommentsFromHtml(LEGACY_COMMENT_BAR_HTML); + expect(comments[0]?.userUrl).toBe("https://commenter1.livejournal.com/"); + }); + + it("extracts visible date text from the title-bearing span", () => { + const comments = extractCommentsFromHtml(LEGACY_COMMENT_BAR_HTML); + expect(comments[0]?.timestampText).toBe("June 18th, 2003 09:57 am (UTC)"); + }); + + it("extracts permalink URL pointing at the comment thread", () => { + const comments = extractCommentsFromHtml(LEGACY_COMMENT_BAR_HTML); + expect(comments[0]?.permalinkUrl).toBe("https://author.livejournal.com/99.html?thread=500#t500"); + }); + + it("captures the body text and excludes header/footer artifacts", () => { + const comments = extractCommentsFromHtml(LEGACY_COMMENT_BAR_HTML); + expect(comments[0]?.contentHtml).toContain("ur no fun!"); + expect(comments[0]?.contentHtml).not.toContain("From:"); + expect(comments[0]?.contentHtml).not.toContain("replyto="); + expect(comments[0]?.contentHtml).not.toContain("Reply"); + expect(comments[0]?.contentHtml).not.toContain("Thread"); + expect(comments[0]?.contentHtml).not.toContain("data-ljuser"); + }); + + it("computes nesting depth from margin-left in comment_bar_one variant", () => { + const indented = LEGACY_COMMENT_BAR_HTML.replace( + `id="ljcmt600" style="margin-left: 0px; margin-top: 5px"`, + `id="ljcmt600" style="margin-left: 25px; margin-top: 5px"` + ); + const comments = extractCommentsFromHtml(indented); + expect(comments[0]?.depth).toBe(0); + expect(comments[1]?.depth).toBe(1); + }); + + it("ignores year-bearing spans in the body when extracting timestamp", () => { + const html = ` +
+ +
+
+ + + + +
From:commenter9
Date:March 1st, 2005 09:00 am (UTC)
+
+
+
we met in 2003 and it was great
+
+ `; + const comments = extractCommentsFromHtml(html); + expect(comments[0]?.timestampText).toBe("March 1st, 2005 09:00 am (UTC)"); + }); + + it("treats anonymous comments as Anonymous when no ljuser span is present", () => { + const html = ` +
+ +
+
+ +
From:(Anonymous)
Date:January 1st, 2004 12:00:00 (UTC)
(Link)
+
+
anon body
+
+ `; + const comments = extractCommentsFromHtml(html); + expect(comments[0]?.username).toBe("Anonymous"); + expect(comments[0]?.userUrl).toBe(""); + expect(comments[0]?.contentHtml).toContain("anon body"); + }); + + it("does not strip body links whose thread id is a substring of this comment's id", () => { + // Comment id is t5; body has a link to ?thread=500 — must survive. + const html = ` +
+ +
+
+ +
From:commenter10
Date:June 18th, 2003 09:57 am (UTC)
+
(Link)
+
+
see this other thread for context
+
+ `; + const comments = extractCommentsFromHtml(html); + expect(comments[0]?.contentHtml).toContain("thread=500"); + expect(comments[0]?.contentHtml).toContain("this other thread"); + }); +}); + +describe("extractCommentsFromHtml S1 legacy cmtbar layout", () => { + it("extracts comments wrapped in cmtbar{id} table structure", () => { + const comments = extractCommentsFromHtml(LEGACY_CMTBAR_HTML); + expect(comments.length).toBe(2); + }); + + it("computes nesting depth from margin-left style", () => { + const comments = extractCommentsFromHtml(LEGACY_CMTBAR_HTML); + expect(comments[0]?.depth).toBe(0); + expect(comments[1]?.depth).toBe(1); + }); + + it("captures the body row content from inside cmtbar table", () => { + const comments = extractCommentsFromHtml(LEGACY_CMTBAR_HTML); + expect(comments[0]?.contentHtml).toContain("top-level body content"); + expect(comments[1]?.contentHtml).toContain("nested reply content"); + }); + + it("excludes metadata and footer from cmtbar body", () => { + const comments = extractCommentsFromHtml(LEGACY_CMTBAR_HTML); + for (const c of comments) { + expect(c.contentHtml).not.toContain("Subject:"); + expect(c.contentHtml).not.toContain("Time:"); + expect(c.contentHtml).not.toContain("replyto="); + expect(c.contentHtml).not.toContain("Parent"); + expect(c.contentHtml).not.toContain("data-ljuser"); } }); + + it("extracts permalink url pointing at the right thread id", () => { + const comments = extractCommentsFromHtml(LEGACY_CMTBAR_HTML); + expect(comments[0]?.permalinkUrl).toContain("thread=700"); + expect(comments[1]?.permalinkUrl).toContain("thread=750"); + }); + + it("does not treat user-authored Link in body as a footer row", () => { + // The body row contains a user-authored anchor with text "Link" pointing at + // an unrelated URL. The footer-row classifier must not mistake it for the + // LJ "(Link)" permalink and skip the row. + const html = ` +
+ +
+ + + + +
+ + + +
commenter11
Subject:Re:
Link:(Link)
Time:2004-02-15 10:00 am (UTC)
+
here is a Link for more context
(Reply) (Thread)
+
+
+ `; + const comments = extractCommentsFromHtml(html); + expect(comments[0]?.contentHtml).toContain("for more context"); + expect(comments[0]?.contentHtml).toContain("https://example.com/article"); + }); + + it("treats anonymous comments in cmtbar layout as Anonymous when no ljuser span is present", () => { + const html = ` +
+ +
+ + + + +
+ + + +
(Anonymous)
Subject:(no subject)
Link:(Link)
Time:2004-02-15 09:00 am (UTC)
+
cmtbar anon body
(Reply) (Thread)
+
+
+ `; + const comments = extractCommentsFromHtml(html); + expect(comments[0]?.username).toBe("Anonymous"); + expect(comments[0]?.userUrl).toBe(""); + expect(comments[0]?.contentHtml).toContain("cmtbar anon body"); + expect(comments[0]?.timestampText).toBe("2004-02-15 09:00 am (UTC)"); + }); });