diff --git a/AGENTS.md b/AGENTS.md
index 8308e70..f4d0e4f 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -72,7 +72,7 @@ src/
calendar.ts # Fetches /{username}/calendar/ → extracts available years
year.ts # Fetches /{username}/{year}/ → extracts dates with entries
day.ts # Fetches /{username}/{year}/{mm}/{dd}/ → extracts entries
- comments.ts # Fetches /{post-id}.html?view=comments → extracts comments with nesting depth
+ comments.ts # Fetches /{post-id}.html?nojs=1&view=comments → extracts comments with nesting depth across modern (b-tree-twig) and S1 legacy (ljcmt{id}) themes
tui/
tty.ts # isTTY() — checks whether stdout is an interactive terminal
logger.ts # TuiLogger (extends Logger) — routes log calls through clack/spinners/progress
diff --git a/README.md b/README.md
index 6b68c3e..41e659b 100644
--- a/README.md
+++ b/README.md
@@ -203,7 +203,7 @@ src/
calendar.ts # Discovers available years
year.ts # Discovers entry dates in a year
day.ts # Extracts entries from a day page
- comments.ts # Extracts comments from an entry's comment page
+ comments.ts # Fetches /{post-id}.html?nojs=1&view=comments → extracts comments with nesting depth across modern (b-tree-twig) and S1 legacy (ljcmt{id}) themes
converters/
html-to-markdown.ts # HTML to Markdown conversion
writers/
diff --git a/src/scrapers/comments.ts b/src/scrapers/comments.ts
index 163f672..a3f1ed7 100644
--- a/src/scrapers/comments.ts
+++ b/src/scrapers/comments.ts
@@ -1,5 +1,5 @@
import * as cheerio from "cheerio";
-import type { AnyNode } from "domhandler";
+import type { AnyNode, Element } from "domhandler";
import { fetchWithRetry, sleep } from "../utils/http.ts";
import type { Logger } from "../utils/logger.ts";
@@ -20,6 +20,9 @@ const LJ_COMMENT_ACTION_PATTERNS = [
/b-pseudo/,
];
+// LJ S1 legacy themes indent threaded replies in 25px increments per nesting level
+const LEGACY_INDENT_PX = 25;
+
export async function scrapeComments(
entryUrl: string,
retries: number,
@@ -35,16 +38,18 @@ export async function scrapeComments(
export function buildCommentUrl(entryUrl: string): string {
const base = entryUrl.replace(/[?#].*$/, "");
- return `${base}?view=comments`;
+ // ?nojs=1 forces LJ to render the static, server-rendered comment tree.
+ // Without it, modern themes return an empty .b-tree-root and load comments via JS.
+ return `${base}?nojs=1&view=comments`;
}
export function extractCommentsFromHtml(html: string): Comment[] {
const $ = cheerio.load(html);
- const comments: Comment[] = [];
// Modern LJ theme: b-tree-twig wrappers inside b-tree-root
const twigs = $(".b-tree-twig");
if (twigs.length > 0) {
+ const comments: Comment[] = [];
twigs.each((_i, twig) => {
const $twig = $(twig);
const leaf = $twig.children(".b-leaf.comment").first();
@@ -57,20 +62,24 @@ export function extractCommentsFromHtml(html: string): Comment[] {
return comments;
}
- // Fallback: look for comment elements with thread links (older/custom themes)
- const threadLinks = $('a[href*="thread="]');
- if (threadLinks.length > 0) {
- return extractLegacyComments($);
+ // S1 legacy theme:
+ const legacyContainers = $('[id^="ljcmt"]');
+ if (legacyContainers.length > 0) {
+ const comments: Comment[] = [];
+ legacyContainers.each((_i, el) => {
+ const comment = parseLegacyComment($, $(el));
+ if (comment) comments.push(comment);
+ });
+ return comments;
}
- return comments;
+ return [];
}
function extractTwigDepth(className: string): number {
// b-tree-twig-N where N is the 1-based depth level
const match = /\bb-tree-twig-(\d+)\b/.exec(className);
if (match) return parseInt(match[1]!, 10) - 1; // convert to 0-based
- // Fallback: parse margin-left from style (30px per level)
return 0;
}
@@ -87,7 +96,7 @@ function parseModernComment(
// User profile link — the avatar/name link; anonymous has empty href
const userLinkEl = $leaf.find(".b-leaf-username a, .b-leaf-userpic-inner").first();
const rawUserUrl = userLinkEl.attr("href") ?? "";
- const userUrl = rawUserUrl && rawUserUrl !== "#" && rawUserUrl !== "" ? rawUserUrl : "";
+ const userUrl = rawUserUrl && rawUserUrl !== "#" ? rawUserUrl : "";
const permalinkEl = $leaf.find(".b-leaf-permalink").first();
const permalinkUrl = permalinkEl.attr("href") ?? "";
@@ -116,45 +125,198 @@ function isCommentActionLink(href: string, cls: string): boolean {
return false;
}
-function extractLegacyComments($: cheerio.CheerioAPI): Comment[] {
- // Older LJ themes may not use b-tree-twig; use thread link anchors as anchors
- const comments: Comment[] = [];
- const seen = new Set
();
- const threadPattern = /[?&]thread=(\d+)/;
+function parseLegacyComment(
+ $: cheerio.CheerioAPI,
+ $cmt: cheerio.Cheerio
+): Comment | null {
+ const idAttr = $cmt.attr("id") ?? "";
+ const threadNumeric = idAttr.replace(/^ljcmt/, "");
+ if (!threadNumeric) return null;
+ const id = `t${threadNumeric}`;
- $('a[href*="thread="]').each((_i, el) => {
- const href = $(el).attr("href") ?? "";
- const match = threadPattern.exec(href);
- if (!match) return;
- const threadId = `t${match[1]}`;
- if (seen.has(threadId)) return;
- seen.add(threadId);
-
- // Walk up to find the comment container
- let $container = $(el).closest("[id^='t']");
- if ($container.length === 0) {
- $container = $(el).closest("div, li, td");
+ const depth = extractLegacyDepth($cmt.attr("style") ?? "");
+
+ const $ljuser = $cmt.find("span.ljuser[data-ljuser]").first();
+ let username = ($ljuser.attr("data-ljuser") ?? "").trim();
+ let userUrl = "";
+ if (username) {
+ userUrl = ($ljuser.find("a.i-ljuser-username").first().attr("href") ?? "").trim();
+ if (!userUrl) {
+ // Fall back to any non-profile anchor inside the ljuser span
+ userUrl = ($ljuser.find("a").not(".i-ljuser-profile").first().attr("href") ?? "").trim();
}
- if ($container.length === 0) return;
-
- const username = $container.find('[class*="username"]').first().text().trim() || "Anonymous";
- const timestampText = $(el).text().trim();
- const $contentClone = $container.clone();
- $contentClone.find("a").each((_j, a) => {
- const aHref = $(a).attr("href") ?? "";
- if (/[?&]replyto=|[?&]mode=reply/.test(aHref)) $(a).remove();
- });
+ } else {
+ username = "Anonymous";
+ }
- comments.push({
- id: threadId,
- depth: 0,
- username,
- userUrl: "",
- timestampText,
- permalinkUrl: href,
- contentHtml: $contentClone.html()?.trim() ?? "",
+ const permalinkUrl = findLegacyPermalink($, $cmt, threadNumeric);
+ const timestampText = findLegacyTimestamp($, $cmt);
+ const contentHtml = extractLegacyBody($, $cmt, threadNumeric);
+
+ return { id, depth, username, userUrl, timestampText, permalinkUrl, contentHtml };
+}
+
+function extractLegacyDepth(style: string): number {
+ const match = /margin-left\s*:\s*(\d+)\s*px/i.exec(style);
+ if (!match) return 0;
+ const px = parseInt(match[1]!, 10);
+ if (!Number.isFinite(px) || px <= 0) return 0;
+ return Math.round(px / LEGACY_INDENT_PX);
+}
+
+function findLegacyPermalink(
+ $: cheerio.CheerioAPI,
+ $cmt: cheerio.Cheerio,
+ threadNumeric: string
+): string {
+ const exactPattern = new RegExp(`[?&]thread=${threadNumeric}(?:#t${threadNumeric}\\b|\\b)`);
+ let permalinkUrl = "";
+ $cmt.find("a[href*='thread=']").each((_i, a) => {
+ const href = $(a).attr("href") ?? "";
+ if (!exactPattern.test(href)) return;
+ if (LJ_COMMENT_ACTION_PATTERNS.some((p) => p.test(href))) return;
+ permalinkUrl = href;
+ return false;
+ });
+ return permalinkUrl;
+}
+
+function findLegacyTimestamp(
+ $: cheerio.CheerioAPI,
+ $cmt: cheerio.Cheerio
+): string {
+ // The timestamp is inside a within the comment header.
+ // Scope to the header region (comment_bar_one for one S1 variant, the first
+ // row of the cmtbar table for the other) so user-authored body content like
+ // "We met in 2003" can't shadow the real timestamp.
+ let $headers: cheerio.Cheerio = $cmt.find(".comment_bar_one, .comment_bar_alt");
+ if ($headers.length === 0) {
+ $headers = $cmt.find('table[id^="cmtbar"]').find("> tbody > tr, > tr").first();
+ }
+ // Last-resort fallback for unknown S1 layouts: search the whole comment.
+ // Re-introduces the body-shadowing risk, so prefer the scoped paths above.
+ if ($headers.length === 0) $headers = $cmt;
+
+ let timestampText = "";
+ $headers.find("span[title]").each((_i, sp) => {
+ const text = $(sp).text().trim();
+ if (/(?:19|20)\d{2}/.test(text)) {
+ timestampText = text;
+ return false;
+ }
+ });
+ return timestampText;
+}
+
+function extractLegacyBody(
+ $: cheerio.CheerioAPI,
+ $cmt: cheerio.Cheerio,
+ threadNumeric: string
+): string {
+ const $clone = $cmt.clone();
+
+ // Drop the named anchor target () — has no body content
+ $clone.find('a[name^="t"]').remove();
+
+ // Strip metadata header used by some S1 themes ("comment_bar_one" / "comment_bar_alt"
+ // contain the userpic + From/Date sub-table; not part of the body)
+ $clone.find(".comment_bar_one, .comment_bar_alt").remove();
+
+ // For themes that wrap the entire comment (header + body + footer) in a single
+ // , surgically extract just the body row(s). Walk only the
+ // outer rows — find("tr") would also descend into the inner metadata sub-table.
+ $clone.find(`table[id^="cmtbar"]`).each((_i, tbl) => {
+ const $table = $(tbl);
+ const bodyHtmlParts: string[] = [];
+ const $rows = $table.find("> tbody > tr, > thead > tr, > tfoot > tr, > tr");
+ $rows.each((_j, tr) => {
+ const $tr = $(tr);
+ // Skip header rows (contain a sub-table or userpic/ljuser markup)
+ if ($tr.find("table, span.ljuser, .i-ljuser, img.i-ljuser-userhead").length > 0) return;
+ // Skip footer rows (Reply/Parent/Thread links)
+ if (rowHasReplyOrParentLink($, $tr)) return;
+ $tr.find("> td, > th").each((_k, cell) => {
+ bodyHtmlParts.push($(cell).html() ?? "");
+ });
});
+ const replacement = bodyHtmlParts.join(" ").trim();
+ $table.replaceWith(replacement);
+ });
+
+ // Strip any empty wrapper divs that previously contained the metadata table
+ $clone.find('div[align="right"].entry').each((_i, d) => {
+ const $d = $(d);
+ if (($d.text() ?? "").trim() === "" && $d.find("img").length === 0) $d.remove();
+ });
+
+ // Strip footer-only divs containing reply/parent/thread links
+ $clone.find("div").each((_i, d) => {
+ const $d = $(d);
+ if (!divIsLegacyFooter($, $d)) return;
+ $d.remove();
+ });
+
+ // Hidden quick-reply containers
+ $clone.find('[id^="ljqrt"], [id="ljqrttopcomment"], [id="ljqrtbottomcomment"]').remove();
+ $clone.find("form, input, button, select, textarea").remove();
+
+ // Strip orphan permalink anchors (e.g. a leftover bare "(Link)" line). Use a
+ // word-boundary regex so a body link to ?thread=500 isn't matched when this
+ // comment's id is "5".
+ const exactThreadPattern = new RegExp(`[?&]thread=${threadNumeric}(?:#t${threadNumeric}\\b|\\b)`);
+ $clone.find("a[href*='thread=']").each((_i, a) => {
+ const href = $(a).attr("href") ?? "";
+ if (exactThreadPattern.test(href)) {
+ $(a).remove();
+ }
+ });
+ // After removing those, prune empty (...) wrappers left behind
+ $clone.find("strong").each((_i, s) => {
+ const $s = $(s);
+ const text = ($s.text() ?? "").replace(/[\s()]/g, "");
+ if (text === "") $s.remove();
+ });
+
+ return ($clone.html() ?? "").trim();
+}
+
+function divIsLegacyFooter(
+ $: cheerio.CheerioAPI,
+ $d: cheerio.Cheerio
+): boolean {
+ const $links = $d.find("a");
+ if ($links.length === 0) return false;
+ // Footer divs only contain Reply/Parent/Thread/Link anchors and decorative punctuation
+ const hasReply = $links.toArray().some((a) => {
+ const href = $(a as Element).attr("href") ?? "";
+ return /[?&]replyto=|[?&]mode=reply/i.test(href);
+ });
+ if (!hasReply) return false;
+ // Don't yank a div that also has substantial non-link prose
+ let nonLinkText = ($d.text() ?? "").trim();
+ $links.toArray().forEach((a) => {
+ const linkText = ($(a as Element).text() ?? "").trim();
+ if (linkText) nonLinkText = nonLinkText.replaceAll(linkText, "");
});
+ nonLinkText = nonLinkText.replace(/[()|·•\s]/g, "");
+ return nonLinkText.length === 0;
+}
- return comments;
+function rowHasReplyOrParentLink(
+ $: cheerio.CheerioAPI,
+ $tr: cheerio.Cheerio
+): boolean {
+ const links = $tr.find("a").toArray();
+ if (links.length === 0) return false;
+ // The text-only match (Parent/Thread/Link) is gated on a thread= href so a
+ // commenter who wrote Link with an unrelated href can't make their
+ // body row look like an LJ footer. The Reply arm is gated by replyto/mode.
+ return links.some((a) => {
+ const $a = $(a as Element);
+ const href = $a.attr("href") ?? "";
+ if (/[?&]replyto=|[?&]mode=reply/i.test(href)) return true;
+ if (!/[?&]thread=/i.test(href)) return false;
+ const text = ($a.text() ?? "").trim();
+ return /^(?:Reply|Parent|Thread|Link)$/i.test(text);
+ });
}
diff --git a/tests/scrapers/comments.test.ts b/tests/scrapers/comments.test.ts
index 1d34c4e..04c98ad 100644
--- a/tests/scrapers/comments.test.ts
+++ b/tests/scrapers/comments.test.ts
@@ -93,19 +93,19 @@ const NO_COMMENTS_HTML = `
`;
describe("buildCommentUrl", () => {
- it("appends ?view=comments to a plain entry URL", () => {
+ it("appends ?nojs=1&view=comments to a plain entry URL", () => {
expect(buildCommentUrl("https://user.livejournal.com/12345.html"))
- .toBe("https://user.livejournal.com/12345.html?view=comments");
+ .toBe("https://user.livejournal.com/12345.html?nojs=1&view=comments");
});
it("strips existing query params before appending", () => {
expect(buildCommentUrl("https://user.livejournal.com/12345.html?view=flat"))
- .toBe("https://user.livejournal.com/12345.html?view=comments");
+ .toBe("https://user.livejournal.com/12345.html?nojs=1&view=comments");
});
it("strips fragment before appending", () => {
expect(buildCommentUrl("https://user.livejournal.com/12345.html#comments"))
- .toBe("https://user.livejournal.com/12345.html?view=comments");
+ .toBe("https://user.livejournal.com/12345.html?nojs=1&view=comments");
});
});
@@ -181,52 +181,283 @@ describe("extractCommentsFromHtml", () => {
});
});
-const LEGACY_COMMENTS_HTML = `
+// Mirrors the LJ S1 "comment_bar_one" theme: each comment is a
+// containing a header
Legacy comment content here.
- Reply -Another legacy comment.
- Reply -