Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ src/
calendar.ts # Fetches /{username}/calendar/ → extracts available years
year.ts # Fetches /{username}/{year}/ → extracts dates with entries
day.ts # Fetches /{username}/{year}/{mm}/{dd}/ → extracts entries
comments.ts # Fetches /{post-id}.html?view=comments → extracts comments with nesting depth
comments.ts # Fetches /{post-id}.html?nojs=1&view=comments → extracts comments with nesting depth across modern (b-tree-twig) and S1 legacy (ljcmt{id}) themes
tui/
tty.ts # isTTY() — checks whether stdout is an interactive terminal
logger.ts # TuiLogger (extends Logger) — routes log calls through clack/spinners/progress
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ src/
calendar.ts # Discovers available years
year.ts # Discovers entry dates in a year
day.ts # Extracts entries from a day page
comments.ts # Extracts comments from an entry's comment page
comments.ts # Fetches /{post-id}.html?nojs=1&view=comments → extracts comments with nesting depth across modern (b-tree-twig) and S1 legacy (ljcmt{id}) themes
converters/
html-to-markdown.ts # HTML to Markdown conversion
writers/
Expand Down
252 changes: 207 additions & 45 deletions src/scrapers/comments.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import * as cheerio from "cheerio";
import type { AnyNode } from "domhandler";
import type { AnyNode, Element } from "domhandler";
import { fetchWithRetry, sleep } from "../utils/http.ts";
import type { Logger } from "../utils/logger.ts";

Expand All @@ -20,6 +20,9 @@ const LJ_COMMENT_ACTION_PATTERNS = [
/b-pseudo/,
];

// LJ S1 legacy themes indent threaded replies in 25px increments per nesting level
const LEGACY_INDENT_PX = 25;

export async function scrapeComments(
entryUrl: string,
retries: number,
Expand All @@ -35,16 +38,18 @@ export async function scrapeComments(

export function buildCommentUrl(entryUrl: string): string {
const base = entryUrl.replace(/[?#].*$/, "");
return `${base}?view=comments`;
// ?nojs=1 forces LJ to render the static, server-rendered comment tree.
// Without it, modern themes return an empty .b-tree-root and load comments via JS.
return `${base}?nojs=1&view=comments`;
}

export function extractCommentsFromHtml(html: string): Comment[] {
const $ = cheerio.load(html);
const comments: Comment[] = [];

// Modern LJ theme: b-tree-twig wrappers inside b-tree-root
const twigs = $(".b-tree-twig");
if (twigs.length > 0) {
const comments: Comment[] = [];
twigs.each((_i, twig) => {
const $twig = $(twig);
const leaf = $twig.children(".b-leaf.comment").first();
Expand All @@ -57,20 +62,24 @@ export function extractCommentsFromHtml(html: string): Comment[] {
return comments;
}

// Fallback: look for comment elements with thread links (older/custom themes)
const threadLinks = $('a[href*="thread="]');
if (threadLinks.length > 0) {
return extractLegacyComments($);
// S1 legacy theme: <div id="ljcmt{id}" style="margin-left: Npx">
const legacyContainers = $('[id^="ljcmt"]');
if (legacyContainers.length > 0) {
const comments: Comment[] = [];
legacyContainers.each((_i, el) => {
const comment = parseLegacyComment($, $(el));
if (comment) comments.push(comment);
});
return comments;
}

return comments;
return [];
}

function extractTwigDepth(className: string): number {
// b-tree-twig-N where N is the 1-based depth level
const match = /\bb-tree-twig-(\d+)\b/.exec(className);
if (match) return parseInt(match[1]!, 10) - 1; // convert to 0-based
// Fallback: parse margin-left from style (30px per level)
return 0;
}

Expand All @@ -87,7 +96,7 @@ function parseModernComment(
// User profile link — the avatar/name link; anonymous has empty href
const userLinkEl = $leaf.find(".b-leaf-username a, .b-leaf-userpic-inner").first();
const rawUserUrl = userLinkEl.attr("href") ?? "";
const userUrl = rawUserUrl && rawUserUrl !== "#" && rawUserUrl !== "" ? rawUserUrl : "";
const userUrl = rawUserUrl && rawUserUrl !== "#" ? rawUserUrl : "";

const permalinkEl = $leaf.find(".b-leaf-permalink").first();
const permalinkUrl = permalinkEl.attr("href") ?? "";
Expand Down Expand Up @@ -116,45 +125,198 @@ function isCommentActionLink(href: string, cls: string): boolean {
return false;
}

function extractLegacyComments($: cheerio.CheerioAPI): Comment[] {
// Older LJ themes may not use b-tree-twig; use thread link anchors as anchors
const comments: Comment[] = [];
const seen = new Set<string>();
const threadPattern = /[?&]thread=(\d+)/;
function parseLegacyComment(
$: cheerio.CheerioAPI,
$cmt: cheerio.Cheerio<AnyNode>
): Comment | null {
const idAttr = $cmt.attr("id") ?? "";
const threadNumeric = idAttr.replace(/^ljcmt/, "");
if (!threadNumeric) return null;
const id = `t${threadNumeric}`;

$('a[href*="thread="]').each((_i, el) => {
const href = $(el).attr("href") ?? "";
const match = threadPattern.exec(href);
if (!match) return;
const threadId = `t${match[1]}`;
if (seen.has(threadId)) return;
seen.add(threadId);

// Walk up to find the comment container
let $container = $(el).closest("[id^='t']");
if ($container.length === 0) {
$container = $(el).closest("div, li, td");
const depth = extractLegacyDepth($cmt.attr("style") ?? "");

const $ljuser = $cmt.find("span.ljuser[data-ljuser]").first();
let username = ($ljuser.attr("data-ljuser") ?? "").trim();
let userUrl = "";
if (username) {
userUrl = ($ljuser.find("a.i-ljuser-username").first().attr("href") ?? "").trim();
if (!userUrl) {
// Fall back to any non-profile anchor inside the ljuser span
userUrl = ($ljuser.find("a").not(".i-ljuser-profile").first().attr("href") ?? "").trim();
}
if ($container.length === 0) return;

const username = $container.find('[class*="username"]').first().text().trim() || "Anonymous";
const timestampText = $(el).text().trim();
const $contentClone = $container.clone();
$contentClone.find("a").each((_j, a) => {
const aHref = $(a).attr("href") ?? "";
if (/[?&]replyto=|[?&]mode=reply/.test(aHref)) $(a).remove();
});
} else {
username = "Anonymous";
}

comments.push({
id: threadId,
depth: 0,
username,
userUrl: "",
timestampText,
permalinkUrl: href,
contentHtml: $contentClone.html()?.trim() ?? "",
const permalinkUrl = findLegacyPermalink($, $cmt, threadNumeric);
const timestampText = findLegacyTimestamp($, $cmt);
const contentHtml = extractLegacyBody($, $cmt, threadNumeric);

return { id, depth, username, userUrl, timestampText, permalinkUrl, contentHtml };
}

function extractLegacyDepth(style: string): number {
const match = /margin-left\s*:\s*(\d+)\s*px/i.exec(style);
if (!match) return 0;
const px = parseInt(match[1]!, 10);
if (!Number.isFinite(px) || px <= 0) return 0;
return Math.round(px / LEGACY_INDENT_PX);
}

function findLegacyPermalink(
$: cheerio.CheerioAPI,
$cmt: cheerio.Cheerio<AnyNode>,
threadNumeric: string
): string {
const exactPattern = new RegExp(`[?&]thread=${threadNumeric}(?:#t${threadNumeric}\\b|\\b)`);
let permalinkUrl = "";
$cmt.find("a[href*='thread=']").each((_i, a) => {
const href = $(a).attr("href") ?? "";
if (!exactPattern.test(href)) return;
if (LJ_COMMENT_ACTION_PATTERNS.some((p) => p.test(href))) return;
permalinkUrl = href;
return false;
});
return permalinkUrl;
}

function findLegacyTimestamp(
$: cheerio.CheerioAPI,
$cmt: cheerio.Cheerio<AnyNode>
): string {
// The timestamp is inside a <span title="..."> within the comment header.
// Scope to the header region (comment_bar_one for one S1 variant, the first
// row of the cmtbar table for the other) so user-authored body content like
// "<span>We met in 2003</span>" can't shadow the real timestamp.
let $headers: cheerio.Cheerio<AnyNode> = $cmt.find(".comment_bar_one, .comment_bar_alt");
if ($headers.length === 0) {
$headers = $cmt.find('table[id^="cmtbar"]').find("> tbody > tr, > tr").first();
}
// Last-resort fallback for unknown S1 layouts: search the whole comment.
// Re-introduces the body-shadowing risk, so prefer the scoped paths above.
if ($headers.length === 0) $headers = $cmt;

let timestampText = "";
$headers.find("span[title]").each((_i, sp) => {
const text = $(sp).text().trim();
if (/(?:19|20)\d{2}/.test(text)) {
timestampText = text;
return false;
}
});
return timestampText;
}

function extractLegacyBody(
$: cheerio.CheerioAPI,
$cmt: cheerio.Cheerio<AnyNode>,
threadNumeric: string
): string {
const $clone = $cmt.clone();

// Drop the named anchor target (<a name="t...">) — has no body content
$clone.find('a[name^="t"]').remove();

// Strip metadata header used by some S1 themes ("comment_bar_one" / "comment_bar_alt"
// contain the userpic + From/Date sub-table; not part of the body)
$clone.find(".comment_bar_one, .comment_bar_alt").remove();

// For themes that wrap the entire comment (header + body + footer) in a single
// <table id="cmtbar{id}">, surgically extract just the body row(s). Walk only the
// outer rows — find("tr") would also descend into the inner metadata sub-table.
$clone.find(`table[id^="cmtbar"]`).each((_i, tbl) => {
const $table = $(tbl);
const bodyHtmlParts: string[] = [];
const $rows = $table.find("> tbody > tr, > thead > tr, > tfoot > tr, > tr");
$rows.each((_j, tr) => {
const $tr = $(tr);
// Skip header rows (contain a sub-table or userpic/ljuser markup)
if ($tr.find("table, span.ljuser, .i-ljuser, img.i-ljuser-userhead").length > 0) return;
// Skip footer rows (Reply/Parent/Thread links)
if (rowHasReplyOrParentLink($, $tr)) return;
$tr.find("> td, > th").each((_k, cell) => {
bodyHtmlParts.push($(cell).html() ?? "");
});
});
const replacement = bodyHtmlParts.join(" ").trim();
$table.replaceWith(replacement);
});

// Strip any empty wrapper divs that previously contained the metadata table
$clone.find('div[align="right"].entry').each((_i, d) => {
const $d = $(d);
if (($d.text() ?? "").trim() === "" && $d.find("img").length === 0) $d.remove();
});

// Strip footer-only divs containing reply/parent/thread links
$clone.find("div").each((_i, d) => {
const $d = $(d);
if (!divIsLegacyFooter($, $d)) return;
$d.remove();
});

// Hidden quick-reply containers
$clone.find('[id^="ljqrt"], [id="ljqrttopcomment"], [id="ljqrtbottomcomment"]').remove();
$clone.find("form, input, button, select, textarea").remove();

// Strip orphan permalink anchors (e.g. a leftover bare "(Link)" line). Use a
// word-boundary regex so a body link to ?thread=500 isn't matched when this
// comment's id is "5".
const exactThreadPattern = new RegExp(`[?&]thread=${threadNumeric}(?:#t${threadNumeric}\\b|\\b)`);
$clone.find("a[href*='thread=']").each((_i, a) => {
const href = $(a).attr("href") ?? "";
if (exactThreadPattern.test(href)) {
$(a).remove();
}
});
// After removing those, prune empty <strong>(...)</strong> wrappers left behind
$clone.find("strong").each((_i, s) => {
const $s = $(s);
const text = ($s.text() ?? "").replace(/[\s()]/g, "");
if (text === "") $s.remove();
});

return ($clone.html() ?? "").trim();
}

function divIsLegacyFooter(
$: cheerio.CheerioAPI,
$d: cheerio.Cheerio<AnyNode>
): boolean {
const $links = $d.find("a");
if ($links.length === 0) return false;
// Footer divs only contain Reply/Parent/Thread/Link anchors and decorative punctuation
const hasReply = $links.toArray().some((a) => {
const href = $(a as Element).attr("href") ?? "";
return /[?&]replyto=|[?&]mode=reply/i.test(href);
});
if (!hasReply) return false;
// Don't yank a div that also has substantial non-link prose
let nonLinkText = ($d.text() ?? "").trim();
$links.toArray().forEach((a) => {
const linkText = ($(a as Element).text() ?? "").trim();
if (linkText) nonLinkText = nonLinkText.replaceAll(linkText, "");
});
nonLinkText = nonLinkText.replace(/[()|·•\s]/g, "");
return nonLinkText.length === 0;
}

return comments;
function rowHasReplyOrParentLink(
$: cheerio.CheerioAPI,
$tr: cheerio.Cheerio<AnyNode>
): boolean {
const links = $tr.find("a").toArray();
if (links.length === 0) return false;
// The text-only match (Parent/Thread/Link) is gated on a thread= href so a
// commenter who wrote <a>Link</a> with an unrelated href can't make their
// body row look like an LJ footer. The Reply arm is gated by replyto/mode.
return links.some((a) => {
const $a = $(a as Element);
const href = $a.attr("href") ?? "";
if (/[?&]replyto=|[?&]mode=reply/i.test(href)) return true;
if (!/[?&]thread=/i.test(href)) return false;
const text = ($a.text() ?? "").trim();
return /^(?:Reply|Parent|Thread|Link)$/i.test(text);
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Matching "Link" as a footer indicator is safe in the cmtbar layout because the Link anchor lives inside the header sub-table and is excluded by the sub-table guard on line 226 before this function is ever called. But the comment explaining that guard references the sub-table check — a brief inline note here would make the reason explicit so a future reader doesn't strip the "Link" arm thinking it's overly broad.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Link arm here is the one worth reconsidering. The inline comment on lines 309–311 explains why the LJ-generated (Link) permalink can't appear in a non-header row — it's inside the metadata sub-table, which is already excluded by the nested-table guard above. So this arm is never needed to catch LJ-generated links.

What it does catch is any user-authored body anchor whose visible text happens to be the single word "Link" (case-insensitive). For example, a commenter who wrote:

<a href="https://example.com">Link</a> for more context

would cause rowHasReplyOrParentLink to return true, the row would be skipped, and the comment body would be silently empty.

The fix is to scope the Link arm to anchors that also have thread= in their href (which is how LJ always generates the permalink):

Suggested change
return /^(?:Reply|Parent|Thread|Link)$/i.test(text);
if (/[?&]replyto=|[?&]mode=reply/i.test(href)) return true;
if (/[?&]thread=/.test(href) && /^Link$/i.test(($a.text() ?? "").trim())) return true;
const text = ($a.text() ?? "").trim();
return /^(?:Reply|Parent|Thread)$/i.test(text);

No existing test covers a cmtbar body row with a user-authored <a>Link</a> — adding one would lock in the correct behavior.

});
}
Loading
Loading