From d2e3a4ba193dbcc9e7656bc934be4efe42c6c0d8 Mon Sep 17 00:00:00 2001 From: lihaidong Date: Mon, 25 May 2026 19:52:44 +0800 Subject: [PATCH] feat(douban): add short comments reader --- cli-manifest.json | 63 ++++++++++++++ clis/douban/comments.js | 25 ++++++ clis/douban/comments.test.js | 14 +++ clis/douban/utils.js | 162 +++++++++++++++++++++++++++++++++++ clis/douban/utils.test.js | 81 ++++++++++++++++++ 5 files changed, 345 insertions(+) create mode 100644 clis/douban/comments.js create mode 100644 clis/douban/comments.test.js diff --git a/cli-manifest.json b/cli-manifest.json index e79ac2eb1..60ca6747a 100644 --- a/cli-manifest.json +++ b/cli-manifest.json @@ -8268,6 +8268,69 @@ "sourceFile": "douban/book-hot.js", "navigateBefore": "https://book.douban.com" }, + { + "site": "douban", + "name": "comments", + "description": "获取豆瓣条目短评", + "access": "read", + "domain": "movie.douban.com", + "strategy": "cookie", + "browser": true, + "args": [ + { + "name": "id", + "type": "str", + "required": true, + "positional": true, + "help": "豆瓣条目 ID" + }, + { + "name": "type", + "type": "str", + "default": "movie", + "required": false, + "help": "条目类型(movie=电影, book=图书, music=音乐)", + "choices": [ + "movie", + "book", + "music" + ] + }, + { + "name": "limit", + "type": "int", + "default": 100, + "required": false, + "help": "最多返回多少条短评" + }, + { + "name": "sort", + "type": "str", + "default": "new_score", + "required": false, + "help": "排序方式(new_score=热门, time=最新)", + "choices": [ + "new_score", + "time" + ] + } + ], + "columns": [ + "index", + "id", + "userName", + "rating", + "ratingText", + "votes", + "time", + "content", + "url" + ], + "type": "js", + "modulePath": "douban/comments.js", + "sourceFile": "douban/comments.js", + "navigateBefore": false + }, { "site": "douban", "name": "download", diff --git a/clis/douban/comments.js b/clis/douban/comments.js new file mode 100644 index 000000000..0ba946bc3 --- /dev/null +++ b/clis/douban/comments.js @@ -0,0 +1,25 @@ +import { cli, Strategy } from '@jackwener/opencli/registry'; +import { loadDoubanComments, normalizeDoubanSubjectId } from './utils.js'; + +cli({ + site: 'douban', + name: 'comments', + access: 'read', + description: '获取豆瓣条目短评', + domain: 'movie.douban.com', + strategy: Strategy.COOKIE, + browser: true, + navigateBefore: false, + args: [ + { name: 'id', positional: true, required: true, help: '豆瓣条目 ID' }, + { name: 'type', default: 'movie', choices: ['movie', 'book', 'music'], help: '条目类型(movie=电影, book=图书, music=音乐)' }, + { name: 'limit', type: 'int', default: 100, help: '最多返回多少条短评' }, + { name: 'sort', default: 'new_score', choices: ['new_score', 'time'], help: '排序方式(new_score=热门, time=最新)' }, + ], + columns: ['index', 'id', 'userName', 'rating', 'ratingText', 'votes', 'time', 'content', 'url'], + func: async (page, kwargs) => loadDoubanComments(page, normalizeDoubanSubjectId(String(kwargs.id || '')), { + type: String(kwargs.type || 'movie'), + limit: Number(kwargs.limit) || 100, + sort: String(kwargs.sort || 'new_score'), + }), +}); diff --git a/clis/douban/comments.test.js b/clis/douban/comments.test.js new file mode 100644 index 000000000..bd19720f7 --- /dev/null +++ b/clis/douban/comments.test.js @@ -0,0 +1,14 @@ +import { describe, expect, it } from 'vitest'; +import { getRegistry } from '@jackwener/opencli/registry'; +import './comments.js'; + +describe('douban comments command', () => { + it('registers short comments as a browser command that handles its own navigation', () => { + const command = getRegistry().get('douban/comments'); + expect(command).toBeDefined(); + expect(command?.browser).toBe(true); + expect(command?.navigateBefore).toBe(false); + expect(command?.args.map((arg) => arg.name)).toEqual(['id', 'type', 'limit', 'sort']); + expect(command?.columns).toContain('content'); + }); +}); diff --git a/clis/douban/utils.js b/clis/douban/utils.js index 058fcbbe6..6b7a8854d 100644 --- a/clis/douban/utils.js +++ b/clis/douban/utils.js @@ -4,10 +4,14 @@ import { ArgumentError, CliError, EmptyResultError } from '@jackwener/opencli/errors'; import { clamp } from '../_shared/common.js'; const DOUBAN_PHOTO_PAGE_SIZE = 30; +const DOUBAN_COMMENT_PAGE_SIZE = 20; const MAX_DOUBAN_PHOTOS = 500; +const MAX_DOUBAN_COMMENTS = 500; const clampLimit = (limit) => clamp(limit || 20, 1, 50); const clampPhotoLimit = (limit) => clamp(limit || 120, 1, MAX_DOUBAN_PHOTOS); +const clampCommentLimit = (limit) => clamp(limit || 100, 1, MAX_DOUBAN_COMMENTS); const DOUBAN_SEARCH_READY_SELECTOR = '.item-root .title-text, .item-root .title a, .result-list .result-item h3 a'; +const DOUBAN_COMMENT_READY_SELECTOR = '.comment-item, .comment, #comments'; const normalizeText = (value) => String(value || '').replace(/\s+/g, ' ').trim(); function firstNonEmpty(values) { for (const value of values) { @@ -121,6 +125,28 @@ function buildDoubanSearchUrl(type, keyword) { } return url.toString(); } +function normalizeDoubanSubjectType(type) { + const normalized = String(type || 'movie').trim(); + if (normalized === 'movie' || normalized === 'book' || normalized === 'music') { + return normalized; + } + throw new ArgumentError(`Invalid Douban subject type: ${type}`, 'Use one of: movie, book, music'); +} +function doubanSubjectHost(type) { + return `${normalizeDoubanSubjectType(type)}.douban.com`; +} +function buildDoubanCommentsUrl(subjectId, type, start = 0, sort = 'new_score') { + const normalizedId = normalizeDoubanSubjectId(subjectId); + const normalizedType = normalizeDoubanSubjectType(type); + const url = new URL(`https://${doubanSubjectHost(normalizedType)}/subject/${normalizedId}/comments/`); + url.searchParams.set('start', String(Math.max(0, Number(start) || 0))); + url.searchParams.set('limit', String(DOUBAN_COMMENT_PAGE_SIZE)); + url.searchParams.set('status', 'P'); + if (sort) { + url.searchParams.set('sort', String(sort)); + } + return url.toString(); +} export function normalizeDoubanSubjectId(subjectId) { const normalized = String(subjectId || '').trim(); if (!/^\d+$/.test(normalized)) { @@ -128,6 +154,142 @@ export function normalizeDoubanSubjectId(subjectId) { } return normalized; } +export function normalizeDoubanComment(raw, fallback = {}) { + const content = normalizeText(raw?.content); + const userName = normalizeText(raw?.userName); + const userUrl = normalizeText(raw?.userUrl); + const url = normalizeText(raw?.url); + const commentId = normalizeText(raw?.commentId) + || url.match(/comment-(\d+)/)?.[1] + || normalizeText(raw?.id); + return { + index: Number(fallback.index) || Number(raw?.index) || 0, + id: commentId, + subjectId: normalizeDoubanSubjectId(raw?.subjectId || fallback.subjectId), + type: normalizeDoubanSubjectType(raw?.type || fallback.type), + userName, + userUrl, + rating: parseDoubanCount(raw?.rating), + ratingText: normalizeText(raw?.ratingText), + votes: parseDoubanCount(raw?.votes), + time: normalizeText(raw?.time), + content, + url, + }; +} +export async function loadDoubanComments(page, subjectId, options = {}) { + const normalizedId = normalizeDoubanSubjectId(subjectId); + const type = normalizeDoubanSubjectType(options.type); + const sort = String(options.sort || 'new_score').trim() || 'new_score'; + const safeLimit = clampCommentLimit(Number(options.limit) || 100); + const startUrl = buildDoubanCommentsUrl(normalizedId, type, 0, sort); + await page.goto(startUrl, { waitUntil: 'load', settleMs: 1500 }); + await ensureDoubanReady(page); + await page.wait({ selector: DOUBAN_COMMENT_READY_SELECTOR, timeout: 8 }).catch(() => { }); + const data = await page.evaluate(` + (async () => { + const subjectId = ${JSON.stringify(normalizedId)}; + const type = ${JSON.stringify(type)}; + const sort = ${JSON.stringify(sort)}; + const limit = ${safeLimit}; + const pageSize = ${DOUBAN_COMMENT_PAGE_SIZE}; + const normalize = (value) => String(value || '').replace(/\\s+/g, ' ').trim(); + const toAbsoluteUrl = (value) => { + if (!value) return ''; + try { + return new URL(value, location.href).toString(); + } catch { + return value; + } + }; + const buildUrl = (start) => { + const url = new URL(location.href); + url.searchParams.set('start', String(start)); + url.searchParams.set('limit', String(pageSize)); + url.searchParams.set('status', 'P'); + if (sort) url.searchParams.set('sort', sort); + return url.toString(); + }; + const extractCommentRows = (doc, pageUrl) => { + const rows = []; + const primaryNodes = Array.from(doc.querySelectorAll('.comment-item')); + const nodes = primaryNodes.length ? primaryNodes : Array.from(doc.querySelectorAll('.comment')); + for (const node of nodes) { + const contentEl = node.querySelector('.short, .comment-content, p.comment-content, .comment-content span'); + const content = normalize(contentEl?.textContent); + if (!content) continue; + + const info = node.querySelector('.comment-info') || node; + const userEl = info.querySelector('a[href*="/people/"]') || node.querySelector('a[href*="/people/"]'); + const ratingEl = info.querySelector('span[class*="allstar"], span[class*="rating"]'); + const timeEl = info.querySelector('.comment-time, time') || node.querySelector('.comment-time, time'); + const voteEl = node.querySelector('.votes, .vote-count'); + const commentId = normalize(node.getAttribute('data-cid')) + || normalize(node.id).replace(/^comment-/, '') + || normalize(node.querySelector('[data-cid]')?.getAttribute('data-cid')); + const permalink = commentId ? pageUrl + '#comment-' + commentId : pageUrl; + const ratingClass = ratingEl?.className || ''; + const ratingValue = Number(ratingClass.match(/allstar(\\d)0/)?.[1] || ratingClass.match(/rating(\\d)-t/)?.[1] || 0) * 2; + + rows.push({ + id: commentId, + subjectId, + type, + userName: normalize(userEl?.textContent), + userUrl: toAbsoluteUrl(userEl?.getAttribute('href') || ''), + rating: ratingValue || 0, + ratingText: normalize(ratingEl?.getAttribute('title')), + votes: normalize(voteEl?.textContent), + time: normalize(timeEl?.getAttribute('title')) || normalize(timeEl?.textContent), + content, + url: permalink, + }); + } + return rows; + }; + + const seen = new Set(); + const comments = []; + for (let start = 0; comments.length < limit; start += pageSize) { + let doc = document; + let pageUrl = location.href; + if (start > 0) { + pageUrl = buildUrl(start); + const response = await fetch(pageUrl, { credentials: 'include' }); + if (!response.ok) break; + const html = await response.text(); + doc = new DOMParser().parseFromString(html, 'text/html'); + } + const rows = extractCommentRows(doc, pageUrl); + if (!rows.length) break; + let appended = 0; + for (const row of rows) { + const key = row.id || row.userUrl + '\\n' + row.time + '\\n' + row.content; + if (seen.has(key)) continue; + seen.add(key); + comments.push({ + index: comments.length + 1, + ...row, + }); + appended += 1; + if (comments.length >= limit) break; + } + if (rows.length < pageSize || appended === 0) break; + await new Promise((resolve) => setTimeout(resolve, 150)); + } + return comments; + })() + `); + const comments = Array.isArray(data) ? data : []; + if (!comments.length) { + throw new EmptyResultError('douban comments', `No short comments found for ${type} subject ${normalizedId}.`); + } + return comments.slice(0, safeLimit).map((comment, index) => normalizeDoubanComment(comment, { + index: index + 1, + subjectId: normalizedId, + type, + })); +} export function promoteDoubanPhotoUrl(url, size = 'l') { const normalized = String(url || '').trim(); if (!normalized) diff --git a/clis/douban/utils.test.js b/clis/douban/utils.test.js index 48722157f..e3544b987 100644 --- a/clis/douban/utils.test.js +++ b/clis/douban/utils.test.js @@ -3,9 +3,11 @@ import { describe, expect, it, vi } from 'vitest'; import { getDoubanPhotoExtension, inferDoubanSearchResultType, + loadDoubanComments, loadDoubanMovieHot, loadDoubanSubjectDetail, loadDoubanSubjectPhotos, + normalizeDoubanComment, normalizeDoubanBookSubject, normalizeDoubanSubjectId, promoteDoubanPhotoUrl, @@ -125,6 +127,35 @@ describe('douban utils', () => { expect(() => normalizeDoubanSubjectId('tt30382501')).toThrow('Invalid Douban subject ID'); }); + it('normalizes short comment rows', () => { + expect(normalizeDoubanComment({ + id: '123', + subjectId: '26266893', + type: 'movie', + userName: ' 豆友 ', + userUrl: 'https://www.douban.com/people/example/', + rating: '8', + ratingText: '推荐', + votes: '12 有用', + time: '2026-05-25', + content: ' 很好看 ', + url: 'https://movie.douban.com/subject/26266893/comments/#comment-123', + }, { index: 1 })).toEqual({ + index: 1, + id: '123', + subjectId: '26266893', + type: 'movie', + userName: '豆友', + userUrl: 'https://www.douban.com/people/example/', + rating: 8, + ratingText: '推荐', + votes: 12, + time: '2026-05-25', + content: '很好看', + url: 'https://movie.douban.com/subject/26266893/comments/#comment-123', + }); + }); + it('promotes thumbnail urls to large photo urls', () => { expect(promoteDoubanPhotoUrl('https://img1.doubanio.com/view/photo/m/public/p2913450214.webp')).toBe('https://img1.doubanio.com/view/photo/l/public/p2913450214.webp'); expect(promoteDoubanPhotoUrl('https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2578474613.jpg')).toBe('https://img9.doubanio.com/view/photo/l/public/p2578474613.jpg'); @@ -185,6 +216,56 @@ describe('douban utils', () => { expect(getDoubanPhotoExtension('https://img1.doubanio.com/view/photo/l/public/p2913450214.jpeg')).toBe('.jpeg'); }); + it('loads comments from the matching subject host and preserves a 100 item limit', async () => { + const page = { + goto: vi.fn().mockResolvedValue(undefined), + wait: vi.fn().mockResolvedValue(undefined), + evaluate: vi.fn() + .mockResolvedValueOnce({ blocked: false, title: '短评', href: 'https://book.douban.com/subject/2567698/comments/' }) + .mockResolvedValueOnce([ + { + index: 1, + id: '101', + subjectId: '2567698', + type: 'book', + userName: 'reader', + userUrl: 'https://www.douban.com/people/reader/', + rating: 10, + ratingText: '力荐', + votes: 7, + time: '2026-05-25', + content: '短评正文', + url: 'https://book.douban.com/subject/2567698/comments/#comment-101', + }, + ]), + }; + + await expect(loadDoubanComments(page, '2567698', { + type: 'book', + limit: 100, + sort: 'time', + })).resolves.toMatchObject([ + { + index: 1, + id: '101', + subjectId: '2567698', + type: 'book', + rating: 10, + content: '短评正文', + }, + ]); + expect(page.goto).toHaveBeenCalledWith('https://book.douban.com/subject/2567698/comments/?start=0&limit=20&status=P&sort=time', { + waitUntil: 'load', + settleMs: 1500, + }); + expect(page.wait).toHaveBeenCalledWith({ + selector: '.comment-item, .comment, #comments', + timeout: 8, + }); + const scanScript = page.evaluate.mock.calls[1]?.[0]; + expect(scanScript).toContain('const limit = 100;'); + }); + it('maps tv series results to tvshow in searchDouban output', async () => { const domItems = [ createFakeSearchItem({