From c93eb4e602121d5f5227b926aa0733ff04f7b34a Mon Sep 17 00:00:00 2001 From: xuews Date: Mon, 23 Mar 2026 17:08:50 +0800 Subject: [PATCH 1/2] fix(xiaohongshu): improve search login-wall handling and detail output --- src/clis/xiaohongshu/search.test.ts | 103 +++++++++++++++ src/clis/xiaohongshu/search.ts | 198 ++++++++++++++++++++++++---- 2 files changed, 277 insertions(+), 24 deletions(-) create mode 100644 src/clis/xiaohongshu/search.test.ts diff --git a/src/clis/xiaohongshu/search.test.ts b/src/clis/xiaohongshu/search.test.ts new file mode 100644 index 00000000..afec06f9 --- /dev/null +++ b/src/clis/xiaohongshu/search.test.ts @@ -0,0 +1,103 @@ +import { describe, expect, it, vi } from 'vitest'; +import type { IPage } from '../../types.js'; +import { getRegistry } from '../../registry.js'; +import './search.js'; + +function createPageMock(evaluateResults: any[]): IPage { + const evaluate = vi.fn(); + for (const result of evaluateResults) { + evaluate.mockResolvedValueOnce(result); + } + + return { + goto: vi.fn().mockResolvedValue(undefined), + evaluate, + snapshot: vi.fn().mockResolvedValue(undefined), + click: vi.fn().mockResolvedValue(undefined), + typeText: vi.fn().mockResolvedValue(undefined), + pressKey: vi.fn().mockResolvedValue(undefined), + scrollTo: vi.fn().mockResolvedValue(undefined), + getFormState: vi.fn().mockResolvedValue({ forms: [], orphanFields: [] }), + wait: vi.fn().mockResolvedValue(undefined), + tabs: vi.fn().mockResolvedValue([]), + closeTab: vi.fn().mockResolvedValue(undefined), + newTab: vi.fn().mockResolvedValue(undefined), + selectTab: vi.fn().mockResolvedValue(undefined), + networkRequests: vi.fn().mockResolvedValue([]), + consoleMessages: vi.fn().mockResolvedValue([]), + scroll: vi.fn().mockResolvedValue(undefined), + autoScroll: vi.fn().mockResolvedValue(undefined), + installInterceptor: vi.fn().mockResolvedValue(undefined), + getInterceptedRequests: vi.fn().mockResolvedValue([]), + getCookies: vi.fn().mockResolvedValue([]), + screenshot: vi.fn().mockResolvedValue(''), + }; +} + +describe('xiaohongshu search', () => { + it('throws a clear error when the search page is blocked by a login wall', async () => { + const cmd = getRegistry().get('xiaohongshu/search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { + loginWall: true, + bodyPreview: '登录后查看搜索结果', + results: [], + }, + ]); + + await expect(cmd!.func!(page, { query: '特斯拉', limit: 5 })).rejects.toThrow( + 'Xiaohongshu search results are blocked behind a login wall' + ); + }); + + it('keeps the search_result url and enriches rows with note details', async () => { + const cmd = getRegistry().get('xiaohongshu/search'); + expect(cmd?.func).toBeTypeOf('function'); + + const detailUrl = + 'https://www.xiaohongshu.com/search_result/68e90be80000000004022e66?xsec_token=test-token&xsec_source='; + const page = createPageMock([ + { + loginWall: false, + bodyPreview: '', + results: [ + { + title: '某鱼买FSD被坑了4万', + author: '随风', + likes: '261', + url: detailUrl, + author_url: + 'https://www.xiaohongshu.com/user/profile/635a9c720000000018028b40?xsec_token=user-token&xsec_source=pc_search', + }, + ], + }, + { + title: '某鱼买FSD被坑了4万', + author: '随风', + content: '今天早上提车,昨天深夜,心血来潮搜了一下x鱼。', + comment_count: '302', + comments: ['KA330: 没有被坑啊。', 'NONO: 你怎么敢某鱼花4.3W买的'], + }, + ]); + + const result = await cmd!.func!(page, { query: '特斯拉', limit: 1 }); + + expect((page.goto as any).mock.calls[1][0]).toBe(detailUrl); + expect(result).toEqual([ + { + rank: 1, + title: '某鱼买FSD被坑了4万', + author: '随风', + likes: '261', + url: detailUrl, + author_url: + 'https://www.xiaohongshu.com/user/profile/635a9c720000000018028b40?xsec_token=user-token&xsec_source=pc_search', + content: '今天早上提车,昨天深夜,心血来潮搜了一下x鱼。', + comment_count: '302', + comments: ['KA330: 没有被坑啊。', 'NONO: 你怎么敢某鱼花4.3W买的'], + }, + ]); + }); +}); diff --git a/src/clis/xiaohongshu/search.ts b/src/clis/xiaohongshu/search.ts index 4d6232ac..25401db7 100644 --- a/src/clis/xiaohongshu/search.ts +++ b/src/clis/xiaohongshu/search.ts @@ -8,6 +8,76 @@ import { cli, Strategy } from '../../registry.js'; +type SearchRow = { + title: string; + author: string; + likes: string; + url: string; + author_url: string; + content: string; + comment_count: string; + comments: string[]; +}; + +type SearchListRow = { + title: string; + author: string; + likes: string; + url: string; + author_url: string; +}; + +async function readNoteDetail(page: any, url: string): Promise> { + await page.goto(url); + await page.wait(3); + + const payload = await page.evaluate(` + (() => { + const state = window.__INITIAL_STATE__ || {}; + const noteState = state.note || {}; + const detailMap = noteState.noteDetailMap || {}; + const detailKeys = Object.keys(detailMap || {}); + const firstDetail = detailKeys.length ? detailMap[detailKeys[0]] : null; + const note = firstDetail?.note || {}; + const comments = firstDetail?.comments?.list || []; + + const title = (note.title || '').trim(); + const content = (note.desc || '').trim(); + const author = (note.user?.nickname || '').trim(); + const commentCount = String(note.interactInfo?.commentCount || note.interact_info?.comment_count || comments.length || 0); + const topComments = comments + .map((item) => { + const nickname = (item?.userInfo?.nickname || '').trim(); + const text = (item?.content || '').trim(); + if (!text) return ''; + return nickname ? nickname + ': ' + text : text; + }) + .filter(Boolean) + .slice(0, 3); + + return { + title, + author, + content, + comment_count: commentCount, + comments: topComments, + }; + })() + `); + + if (!payload || typeof payload !== 'object') { + return { title: '', author: '', content: '', comment_count: '0', comments: [] }; + } + + return { + title: typeof (payload as any).title === 'string' ? (payload as any).title : '', + author: typeof (payload as any).author === 'string' ? (payload as any).author : '', + content: typeof (payload as any).content === 'string' ? (payload as any).content : '', + comment_count: typeof (payload as any).comment_count === 'string' ? (payload as any).comment_count : '0', + comments: Array.isArray((payload as any).comments) ? (payload as any).comments : [], + }; +} + cli({ site: 'xiaohongshu', name: 'search', @@ -18,7 +88,7 @@ cli({ { name: 'query', required: true, positional: true, help: 'Search keyword' }, { name: 'limit', type: 'int', default: 20, help: 'Number of results' }, ], - columns: ['rank', 'title', 'author', 'likes'], + columns: ['rank', 'title', 'author', 'likes', 'comment_count', 'url'], func: async (page, kwargs) => { const keyword = encodeURIComponent(kwargs.query); await page.goto( @@ -29,40 +99,120 @@ cli({ // Scroll a couple of times to load more results await page.autoScroll({ times: 2 }); - const data = await page.evaluate(` + const payload = await page.evaluate(` (() => { - const notes = document.querySelectorAll('section.note-item'); + const loginWall = /登录后查看搜索结果/.test(document.body.innerText || ''); const results = []; + + const pushResult = (raw) => { + const url = (raw?.url || '').trim(); + if (!url) return; + results.push({ + title: (raw?.title || '').trim(), + author: (raw?.author || '').trim(), + likes: (raw?.likes || '0').trim(), + url, + author_url: (raw?.author_url || '').trim(), + }); + }; + + const normalizeUrl = (href) => { + if (!href) return ''; + if (href.startsWith('http://') || href.startsWith('https://')) return href; + if (href.startsWith('/')) return 'https://www.xiaohongshu.com' + href; + return ''; + }; + + const cleanText = (value) => (value || '').replace(/\s+/g, ' ').trim(); + const notes = document.querySelectorAll('section.note-item'); notes.forEach(el => { - // Skip "related searches" sections if (el.classList.contains('query-note-item')) return; - const titleEl = el.querySelector('.title, .note-title, a.title'); - const nameEl = el.querySelector('.name, .author-name, .nick-name'); + const titleEl = el.querySelector('.title, .note-title, a.title, .footer .title span'); + const nameEl = el.querySelector('a.author .name, .name, .author-name, .nick-name, a.author'); const likesEl = el.querySelector('.count, .like-count, .like-wrapper .count'); - const linkEl = el.querySelector('a[href*="/explore/"], a[href*="/search_result/"], a[href*="/note/"]'); + const detailLinkEl = + el.querySelector('a.cover.mask') || + el.querySelector('a[href*="/search_result/"]') || + el.querySelector('a[href*="/explore/"]') || + el.querySelector('a[href*="/note/"]'); + const authorLinkEl = el.querySelector('a.author, a[href*="/user/profile/"]'); - const href = linkEl?.getAttribute('href') || ''; - const noteId = href.match(/\\/(?:explore|note)\\/([a-zA-Z0-9]+)/)?.[1] || ''; - - results.push({ - title: (titleEl?.textContent || '').trim(), - author: (nameEl?.textContent || '').trim(), - likes: (likesEl?.textContent || '0').trim(), - url: noteId ? 'https://www.xiaohongshu.com/explore/' + noteId : '', + pushResult({ + title: cleanText(titleEl?.textContent || ''), + author: cleanText(nameEl?.textContent || ''), + likes: cleanText(likesEl?.textContent || '0'), + url: normalizeUrl(detailLinkEl?.getAttribute('href') || ''), + author_url: normalizeUrl(authorLinkEl?.getAttribute('href') || ''), }); }); - return results; + + if (results.length === 0) { + const anchors = Array.from(document.querySelectorAll('a.cover.mask, a[href*="/search_result/"]')); + anchors.forEach(anchor => { + const card = anchor.closest('section, article, div') || anchor.parentElement; + if (!card) return; + const titleEl = card.querySelector('.title, .note-title, .footer .title span, [class*="title"]'); + const nameEl = card.querySelector('a.author .name, .name, .author-name, .nick-name, a.author, [class*="author"], [class*="user"]'); + const likesEl = card.querySelector('.count, .like-count, .like-wrapper .count, [class*="like"]'); + const authorLinkEl = card.querySelector('a.author, a[href*="/user/profile/"]'); + pushResult({ + title: cleanText(titleEl?.textContent || anchor.textContent || ''), + author: cleanText(nameEl?.textContent || ''), + likes: cleanText(likesEl?.textContent || '0'), + url: normalizeUrl(anchor.getAttribute('href') || ''), + author_url: normalizeUrl(authorLinkEl?.getAttribute('href') || ''), + }); + }); + } + + const deduped = []; + const seen = new Set(); + for (const item of results) { + const key = item.url || item.title; + if (!key || seen.has(key)) continue; + seen.add(key); + deduped.push(item); + } + + return { + loginWall, + bodyPreview: (document.body.innerText || '').slice(0, 400), + results: deduped, + }; })() `); - if (!Array.isArray(data)) return []; - return data - .filter((item: any) => item.title) - .slice(0, kwargs.limit) - .map((item: any, i: number) => ({ - rank: i + 1, - ...item, - })); + if (!payload || typeof payload !== 'object') return []; + if ((payload as any).loginWall) { + throw new Error( + 'Xiaohongshu search results are blocked behind a login wall for the current browser session. ' + + 'Open https://www.xiaohongshu.com/search_result in Chrome and sign in, then retry.' + ); + } + + const data = Array.isArray((payload as any).results) ? (payload as any).results as SearchListRow[] : []; + const limited = data.slice(0, kwargs.limit); + const enriched: SearchRow[] = []; + + for (const item of limited) { + const detail = await readNoteDetail(page, item.url); + const fallbackTitle = detail.content.split('\n').map((line) => line.trim()).find(Boolean) || ''; + enriched.push({ + title: detail.title || item.title || fallbackTitle, + author: detail.author || item.author, + likes: item.likes, + url: item.url, + author_url: item.author_url, + content: detail.content, + comment_count: detail.comment_count, + comments: detail.comments, + }); + } + + return enriched.map((item, i) => ({ + rank: i + 1, + ...item, + })); }, }); From ed576e323a85c9a9830196da5eafeb2df298ac12 Mon Sep 17 00:00:00 2001 From: jackwener Date: Mon, 23 Mar 2026 17:33:24 +0800 Subject: [PATCH 2/2] fix(xiaohongshu/search): keep login-wall detection & URL improvements, remove serial per-note enrichment - Detect login wall and throw a clear error message (from original PR) - Preserve search_result/ URL with xsec_token instead of degrading to /explore/ - Add author_url to results - Remove readNoteDetail() + sequential page.goto() per note (caused 60s+ delays for default limit=20 with 3s wait each) - Simplify and unify DOM extraction logic (remove unused fallback anchor scan) - Update tests: cover login-wall, URL preservation (assert single goto), and limit/filter --- src/clis/xiaohongshu/search.test.ts | 67 ++++++++--- src/clis/xiaohongshu/search.ts | 170 +++++----------------------- 2 files changed, 77 insertions(+), 160 deletions(-) diff --git a/src/clis/xiaohongshu/search.test.ts b/src/clis/xiaohongshu/search.test.ts index afec06f9..25c94879 100644 --- a/src/clis/xiaohongshu/search.test.ts +++ b/src/clis/xiaohongshu/search.test.ts @@ -42,7 +42,6 @@ describe('xiaohongshu search', () => { const page = createPageMock([ { loginWall: true, - bodyPreview: '登录后查看搜索结果', results: [], }, ]); @@ -52,39 +51,35 @@ describe('xiaohongshu search', () => { ); }); - it('keeps the search_result url and enriches rows with note details', async () => { + it('returns ranked results with search_result url and author_url preserved', async () => { const cmd = getRegistry().get('xiaohongshu/search'); expect(cmd?.func).toBeTypeOf('function'); const detailUrl = 'https://www.xiaohongshu.com/search_result/68e90be80000000004022e66?xsec_token=test-token&xsec_source='; + const authorUrl = + 'https://www.xiaohongshu.com/user/profile/635a9c720000000018028b40?xsec_token=user-token&xsec_source=pc_search'; + const page = createPageMock([ { loginWall: false, - bodyPreview: '', results: [ { title: '某鱼买FSD被坑了4万', author: '随风', likes: '261', url: detailUrl, - author_url: - 'https://www.xiaohongshu.com/user/profile/635a9c720000000018028b40?xsec_token=user-token&xsec_source=pc_search', + author_url: authorUrl, }, ], }, - { - title: '某鱼买FSD被坑了4万', - author: '随风', - content: '今天早上提车,昨天深夜,心血来潮搜了一下x鱼。', - comment_count: '302', - comments: ['KA330: 没有被坑啊。', 'NONO: 你怎么敢某鱼花4.3W买的'], - }, ]); const result = await cmd!.func!(page, { query: '特斯拉', limit: 1 }); - expect((page.goto as any).mock.calls[1][0]).toBe(detailUrl); + // Should only do one goto (the search page itself), no per-note detail navigation + expect((page.goto as any).mock.calls).toHaveLength(1); + expect(result).toEqual([ { rank: 1, @@ -92,12 +87,48 @@ describe('xiaohongshu search', () => { author: '随风', likes: '261', url: detailUrl, - author_url: - 'https://www.xiaohongshu.com/user/profile/635a9c720000000018028b40?xsec_token=user-token&xsec_source=pc_search', - content: '今天早上提车,昨天深夜,心血来潮搜了一下x鱼。', - comment_count: '302', - comments: ['KA330: 没有被坑啊。', 'NONO: 你怎么敢某鱼花4.3W买的'], + author_url: authorUrl, + }, + ]); + }); + + it('filters out results with no title and respects the limit', async () => { + const cmd = getRegistry().get('xiaohongshu/search'); + expect(cmd?.func).toBeTypeOf('function'); + + const page = createPageMock([ + { + loginWall: false, + results: [ + { + title: 'Result A', + author: 'UserA', + likes: '10', + url: 'https://www.xiaohongshu.com/search_result/aaa', + author_url: '', + }, + { + title: '', + author: 'UserB', + likes: '5', + url: 'https://www.xiaohongshu.com/search_result/bbb', + author_url: '', + }, + { + title: 'Result C', + author: 'UserC', + likes: '3', + url: 'https://www.xiaohongshu.com/search_result/ccc', + author_url: '', + }, + ], }, ]); + + const result = (await cmd!.func!(page, { query: '测试', limit: 1 })) as any[]; + + // limit=1 should return only the first valid-titled result + expect(result).toHaveLength(1); + expect(result[0]).toMatchObject({ rank: 1, title: 'Result A' }); }); }); diff --git a/src/clis/xiaohongshu/search.ts b/src/clis/xiaohongshu/search.ts index 25401db7..cde23366 100644 --- a/src/clis/xiaohongshu/search.ts +++ b/src/clis/xiaohongshu/search.ts @@ -8,76 +8,6 @@ import { cli, Strategy } from '../../registry.js'; -type SearchRow = { - title: string; - author: string; - likes: string; - url: string; - author_url: string; - content: string; - comment_count: string; - comments: string[]; -}; - -type SearchListRow = { - title: string; - author: string; - likes: string; - url: string; - author_url: string; -}; - -async function readNoteDetail(page: any, url: string): Promise> { - await page.goto(url); - await page.wait(3); - - const payload = await page.evaluate(` - (() => { - const state = window.__INITIAL_STATE__ || {}; - const noteState = state.note || {}; - const detailMap = noteState.noteDetailMap || {}; - const detailKeys = Object.keys(detailMap || {}); - const firstDetail = detailKeys.length ? detailMap[detailKeys[0]] : null; - const note = firstDetail?.note || {}; - const comments = firstDetail?.comments?.list || []; - - const title = (note.title || '').trim(); - const content = (note.desc || '').trim(); - const author = (note.user?.nickname || '').trim(); - const commentCount = String(note.interactInfo?.commentCount || note.interact_info?.comment_count || comments.length || 0); - const topComments = comments - .map((item) => { - const nickname = (item?.userInfo?.nickname || '').trim(); - const text = (item?.content || '').trim(); - if (!text) return ''; - return nickname ? nickname + ': ' + text : text; - }) - .filter(Boolean) - .slice(0, 3); - - return { - title, - author, - content, - comment_count: commentCount, - comments: topComments, - }; - })() - `); - - if (!payload || typeof payload !== 'object') { - return { title: '', author: '', content: '', comment_count: '0', comments: [] }; - } - - return { - title: typeof (payload as any).title === 'string' ? (payload as any).title : '', - author: typeof (payload as any).author === 'string' ? (payload as any).author : '', - content: typeof (payload as any).content === 'string' ? (payload as any).content : '', - comment_count: typeof (payload as any).comment_count === 'string' ? (payload as any).comment_count : '0', - comments: Array.isArray((payload as any).comments) ? (payload as any).comments : [], - }; -} - cli({ site: 'xiaohongshu', name: 'search', @@ -88,7 +18,7 @@ cli({ { name: 'query', required: true, positional: true, help: 'Search keyword' }, { name: 'limit', type: 'int', default: 20, help: 'Number of results' }, ], - columns: ['rank', 'title', 'author', 'likes', 'comment_count', 'url'], + columns: ['rank', 'title', 'author', 'likes', 'url'], func: async (page, kwargs) => { const keyword = encodeURIComponent(kwargs.query); await page.goto( @@ -102,19 +32,6 @@ cli({ const payload = await page.evaluate(` (() => { const loginWall = /登录后查看搜索结果/.test(document.body.innerText || ''); - const results = []; - - const pushResult = (raw) => { - const url = (raw?.url || '').trim(); - if (!url) return; - results.push({ - title: (raw?.title || '').trim(), - author: (raw?.author || '').trim(), - likes: (raw?.likes || '0').trim(), - url, - author_url: (raw?.author_url || '').trim(), - }); - }; const normalizeUrl = (href) => { if (!href) return ''; @@ -123,14 +40,19 @@ cli({ return ''; }; - const cleanText = (value) => (value || '').replace(/\s+/g, ' ').trim(); - const notes = document.querySelectorAll('section.note-item'); - notes.forEach(el => { + const cleanText = (value) => (value || '').replace(/\\s+/g, ' ').trim(); + + const results = []; + const seen = new Set(); + + document.querySelectorAll('section.note-item').forEach(el => { + // Skip "related searches" sections if (el.classList.contains('query-note-item')) return; const titleEl = el.querySelector('.title, .note-title, a.title, .footer .title span'); const nameEl = el.querySelector('a.author .name, .name, .author-name, .nick-name, a.author'); const likesEl = el.querySelector('.count, .like-count, .like-wrapper .count'); + // Prefer search_result link (preserves xsec_token) over generic /explore/ link const detailLinkEl = el.querySelector('a.cover.mask') || el.querySelector('a[href*="/search_result/"]') || @@ -138,52 +60,31 @@ cli({ el.querySelector('a[href*="/note/"]'); const authorLinkEl = el.querySelector('a.author, a[href*="/user/profile/"]'); - pushResult({ + const url = normalizeUrl(detailLinkEl?.getAttribute('href') || ''); + if (!url) return; + + const key = url; + if (seen.has(key)) return; + seen.add(key); + + results.push({ title: cleanText(titleEl?.textContent || ''), author: cleanText(nameEl?.textContent || ''), likes: cleanText(likesEl?.textContent || '0'), - url: normalizeUrl(detailLinkEl?.getAttribute('href') || ''), + url, author_url: normalizeUrl(authorLinkEl?.getAttribute('href') || ''), }); }); - if (results.length === 0) { - const anchors = Array.from(document.querySelectorAll('a.cover.mask, a[href*="/search_result/"]')); - anchors.forEach(anchor => { - const card = anchor.closest('section, article, div') || anchor.parentElement; - if (!card) return; - const titleEl = card.querySelector('.title, .note-title, .footer .title span, [class*="title"]'); - const nameEl = card.querySelector('a.author .name, .name, .author-name, .nick-name, a.author, [class*="author"], [class*="user"]'); - const likesEl = card.querySelector('.count, .like-count, .like-wrapper .count, [class*="like"]'); - const authorLinkEl = card.querySelector('a.author, a[href*="/user/profile/"]'); - pushResult({ - title: cleanText(titleEl?.textContent || anchor.textContent || ''), - author: cleanText(nameEl?.textContent || ''), - likes: cleanText(likesEl?.textContent || '0'), - url: normalizeUrl(anchor.getAttribute('href') || ''), - author_url: normalizeUrl(authorLinkEl?.getAttribute('href') || ''), - }); - }); - } - - const deduped = []; - const seen = new Set(); - for (const item of results) { - const key = item.url || item.title; - if (!key || seen.has(key)) continue; - seen.add(key); - deduped.push(item); - } - return { loginWall, - bodyPreview: (document.body.innerText || '').slice(0, 400), - results: deduped, + results, }; })() `); if (!payload || typeof payload !== 'object') return []; + if ((payload as any).loginWall) { throw new Error( 'Xiaohongshu search results are blocked behind a login wall for the current browser session. ' + @@ -191,28 +92,13 @@ cli({ ); } - const data = Array.isArray((payload as any).results) ? (payload as any).results as SearchListRow[] : []; - const limited = data.slice(0, kwargs.limit); - const enriched: SearchRow[] = []; - - for (const item of limited) { - const detail = await readNoteDetail(page, item.url); - const fallbackTitle = detail.content.split('\n').map((line) => line.trim()).find(Boolean) || ''; - enriched.push({ - title: detail.title || item.title || fallbackTitle, - author: detail.author || item.author, - likes: item.likes, - url: item.url, - author_url: item.author_url, - content: detail.content, - comment_count: detail.comment_count, - comments: detail.comments, - }); - } - - return enriched.map((item, i) => ({ - rank: i + 1, - ...item, - })); + const data: any[] = Array.isArray((payload as any).results) ? (payload as any).results : []; + return data + .filter((item: any) => item.title) + .slice(0, kwargs.limit) + .map((item: any, i: number) => ({ + rank: i + 1, + ...item, + })); }, });