From d76c4d9994bea794f43d332dfb59a83d54859435 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E4=BB=A5=E7=90=B3?= Date: Tue, 26 May 2026 22:53:17 +0800 Subject: [PATCH 1/3] feat(douyin): add search command for keyword video search DOM extraction from www.douyin.com/search/?type=video. Requires logged-in profile. plays/comments/shares exposed as 0 (card markup only surfaces likes); see Follow-ups for full-counter path. Schema aligned with tiktok search. Refs https://github.com/Daily-AC/omnireach/issues/12 --- cli-manifest.json | 39 ++++++ clis/douyin/search.js | 264 +++++++++++++++++++++++++++++++++++++ clis/douyin/search.test.js | 232 ++++++++++++++++++++++++++++++++ 3 files changed, 535 insertions(+) create mode 100644 clis/douyin/search.js create mode 100644 clis/douyin/search.test.js diff --git a/cli-manifest.json b/cli-manifest.json index e79ac2eb1..d1fa195bf 100644 --- a/cli-manifest.json +++ b/cli-manifest.json @@ -9418,6 +9418,45 @@ "sourceFile": "douyin/publish.js", "navigateBefore": "https://creator.douyin.com" }, + { + "site": "douyin", + "name": "search", + "description": "关键词搜索抖音视频", + "access": "read", + "domain": "www.douyin.com", + "strategy": "cookie", + "browser": true, + "args": [ + { + "name": "query", + "type": "str", + "required": true, + "positional": true, + "help": "搜索关键词" + }, + { + "name": "limit", + "type": "int", + "default": 10, + "required": false, + "help": "结果数量 (1-30)" + } + ], + "columns": [ + "rank", + "desc", + "author", + "url", + "plays", + "likes", + "comments", + "shares" + ], + "type": "js", + "modulePath": "douyin/search.js", + "sourceFile": "douyin/search.js", + "navigateBefore": "https://www.douyin.com" + }, { "site": "douyin", "name": "stats", diff --git a/clis/douyin/search.js b/clis/douyin/search.js new file mode 100644 index 000000000..9540cbff6 --- /dev/null +++ b/clis/douyin/search.js @@ -0,0 +1,264 @@ +/** + * Douyin search — keyword video search on www.douyin.com. + * + * Strategy: DOM extraction from the server-rendered search results page. + * + * Why not XHR interception: + * The `www.douyin.com/search/?type=video` page renders results into + * `
    ` server-side during initial navigation + * and (for the OpenCLI-bridged browser context) does NOT fire a + * subsequent `/aweme/v1/web/general/search/single/` XHR — we confirmed + * this by `wait xhr "general/search/single"` timing out at 20s on a + * logged-in profile that has visible result cards in the DOM. Direct + * synthesis of the XHR from page context returns + * `status_code: 0, data: [], search_nil_info: { search_nil_type: + * "verify_check" }` because the bare URL lacks the SPA-computed + * `a_bogus` / `msToken` signature. + * + * DOM extraction sidesteps both blockers: the data is already in the + * rendered HTML at the moment of navigation, signature-free. + * + * Selector approach: + * Douyin obfuscates card classnames (e.g. `.ckopQfVu`, `.cIiU4Muu`) + * and they churn between builds. We pin only the stable hooks: + * - container: `[data-e2e="scroll-list"]` + * - row: `li` inside the container + * - url: `a[href*="/video/"]` + * - other fields are extracted from the row's leaf text nodes by + * SHAPE (digit+万/亿 → likes; HH:MM or MM:SS → duration; text after + * `@` → author nickname; longest remaining → desc). + * + * Output fields mirror `tiktok search` (rank, desc, author, url, plays, + * likes, comments, shares) so downstream tools that already normalize + * tiktok rows can consume douyin rows without per-adapter glue. The + * search results page only surfaces the like count — plays/comments/ + * shares are not in the card markup and we expose them as 0 rather + * than fabricate values; clients that need them should fetch + * /aweme/v1/web/aweme/detail/?aweme_id=... for the relevant id. + * + * Prerequisite: the bound Chrome profile must be logged in to + * https://www.douyin.com. The search results page renders an empty + * skeleton for anonymous visitors, which we surface as AuthRequiredError. + */ +import { cli, Strategy } from '@jackwener/opencli/registry'; +import { ArgumentError, AuthRequiredError, CommandExecutionError } from '@jackwener/opencli/errors'; + +export const MAX_SEARCH_LIMIT = 30; +// Time budget for the SPA's initial DOM commit. Empirically the +// scroll-list `
  • ` rows appear within 2-4s of navigation when logged +// in; 15s covers slow networks without blocking on a permanently-empty +// page (anonymous gate, network error). +export const RENDER_TIMEOUT_MS = 15000; + +export function parseSearchLimit(raw) { + const parsed = Number(raw ?? 10); + if (!Number.isFinite(parsed) || !Number.isInteger(parsed)) { + throw new ArgumentError(`--limit must be an integer between 1 and ${MAX_SEARCH_LIMIT}, got ${JSON.stringify(raw)}`); + } + if (parsed < 1 || parsed > MAX_SEARCH_LIMIT) { + throw new ArgumentError(`--limit must be between 1 and ${MAX_SEARCH_LIMIT}, got ${parsed}`); + } + return parsed; +} + +/** + * Parse a Douyin display count like "1.9万", "3.1万", "4702", "1.2亿" + * into a plain integer. Returns 0 for unparseable input rather than + * throwing — the CLI promises numeric columns and missing data is + * common enough on real result rows that a soft fallback is the right + * choice. + */ +export function parseDouyinCount(text) { + if (typeof text !== 'string') return 0; + const m = text.replace(/\s/g, '').match(/^(\d+(?:\.\d+)?)([万亿])?$/); + if (!m) { + const plain = Number(text.replace(/[,\s]/g, '')); + return Number.isFinite(plain) ? Math.round(plain) : 0; + } + const n = Number(m[1]); + if (!Number.isFinite(n)) return 0; + if (m[2] === '万') return Math.round(n * 10_000); + if (m[2] === '亿') return Math.round(n * 100_000_000); + return Math.round(n); +} + +/** + * Resolve scheme-relative or absolute Douyin video links to the canonical + * https://www.douyin.com/video/ shape. Returns '' for unparseable + * input rather than throwing — callers expect a string column. + */ +export function normalizeDouyinVideoUrl(href) { + if (typeof href !== 'string' || !href) return ''; + let full = href; + if (full.startsWith('//')) full = 'https:' + full; + else if (full.startsWith('/')) full = 'https://www.douyin.com' + full; + const idMatch = full.match(/\/video\/(\d+)/); + if (idMatch) return `https://www.douyin.com/video/${idMatch[1]}`; + return full; +} + +/** + * Project a single rendered card into the canonical row shape. Operates + * on a serialized card payload (the raw `{url, leafTexts}` we collect + * via page.evaluate) so this function is unit-testable without a real + * browser. + * + * `leafTexts` is the ordered list of `textContent.trim()` for every leaf + * element inside the card (no children). The fields we want are + * identified by shape: + * - duration: matches `HH:MM:SS` or `MM:SS` + * - likes: matches `(.)?(万|亿)?` and ISN'T the duration + * - author: the text node immediately following an `@` text node + * - desc: the longest remaining leaf text + */ +export function projectCard(card, index) { + const url = normalizeDouyinVideoUrl(card?.url); + const texts = Array.isArray(card?.leafTexts) ? card.leafTexts.map((t) => String(t ?? '').trim()).filter(Boolean) : []; + + const DURATION_RE = /^\d{1,2}:\d{2}(?::\d{2})?$/; + const COUNT_RE = /^\d+(?:\.\d+)?[万亿]?$/; + + let likes = 0; + let author = ''; + let longest = ''; + + for (let i = 0; i < texts.length; i++) { + const t = texts[i]; + if (DURATION_RE.test(t)) continue; + if (!likes && COUNT_RE.test(t)) { + likes = parseDouyinCount(t); + continue; + } + if (t === '@' && !author) { + author = (texts[i + 1] ?? '').trim(); + continue; + } + if (t === author) continue; + if (t.length > longest.length) longest = t; + } + let desc = longest; + // Strip a leading "@author" that some renders fuse into the desc text node. + if (author && desc.startsWith('@' + author)) { + desc = desc.slice(author.length + 1).trim(); + } + return { + rank: index + 1, + desc, + author, + url, + plays: 0, + likes, + comments: 0, + shares: 0, + }; +} + +// JS snippet that waits for the scroll-list to populate, then returns +// `{state: 'rendered', cards}` or `{state: 'login_wall'}` / +// `{state: 'timeout'}`. Runs inside page.evaluate so we don't pay a +// round-trip per poll iteration. +const WAIT_AND_EXTRACT_JS = (timeoutMs) => ` + new Promise((resolve) => { + const collectCards = () => { + const cards = []; + const lis = document.querySelectorAll('[data-e2e="scroll-list"] li'); + for (const li of lis) { + const a = li.querySelector('a[href*="/video/"]'); + if (!a) continue; + const leafTexts = []; + for (const el of li.querySelectorAll('*')) { + if (el.children.length > 0) continue; + const t = (el.textContent || '').trim(); + if (t) leafTexts.push(t); + } + cards.push({ url: a.getAttribute('href') || '', leafTexts }); + } + return cards; + }; + const detectState = () => { + const cards = collectCards(); + if (cards.length > 0) return { state: 'rendered', cards }; + // Anonymous gate: Douyin renders a centered "登录后查看更多内容" + // overlay on /search/ for visitors without sessionid. Match either + // the literal Chinese prompt or a visible login modal/mask. + const text = (document.body && document.body.innerText) || ''; + if (/登录后查看|请先登录|登录抖音/.test(text)) return { state: 'login_wall' }; + const modal = document.querySelector('[class*="login-mask"], [class*="LoginMask"], [class*="login-modal"], dialog[role="dialog"]'); + if (modal && modal instanceof HTMLElement) { + const r = modal.getBoundingClientRect(); + const s = getComputedStyle(modal); + if (r.width > 0 && r.height > 0 && s.display !== 'none' && s.visibility !== 'hidden') { + return { state: 'login_wall' }; + } + } + return null; + }; + const found = detectState(); + if (found) return resolve(found); + const observer = new MutationObserver(() => { + const s = detectState(); + if (s) { observer.disconnect(); resolve(s); } + }); + observer.observe(document.body, { childList: true, subtree: true }); + setTimeout(() => { + observer.disconnect(); + const fallback = detectState(); + resolve(fallback ?? { state: 'timeout' }); + }, ${timeoutMs}); + }) +`; + +function unwrapEvaluateResult(payload) { + if (payload && !Array.isArray(payload) && typeof payload === 'object' && 'session' in payload && 'data' in payload) { + return payload.data; + } + return payload; +} + +cli({ + site: 'douyin', + name: 'search', + access: 'read', + description: '关键词搜索抖音视频', + domain: 'www.douyin.com', + strategy: Strategy.COOKIE, + args: [ + { name: 'query', required: true, positional: true, help: '搜索关键词' }, + { name: 'limit', type: 'int', default: 10, help: `结果数量 (1-${MAX_SEARCH_LIMIT})` }, + ], + columns: ['rank', 'desc', 'author', 'url', 'plays', 'likes', 'comments', 'shares'], + func: async (page, kwargs) => { + const limit = parseSearchLimit(kwargs.limit); + const keyword = String(kwargs.query ?? '').trim(); + if (!keyword) { + throw new ArgumentError('douyin search 需要 关键词'); + } + await page.goto(`https://www.douyin.com/search/${encodeURIComponent(keyword)}?type=video`); + let result; + try { + result = unwrapEvaluateResult(await page.evaluate(WAIT_AND_EXTRACT_JS(RENDER_TIMEOUT_MS))); + } catch (error) { + throw new CommandExecutionError(`Douyin search extraction failed: ${error instanceof Error ? error.message : String(error)}`); + } + if (!result || typeof result !== 'object') { + throw new CommandExecutionError('Douyin search: unexpected evaluator payload shape'); + } + if (result.state === 'login_wall') { + throw new AuthRequiredError( + 'www.douyin.com', + 'Douyin search results are blocked behind a login wall — log in at https://www.douyin.com in Chrome first.', + ); + } + if (result.state === 'timeout' || !Array.isArray(result.cards) || result.cards.length === 0) { + // No cards rendered within the budget AND no explicit login + // wall detected. Most common cause is still an unauthenticated + // session (the page just hides results silently); surface as + // AuthRequiredError with the same actionable message. + throw new AuthRequiredError( + 'www.douyin.com', + 'Douyin search returned no results. Log in to https://www.douyin.com in Chrome — anonymous sessions get an empty results page without a visible login prompt.', + ); + } + return result.cards.slice(0, limit).map((card, index) => projectCard(card, index)); + }, +}); diff --git a/clis/douyin/search.test.js b/clis/douyin/search.test.js new file mode 100644 index 000000000..63659c3b3 --- /dev/null +++ b/clis/douyin/search.test.js @@ -0,0 +1,232 @@ +import { describe, expect, it, vi } from 'vitest'; +import { getRegistry } from '@jackwener/opencli/registry'; +import { + MAX_SEARCH_LIMIT, + normalizeDouyinVideoUrl, + parseDouyinCount, + parseSearchLimit, + projectCard, +} from './search.js'; + +function createPageMock({ evaluateResult } = {}) { + return { + goto: vi.fn().mockResolvedValue(undefined), + wait: vi.fn().mockResolvedValue(undefined), + evaluate: vi.fn().mockResolvedValue(evaluateResult), + }; +} + +describe('douyin search', () => { + it('registers the command on www.douyin.com', () => { + const registry = getRegistry(); + const cmd = [...registry.values()].find((c) => c.site === 'douyin' && c.name === 'search'); + expect(cmd).toBeDefined(); + expect(cmd?.domain).toBe('www.douyin.com'); + }); + + it('rejects invalid limit before navigation', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock(); + await expect(cmd.func(page, { query: '咖啡', limit: 0 })).rejects.toMatchObject({ + code: 'ARGUMENT', + message: expect.stringContaining('--limit'), + }); + expect(page.goto).not.toHaveBeenCalled(); + expect(page.evaluate).not.toHaveBeenCalled(); + }); + + it('rejects limit above MAX_SEARCH_LIMIT', () => { + expect(() => parseSearchLimit(MAX_SEARCH_LIMIT + 1)).toThrow(/--limit/); + }); + + it('rejects an empty query', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock(); + await expect(cmd.func(page, { query: ' ', limit: 5 })).rejects.toMatchObject({ + code: 'ARGUMENT', + }); + expect(page.goto).not.toHaveBeenCalled(); + }); + + it('returns ranked cards from the rendered scroll-list', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock({ + evaluateResult: { + state: 'rendered', + cards: [ + { + url: '//www.douyin.com/video/7585120459717365001', + leafTexts: [ + '合集', + '03:55', + '1.9万', + 'Python邪修,5分钟学完Python基础 #python #编程', + '@', + '校长讲python(无小号)', + '5月前', + ], + }, + ], + }, + }); + const rows = await cmd.func(page, { query: 'python', limit: 5 }); + expect(page.goto).toHaveBeenCalledWith('https://www.douyin.com/search/python?type=video'); + expect(rows).toEqual([ + { + rank: 1, + desc: 'Python邪修,5分钟学完Python基础 #python #编程', + author: '校长讲python(无小号)', + url: 'https://www.douyin.com/video/7585120459717365001', + plays: 0, + likes: 19000, + comments: 0, + shares: 0, + }, + ]); + }); + + it('encodes Chinese keywords in the URL path', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock({ evaluateResult: { state: 'rendered', cards: [{ url: '/video/1', leafTexts: ['hi'] }] } }); + await cmd.func(page, { query: 'AI 编程', limit: 1 }); + expect(page.goto).toHaveBeenCalledWith('https://www.douyin.com/search/AI%20%E7%BC%96%E7%A8%8B?type=video'); + }); + + it('respects --limit cap when the page rendered more cards than requested', async () => { + const cmd = getRegistry().get('douyin/search'); + const cards = Array.from({ length: 12 }, (_, i) => ({ + url: `//www.douyin.com/video/100000${i}`, + leafTexts: ['03:00', `${i + 1}万`, `video ${i}`, '@', `user${i}`], + })); + const page = createPageMock({ evaluateResult: { state: 'rendered', cards } }); + const rows = await cmd.func(page, { query: 'x', limit: 3 }); + expect(rows).toHaveLength(3); + expect(rows.map((r) => r.url)).toEqual([ + 'https://www.douyin.com/video/1000000', + 'https://www.douyin.com/video/1000001', + 'https://www.douyin.com/video/1000002', + ]); + }); + + it('maps the explicit login-wall state to AuthRequiredError', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock({ evaluateResult: { state: 'login_wall' } }); + await expect(cmd.func(page, { query: 'x', limit: 1 })).rejects.toMatchObject({ + code: 'AUTH_REQUIRED', + message: expect.stringContaining('login wall'), + }); + }); + + it('maps the timeout/empty state to AuthRequiredError (anonymous sessions get a silent empty page)', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock({ evaluateResult: { state: 'timeout' } }); + await expect(cmd.func(page, { query: 'x', limit: 1 })).rejects.toMatchObject({ + code: 'AUTH_REQUIRED', + }); + }); + + it('unwraps Browser Bridge {session, data} envelopes before inspecting state', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock({ + evaluateResult: { + session: 'site:douyin', + data: { state: 'rendered', cards: [{ url: '/video/9', leafTexts: ['demo'] }] }, + }, + }); + const rows = await cmd.func(page, { query: 'x', limit: 1 }); + expect(rows).toHaveLength(1); + expect(rows[0].url).toBe('https://www.douyin.com/video/9'); + }); + + it('throws CommandExecutionError on malformed evaluator payload', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock({ evaluateResult: 'not-an-object' }); + await expect(cmd.func(page, { query: 'x', limit: 1 })).rejects.toMatchObject({ + code: 'COMMAND_EXEC', + }); + }); +}); + +describe('parseDouyinCount', () => { + it.each([ + ['1.9万', 19_000], + ['3万', 30_000], + ['4702', 4702], + ['1,234', 1234], + ['1.2亿', 120_000_000], + ['', 0], + ['unknown', 0], + [null, 0], + [undefined, 0], + ])('parses %j as %i', (input, expected) => { + expect(parseDouyinCount(input)).toBe(expected); + }); +}); + +describe('normalizeDouyinVideoUrl', () => { + it.each([ + ['//www.douyin.com/video/123', 'https://www.douyin.com/video/123'], + ['/video/123?foo=bar', 'https://www.douyin.com/video/123'], + ['https://www.douyin.com/video/123?something', 'https://www.douyin.com/video/123'], + ['', ''], + [null, ''], + ])('normalizes %j → %j', (input, expected) => { + expect(normalizeDouyinVideoUrl(input)).toBe(expected); + }); +}); + +describe('projectCard', () => { + it('extracts duration/likes/desc/author by leaf-text shape, classname-agnostic', () => { + const row = projectCard({ + url: '//www.douyin.com/video/7585120459717365001', + leafTexts: ['合集', '03:55', '1.9万', 'Python邪修', '@', '校长', '5月前'], + }, 0); + expect(row).toEqual({ + rank: 1, + desc: 'Python邪修', + author: '校长', + url: 'https://www.douyin.com/video/7585120459717365001', + plays: 0, + likes: 19000, + comments: 0, + shares: 0, + }); + }); + + it('returns the longest non-skipped text as desc, not the publish-date suffix', () => { + const row = projectCard({ + url: '/video/1', + leafTexts: ['02:00', '4702', 'hi long-text', '@', 'user', '1月前'], + }, 0); + expect(row.desc).toBe('hi long-text'); + expect(row.author).toBe('user'); + }); + + it('strips a fused @author prefix from the desc when present', () => { + const row = projectCard({ + url: '/video/1', + leafTexts: ['02:00', '100', '@alice this is the caption', '@', 'alice'], + }, 0); + expect(row.author).toBe('alice'); + expect(row.desc).toBe('this is the caption'); + }); + + it('returns safe defaults when leafTexts is missing', () => { + const row = projectCard({ url: '/video/42', leafTexts: undefined }, 4); + expect(row).toEqual({ + rank: 5, + desc: '', + author: '', + url: 'https://www.douyin.com/video/42', + plays: 0, + likes: 0, + comments: 0, + shares: 0, + }); + }); + + it('returns rank=index+1 regardless of input', () => { + const row = projectCard({ url: '/video/1', leafTexts: ['x'] }, 9); + expect(row.rank).toBe(10); + }); +}); From 22cf2b780238068bd282f4b4525fab4d960e8515 Mon Sep 17 00:00:00 2001 From: jackwener Date: Wed, 27 May 2026 03:32:09 +0800 Subject: [PATCH 2/3] fix(douyin): harden search result identity --- clis/douyin/search.js | 76 +++++++++++++++++++++++++++----------- clis/douyin/search.test.js | 56 +++++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 24 deletions(-) diff --git a/clis/douyin/search.js b/clis/douyin/search.js index 9540cbff6..af4880602 100644 --- a/clis/douyin/search.js +++ b/clis/douyin/search.js @@ -41,7 +41,7 @@ * skeleton for anonymous visitors, which we surface as AuthRequiredError. */ import { cli, Strategy } from '@jackwener/opencli/registry'; -import { ArgumentError, AuthRequiredError, CommandExecutionError } from '@jackwener/opencli/errors'; +import { ArgumentError, AuthRequiredError, CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors'; export const MAX_SEARCH_LIMIT = 30; // Time budget for the SPA's initial DOM commit. Empirically the @@ -82,24 +82,35 @@ export function parseDouyinCount(text) { return Math.round(n); } +export function extractDouyinVideoId(href) { + if (typeof href !== 'string' || !href) return ''; + let full = href; + if (full.startsWith('//')) full = 'https:' + full; + else if (full.startsWith('/')) full = 'https://www.douyin.com' + full; + try { + const parsed = new URL(full); + if (!/(^|\.)douyin\.com$/.test(parsed.hostname)) return ''; + const match = parsed.pathname.match(/^\/video\/(\d+)$/); + return match?.[1] ?? ''; + } + catch { + return ''; + } +} + /** * Resolve scheme-relative or absolute Douyin video links to the canonical * https://www.douyin.com/video/ shape. Returns '' for unparseable * input rather than throwing — callers expect a string column. */ export function normalizeDouyinVideoUrl(href) { - if (typeof href !== 'string' || !href) return ''; - let full = href; - if (full.startsWith('//')) full = 'https:' + full; - else if (full.startsWith('/')) full = 'https://www.douyin.com' + full; - const idMatch = full.match(/\/video\/(\d+)/); - if (idMatch) return `https://www.douyin.com/video/${idMatch[1]}`; - return full; + const id = extractDouyinVideoId(href); + return id ? `https://www.douyin.com/video/${id}` : ''; } /** * Project a single rendered card into the canonical row shape. Operates - * on a serialized card payload (the raw `{url, leafTexts}` we collect + * on a serialized card payload (the raw `{href, leafTexts}` we collect * via page.evaluate) so this function is unit-testable without a real * browser. * @@ -112,7 +123,7 @@ export function normalizeDouyinVideoUrl(href) { * - desc: the longest remaining leaf text */ export function projectCard(card, index) { - const url = normalizeDouyinVideoUrl(card?.url); + const url = normalizeDouyinVideoUrl(card?.url ?? card?.href); const texts = Array.isArray(card?.leafTexts) ? card.leafTexts.map((t) => String(t ?? '').trim()).filter(Boolean) : []; const DURATION_RE = /^\d{1,2}:\d{2}(?::\d{2})?$/; @@ -153,6 +164,17 @@ export function projectCard(card, index) { }; } +function isProjectedRowUsable(row) { + return Boolean(row?.url && row?.desc); +} + +export function projectSearchCards(cards, limit) { + const window = Array.isArray(cards) ? cards.slice(0, limit) : []; + const rows = window.map((card, index) => projectCard(card, index)); + const invalidCount = rows.filter((row) => !isProjectedRowUsable(row)).length; + return { rows: rows.filter(isProjectedRowUsable), invalidCount }; +} + // JS snippet that waits for the scroll-list to populate, then returns // `{state: 'rendered', cards}` or `{state: 'login_wall'}` / // `{state: 'timeout'}`. Runs inside page.evaluate so we don't pay a @@ -171,7 +193,7 @@ const WAIT_AND_EXTRACT_JS = (timeoutMs) => ` const t = (el.textContent || '').trim(); if (t) leafTexts.push(t); } - cards.push({ url: a.getAttribute('href') || '', leafTexts }); + cards.push({ href: a.getAttribute('href') || '', leafTexts }); } return cards; }; @@ -182,7 +204,8 @@ const WAIT_AND_EXTRACT_JS = (timeoutMs) => ` // overlay on /search/ for visitors without sessionid. Match either // the literal Chinese prompt or a visible login modal/mask. const text = (document.body && document.body.innerText) || ''; - if (/登录后查看|请先登录|登录抖音/.test(text)) return { state: 'login_wall' }; + if (/登录后查看|请先登录|登录抖音|验证码|验证|verify_check|安全校验/.test(text)) return { state: 'login_wall' }; + if (/暂无相关搜索结果|没有找到相关结果|搜索结果为空|暂无结果/.test(text)) return { state: 'empty' }; const modal = document.querySelector('[class*="login-mask"], [class*="LoginMask"], [class*="login-modal"], dialog[role="dialog"]'); if (modal && modal instanceof HTMLElement) { const r = modal.getBoundingClientRect(); @@ -249,16 +272,25 @@ cli({ 'Douyin search results are blocked behind a login wall — log in at https://www.douyin.com in Chrome first.', ); } - if (result.state === 'timeout' || !Array.isArray(result.cards) || result.cards.length === 0) { - // No cards rendered within the budget AND no explicit login - // wall detected. Most common cause is still an unauthenticated - // session (the page just hides results silently); surface as - // AuthRequiredError with the same actionable message. - throw new AuthRequiredError( - 'www.douyin.com', - 'Douyin search returned no results. Log in to https://www.douyin.com in Chrome — anonymous sessions get an empty results page without a visible login prompt.', - ); + if (result.state === 'empty') { + throw new EmptyResultError('douyin search', `No Douyin videos matched "${keyword}".`); + } + if (result.state === 'timeout') { + throw new CommandExecutionError('Douyin search did not render result cards within the timeout. Open the same search in Chrome and verify login/security state before retrying.'); + } + if (!Array.isArray(result.cards)) { + throw new CommandExecutionError('Douyin search: evaluator returned malformed cards payload'); + } + if (result.cards.length === 0) { + throw new EmptyResultError('douyin search', `No Douyin videos matched "${keyword}".`); + } + const projected = projectSearchCards(result.cards, limit); + if (projected.invalidCount > 0) { + throw new CommandExecutionError('Douyin search parser found result cards without stable video url or description'); + } + if (projected.rows.length === 0) { + throw new EmptyResultError('douyin search', `No Douyin videos matched "${keyword}".`); } - return result.cards.slice(0, limit).map((card, index) => projectCard(card, index)); + return projected.rows; }, }); diff --git a/clis/douyin/search.test.js b/clis/douyin/search.test.js index 63659c3b3..6c7b3b516 100644 --- a/clis/douyin/search.test.js +++ b/clis/douyin/search.test.js @@ -1,11 +1,13 @@ import { describe, expect, it, vi } from 'vitest'; import { getRegistry } from '@jackwener/opencli/registry'; import { + extractDouyinVideoId, MAX_SEARCH_LIMIT, normalizeDouyinVideoUrl, parseDouyinCount, parseSearchLimit, projectCard, + projectSearchCards, } from './search.js'; function createPageMock({ evaluateResult } = {}) { @@ -117,11 +119,19 @@ describe('douyin search', () => { }); }); - it('maps the timeout/empty state to AuthRequiredError (anonymous sessions get a silent empty page)', async () => { + it('maps explicit empty search state to EmptyResultError', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock({ evaluateResult: { state: 'empty' } }); + await expect(cmd.func(page, { query: 'x', limit: 1 })).rejects.toMatchObject({ + code: 'EMPTY_RESULT', + }); + }); + + it('maps timeout state to CommandExecutionError instead of treating parser drift as auth or empty', async () => { const cmd = getRegistry().get('douyin/search'); const page = createPageMock({ evaluateResult: { state: 'timeout' } }); await expect(cmd.func(page, { query: 'x', limit: 1 })).rejects.toMatchObject({ - code: 'AUTH_REQUIRED', + code: 'COMMAND_EXEC', }); }); @@ -145,6 +155,30 @@ describe('douyin search', () => { code: 'COMMAND_EXEC', }); }); + + it('throws CommandExecutionError on malformed cards payload', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock({ evaluateResult: { state: 'rendered', cards: { bad: true } } }); + await expect(cmd.func(page, { query: 'x', limit: 1 })).rejects.toMatchObject({ + code: 'COMMAND_EXEC', + }); + }); + + it('fails closed instead of partially returning cards missing stable url or desc', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock({ + evaluateResult: { + state: 'rendered', + cards: [ + { url: '/video/123', leafTexts: ['03:00', 'valid desc'] }, + { url: 'https://evil.test/video/456', leafTexts: ['03:00', 'invalid url'] }, + ], + }, + }); + await expect(cmd.func(page, { query: 'x', limit: 2 })).rejects.toMatchObject({ + code: 'COMMAND_EXEC', + }); + }); }); describe('parseDouyinCount', () => { @@ -168,11 +202,19 @@ describe('normalizeDouyinVideoUrl', () => { ['//www.douyin.com/video/123', 'https://www.douyin.com/video/123'], ['/video/123?foo=bar', 'https://www.douyin.com/video/123'], ['https://www.douyin.com/video/123?something', 'https://www.douyin.com/video/123'], + ['https://evil.test/video/123', ''], + ['https://www.douyin.com/user/video/123', ''], ['', ''], [null, ''], ])('normalizes %j → %j', (input, expected) => { expect(normalizeDouyinVideoUrl(input)).toBe(expected); }); + + it('extracts only stable Douyin video ids', () => { + expect(extractDouyinVideoId('https://www.douyin.com/video/123')).toBe('123'); + expect(extractDouyinVideoId('//www.douyin.com/video/456')).toBe('456'); + expect(extractDouyinVideoId('https://evil.test/video/123')).toBe(''); + }); }); describe('projectCard', () => { @@ -229,4 +271,14 @@ describe('projectCard', () => { const row = projectCard({ url: '/video/1', leafTexts: ['x'] }, 9); expect(row.rank).toBe(10); }); + + it('projects cards and reports malformed rows in the returned window', () => { + const result = projectSearchCards([ + { url: '/video/1', leafTexts: ['caption'] }, + { url: '/video/not-numeric', leafTexts: ['bad'] }, + { url: '/video/3', leafTexts: [] }, + ], 3); + expect(result.rows).toHaveLength(1); + expect(result.invalidCount).toBe(2); + }); }); From 5cebe416a7ceec8ffdcf49d9d2cce6f4c0f275b5 Mon Sep 17 00:00:00 2001 From: jackwener Date: Wed, 27 May 2026 03:38:58 +0800 Subject: [PATCH 3/3] fix(douyin): reject metadata-only search cards --- clis/douyin/search.js | 12 ++++++++++++ clis/douyin/search.test.js | 23 +++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/clis/douyin/search.js b/clis/douyin/search.js index af4880602..22079c18b 100644 --- a/clis/douyin/search.js +++ b/clis/douyin/search.js @@ -108,6 +108,17 @@ export function normalizeDouyinVideoUrl(href) { return id ? `https://www.douyin.com/video/${id}` : ''; } +function isSearchCardMetadataText(text) { + if (!text) return true; + if (/^\d{1,2}:\d{2}(?::\d{2})?$/.test(text)) return true; + if (/^\d+(?:\.\d+)?[万亿]?$/.test(text)) return true; + if (/^(合集|视频|作者)$/.test(text)) return true; + if (/^(刚刚|今天|昨天|前天)$/.test(text)) return true; + if (/^\d+\s*(秒|分钟|小时|天|周|个月|月|年)前$/.test(text)) return true; + if (/^\d{4}[-/.年]\d{1,2}(?:[-/.月]\d{1,2}日?)?$/.test(text)) return true; + return false; +} + /** * Project a single rendered card into the canonical row shape. Operates * on a serialized card payload (the raw `{href, leafTexts}` we collect @@ -145,6 +156,7 @@ export function projectCard(card, index) { continue; } if (t === author) continue; + if (isSearchCardMetadataText(t)) continue; if (t.length > longest.length) longest = t; } let desc = longest; diff --git a/clis/douyin/search.test.js b/clis/douyin/search.test.js index 6c7b3b516..8741e7962 100644 --- a/clis/douyin/search.test.js +++ b/clis/douyin/search.test.js @@ -179,6 +179,21 @@ describe('douyin search', () => { code: 'COMMAND_EXEC', }); }); + + it('fails closed when a card only has metadata text and no stable desc', async () => { + const cmd = getRegistry().get('douyin/search'); + const page = createPageMock({ + evaluateResult: { + state: 'rendered', + cards: [ + { url: '/video/123', leafTexts: ['合集', '03:00', '1.2万', '@', '作者名', '5月前'] }, + ], + }, + }); + await expect(cmd.func(page, { query: 'x', limit: 1 })).rejects.toMatchObject({ + code: 'COMMAND_EXEC', + }); + }); }); describe('parseDouyinCount', () => { @@ -281,4 +296,12 @@ describe('projectCard', () => { expect(result.rows).toHaveLength(1); expect(result.invalidCount).toBe(2); }); + + it('does not treat metadata-only leaf text as a stable desc', () => { + const result = projectSearchCards([ + { url: '/video/1', leafTexts: ['合集', '03:55', '1.9万', '@', '校长', '5月前'] }, + ], 1); + expect(result.rows).toHaveLength(0); + expect(result.invalidCount).toBe(1); + }); });