From e591f70acff00b09d3a653f4f01addaac16ab9ef Mon Sep 17 00:00:00 2001 From: jackwener Date: Mon, 1 Jun 2026 13:54:12 +0800 Subject: [PATCH] feat(author): require network-first strategy evidence --- skills/opencli-adapter-author/SKILL.md | 40 +++- .../references/api-discovery.md | 9 + src/browser/analyze.test.ts | 55 ++++- src/browser/analyze.ts | 194 +++++++++++++++++- src/cli.test.ts | 12 +- src/cli.ts | 3 +- 6 files changed, 300 insertions(+), 13 deletions(-) diff --git a/skills/opencli-adapter-author/SKILL.md b/skills/opencli-adapter-author/SKILL.md index 327c2c01b..8af3eee7a 100644 --- a/skills/opencli-adapter-author/SKILL.md +++ b/skills/opencli-adapter-author/SKILL.md @@ -28,6 +28,39 @@ allowed-tools: Bash(opencli:*), Read, Edit, Write, Grep ## 顶层决策树 +**先定 strategy,再写 adapter。** 每次进入 Step 3/4 后、写代码前,必须产出一段 strategy note。没有这段 note,不要开始写 `clis//.js`。 + +核心判断不是 "API 比 DOM 高级",而是 **数据源有没有外部契约**。实测维护成本显示:公开/官方接口最稳;UI/DOM 语义通常也有用户可见契约;站内未文档化 XHR/GraphQL/signature endpoint 最容易漂。不要为了 "API-first" 把稳定的 UI/DOM 实现盲目迁到无契约内部接口。 + +```md +Strategy: PUBLIC_API | COOKIE_API | PAGE_FETCH | INTERCEPT | DOM_STATE | UI_SELECTOR +Contract: stable | visible-ui | internal-unstable +Evidence: +- observed request/state: +- auth source: +- replay result: + +If Strategy is PAGE_FETCH or INTERCEPT: +- why PUBLIC_API / COOKIE_API are unavailable: +- why UI_SELECTOR / DOM_STATE are not safer: +- why the maintenance cost is acceptable: +``` + +Strategy classes: + +| Strategy | 契约级别 | 用在什么时候 | 证据要求 | +|---|---|---|---| +| `PUBLIC_API` | stable | 不需要登录,Node-side `fetch` 直接拿到目标数据 | 200 + JSON/HTML 含目标数据,不是埋点/广告 | +| `COOKIE_API` | stable | Node-side `fetch` + `page.getCookies()` / header helper 能拿数据 | cookie/CSRF 来源清楚,replay 非空 | +| `UI_SELECTOR` | visible-ui | publish/upload/click/表单,或页面语义比内部接口更稳 | selector 有语义锚点;错误路径是 typed error | +| `DOM_STATE` | visible-ui | 数据在 hydration state / bootstrap JSON / SSR HTML 里 | state key / script JSON / HTML 结构明确 | +| `PAGE_FETCH` | internal-unstable | 只能在页面上下文 `fetch` 才能复用 same-origin/session/runtime | `opencli browser eval fetch(...)` 非空;必须解释为什么避不开内部接口 | +| `INTERCEPT` | internal-unstable | 请求签名复杂,但页面自己能自然发出请求 | 触发 UI 后能截到目标 response;必须解释为什么 UI/DOM 不够 | + +选择规则:优先 `PUBLIC_API` / `COOKIE_API`。如果 UI/DOM 语义稳定,不要强行升级到 `PAGE_FETCH` / `INTERCEPT`。只有公开/官方接口不可用、UI/DOM 无法表达目标数据或操作时,才承担无契约内部接口的维护成本。 + +边界:只复用页面自己已经合法获得的数据/能力。不教破解签名、不绕验证码/风控/访问控制;遇到不可复用签名(如必须由页面 runtime 生成且不能安全抽象)就降级到 `UI_SELECTOR` / `DOM_STATE` / `INTERCEPT`。 + ``` START │ @@ -123,7 +156,12 @@ DONE [ ] 5. 直接 fetch 候选 endpoint 验证: [ ] 返回 200 [ ] 响应含目标数据(不是 HTML / 广告) -[ ] 6. 定鉴权策略:裸 fetch 通 → PUBLIC;要 cookie / token / csrf → COOKIE;拿不到签名 → INTERCEPT;只能点 UI → UI +[ ] 6. 写 strategy note(写代码前的强制产物): + [ ] 从 `PUBLIC_API / COOKIE_API / PAGE_FETCH / INTERCEPT / DOM_STATE / UI_SELECTOR` 选一个 + [ ] 填 Contract:`stable / visible-ui / internal-unstable` + [ ] 填 Evidence:observed request/state、auth source、replay result + [ ] 如果选 `PAGE_FETCH` / `INTERCEPT`,必须解释为什么 `PUBLIC_API` / `COOKIE_API` / `UI_SELECTOR` / `DOM_STATE` 都不适合 + [ ] 如果选 `UI_SELECTOR` / `DOM_STATE`,不需要为 "为什么不是 API" 过度辩护;只要说明语义锚点和 typed error 路径 [ ] 7. 字段解码: [ ] 自解释 → 直接用 key [ ] 已知代号 → field-conventions.md 查表 diff --git a/skills/opencli-adapter-author/references/api-discovery.md b/skills/opencli-adapter-author/references/api-discovery.md index e919087a2..c3eebab83 100644 --- a/skills/opencli-adapter-author/references/api-discovery.md +++ b/skills/opencli-adapter-author/references/api-discovery.md @@ -59,6 +59,15 @@ opencli browser network 静态资源 / 埋点 / 追踪默认已过滤。默认会保留 JSON / XML / plain text / `text/javascript` 这类 API 响应;如果你确定浏览器 DevTools 里有目标请求但这里缺失,用 `--all` 查一遍是否被 content-type 或 URL 噪音过滤挡掉。 +如果是冷启动,先看 `opencli browser analyze ` 里的 `api_candidates`: + +- `verdict: "likely_data"`:优先 replay 这条,拿 status / content-type / sample shape 填 strategy note +- `verdict: "maybe_data"`:可以试,但必须人工核对字段是否是目标业务数据 +- `verdict: "noise"`:多半是 analytics / beacon / personalization,不要因为 XHR 数量多就判 Pattern A +- `verdict: "blocked"`:401/403;先排 cookie / token / CSRF,别直接退到 selector + +`real_data_score` 是证据,不是自动 strategy。最终仍要在 strategy note 里写 replay 结果和降级理由。 + ### 按 shape 初筛 挑 `key` 里含业务词(`list / detail / Timeline / User / Tweets / Quote`)的优先看 `shape`: diff --git a/src/browser/analyze.test.ts b/src/browser/analyze.test.ts index 9c743d6a7..36f89e52f 100644 --- a/src/browser/analyze.test.ts +++ b/src/browser/analyze.test.ts @@ -4,6 +4,7 @@ import { detectAntiBot, classifyPattern, findNearestAdapter, + scoreEndpointEvidence, type PageSignals, } from './analyze.js'; import type { CliCommand } from '../registry.js'; @@ -87,13 +88,29 @@ describe('classifyPattern', () => { const v = classifyPattern( mkSignals({ networkEntries: [ - { url: 'https://x.com/api/a', status: 200, contentType: 'application/json', bodyPreview: '{}' }, - { url: 'https://x.com/api/b', status: 200, contentType: 'application/json;charset=utf-8', bodyPreview: '{}' }, + { url: 'https://x.com/api/a', status: 200, contentType: 'application/json', bodyPreview: '{"items":[{"title":"A","id":"1"}]}' }, + { url: 'https://x.com/api/b', status: 200, contentType: 'application/json;charset=utf-8', bodyPreview: '{"data":{"results":[{"name":"B","url":"/b"}]}}' }, ], }), ); expect(v.pattern).toBe('A'); expect(v.json_responses).toBe(2); + expect(v.real_data_candidates).toBe(2); + }); + + it('does not call analytics JSON a real API pattern', () => { + const v = classifyPattern( + mkSignals({ + networkEntries: [ + { url: 'https://x.com/analytics/collect', status: 200, contentType: 'application/json', bodyPreview: '{"event":"view","clientId":"abc","experiment":"A"}' }, + { url: 'https://x.com/personalization', status: 200, contentType: 'application/json', bodyPreview: '{"sessionId":"s1","metrics":{"latency":12}}' }, + ], + }), + ); + expect(v.pattern).toBe('C'); + expect(v.json_responses).toBe(2); + expect(v.real_data_candidates).toBe(0); + expect(v.reason).toMatch(/telemetry|side-channel/); }); it('returns B when __INITIAL_STATE__ is present, beating JSON signals', () => { @@ -127,6 +144,40 @@ describe('classifyPattern', () => { }); }); +describe('scoreEndpointEvidence', () => { + it('scores non-empty business JSON above telemetry side-channel JSON', () => { + const data = scoreEndpointEvidence({ + url: 'https://x.com/api/search', + status: 200, + contentType: 'application/json', + bodyPreview: '{"data":{"items":[{"title":"A","price":12,"url":"/a"}],"total":1}}', + }); + const telemetry = scoreEndpointEvidence({ + url: 'https://x.com/analytics/collect', + status: 200, + contentType: 'application/json', + bodyPreview: '{"event":"view","clientId":"abc"}', + }); + + expect(data.verdict).toBe('likely_data'); + expect(data.real_data_score).toBeGreaterThan(telemetry.real_data_score); + expect(data.sample_paths).toContain('$.data.items:array(1)'); + expect(telemetry.verdict).toBe('noise'); + }); + + it('marks auth-gated JSON as blocked rather than data', () => { + const evidence = scoreEndpointEvidence({ + url: 'https://x.com/api/private', + status: 403, + contentType: 'application/json', + bodyPreview: '{"error":"forbidden"}', + }); + + expect(evidence.verdict).toBe('blocked'); + expect(evidence.real_data_score).toBeLessThan(0.1); + }); +}); + describe('findNearestAdapter', () => { it('matches by domain suffix', () => { const reg = new Map([ diff --git a/src/browser/analyze.ts b/src/browser/analyze.ts index 073023cff..d584907f0 100644 --- a/src/browser/analyze.ts +++ b/src/browser/analyze.ts @@ -51,6 +51,18 @@ export interface PageSignals { title: string; } +export type EndpointEvidenceVerdict = 'likely_data' | 'maybe_data' | 'noise' | 'blocked'; + +export interface EndpointEvidence { + url: string; + status: number; + contentType: string; + real_data_score: number; + verdict: EndpointEvidenceVerdict; + reasons: string[]; + sample_paths: string[]; +} + // ── Anti-bot detection ──────────────────────────────────────────────────── export type AntiBotVendor = @@ -160,10 +172,173 @@ export interface PatternVerdict { reason: string; /** How many JSON XHR/fetch responses we saw during navigation. */ json_responses: number; + /** How many observed responses look like real business data, not telemetry. */ + real_data_candidates: number; /** Count of non-2xx API responses — hint for token-gated (Pattern D). */ auth_failures: number; } +const NOISE_URL_RE = /(?:analytics|beacon|collect|telemetry|tracking|sentry|doubleclick|google-analytics|googletagmanager|adservice|\/ads?(?:[/?#]|$)|metrics?|pixel|personalization|experiment|\/events?(?:[/?#]|$))/i; +const BUSINESS_KEY_RE = /^(?:data|items?|results?|records?|list|rows?|edges?|nodes?|timeline|users?|title|name|text|content|body|price|amount|id|url|avatar|nickname|desc|comments?|likes?|shares?|total|page|cursor|next|rank|score|date|time|author)$/i; +const TRACKING_KEY_RE = /^(?:event|events|trace|traceid|sessionid|clientid|visitorid|experiment|abtest|beacon|analytics|metrics?|pixel|log|logs)$/i; + +function clampScore(value: number): number { + return Math.max(0, Math.min(1, Math.round(value * 100) / 100)); +} + +function parseBodyPreview(preview: string | null): unknown { + if (!preview) return null; + const trimmed = preview.trim(); + if (!trimmed) return null; + try { + return JSON.parse(trimmed); + } catch { + return trimmed; + } +} + +function collectJsonPaths(value: unknown, prefix = '$', out: string[] = [], depth = 0): string[] { + if (depth > 4 || out.length >= 24) return out; + if (Array.isArray(value)) { + out.push(`${prefix}:array(${value.length})`); + if (value.length > 0) collectJsonPaths(value[0], `${prefix}[0]`, out, depth + 1); + return out; + } + if (!value || typeof value !== 'object') { + out.push(`${prefix}:${typeof value}`); + return out; + } + const entries = Object.entries(value as Record).slice(0, 12); + for (const [key, child] of entries) { + out.push(`${prefix}.${key}`); + if (Array.isArray(child)) out.push(`${prefix}.${key}:array(${child.length})`); + else if (child && typeof child === 'object') collectJsonPaths(child, `${prefix}.${key}`, out, depth + 1); + else out.push(`${prefix}.${key}:${typeof child}`); + } + return out; +} + +function countKeys(value: unknown, predicate: (key: string) => boolean, depth = 0): number { + if (depth > 4 || !value || typeof value !== 'object') return 0; + if (Array.isArray(value)) return value.slice(0, 3).reduce((sum, item) => sum + countKeys(item, predicate, depth + 1), 0); + return Object.entries(value as Record).reduce((sum, [key, child]) => ( + sum + (predicate(key) ? 1 : 0) + countKeys(child, predicate, depth + 1) + ), 0); +} + +function hasNonEmptyArray(value: unknown, depth = 0): boolean { + if (depth > 4 || !value || typeof value !== 'object') return false; + if (Array.isArray(value)) return value.length > 0; + return Object.values(value as Record).some((child) => hasNonEmptyArray(child, depth + 1)); +} + +export function scoreEndpointEvidence(entry: PageSignals['networkEntries'][number]): EndpointEvidence { + const reasons: string[] = []; + const body = parseBodyPreview(entry.bodyPreview); + let score = 0; + + if (entry.status >= 200 && entry.status < 300) { + score += 0.15; + reasons.push('2xx status'); + } else if (entry.status === 401 || entry.status === 403) { + return { + url: entry.url, + status: entry.status, + contentType: entry.contentType, + real_data_score: 0.05, + verdict: 'blocked', + reasons: ['auth-blocked status'], + sample_paths: [], + }; + } else { + reasons.push(`non-2xx status ${entry.status}`); + } + + if (/json/i.test(entry.contentType)) { + score += 0.2; + reasons.push('json content-type'); + } else if (/html/i.test(entry.contentType)) { + score -= 0.25; + reasons.push('html content-type'); + } else if (/javascript|text/i.test(entry.contentType)) { + score += 0.05; + reasons.push('text/script content-type'); + } + + if (NOISE_URL_RE.test(entry.url)) { + score -= 0.3; + reasons.push('telemetry-like url'); + } + + const samplePaths = collectJsonPaths(body).slice(0, 8); + if (typeof body === 'string') { + if (/^\s* 20) { + score += 0.05; + reasons.push('non-empty text body'); + } + } else if (Array.isArray(body)) { + if (body.length > 0) { + score += 0.3; + reasons.push('non-empty top-level array'); + } else { + score -= 0.15; + reasons.push('empty array'); + } + } else if (body && typeof body === 'object') { + const keys = Object.keys(body as Record); + if (keys.length === 0) { + score -= 0.15; + reasons.push('empty object'); + } else { + score += 0.12; + reasons.push('json object body'); + } + + const businessKeys = countKeys(body, (key) => BUSINESS_KEY_RE.test(key)); + if (businessKeys > 0) { + score += Math.min(0.3, businessKeys * 0.05); + reasons.push(`${businessKeys} business-like key${businessKeys === 1 ? '' : 's'}`); + } + if (hasNonEmptyArray(body)) { + score += 0.2; + reasons.push('nested non-empty array'); + } + const trackingKeys = countKeys(body, (key) => TRACKING_KEY_RE.test(key)); + if (trackingKeys > 0 && businessKeys === 0) { + score -= Math.min(0.25, trackingKeys * 0.08); + reasons.push(`${trackingKeys} tracking-like key${trackingKeys === 1 ? '' : 's'} without business keys`); + } + } + + const realDataScore = clampScore(score); + const verdict: EndpointEvidenceVerdict = realDataScore >= 0.65 + ? 'likely_data' + : realDataScore >= 0.35 + ? 'maybe_data' + : 'noise'; + + return { + url: entry.url, + status: entry.status, + contentType: entry.contentType, + real_data_score: realDataScore, + verdict, + reasons, + sample_paths: samplePaths, + }; +} + +export function scoreNetworkEvidence(signals: PageSignals): EndpointEvidence[] { + return signals.networkEntries + .map(scoreEndpointEvidence) + .filter((evidence) => evidence.verdict !== 'noise' || evidence.real_data_score > 0) + .sort((a, b) => b.real_data_score - a.real_data_score) + .slice(0, 8); +} + /** * Apply the decision tree from `site-recon.md` mechanically. * @@ -174,6 +349,8 @@ export interface PatternVerdict { */ export function classifyPattern(signals: PageSignals): PatternVerdict { const jsonEntries = signals.networkEntries.filter((e) => /json/i.test(e.contentType)); + const endpointEvidence = scoreNetworkEvidence(signals); + const realDataCandidates = endpointEvidence.filter((e) => e.verdict === 'likely_data' || e.verdict === 'maybe_data').length; const authFailures = signals.networkEntries.filter( (e) => e.status === 401 || e.status === 403, ).length; @@ -188,6 +365,7 @@ export function classifyPattern(signals: PageSignals): PatternVerdict { pattern: 'D', reason: `${authFailures} auth-failing API responses seen — endpoint is token-gated`, json_responses: jsonEntries.length, + real_data_candidates: realDataCandidates, auth_failures: authFailures, }; } @@ -200,15 +378,17 @@ export function classifyPattern(signals: PageSignals): PatternVerdict { pattern: 'B', reason: `SSR state global present: ${which.join(', ')}`, json_responses: jsonEntries.length, + real_data_candidates: realDataCandidates, auth_failures: authFailures, }; } - if (jsonEntries.length >= 1) { + if (realDataCandidates >= 1) { return { pattern: 'A', - reason: `${jsonEntries.length} JSON XHR/fetch responses observed — classic API pattern`, + reason: `${realDataCandidates} captured response${realDataCandidates === 1 ? '' : 's'} look like real data — inspect api_candidates before choosing a strategy`, json_responses: jsonEntries.length, + real_data_candidates: realDataCandidates, auth_failures: authFailures, }; } @@ -219,8 +399,11 @@ export function classifyPattern(signals: PageSignals): PatternVerdict { // leave the agent to upgrade to E manually if they see WS traffic. return { pattern: 'C', - reason: 'No JSON XHR and no SSR state — HTML scrape (Pattern C); escalate to E manually if WebSocket traffic appears', + reason: jsonEntries.length > 0 + ? `${jsonEntries.length} JSON response${jsonEntries.length === 1 ? '' : 's'} observed, but none look like target data — likely telemetry/side-channel; treat as HTML/DOM until an endpoint validates` + : 'No JSON XHR and no SSR state — HTML scrape (Pattern C); escalate to E manually if WebSocket traffic appears', json_responses: jsonEntries.length, + real_data_candidates: realDataCandidates, auth_failures: authFailures, }; } @@ -297,6 +480,7 @@ export interface AnalyzeReport { pattern: PatternVerdict; anti_bot: AntiBotVerdict; initial_state: PageSignals['initialState']; + api_candidates: EndpointEvidence[]; nearest_adapter: NearestAdapter | null; recommended_next_step: string; } @@ -314,13 +498,14 @@ export function analyzeSite( ): AnalyzeReport { const pattern = classifyPattern(signals); const antiBot = detectAntiBot(signals); + const apiCandidates = scoreNetworkEvidence(signals); const nearest = findNearestAdapter(signals.finalUrl, registry); let next: string; if (antiBot.detected) { next = antiBot.implication; } else if (pattern.pattern === 'A') { - next = 'Pick the most specific JSON endpoint from `opencli browser network` and try a bare Node fetch with cookies; escalate to browser-context fetch only if blocked.'; + next = 'Inspect `api_candidates`, then replay the best endpoint and record the status/content-type/sample shape in your strategy note; do not choose API strategy from XHR count alone.'; } else if (pattern.pattern === 'B') { next = 'Read the SSR global via `opencli browser eval "JSON.stringify(window.__INITIAL_STATE__ ?? window.__NUXT__ ?? window.__NEXT_DATA__ ?? window.__APOLLO_STATE__)"` — no API needed.'; } else if (pattern.pattern === 'C') { @@ -340,6 +525,7 @@ export function analyzeSite( pattern, anti_bot: antiBot, initial_state: signals.initialState, + api_candidates: apiCandidates, nearest_adapter: nearest, recommended_next_step: next, }; diff --git a/src/cli.test.ts b/src/cli.test.ts index 2576f761b..ea67389ee 100644 --- a/src/cli.test.ts +++ b/src/cli.test.ts @@ -1332,11 +1332,11 @@ describe('browser tab targeting commands', () => { ]) .mockResolvedValueOnce([ { - url: 'https://target.example/waf', + url: 'https://target.example/api/items', method: 'GET', - responseStatus: 403, - responseContentType: 'text/html', - responsePreview: 'Cloudflare Ray ID', + responseStatus: 200, + responseContentType: 'application/json', + responsePreview: '{"items":[{"title":"A","id":"1"}]}', }, ]), } as unknown as IPage; @@ -1346,7 +1346,9 @@ describe('browser tab targeting commands', () => { const out = lastJsonLog(); expect(browserState.page?.readNetworkCapture).toHaveBeenCalledTimes(2); - expect(out.anti_bot.vendor).toBe('cloudflare'); + expect(out.pattern.pattern).toBe('A'); + expect(out.api_candidates[0].url).toBe('https://target.example/api/items'); + expect(out.api_candidates[0].verdict).toBe('likely_data'); expect(out.anti_bot.evidence).toContain('cookie:cf_clearance'); }); diff --git a/src/cli.ts b/src/cli.ts index ad68def90..77169c627 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -1134,6 +1134,7 @@ Examples: // // - pattern: A/B/C/D (mapped from network + SSR-globals signals) // - anti_bot: vendor + evidence + the one-liner for "what to do next" + // - api_candidates: captured endpoints scored as real data vs telemetry // - initial_state: which window globals are populated // - nearest_adapter: existing commands for the same site, if any // - recommended_next_step: a single imperative sentence @@ -1142,7 +1143,7 @@ Examples: // feedback loop with a single deterministic verdict. Without this, agents // burn ~20min per WAF-protected site re-discovering anti-bot posture. addBrowserTabOption(browser.command('analyze').argument('')) - .description('Classify site: anti-bot vendor, pattern (A/B/C/D), nearest adapter, recommended next step') + .description('Classify site: anti-bot vendor, real-data API candidates, pattern (A/B/C/D), nearest adapter, next step') .action(browserAction(async (page, url) => { const hasSessionCapture = await page.startNetworkCapture?.() ?? false; await page.goto(url);