diff --git a/package.json b/package.json index e9611f0..6d0ffaf 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@apideck/agent-analytics", - "version": "0.6.0", + "version": "0.7.0", "description": "Track AI agent and bot traffic to your Next.js / Vercel app — PostHog, webhooks, or any custom analytics backend. Detects Claude, ChatGPT, Perplexity, Google-Extended, and more.", "keywords": [ "ai", diff --git a/src/bots.ts b/src/bots.ts index f343102..c5e756f 100644 --- a/src/bots.ts +++ b/src/bots.ts @@ -166,23 +166,81 @@ export function firstUserAgentProduct(userAgent: string | null | undefined): str return first || 'Other' } +/** + * Detect likely headless/automated browsers by checking for missing headers + * that real browsers always send. Playwright, Puppeteer, and similar tools + * spoof the UA but often omit standard browser headers. + * + * Signals checked (each scores 1 point): + * - Missing `Accept-Language` — every real browser sends this + * - Missing `Sec-Fetch-Mode` — sent by all modern browsers + * - Missing `Sec-CH-UA` — Client Hints, Chromium 89+ + * - `Sec-CH-UA` contains "HeadlessChrome" + * - Missing or bare Accept header — browsers send detailed accept lists + * - `Connection: close` with browser UA — browsers use keep-alive + * + * Returns a score (0-6), the signals that fired, and a boolean `likely` + * flag (score >= 2 with a browser-like UA). + */ +export function detectHeadless(req: Request): HeadlessDetection { + const signals: string[] = [] + const ua = (req.headers.get('user-agent') || '').toLowerCase() + const isBrowserUA = + ua.includes('mozilla') || ua.includes('chrome') || ua.includes('safari') || ua.includes('firefox') + + if (!isBrowserUA) return { score: 0, signals: [], likely: false } + + if (!req.headers.get('accept-language')) { + signals.push('missing-accept-language') + } + if (!req.headers.get('sec-fetch-mode')) { + signals.push('missing-sec-fetch-mode') + } + const secChUa = req.headers.get('sec-ch-ua') + if (!secChUa) { + signals.push('missing-sec-ch-ua') + } else if (secChUa.toLowerCase().includes('headlesschrome')) { + signals.push('headless-chrome-hint') + } + const accept = req.headers.get('accept') || '' + if (!accept || accept === '*/*') { + signals.push('missing-or-bare-accept') + } + if ((req.headers.get('connection') || '').toLowerCase() === 'close') { + signals.push('connection-close') + } + + const score = signals.length + return { score, signals, likely: score >= 2 } +} + +export interface HeadlessDetection { + /** Number of suspicious signals found (0-6). */ + score: number + /** Names of the specific signals that fired. */ + signals: string[] + /** True when score >= 2 — strong headless indication. */ + likely: boolean +} + export type AgentKind = | 'declared-crawler' | 'coding-agent-hint' + | 'headless-likely' | 'browser' | 'other' export interface AgentClassification { /** - * Categorical tag for the UA: + * Categorical tag for the request: * * - `'declared-crawler'` — {@link AI_BOT_PATTERN} matched. High confidence. * - `'coding-agent-hint'` — {@link HTTP_CLIENT_PATTERN} matched. Loose * signal; could be a coding agent, a curl script, or any automation. - * - `'browser'` — looks like a real browser. Could be a genuine user or - * a Playwright-based agent (Aider, OpenCode) that can't be distinguished - * at the UA layer. - * - `'other'` — unrecognised or empty. + * - `'headless-likely'` — Browser-like UA but missing standard headers. + * Strong signal of Playwright/Puppeteer automation (Aider, OpenCode, etc.). + * - `'browser'` — Looks like a real browser with expected headers present. + * - `'other'` — Unrecognised or empty. */ kind: AgentKind /** Human-readable label, same string {@link parseBotName} returns. */ @@ -191,13 +249,13 @@ export interface AgentClassification { isAiBot: boolean /** Loose: `true` for known HTTP-library / automation UAs. */ codingAgentHint: boolean + /** Headless browser detection result. Only populated when `req` is passed. */ + headless?: HeadlessDetection } /** - * One-stop classification of a user-agent. Combines {@link isAiBot}, - * {@link isHttpClient}, and {@link parseBotName} into a single structured - * result. Used internally by `trackVisit` to populate event properties; - * useful in consumer code when you need all signals at once. + * UA-only classification. Use {@link classifyRequest} for full detection + * including headless browser heuristics. */ export function classifyAgent(userAgent: string | null | undefined): AgentClassification { const label = parseBotName(userAgent) @@ -212,3 +270,21 @@ export function classifyAgent(userAgent: string | null | undefined): AgentClassi return { kind, label, isAiBot: aiBot, codingAgentHint: httpClient } } + +/** + * Full request classification — combines UA parsing with header-based + * headless detection. When a browser-like UA is missing standard headers, + * the kind is promoted from `'browser'` to `'headless-likely'`. + */ +export function classifyRequest(req: Request): AgentClassification { + const userAgent = req.headers.get('user-agent') || '' + const base = classifyAgent(userAgent) + const headless = detectHeadless(req) + + let kind = base.kind + if (kind === 'browser' && headless.likely) { + kind = 'headless-likely' + } + + return { ...base, kind, headless } +} diff --git a/src/index.ts b/src/index.ts index 7462af7..be2f20d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -3,12 +3,14 @@ export { AI_BOT_PATTERN, HTTP_CLIENT_PATTERN, classifyAgent, + classifyRequest, + detectHeadless, firstUserAgentProduct, isAiBot, isHttpClient, parseBotName } from './bots.js' -export type { AgentClassification, AgentKind } from './bots.js' +export type { AgentClassification, AgentKind, HeadlessDetection } from './bots.js' export { hashId } from './hash.js' export { posthogAnalytics } from './adapters/posthog.js' export { webhookAnalytics } from './adapters/webhook.js' diff --git a/src/track.ts b/src/track.ts index 399b16e..342d311 100644 --- a/src/track.ts +++ b/src/track.ts @@ -1,4 +1,4 @@ -import { classifyAgent, isAiBot, isHttpClient } from './bots.js' +import { classifyRequest, detectHeadless, isAiBot, isHttpClient } from './bots.js' import { hashId } from './hash.js' import type { TrackVisitOptions } from './types.js' @@ -20,7 +20,12 @@ export async function trackVisit( const onlyBots = opts.onlyBots ?? false const skipBrowsers = opts.skipBrowsers ?? false if (onlyBots && !isAiBot(userAgent)) return - if (skipBrowsers && !isAiBot(userAgent) && !isHttpClient(userAgent)) return + if (skipBrowsers && !isAiBot(userAgent) && !isHttpClient(userAgent)) { + // Not a declared bot or HTTP client — check headless heuristics. + // Playwright-based agents (Aider, OpenCode) will pass if they're missing + // standard browser headers. Real browsers get skipped. + if (!detectHeadless(req).likely) return + } let pathname = '/' let originFromUrl = '' @@ -37,7 +42,7 @@ export async function trackVisit( const forwardedFor = req.headers.get('x-forwarded-for') || '' const ip = forwardedFor.split(',')[0]?.trim() ?? '' const referer = req.headers.get('referer') - const classification = classifyAgent(userAgent) + const classification = classifyRequest(req) const event = { event: opts.eventName ?? 'agent_visit', @@ -52,6 +57,8 @@ export async function trackVisit( bot_name: classification.label, ua_category: classification.kind, coding_agent_hint: classification.codingAgentHint, + headless_score: classification.headless?.score ?? 0, + headless_likely: classification.headless?.likely ?? false, referer, source: opts.source ?? null, ...opts.properties diff --git a/test/track.test.ts b/test/track.test.ts index d89706f..6b43086 100644 --- a/test/track.test.ts +++ b/test/track.test.ts @@ -44,11 +44,15 @@ describe('trackVisit', () => { }) }) - it('sets bot_name to Browser for human traffic when onlyBots is false', async () => { + it('sets bot_name to Browser for real browser traffic', async () => { const spy = vi.fn() await trackVisit( makeRequest('https://example.com/page', { - 'user-agent': 'Mozilla/5.0 (Macintosh) Chrome/120' + 'user-agent': 'Mozilla/5.0 (Macintosh) Chrome/120', + 'accept-language': 'en-US,en;q=0.9', + 'sec-fetch-mode': 'navigate', + 'sec-ch-ua': '"Chromium";v="120"', + accept: 'text/html,application/xhtml+xml' }), { analytics: customAnalytics(spy), onlyBots: false } ) @@ -57,6 +61,7 @@ describe('trackVisit', () => { expect(event.properties.bot_name).toBe('Browser') expect(event.properties.ua_category).toBe('browser') expect(event.properties.coding_agent_hint).toBe(false) + expect(event.properties.headless_likely).toBe(false) }) it('sets coding_agent_hint and ua_category for HTTP-library UAs (onlyBots: false)', async () => { @@ -129,17 +134,37 @@ describe('trackVisit', () => { expect(event.properties.coding_agent_hint).toBe(true) }) - it('skipBrowsers skips regular browsers', async () => { + it('skipBrowsers skips real browsers (with standard headers)', async () => { const spy = vi.fn() await trackVisit( makeRequest('https://example.com/page', { - 'user-agent': 'Mozilla/5.0 (Macintosh) Chrome/120' + 'user-agent': 'Mozilla/5.0 (Macintosh) Chrome/120', + 'accept-language': 'en-US,en;q=0.9', + 'sec-fetch-mode': 'navigate', + 'sec-ch-ua': '"Chromium";v="120", "Google Chrome";v="120"', + accept: 'text/html,application/xhtml+xml' }), { analytics: customAnalytics(spy), skipBrowsers: true } ) expect(spy).not.toHaveBeenCalled() }) + it('skipBrowsers captures headless browsers (missing standard headers)', async () => { + const spy = vi.fn() + await trackVisit( + makeRequest('https://example.com/page', { + 'user-agent': 'Mozilla/5.0 (Macintosh) Chrome/120' + // Missing: accept-language, sec-fetch-mode, sec-ch-ua, proper accept + }), + { analytics: customAnalytics(spy), skipBrowsers: true } + ) + expect(spy).toHaveBeenCalledOnce() + const event = spy.mock.calls[0]![0] as CaptureEvent + expect(event.properties.ua_category).toBe('headless-likely') + expect(event.properties.headless_likely).toBe(true) + expect(event.properties.headless_score).toBeGreaterThanOrEqual(2) + }) + it('honours a custom event name', async () => { const spy = vi.fn() await trackVisit(