diff --git a/src/bots.ts b/src/bots.ts index c5e756f..7e923da 100644 --- a/src/bots.ts +++ b/src/bots.ts @@ -15,7 +15,7 @@ * Anthropic, Google, Perplexity, Cohere, Apple, Bytedance. */ export const AI_BOT_PATTERN = - /ClaudeBot|Claude-User|Anthropic|ChatGPT-User|GPTBot|OAI-SearchBot|PerplexityBot|Perplexity-User|Google-Extended|Applebot-Extended|cohere-ai|Bytespider|CCBot|Amazonbot|Meta-ExternalAgent|FacebookBot|DuckAssistBot|MistralAI-User|YouBot|AI2Bot|Diffbot|Cursor|Windsurf/i + /ClaudeBot|Claude-User|Claude-SearchBot|Claude-Web|Anthropic|GPTBot|ChatGPT-User|OAI-SearchBot|PerplexityBot|Perplexity-User|Google-Extended|Google-CloudVertexBot|Google-Agent|GoogleAgent-Mariner|Gemini-Deep-Research|Applebot|cohere|Bytespider|CCBot|Amazonbot|Amzn-SearchBot|NovaAct|AzureAI-SearchBot|Meta-ExternalAgent|meta-externalfetcher|meta-webindexer|FacebookBot|DuckAssistBot|MistralAI-User|YouBot|AI2Bot|Diffbot|DeepSeek|PanguBot|Webzio-Extended|omgili|Timpibot|Grok|Manus-User|quillbot|MyCentralAIScraperBot|Cursor|Windsurf/i /** * HTTP library / runtime signatures frequently used by coding agents. Matching @@ -67,15 +67,36 @@ export function parseBotName(userAgent: string | null | undefined): string { // Publicly declared AI crawlers (high confidence). if (s.includes('chatgpt-user') || s.includes('gptbot') || s.includes('oai-searchbot') || s.includes('openai')) return 'ChatGPT' - if (s.includes('claudebot') || s.includes('claude-user') || s.includes('anthropic')) return 'Claude' + if ( + s.includes('claudebot') || + s.includes('claude-user') || + s.includes('claude-searchbot') || + s.includes('claude-web') || + s.includes('anthropic') + ) + return 'Claude' if (s.includes('perplexitybot') || s.includes('perplexity-user')) return 'Perplexity' if (s.includes('ccbot')) return 'Common Crawl' - if (s.includes('google-extended') || s.includes('googlebot')) return 'Google' - if (s.includes('applebot-extended') || s.includes('applebot')) return 'Apple' + if ( + s.includes('google-extended') || + s.includes('googlebot') || + s.includes('google-cloudvertexbot') || + s.includes('google-agent') || + s.includes('googleagent-mariner') || + s.includes('gemini-deep-research') + ) + return 'Google' + if (s.includes('applebot')) return 'Apple' if (s.includes('bingbot')) return 'Bing' if (s.includes('bytespider')) return 'Bytespider' - if (s.includes('amazonbot')) return 'Amazon' - if (s.includes('meta-externalagent') || s.includes('facebookbot')) return 'Meta' + if (s.includes('amazonbot') || s.includes('amzn-searchbot') || s.includes('novaact')) return 'Amazon' + if ( + s.includes('meta-externalagent') || + s.includes('meta-externalfetcher') || + s.includes('meta-webindexer') || + s.includes('facebookbot') + ) + return 'Meta' if (s.includes('mistralai-user')) return 'Mistral' if (s.includes('duckassistbot')) return 'DuckDuckGo' if (s.includes('youbot')) return 'You.com' @@ -84,6 +105,15 @@ export function parseBotName(userAgent: string | null | undefined): string { if (s.includes('cohere')) return 'Cohere' if (s.includes('cursor')) return 'Cursor' if (s.includes('windsurf')) return 'Windsurf' + if (s.includes('deepseek')) return 'DeepSeek' + if (s.includes('pangubot')) return 'Huawei' + if (s.includes('webzio') || s.includes('omgili')) return 'Webz.io' + if (s.includes('timpibot')) return 'Timpi' + if (s.includes('grok') || s.includes('xai-')) return 'xAI' + if (s.includes('manus-user')) return 'Manus' + if (s.includes('quillbot')) return 'QuillBot' + if (s.includes('azureai-searchbot')) return 'Microsoft' + if (s.includes('mycentralaiscraperbot')) return 'MyCentralAI' if (s.includes('petalbot')) return 'PetalBot' // SEO crawlers and monitoring bots. diff --git a/test/bots.test.ts b/test/bots.test.ts index 9589e78..ea3be97 100644 --- a/test/bots.test.ts +++ b/test/bots.test.ts @@ -200,12 +200,75 @@ describe('parseBotName — additional branded crawlers', () => { ['Windsurf/0.1', 'Windsurf'], ['PetalBot; +http://aspiegel.com/petalbot', 'PetalBot'], ['Mozilla/5.0 (compatible; bingbot/2.0)', 'Bing'], - ['Mozilla/5.0 (compatible; Meta-ExternalAgent/1.0)', 'Meta'] + ['Mozilla/5.0 (compatible; Meta-ExternalAgent/1.0)', 'Meta'], + ['Claude-SearchBot/1.0', 'Claude'], + ['Claude-Web/1.0', 'Claude'], + ['Applebot/0.1', 'Apple'], + ['Mozilla/5.0 (compatible; Google-CloudVertexBot/1.0)', 'Google'], + ['Gemini-Deep-Research/1.0', 'Google'], + ['GoogleAgent-Mariner/1.0', 'Google'], + ['Amzn-SearchBot/1.0', 'Amazon'], + ['NovaAct/1.0', 'Amazon'], + ['AzureAI-SearchBot/1.0', 'Microsoft'], + ['meta-externalfetcher/1.0', 'Meta'], + ['meta-webindexer/1.0', 'Meta'], + ['DeepSeekBot/1.0', 'DeepSeek'], + ['PanguBot/1.0', 'Huawei'], + ['Webzio-Extended/1.0', 'Webz.io'], + ['omgili/0.5 (+http://omgili.com)', 'Webz.io'], + ['omgilibot/0.5', 'Webz.io'], + ['Timpibot/0.8', 'Timpi'], + ['GrokBot/1.0', 'xAI'], + ['Grok-DeepSearch/1.0', 'xAI'], + ['xAI-Grok/1.0', 'xAI'], + ['Manus-User/1.0', 'Manus'], + ['quillbot.com/1.0', 'QuillBot'], + ['MyCentralAIScraperBot/1.0', 'MyCentralAI'], + ['cohere-training-data-crawler/1.0', 'Cohere'] ])('labels %s as %s', (ua, label) => { expect(parseBotName(ua)).toBe(label) }) }) +describe('isAiBot — expanded crawler coverage', () => { + it.each([ + 'Claude-SearchBot/1.0', + 'Claude-Web/1.0', + 'Applebot/0.1', + 'Google-CloudVertexBot/1.0', + 'Google-Agent/1.0', + 'GoogleAgent-Mariner/1.0', + 'Gemini-Deep-Research/1.0', + 'Amzn-SearchBot/1.0', + 'NovaAct/1.0', + 'AzureAI-SearchBot/1.0', + 'meta-externalfetcher/1.0', + 'meta-webindexer/1.0', + 'DeepSeekBot/1.0', + 'PanguBot/1.0', + 'Webzio-Extended/1.0', + 'omgili/0.5', + 'omgilibot/0.5', + 'Timpibot/0.8', + 'GrokBot/1.0', + 'Grok-DeepSearch/1.0', + 'xAI-Grok/1.0', + 'Manus-User/1.0', + 'quillbot.com/1.0', + 'MyCentralAIScraperBot/1.0', + 'cohere-training-data-crawler/1.0', + 'Ai2Bot-Dolma/1.0' + ])('flags %s as an AI bot', (ua) => { + expect(isAiBot(ua)).toBe(true) + }) + + it('does not flag Claude-Code (coding-agent UA, intentionally excluded)', () => { + // Claude-Code goes through HTTP_CLIENT_PATTERN heuristics, not the + // declared-crawler set — see comment at AI_BOT_PATTERN. + expect(isAiBot('Claude-Code/1.0')).toBe(false) + }) +}) + describe('classifyAgent — spoofed UAs and edge cases', () => { it('treats a Playwright-driven Mozilla UA as a plain browser (Aider/OpenCode look like users at the UA layer)', () => { const playwrightish =