Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 36 additions & 6 deletions src/bots.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* Anthropic, Google, Perplexity, Cohere, Apple, Bytedance.
*/
export const AI_BOT_PATTERN =
/ClaudeBot|Claude-User|Anthropic|ChatGPT-User|GPTBot|OAI-SearchBot|PerplexityBot|Perplexity-User|Google-Extended|Applebot-Extended|cohere-ai|Bytespider|CCBot|Amazonbot|Meta-ExternalAgent|FacebookBot|DuckAssistBot|MistralAI-User|YouBot|AI2Bot|Diffbot|Cursor|Windsurf/i
/ClaudeBot|Claude-User|Claude-SearchBot|Claude-Web|Anthropic|GPTBot|ChatGPT-User|OAI-SearchBot|PerplexityBot|Perplexity-User|Google-Extended|Google-CloudVertexBot|Google-Agent|GoogleAgent-Mariner|Gemini-Deep-Research|Applebot|cohere|Bytespider|CCBot|Amazonbot|Amzn-SearchBot|NovaAct|AzureAI-SearchBot|Meta-ExternalAgent|meta-externalfetcher|meta-webindexer|FacebookBot|DuckAssistBot|MistralAI-User|YouBot|AI2Bot|Diffbot|DeepSeek|PanguBot|Webzio-Extended|omgili|Timpibot|Grok|Manus-User|quillbot|MyCentralAIScraperBot|Cursor|Windsurf/i

/**
* HTTP library / runtime signatures frequently used by coding agents. Matching
Expand Down Expand Up @@ -67,15 +67,36 @@ export function parseBotName(userAgent: string | null | undefined): string {
// Publicly declared AI crawlers (high confidence).
if (s.includes('chatgpt-user') || s.includes('gptbot') || s.includes('oai-searchbot') || s.includes('openai'))
return 'ChatGPT'
if (s.includes('claudebot') || s.includes('claude-user') || s.includes('anthropic')) return 'Claude'
if (
s.includes('claudebot') ||
s.includes('claude-user') ||
s.includes('claude-searchbot') ||
s.includes('claude-web') ||
s.includes('anthropic')
)
return 'Claude'
if (s.includes('perplexitybot') || s.includes('perplexity-user')) return 'Perplexity'
if (s.includes('ccbot')) return 'Common Crawl'
if (s.includes('google-extended') || s.includes('googlebot')) return 'Google'
if (s.includes('applebot-extended') || s.includes('applebot')) return 'Apple'
if (
s.includes('google-extended') ||
s.includes('googlebot') ||
s.includes('google-cloudvertexbot') ||
s.includes('google-agent') ||
s.includes('googleagent-mariner') ||
s.includes('gemini-deep-research')
)
return 'Google'
if (s.includes('applebot')) return 'Apple'
if (s.includes('bingbot')) return 'Bing'
if (s.includes('bytespider')) return 'Bytespider'
if (s.includes('amazonbot')) return 'Amazon'
if (s.includes('meta-externalagent') || s.includes('facebookbot')) return 'Meta'
if (s.includes('amazonbot') || s.includes('amzn-searchbot') || s.includes('novaact')) return 'Amazon'
if (
s.includes('meta-externalagent') ||
s.includes('meta-externalfetcher') ||
s.includes('meta-webindexer') ||
s.includes('facebookbot')
)
return 'Meta'
if (s.includes('mistralai-user')) return 'Mistral'
if (s.includes('duckassistbot')) return 'DuckDuckGo'
if (s.includes('youbot')) return 'You.com'
Expand All @@ -84,6 +105,15 @@ export function parseBotName(userAgent: string | null | undefined): string {
if (s.includes('cohere')) return 'Cohere'
if (s.includes('cursor')) return 'Cursor'
if (s.includes('windsurf')) return 'Windsurf'
if (s.includes('deepseek')) return 'DeepSeek'
if (s.includes('pangubot')) return 'Huawei'
if (s.includes('webzio') || s.includes('omgili')) return 'Webz.io'
if (s.includes('timpibot')) return 'Timpi'
if (s.includes('grok') || s.includes('xai-')) return 'xAI'
if (s.includes('manus-user')) return 'Manus'
if (s.includes('quillbot')) return 'QuillBot'
if (s.includes('azureai-searchbot')) return 'Microsoft'
if (s.includes('mycentralaiscraperbot')) return 'MyCentralAI'
if (s.includes('petalbot')) return 'PetalBot'

// SEO crawlers and monitoring bots.
Expand Down
65 changes: 64 additions & 1 deletion test/bots.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -200,12 +200,75 @@ describe('parseBotName — additional branded crawlers', () => {
['Windsurf/0.1', 'Windsurf'],
['PetalBot; +http://aspiegel.com/petalbot', 'PetalBot'],
['Mozilla/5.0 (compatible; bingbot/2.0)', 'Bing'],
['Mozilla/5.0 (compatible; Meta-ExternalAgent/1.0)', 'Meta']
['Mozilla/5.0 (compatible; Meta-ExternalAgent/1.0)', 'Meta'],
['Claude-SearchBot/1.0', 'Claude'],
['Claude-Web/1.0', 'Claude'],
['Applebot/0.1', 'Apple'],
['Mozilla/5.0 (compatible; Google-CloudVertexBot/1.0)', 'Google'],
['Gemini-Deep-Research/1.0', 'Google'],
['GoogleAgent-Mariner/1.0', 'Google'],
['Amzn-SearchBot/1.0', 'Amazon'],
['NovaAct/1.0', 'Amazon'],
['AzureAI-SearchBot/1.0', 'Microsoft'],
['meta-externalfetcher/1.0', 'Meta'],
['meta-webindexer/1.0', 'Meta'],
['DeepSeekBot/1.0', 'DeepSeek'],
['PanguBot/1.0', 'Huawei'],
['Webzio-Extended/1.0', 'Webz.io'],
['omgili/0.5 (+http://omgili.com)', 'Webz.io'],
['omgilibot/0.5', 'Webz.io'],
['Timpibot/0.8', 'Timpi'],
['GrokBot/1.0', 'xAI'],
['Grok-DeepSearch/1.0', 'xAI'],
['xAI-Grok/1.0', 'xAI'],
['Manus-User/1.0', 'Manus'],
['quillbot.com/1.0', 'QuillBot'],
['MyCentralAIScraperBot/1.0', 'MyCentralAI'],
['cohere-training-data-crawler/1.0', 'Cohere']
])('labels %s as %s', (ua, label) => {
expect(parseBotName(ua)).toBe(label)
})
})

describe('isAiBot — expanded crawler coverage', () => {
it.each([
'Claude-SearchBot/1.0',
'Claude-Web/1.0',
'Applebot/0.1',
'Google-CloudVertexBot/1.0',
'Google-Agent/1.0',
'GoogleAgent-Mariner/1.0',
'Gemini-Deep-Research/1.0',
'Amzn-SearchBot/1.0',
'NovaAct/1.0',
'AzureAI-SearchBot/1.0',
'meta-externalfetcher/1.0',
'meta-webindexer/1.0',
'DeepSeekBot/1.0',
'PanguBot/1.0',
'Webzio-Extended/1.0',
'omgili/0.5',
'omgilibot/0.5',
'Timpibot/0.8',
'GrokBot/1.0',
'Grok-DeepSearch/1.0',
'xAI-Grok/1.0',
'Manus-User/1.0',
'quillbot.com/1.0',
'MyCentralAIScraperBot/1.0',
'cohere-training-data-crawler/1.0',
'Ai2Bot-Dolma/1.0'
])('flags %s as an AI bot', (ua) => {
expect(isAiBot(ua)).toBe(true)
})

it('does not flag Claude-Code (coding-agent UA, intentionally excluded)', () => {
// Claude-Code goes through HTTP_CLIENT_PATTERN heuristics, not the
// declared-crawler set — see comment at AI_BOT_PATTERN.
expect(isAiBot('Claude-Code/1.0')).toBe(false)
})
})

describe('classifyAgent — spoofed UAs and edge cases', () => {
it('treats a Playwright-driven Mozilla UA as a plain browser (Aider/OpenCode look like users at the UA layer)', () => {
const playwrightish =
Expand Down
Loading