From 6195ddf2b9c6106760659c6a262e4f790a611f1b Mon Sep 17 00:00:00 2001 From: Jared McFarland Date: Wed, 18 Feb 2026 18:51:23 -0800 Subject: [PATCH 1/7] test: add AI bot classification tests Part of AI bot classification feature for Node.js SDK. --- test/ai_bot_classifier.js | 246 ++++++++++++++++++++++++++++++++++++ test/ai_bot_middleware.js | 260 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 506 insertions(+) create mode 100644 test/ai_bot_classifier.js create mode 100644 test/ai_bot_middleware.js diff --git a/test/ai_bot_classifier.js b/test/ai_bot_classifier.js new file mode 100644 index 0000000..cac309d --- /dev/null +++ b/test/ai_bot_classifier.js @@ -0,0 +1,246 @@ +// test/ai_bot_classifier.js + +// These tests define the expected behavior of the classifier +// Write these BEFORE implementing lib/ai_bot_classifier.js + +describe('AiBotClassifier', () => { + + let classify; + + beforeEach(() => { + const { classify_user_agent } = require('../lib/ai_bot_classifier'); + classify = classify_user_agent; + }); + + // === CORE CLASSIFICATION === + + describe('classify_user_agent', () => { + + // --- OpenAI Bots --- + + it('should classify GPTBot user agent', () => { + const result = classify('Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)'); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe('GPTBot'); + expect(result.$ai_bot_provider).toBe('OpenAI'); + expect(result.$ai_bot_category).toBe('indexing'); + }); + + it('should classify ChatGPT-User agent', () => { + const result = classify('Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ChatGPT-User/1.0; +https://openai.com/bot)'); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe('ChatGPT-User'); + expect(result.$ai_bot_provider).toBe('OpenAI'); + expect(result.$ai_bot_category).toBe('retrieval'); + }); + + it('should classify OAI-SearchBot agent', () => { + const result = classify('Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot)'); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe('OAI-SearchBot'); + expect(result.$ai_bot_provider).toBe('OpenAI'); + expect(result.$ai_bot_category).toBe('indexing'); + }); + + // --- Anthropic Bots --- + + it('should classify ClaudeBot agent', () => { + const result = classify('Mozilla/5.0 (compatible; ClaudeBot/1.0; +claudebot@anthropic.com)'); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe('ClaudeBot'); + expect(result.$ai_bot_provider).toBe('Anthropic'); + expect(result.$ai_bot_category).toBe('indexing'); + }); + + it('should classify Claude-User agent', () => { + const result = classify('Mozilla/5.0 (compatible; Claude-User/1.0)'); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe('Claude-User'); + expect(result.$ai_bot_provider).toBe('Anthropic'); + expect(result.$ai_bot_category).toBe('retrieval'); + }); + + // --- Google Bots --- + + it('should classify Google-Extended agent', () => { + const result = classify('Mozilla/5.0 (compatible; Google-Extended/1.0)'); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe('Google-Extended'); + expect(result.$ai_bot_provider).toBe('Google'); + expect(result.$ai_bot_category).toBe('indexing'); + }); + + // --- Perplexity --- + + it('should classify PerplexityBot agent', () => { + const result = classify('Mozilla/5.0 (compatible; PerplexityBot/1.0)'); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe('PerplexityBot'); + expect(result.$ai_bot_provider).toBe('Perplexity'); + expect(result.$ai_bot_category).toBe('retrieval'); + }); + + // --- ByteDance --- + + it('should classify Bytespider agent', () => { + const result = classify('Mozilla/5.0 (compatible; Bytespider/1.0)'); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe('Bytespider'); + expect(result.$ai_bot_provider).toBe('ByteDance'); + }); + + // --- Common Crawl --- + + it('should classify CCBot agent', () => { + const result = classify('CCBot/2.0 (https://commoncrawl.org/faq/)'); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe('CCBot'); + expect(result.$ai_bot_provider).toBe('Common Crawl'); + }); + + // --- Apple --- + + it('should classify Applebot-Extended agent', () => { + const result = classify('Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Applebot-Extended/0.1'); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe('Applebot-Extended'); + expect(result.$ai_bot_provider).toBe('Apple'); + }); + + // --- Meta --- + + it('should classify Meta-ExternalAgent agent', () => { + const result = classify('Mozilla/5.0 (compatible; Meta-ExternalAgent/1.0)'); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe('Meta-ExternalAgent'); + expect(result.$ai_bot_provider).toBe('Meta'); + }); + + // --- Cohere --- + + it('should classify cohere-ai agent', () => { + const result = classify('cohere-ai/1.0 (https://cohere.com)'); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe('cohere-ai'); + expect(result.$ai_bot_provider).toBe('Cohere'); + expect(result.$ai_bot_category).toBe('indexing'); + }); + + // === NEGATIVE CASES === + + it('should NOT classify regular Chrome browser as AI bot', () => { + const result = classify('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); + expect(result.$is_ai_bot).toBe(false); + expect(result.$ai_bot_name).toBeUndefined(); + }); + + it('should NOT classify Googlebot (regular) as AI bot', () => { + const result = classify('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'); + expect(result.$is_ai_bot).toBe(false); + }); + + it('should NOT classify Bingbot (regular) as AI bot', () => { + const result = classify('Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'); + expect(result.$is_ai_bot).toBe(false); + }); + + it('should NOT classify curl as AI bot', () => { + const result = classify('curl/7.64.1'); + expect(result.$is_ai_bot).toBe(false); + }); + + it('should handle empty user agent', () => { + const result = classify(''); + expect(result.$is_ai_bot).toBe(false); + }); + + it('should handle undefined user agent', () => { + const result = classify(undefined); + expect(result.$is_ai_bot).toBe(false); + }); + + it('should handle null user agent', () => { + const result = classify(null); + expect(result.$is_ai_bot).toBe(false); + }); + + // === CASE SENSITIVITY === + + it('should match case-insensitively', () => { + const result = classify('Mozilla/5.0 (compatible; gptbot/1.2)'); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe('GPTBot'); + }); + + // === RETURN SHAPE === + + it('should return all expected fields for a match', () => { + const result = classify('GPTBot/1.2'); + expect(result).toHaveProperty('$is_ai_bot', true); + expect(result).toHaveProperty('$ai_bot_name'); + expect(result).toHaveProperty('$ai_bot_provider'); + expect(result).toHaveProperty('$ai_bot_category'); + expect(typeof result.$ai_bot_name).toBe('string'); + expect(typeof result.$ai_bot_provider).toBe('string'); + expect(['indexing', 'retrieval', 'agent']).toContain(result.$ai_bot_category); + }); + + it('should return only $is_ai_bot:false for non-matches', () => { + const result = classify('Mozilla/5.0 Chrome/120'); + expect(Object.keys(result)).toEqual(['$is_ai_bot']); + expect(result.$is_ai_bot).toBe(false); + }); + }); + + // === BOT DATABASE === + + describe('get_bot_database', () => { + it('should expose the bot database for inspection', () => { + const { get_bot_database } = require('../lib/ai_bot_classifier'); + const db = get_bot_database(); + expect(Array.isArray(db)).toBe(true); + expect(db.length).toBeGreaterThan(0); + expect(db[0]).toHaveProperty('pattern'); + expect(db[0]).toHaveProperty('name'); + expect(db[0]).toHaveProperty('provider'); + expect(db[0]).toHaveProperty('category'); + }); + }); + + // === CUSTOM BOTS === + + describe('custom bot registration', () => { + it('should allow adding custom bot patterns', () => { + const { create_classifier } = require('../lib/ai_bot_classifier'); + const classifier = create_classifier({ + additional_bots: [ + { + pattern: /MyCustomBot\//i, + name: 'MyCustomBot', + provider: 'CustomCorp', + category: 'indexing' + } + ] + }); + const result = classifier('Mozilla/5.0 (compatible; MyCustomBot/1.0)'); + expect(result.$is_ai_bot).toBe(true); + expect(result.$ai_bot_name).toBe('MyCustomBot'); + }); + + it('should check custom bots before built-in bots', () => { + const { create_classifier } = require('../lib/ai_bot_classifier'); + const classifier = create_classifier({ + additional_bots: [ + { + pattern: /GPTBot\//i, + name: 'GPTBot-Custom', + provider: 'CustomProvider', + category: 'retrieval' + } + ] + }); + const result = classifier('GPTBot/1.2'); + expect(result.$ai_bot_name).toBe('GPTBot-Custom'); + }); + }); +}); diff --git a/test/ai_bot_middleware.js b/test/ai_bot_middleware.js new file mode 100644 index 0000000..64d853d --- /dev/null +++ b/test/ai_bot_middleware.js @@ -0,0 +1,260 @@ +// test/ai_bot_middleware.js + +describe('AI Bot Middleware Integration', () => { + + let Mixpanel, mixpanel; + + beforeEach(() => { + Mixpanel = require('../lib/mixpanel-node'); + }); + + describe('enable_bot_classification', () => { + + it('should enrich track() calls with bot classification when $user_agent is present', () => { + const { enable_bot_classification } = require('../lib/ai_bot_middleware'); + mixpanel = Mixpanel.init('test-token'); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, 'send_request'); + + mixpanel.track('page_view', { + distinct_id: 'user123', + $user_agent: 'Mozilla/5.0 (compatible; GPTBot/1.2; +https://openai.com/gptbot)' + }); + + expect(mixpanel.send_request).toHaveBeenCalledWith( + expect.objectContaining({ + data: expect.objectContaining({ + properties: expect.objectContaining({ + $is_ai_bot: true, + $ai_bot_name: 'GPTBot', + $ai_bot_provider: 'OpenAI', + $ai_bot_category: 'indexing' + }) + }) + }), + undefined + ); + }); + + it('should NOT add bot properties when $user_agent is not present', () => { + const { enable_bot_classification } = require('../lib/ai_bot_middleware'); + mixpanel = Mixpanel.init('test-token'); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, 'send_request'); + + mixpanel.track('page_view', { distinct_id: 'user123' }); + + const callData = mixpanel.send_request.mock.calls[0][0].data; + expect(callData.properties.$is_ai_bot).toBeUndefined(); + }); + + it('should set $is_ai_bot:false when $user_agent is present but not an AI bot', () => { + const { enable_bot_classification } = require('../lib/ai_bot_middleware'); + mixpanel = Mixpanel.init('test-token'); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, 'send_request'); + + mixpanel.track('page_view', { + distinct_id: 'user123', + $user_agent: 'Mozilla/5.0 Chrome/120.0.0.0' + }); + + const callData = mixpanel.send_request.mock.calls[0][0].data; + expect(callData.properties.$is_ai_bot).toBe(false); + expect(callData.properties.$ai_bot_name).toBeUndefined(); + }); + + it('should preserve existing properties alongside bot classification', () => { + const { enable_bot_classification } = require('../lib/ai_bot_middleware'); + mixpanel = Mixpanel.init('test-token'); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, 'send_request'); + + mixpanel.track('page_view', { + distinct_id: 'user123', + $user_agent: 'GPTBot/1.2', + page_url: '/products', + custom_prop: 'value' + }); + + const props = mixpanel.send_request.mock.calls[0][0].data.properties; + expect(props.page_url).toBe('/products'); + expect(props.custom_prop).toBe('value'); + expect(props.$is_ai_bot).toBe(true); + }); + + it('should preserve callback functionality', () => { + const { enable_bot_classification } = require('../lib/ai_bot_middleware'); + mixpanel = Mixpanel.init('test-token'); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, 'send_request'); + + const callback = vi.fn(); + mixpanel.track('page_view', { $user_agent: 'GPTBot/1.2' }, callback); + + expect(mixpanel.send_request).toHaveBeenCalledWith( + expect.anything(), + callback + ); + }); + + it('should support callback as second argument (no properties)', () => { + const { enable_bot_classification } = require('../lib/ai_bot_middleware'); + mixpanel = Mixpanel.init('test-token'); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, 'send_request'); + + const callback = vi.fn(); + mixpanel.track('page_view', callback); + + // When callback is passed as 2nd arg, properties should be empty + // and no bot classification should be added + expect(mixpanel.send_request).toHaveBeenCalled(); + }); + + it('should NOT enrich track_batch events (known limitation — track_batch bypasses send_event_request)', () => { + const { enable_bot_classification } = require('../lib/ai_bot_middleware'); + mixpanel = Mixpanel.init('test-token'); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, 'send_request'); + + mixpanel.track_batch([ + { event: 'page_view', properties: { $user_agent: 'GPTBot/1.2', distinct_id: 'bot1' } }, + { event: 'page_view', properties: { $user_agent: 'Chrome/120', distinct_id: 'user1' } } + ]); + + // track_batch goes through send_batch_requests -> send_request, NOT send_event_request + // so bot classification is not applied + const call = mixpanel.send_request.mock.calls[0][0]; + expect(call.data[0].properties.$is_ai_bot).toBeUndefined(); + expect(call.data[1].properties.$is_ai_bot).toBeUndefined(); + }); + + it('should not modify the original properties object', () => { + const { enable_bot_classification } = require('../lib/ai_bot_middleware'); + mixpanel = Mixpanel.init('test-token'); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, 'send_request'); + + const props = { distinct_id: 'user123', $user_agent: 'GPTBot/1.2' }; + const originalKeys = Object.keys(props); + mixpanel.track('page_view', props); + + // Original object should not have been mutated + expect(Object.keys(props).sort()).toEqual(originalKeys.sort()); + }); + }); + + describe('configuration options', () => { + + it('should accept custom user_agent_property name', () => { + const { enable_bot_classification } = require('../lib/ai_bot_middleware'); + mixpanel = Mixpanel.init('test-token'); + enable_bot_classification(mixpanel, { + user_agent_property: 'ua_string' + }); + + vi.spyOn(mixpanel, 'send_request'); + + mixpanel.track('page_view', { + distinct_id: 'user123', + ua_string: 'GPTBot/1.2' + }); + + const props = mixpanel.send_request.mock.calls[0][0].data.properties; + expect(props.$is_ai_bot).toBe(true); + }); + + it('should accept custom property prefix', () => { + const { enable_bot_classification } = require('../lib/ai_bot_middleware'); + mixpanel = Mixpanel.init('test-token'); + enable_bot_classification(mixpanel, { + property_prefix: 'bot_' + }); + + vi.spyOn(mixpanel, 'send_request'); + + mixpanel.track('page_view', { + $user_agent: 'GPTBot/1.2' + }); + + const props = mixpanel.send_request.mock.calls[0][0].data.properties; + expect(props.bot_is_ai_bot).toBe(true); + expect(props.bot_name).toBe('GPTBot'); + }); + + it('should allow disabling classification without removing middleware', () => { + const { enable_bot_classification } = require('../lib/ai_bot_middleware'); + mixpanel = Mixpanel.init('test-token'); + const controller = enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, 'send_request'); + + controller.disable(); + mixpanel.track('page_view', { $user_agent: 'GPTBot/1.2' }); + + const props = mixpanel.send_request.mock.calls[0][0].data.properties; + expect(props.$is_ai_bot).toBeUndefined(); + + controller.enable(); + mixpanel.track('page_view', { $user_agent: 'GPTBot/1.2' }); + + const props2 = mixpanel.send_request.mock.calls[1][0].data.properties; + expect(props2.$is_ai_bot).toBe(true); + }); + }); + + describe('helper: track_request', () => { + + it('should provide a helper that extracts user-agent from HTTP request', () => { + const { enable_bot_classification, track_request } = require('../lib/ai_bot_middleware'); + mixpanel = Mixpanel.init('test-token'); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, 'send_request'); + + // Simulate an Express/Node.js request object + const mockReq = { + headers: { + 'user-agent': 'GPTBot/1.2', + 'x-forwarded-for': '1.2.3.4' + }, + ip: '1.2.3.4', + url: '/api/products' + }; + + track_request(mixpanel, mockReq, 'page_view', { + distinct_id: 'user123', + page_url: '/api/products' + }); + + const props = mixpanel.send_request.mock.calls[0][0].data.properties; + expect(props.$user_agent).toBe('GPTBot/1.2'); + expect(props.$is_ai_bot).toBe(true); + expect(props.ip).toBe('1.2.3.4'); + expect(props.page_url).toBe('/api/products'); + }); + + it('should handle request with no user-agent header', () => { + const { enable_bot_classification, track_request } = require('../lib/ai_bot_middleware'); + mixpanel = Mixpanel.init('test-token'); + enable_bot_classification(mixpanel); + + vi.spyOn(mixpanel, 'send_request'); + + const mockReq = { headers: {}, ip: '1.2.3.4' }; + track_request(mixpanel, mockReq, 'page_view', { distinct_id: 'user123' }); + + const props = mixpanel.send_request.mock.calls[0][0].data.properties; + expect(props.$is_ai_bot).toBeUndefined(); + }); + }); +}); From 4d6483c823bd707aa2c7ae01ae4c3e779dc07ca1 Mon Sep 17 00:00:00 2001 From: Jared McFarland Date: Wed, 18 Feb 2026 18:51:55 -0800 Subject: [PATCH 2/7] feat: implement AI bot classifier Part of AI bot classification feature for Node.js SDK. --- lib/ai_bot_classifier.js | 175 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 lib/ai_bot_classifier.js diff --git a/lib/ai_bot_classifier.js b/lib/ai_bot_classifier.js new file mode 100644 index 0000000..eef1580 --- /dev/null +++ b/lib/ai_bot_classifier.js @@ -0,0 +1,175 @@ +// lib/ai_bot_classifier.js + +const AI_BOT_DATABASE = [ + // === OpenAI === + { + pattern: /GPTBot\//i, + name: 'GPTBot', + provider: 'OpenAI', + category: 'indexing', + description: 'OpenAI web crawler for model training data', + ip_ranges_url: 'https://openai.com/gptbot.json' + }, + { + pattern: /ChatGPT-User\//i, + name: 'ChatGPT-User', + provider: 'OpenAI', + category: 'retrieval', + description: 'ChatGPT real-time retrieval for user queries (RAG)', + ip_ranges_url: 'https://openai.com/chatgpt-user.json' + }, + { + pattern: /OAI-SearchBot\//i, + name: 'OAI-SearchBot', + provider: 'OpenAI', + category: 'indexing', + description: 'OpenAI search indexing crawler', + ip_ranges_url: 'https://openai.com/searchbot.json' + }, + + // === Anthropic === + { + pattern: /ClaudeBot\//i, + name: 'ClaudeBot', + provider: 'Anthropic', + category: 'indexing', + description: 'Anthropic web crawler for model training', + ip_ranges_url: null // Anthropic publishes ranges but URL may vary + }, + { + pattern: /Claude-User\//i, + name: 'Claude-User', + provider: 'Anthropic', + category: 'retrieval', + description: 'Claude real-time retrieval for user queries' + }, + + // === Google === + { + pattern: /Google-Extended\//i, + name: 'Google-Extended', + provider: 'Google', + category: 'indexing', + description: 'Google AI training data crawler (separate from Googlebot)' + }, + + // === Perplexity === + { + pattern: /PerplexityBot\//i, + name: 'PerplexityBot', + provider: 'Perplexity', + category: 'retrieval', + description: 'Perplexity AI search crawler' + }, + + // === ByteDance === + { + pattern: /Bytespider\//i, + name: 'Bytespider', + provider: 'ByteDance', + category: 'indexing', + description: 'ByteDance/TikTok AI crawler' + }, + + // === Common Crawl === + { + pattern: /CCBot\//i, + name: 'CCBot', + provider: 'Common Crawl', + category: 'indexing', + description: 'Common Crawl bot (data used by many AI models)' + }, + + // === Apple === + { + pattern: /Applebot-Extended\//i, + name: 'Applebot-Extended', + provider: 'Apple', + category: 'indexing', + description: 'Apple AI/Siri training data crawler' + }, + + // === Meta === + { + pattern: /Meta-ExternalAgent\//i, + name: 'Meta-ExternalAgent', + provider: 'Meta', + category: 'indexing', + description: 'Meta/Facebook AI training data crawler' + }, + + // === Cohere === + { + pattern: /cohere-ai\//i, + name: 'cohere-ai', + provider: 'Cohere', + category: 'indexing', + description: 'Cohere AI training data crawler' + }, +]; + +/** + * Classify a user-agent string against the AI bot database. + * @param {string} userAgent - The user-agent string to classify + * @returns {Object} Classification result with $is_ai_bot and optional bot details + */ +function classify_user_agent(userAgent) { + if (!userAgent || typeof userAgent !== 'string') { + return { $is_ai_bot: false }; + } + + for (const bot of AI_BOT_DATABASE) { + if (bot.pattern.test(userAgent)) { + return { + $is_ai_bot: true, + $ai_bot_name: bot.name, + $ai_bot_provider: bot.provider, + $ai_bot_category: bot.category, + }; + } + } + + return { $is_ai_bot: false }; +} + +/** + * Create a classifier with optional additional bot patterns. + * @param {Object} options + * @param {Array} options.additional_bots - Additional bot patterns to check (checked first) + * @returns {Function} A classify_user_agent function + */ +function create_classifier(options) { + const additional = (options && options.additional_bots) || []; + const combined = [...additional, ...AI_BOT_DATABASE]; + + return function(userAgent) { + if (!userAgent || typeof userAgent !== 'string') { + return { $is_ai_bot: false }; + } + + for (const bot of combined) { + if (bot.pattern.test(userAgent)) { + return { + $is_ai_bot: true, + $ai_bot_name: bot.name, + $ai_bot_provider: bot.provider, + $ai_bot_category: bot.category, + }; + } + } + + return { $is_ai_bot: false }; + }; +} + +function get_bot_database() { + return AI_BOT_DATABASE.map(bot => ({ + pattern: bot.pattern, + name: bot.name, + provider: bot.provider, + category: bot.category, + description: bot.description || '', + })); +} + +module.exports = { classify_user_agent, create_classifier, get_bot_database }; From 3d91b86b4aeb84f7880bde01409db8f3d9f99594 Mon Sep 17 00:00:00 2001 From: Jared McFarland Date: Wed, 18 Feb 2026 18:52:57 -0800 Subject: [PATCH 3/7] feat: add bot classification middleware/integration Part of AI bot classification feature for Node.js SDK. --- lib/ai_bot_middleware.js | 77 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 lib/ai_bot_middleware.js diff --git a/lib/ai_bot_middleware.js b/lib/ai_bot_middleware.js new file mode 100644 index 0000000..f03da50 --- /dev/null +++ b/lib/ai_bot_middleware.js @@ -0,0 +1,77 @@ +// lib/ai_bot_middleware.js + +const { classify_user_agent, create_classifier } = require('./ai_bot_classifier'); + +/** + * Enable AI bot classification on a Mixpanel client instance. + * Wraps track() to auto-classify when $user_agent property is present. + * + * @param {Object} mixpanel - Mixpanel client from Mixpanel.init() + * @param {Object} [options] + * @param {string} [options.user_agent_property='$user_agent'] - Property name containing the UA string + * @param {string} [options.property_prefix='$'] - Prefix for classification properties + * @param {Array} [options.additional_bots] - Additional bot patterns + * @returns {Object} Controller with enable()/disable() methods + */ +function enable_bot_classification(mixpanel, options) { + const opts = options || {}; + const uaProp = opts.user_agent_property || '$user_agent'; + const prefix = opts.property_prefix || '$'; + const classify = opts.additional_bots + ? create_classifier({ additional_bots: opts.additional_bots }) + : classify_user_agent; + + let enabled = true; + + // Wrap send_event_request — the single chokepoint for all event data + const originalSendEvent = mixpanel.send_event_request; + mixpanel.send_event_request = function(endpoint, event, properties, callback) { + var enrichedProperties = properties; + if (enabled && properties && properties[uaProp]) { + const classification = classify(properties[uaProp]); + // Map classification properties with the configured prefix + if (prefix === '$') { + enrichedProperties = Object.assign({}, properties, classification); + } else { + enrichedProperties = Object.assign({}, properties); + for (const [key, value] of Object.entries(classification)) { + // $is_ai_bot -> {prefix}is_ai_bot; $ai_bot_name -> {prefix}name + const newKey = key.startsWith('$ai_bot_') + ? prefix + key.substring('$ai_bot_'.length) + : prefix + key.substring(1); + enrichedProperties[newKey] = value; + } + } + } + originalSendEvent.call(mixpanel, endpoint, event, enrichedProperties, callback); + }; + + return { + enable: function() { enabled = true; }, + disable: function() { enabled = false; }, + }; +} + +/** + * Helper: Track an event with automatic user-agent and IP extraction from an HTTP request. + * + * @param {Object} mixpanel - Mixpanel client + * @param {Object} req - Node.js HTTP IncomingMessage (or Express Request) + * @param {string} eventName - Event name + * @param {Object} [properties] - Additional properties + * @param {Function} [callback] - Callback + */ +function track_request(mixpanel, req, eventName, properties, callback) { + properties = properties || {}; + const ua = req.headers && req.headers['user-agent']; + if (ua) { + properties.$user_agent = ua; + } + const ip = req.ip || (req.headers && req.headers['x-forwarded-for']) || (req.connection && req.connection.remoteAddress); + if (ip) { + properties.ip = ip; + } + mixpanel.track(eventName, properties, callback); +} + +module.exports = { enable_bot_classification, track_request }; From 178182cfb53ba62a71006fcacfc467495feeede0 Mon Sep 17 00:00:00 2001 From: Jared McFarland Date: Wed, 18 Feb 2026 18:53:34 -0800 Subject: [PATCH 4/7] feat: add type definitions and exports Part of AI bot classification feature for Node.js SDK. --- lib/ai_bot_classifier.d.ts | 19 +++++++++++++++++++ lib/ai_bot_middleware.d.ts | 30 ++++++++++++++++++++++++++++++ lib/mixpanel-node.d.ts | 8 ++++++++ lib/mixpanel-node.js | 2 ++ 4 files changed, 59 insertions(+) create mode 100644 lib/ai_bot_classifier.d.ts create mode 100644 lib/ai_bot_middleware.d.ts diff --git a/lib/ai_bot_classifier.d.ts b/lib/ai_bot_classifier.d.ts new file mode 100644 index 0000000..80adee8 --- /dev/null +++ b/lib/ai_bot_classifier.d.ts @@ -0,0 +1,19 @@ +export interface AiBotEntry { + pattern: RegExp; + name: string; + provider: string; + category: 'indexing' | 'retrieval' | 'agent'; + description?: string; + ip_ranges_url?: string; +} + +export interface AiBotClassification { + $is_ai_bot: boolean; + $ai_bot_name?: string; + $ai_bot_provider?: string; + $ai_bot_category?: 'indexing' | 'retrieval' | 'agent'; +} + +export function classify_user_agent(userAgent: string | null | undefined): AiBotClassification; +export function create_classifier(options: { additional_bots?: AiBotEntry[] }): (userAgent: string) => AiBotClassification; +export function get_bot_database(): AiBotEntry[]; diff --git a/lib/ai_bot_middleware.d.ts b/lib/ai_bot_middleware.d.ts new file mode 100644 index 0000000..7853a9f --- /dev/null +++ b/lib/ai_bot_middleware.d.ts @@ -0,0 +1,30 @@ +import { IncomingMessage } from 'http'; + +export interface BotClassificationOptions { + user_agent_property?: string; + property_prefix?: string; + additional_bots?: Array<{ + pattern: RegExp; + name: string; + provider: string; + category: 'indexing' | 'retrieval' | 'agent'; + }>; +} + +export interface BotClassificationController { + enable(): void; + disable(): void; +} + +export function enable_bot_classification( + mixpanel: any, + options?: BotClassificationOptions +): BotClassificationController; + +export function track_request( + mixpanel: any, + req: IncomingMessage, + eventName: string, + properties?: Record, + callback?: (err?: Error) => void +): void; diff --git a/lib/mixpanel-node.d.ts b/lib/mixpanel-node.d.ts index 2982f73..eaa09de 100644 --- a/lib/mixpanel-node.d.ts +++ b/lib/mixpanel-node.d.ts @@ -406,3 +406,11 @@ declare namespace mixpanel { } export = mixpanel; + +import * as AiBotMiddleware from './ai_bot_middleware'; +import * as AiBotClassifier from './ai_bot_classifier'; + +declare module 'mixpanel' { + export const ai: typeof AiBotMiddleware; + export const AiBotClassifier: typeof AiBotClassifier; +} diff --git a/lib/mixpanel-node.js b/lib/mixpanel-node.js index d7cdb85..ce1aa16 100644 --- a/lib/mixpanel-node.js +++ b/lib/mixpanel-node.js @@ -535,4 +535,6 @@ const create_client = function (token, config) { // module exporting module.exports = { init: create_client, + ai: require('./ai_bot_middleware'), + AiBotClassifier: require('./ai_bot_classifier'), }; From 93449e9091fbb5a85c27b94592bc56cd7a6f1b6c Mon Sep 17 00:00:00 2001 From: Jared McFarland Date: Wed, 18 Feb 2026 19:08:15 -0800 Subject: [PATCH 5/7] style: format code with prettier Fix CI formatting check for AI bot classification files. --- lib/ai_bot_classifier.d.ts | 12 +- lib/ai_bot_classifier.js | 114 ++++++++--------- lib/ai_bot_middleware.d.ts | 8 +- lib/ai_bot_middleware.js | 45 +++++-- lib/mixpanel-node.d.ts | 6 +- lib/mixpanel-node.js | 4 +- test/ai_bot_classifier.js | 246 ++++++++++++++++++++----------------- test/ai_bot_middleware.js | 229 +++++++++++++++++----------------- 8 files changed, 358 insertions(+), 306 deletions(-) diff --git a/lib/ai_bot_classifier.d.ts b/lib/ai_bot_classifier.d.ts index 80adee8..f2f1868 100644 --- a/lib/ai_bot_classifier.d.ts +++ b/lib/ai_bot_classifier.d.ts @@ -2,7 +2,7 @@ export interface AiBotEntry { pattern: RegExp; name: string; provider: string; - category: 'indexing' | 'retrieval' | 'agent'; + category: "indexing" | "retrieval" | "agent"; description?: string; ip_ranges_url?: string; } @@ -11,9 +11,13 @@ export interface AiBotClassification { $is_ai_bot: boolean; $ai_bot_name?: string; $ai_bot_provider?: string; - $ai_bot_category?: 'indexing' | 'retrieval' | 'agent'; + $ai_bot_category?: "indexing" | "retrieval" | "agent"; } -export function classify_user_agent(userAgent: string | null | undefined): AiBotClassification; -export function create_classifier(options: { additional_bots?: AiBotEntry[] }): (userAgent: string) => AiBotClassification; +export function classify_user_agent( + userAgent: string | null | undefined, +): AiBotClassification; +export function create_classifier(options: { + additional_bots?: AiBotEntry[]; +}): (userAgent: string) => AiBotClassification; export function get_bot_database(): AiBotEntry[]; diff --git a/lib/ai_bot_classifier.js b/lib/ai_bot_classifier.js index eef1580..b94d259 100644 --- a/lib/ai_bot_classifier.js +++ b/lib/ai_bot_classifier.js @@ -4,107 +4,107 @@ const AI_BOT_DATABASE = [ // === OpenAI === { pattern: /GPTBot\//i, - name: 'GPTBot', - provider: 'OpenAI', - category: 'indexing', - description: 'OpenAI web crawler for model training data', - ip_ranges_url: 'https://openai.com/gptbot.json' + name: "GPTBot", + provider: "OpenAI", + category: "indexing", + description: "OpenAI web crawler for model training data", + ip_ranges_url: "https://openai.com/gptbot.json", }, { pattern: /ChatGPT-User\//i, - name: 'ChatGPT-User', - provider: 'OpenAI', - category: 'retrieval', - description: 'ChatGPT real-time retrieval for user queries (RAG)', - ip_ranges_url: 'https://openai.com/chatgpt-user.json' + name: "ChatGPT-User", + provider: "OpenAI", + category: "retrieval", + description: "ChatGPT real-time retrieval for user queries (RAG)", + ip_ranges_url: "https://openai.com/chatgpt-user.json", }, { pattern: /OAI-SearchBot\//i, - name: 'OAI-SearchBot', - provider: 'OpenAI', - category: 'indexing', - description: 'OpenAI search indexing crawler', - ip_ranges_url: 'https://openai.com/searchbot.json' + name: "OAI-SearchBot", + provider: "OpenAI", + category: "indexing", + description: "OpenAI search indexing crawler", + ip_ranges_url: "https://openai.com/searchbot.json", }, // === Anthropic === { pattern: /ClaudeBot\//i, - name: 'ClaudeBot', - provider: 'Anthropic', - category: 'indexing', - description: 'Anthropic web crawler for model training', - ip_ranges_url: null // Anthropic publishes ranges but URL may vary + name: "ClaudeBot", + provider: "Anthropic", + category: "indexing", + description: "Anthropic web crawler for model training", + ip_ranges_url: null, // Anthropic publishes ranges but URL may vary }, { pattern: /Claude-User\//i, - name: 'Claude-User', - provider: 'Anthropic', - category: 'retrieval', - description: 'Claude real-time retrieval for user queries' + name: "Claude-User", + provider: "Anthropic", + category: "retrieval", + description: "Claude real-time retrieval for user queries", }, // === Google === { pattern: /Google-Extended\//i, - name: 'Google-Extended', - provider: 'Google', - category: 'indexing', - description: 'Google AI training data crawler (separate from Googlebot)' + name: "Google-Extended", + provider: "Google", + category: "indexing", + description: "Google AI training data crawler (separate from Googlebot)", }, // === Perplexity === { pattern: /PerplexityBot\//i, - name: 'PerplexityBot', - provider: 'Perplexity', - category: 'retrieval', - description: 'Perplexity AI search crawler' + name: "PerplexityBot", + provider: "Perplexity", + category: "retrieval", + description: "Perplexity AI search crawler", }, // === ByteDance === { pattern: /Bytespider\//i, - name: 'Bytespider', - provider: 'ByteDance', - category: 'indexing', - description: 'ByteDance/TikTok AI crawler' + name: "Bytespider", + provider: "ByteDance", + category: "indexing", + description: "ByteDance/TikTok AI crawler", }, // === Common Crawl === { pattern: /CCBot\//i, - name: 'CCBot', - provider: 'Common Crawl', - category: 'indexing', - description: 'Common Crawl bot (data used by many AI models)' + name: "CCBot", + provider: "Common Crawl", + category: "indexing", + description: "Common Crawl bot (data used by many AI models)", }, // === Apple === { pattern: /Applebot-Extended\//i, - name: 'Applebot-Extended', - provider: 'Apple', - category: 'indexing', - description: 'Apple AI/Siri training data crawler' + name: "Applebot-Extended", + provider: "Apple", + category: "indexing", + description: "Apple AI/Siri training data crawler", }, // === Meta === { pattern: /Meta-ExternalAgent\//i, - name: 'Meta-ExternalAgent', - provider: 'Meta', - category: 'indexing', - description: 'Meta/Facebook AI training data crawler' + name: "Meta-ExternalAgent", + provider: "Meta", + category: "indexing", + description: "Meta/Facebook AI training data crawler", }, // === Cohere === { pattern: /cohere-ai\//i, - name: 'cohere-ai', - provider: 'Cohere', - category: 'indexing', - description: 'Cohere AI training data crawler' + name: "cohere-ai", + provider: "Cohere", + category: "indexing", + description: "Cohere AI training data crawler", }, ]; @@ -114,7 +114,7 @@ const AI_BOT_DATABASE = [ * @returns {Object} Classification result with $is_ai_bot and optional bot details */ function classify_user_agent(userAgent) { - if (!userAgent || typeof userAgent !== 'string') { + if (!userAgent || typeof userAgent !== "string") { return { $is_ai_bot: false }; } @@ -142,8 +142,8 @@ function create_classifier(options) { const additional = (options && options.additional_bots) || []; const combined = [...additional, ...AI_BOT_DATABASE]; - return function(userAgent) { - if (!userAgent || typeof userAgent !== 'string') { + return function (userAgent) { + if (!userAgent || typeof userAgent !== "string") { return { $is_ai_bot: false }; } @@ -163,12 +163,12 @@ function create_classifier(options) { } function get_bot_database() { - return AI_BOT_DATABASE.map(bot => ({ + return AI_BOT_DATABASE.map((bot) => ({ pattern: bot.pattern, name: bot.name, provider: bot.provider, category: bot.category, - description: bot.description || '', + description: bot.description || "", })); } diff --git a/lib/ai_bot_middleware.d.ts b/lib/ai_bot_middleware.d.ts index 7853a9f..8e65921 100644 --- a/lib/ai_bot_middleware.d.ts +++ b/lib/ai_bot_middleware.d.ts @@ -1,4 +1,4 @@ -import { IncomingMessage } from 'http'; +import { IncomingMessage } from "http"; export interface BotClassificationOptions { user_agent_property?: string; @@ -7,7 +7,7 @@ export interface BotClassificationOptions { pattern: RegExp; name: string; provider: string; - category: 'indexing' | 'retrieval' | 'agent'; + category: "indexing" | "retrieval" | "agent"; }>; } @@ -18,7 +18,7 @@ export interface BotClassificationController { export function enable_bot_classification( mixpanel: any, - options?: BotClassificationOptions + options?: BotClassificationOptions, ): BotClassificationController; export function track_request( @@ -26,5 +26,5 @@ export function track_request( req: IncomingMessage, eventName: string, properties?: Record, - callback?: (err?: Error) => void + callback?: (err?: Error) => void, ): void; diff --git a/lib/ai_bot_middleware.js b/lib/ai_bot_middleware.js index f03da50..4dcd27d 100644 --- a/lib/ai_bot_middleware.js +++ b/lib/ai_bot_middleware.js @@ -1,6 +1,9 @@ // lib/ai_bot_middleware.js -const { classify_user_agent, create_classifier } = require('./ai_bot_classifier'); +const { + classify_user_agent, + create_classifier, +} = require("./ai_bot_classifier"); /** * Enable AI bot classification on a Mixpanel client instance. @@ -15,8 +18,8 @@ const { classify_user_agent, create_classifier } = require('./ai_bot_classifier' */ function enable_bot_classification(mixpanel, options) { const opts = options || {}; - const uaProp = opts.user_agent_property || '$user_agent'; - const prefix = opts.property_prefix || '$'; + const uaProp = opts.user_agent_property || "$user_agent"; + const prefix = opts.property_prefix || "$"; const classify = opts.additional_bots ? create_classifier({ additional_bots: opts.additional_bots }) : classify_user_agent; @@ -25,30 +28,45 @@ function enable_bot_classification(mixpanel, options) { // Wrap send_event_request — the single chokepoint for all event data const originalSendEvent = mixpanel.send_event_request; - mixpanel.send_event_request = function(endpoint, event, properties, callback) { + mixpanel.send_event_request = function ( + endpoint, + event, + properties, + callback, + ) { var enrichedProperties = properties; if (enabled && properties && properties[uaProp]) { const classification = classify(properties[uaProp]); // Map classification properties with the configured prefix - if (prefix === '$') { + if (prefix === "$") { enrichedProperties = Object.assign({}, properties, classification); } else { enrichedProperties = Object.assign({}, properties); for (const [key, value] of Object.entries(classification)) { // $is_ai_bot -> {prefix}is_ai_bot; $ai_bot_name -> {prefix}name - const newKey = key.startsWith('$ai_bot_') - ? prefix + key.substring('$ai_bot_'.length) + const newKey = key.startsWith("$ai_bot_") + ? prefix + key.substring("$ai_bot_".length) : prefix + key.substring(1); enrichedProperties[newKey] = value; } } } - originalSendEvent.call(mixpanel, endpoint, event, enrichedProperties, callback); + originalSendEvent.call( + mixpanel, + endpoint, + event, + enrichedProperties, + callback, + ); }; return { - enable: function() { enabled = true; }, - disable: function() { enabled = false; }, + enable: function () { + enabled = true; + }, + disable: function () { + enabled = false; + }, }; } @@ -63,11 +81,14 @@ function enable_bot_classification(mixpanel, options) { */ function track_request(mixpanel, req, eventName, properties, callback) { properties = properties || {}; - const ua = req.headers && req.headers['user-agent']; + const ua = req.headers && req.headers["user-agent"]; if (ua) { properties.$user_agent = ua; } - const ip = req.ip || (req.headers && req.headers['x-forwarded-for']) || (req.connection && req.connection.remoteAddress); + const ip = + req.ip || + (req.headers && req.headers["x-forwarded-for"]) || + (req.connection && req.connection.remoteAddress); if (ip) { properties.ip = ip; } diff --git a/lib/mixpanel-node.d.ts b/lib/mixpanel-node.d.ts index eaa09de..8919e66 100644 --- a/lib/mixpanel-node.d.ts +++ b/lib/mixpanel-node.d.ts @@ -407,10 +407,10 @@ declare namespace mixpanel { export = mixpanel; -import * as AiBotMiddleware from './ai_bot_middleware'; -import * as AiBotClassifier from './ai_bot_classifier'; +import * as AiBotMiddleware from "./ai_bot_middleware"; +import * as AiBotClassifier from "./ai_bot_classifier"; -declare module 'mixpanel' { +declare module "mixpanel" { export const ai: typeof AiBotMiddleware; export const AiBotClassifier: typeof AiBotClassifier; } diff --git a/lib/mixpanel-node.js b/lib/mixpanel-node.js index ce1aa16..c60f2ef 100644 --- a/lib/mixpanel-node.js +++ b/lib/mixpanel-node.js @@ -535,6 +535,6 @@ const create_client = function (token, config) { // module exporting module.exports = { init: create_client, - ai: require('./ai_bot_middleware'), - AiBotClassifier: require('./ai_bot_classifier'), + ai: require("./ai_bot_middleware"), + AiBotClassifier: require("./ai_bot_classifier"), }; diff --git a/test/ai_bot_classifier.js b/test/ai_bot_classifier.js index cac309d..2af7791 100644 --- a/test/ai_bot_classifier.js +++ b/test/ai_bot_classifier.js @@ -3,244 +3,262 @@ // These tests define the expected behavior of the classifier // Write these BEFORE implementing lib/ai_bot_classifier.js -describe('AiBotClassifier', () => { - +describe("AiBotClassifier", () => { let classify; beforeEach(() => { - const { classify_user_agent } = require('../lib/ai_bot_classifier'); + const { classify_user_agent } = require("../lib/ai_bot_classifier"); classify = classify_user_agent; }); // === CORE CLASSIFICATION === - describe('classify_user_agent', () => { - + describe("classify_user_agent", () => { // --- OpenAI Bots --- - it('should classify GPTBot user agent', () => { - const result = classify('Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)'); + it("should classify GPTBot user agent", () => { + const result = classify( + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)", + ); expect(result.$is_ai_bot).toBe(true); - expect(result.$ai_bot_name).toBe('GPTBot'); - expect(result.$ai_bot_provider).toBe('OpenAI'); - expect(result.$ai_bot_category).toBe('indexing'); + expect(result.$ai_bot_name).toBe("GPTBot"); + expect(result.$ai_bot_provider).toBe("OpenAI"); + expect(result.$ai_bot_category).toBe("indexing"); }); - it('should classify ChatGPT-User agent', () => { - const result = classify('Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ChatGPT-User/1.0; +https://openai.com/bot)'); + it("should classify ChatGPT-User agent", () => { + const result = classify( + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ChatGPT-User/1.0; +https://openai.com/bot)", + ); expect(result.$is_ai_bot).toBe(true); - expect(result.$ai_bot_name).toBe('ChatGPT-User'); - expect(result.$ai_bot_provider).toBe('OpenAI'); - expect(result.$ai_bot_category).toBe('retrieval'); + expect(result.$ai_bot_name).toBe("ChatGPT-User"); + expect(result.$ai_bot_provider).toBe("OpenAI"); + expect(result.$ai_bot_category).toBe("retrieval"); }); - it('should classify OAI-SearchBot agent', () => { - const result = classify('Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot)'); + it("should classify OAI-SearchBot agent", () => { + const result = classify( + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot)", + ); expect(result.$is_ai_bot).toBe(true); - expect(result.$ai_bot_name).toBe('OAI-SearchBot'); - expect(result.$ai_bot_provider).toBe('OpenAI'); - expect(result.$ai_bot_category).toBe('indexing'); + expect(result.$ai_bot_name).toBe("OAI-SearchBot"); + expect(result.$ai_bot_provider).toBe("OpenAI"); + expect(result.$ai_bot_category).toBe("indexing"); }); // --- Anthropic Bots --- - it('should classify ClaudeBot agent', () => { - const result = classify('Mozilla/5.0 (compatible; ClaudeBot/1.0; +claudebot@anthropic.com)'); + it("should classify ClaudeBot agent", () => { + const result = classify( + "Mozilla/5.0 (compatible; ClaudeBot/1.0; +claudebot@anthropic.com)", + ); expect(result.$is_ai_bot).toBe(true); - expect(result.$ai_bot_name).toBe('ClaudeBot'); - expect(result.$ai_bot_provider).toBe('Anthropic'); - expect(result.$ai_bot_category).toBe('indexing'); + expect(result.$ai_bot_name).toBe("ClaudeBot"); + expect(result.$ai_bot_provider).toBe("Anthropic"); + expect(result.$ai_bot_category).toBe("indexing"); }); - it('should classify Claude-User agent', () => { - const result = classify('Mozilla/5.0 (compatible; Claude-User/1.0)'); + it("should classify Claude-User agent", () => { + const result = classify("Mozilla/5.0 (compatible; Claude-User/1.0)"); expect(result.$is_ai_bot).toBe(true); - expect(result.$ai_bot_name).toBe('Claude-User'); - expect(result.$ai_bot_provider).toBe('Anthropic'); - expect(result.$ai_bot_category).toBe('retrieval'); + expect(result.$ai_bot_name).toBe("Claude-User"); + expect(result.$ai_bot_provider).toBe("Anthropic"); + expect(result.$ai_bot_category).toBe("retrieval"); }); // --- Google Bots --- - it('should classify Google-Extended agent', () => { - const result = classify('Mozilla/5.0 (compatible; Google-Extended/1.0)'); + it("should classify Google-Extended agent", () => { + const result = classify("Mozilla/5.0 (compatible; Google-Extended/1.0)"); expect(result.$is_ai_bot).toBe(true); - expect(result.$ai_bot_name).toBe('Google-Extended'); - expect(result.$ai_bot_provider).toBe('Google'); - expect(result.$ai_bot_category).toBe('indexing'); + expect(result.$ai_bot_name).toBe("Google-Extended"); + expect(result.$ai_bot_provider).toBe("Google"); + expect(result.$ai_bot_category).toBe("indexing"); }); // --- Perplexity --- - it('should classify PerplexityBot agent', () => { - const result = classify('Mozilla/5.0 (compatible; PerplexityBot/1.0)'); + it("should classify PerplexityBot agent", () => { + const result = classify("Mozilla/5.0 (compatible; PerplexityBot/1.0)"); expect(result.$is_ai_bot).toBe(true); - expect(result.$ai_bot_name).toBe('PerplexityBot'); - expect(result.$ai_bot_provider).toBe('Perplexity'); - expect(result.$ai_bot_category).toBe('retrieval'); + expect(result.$ai_bot_name).toBe("PerplexityBot"); + expect(result.$ai_bot_provider).toBe("Perplexity"); + expect(result.$ai_bot_category).toBe("retrieval"); }); // --- ByteDance --- - it('should classify Bytespider agent', () => { - const result = classify('Mozilla/5.0 (compatible; Bytespider/1.0)'); + it("should classify Bytespider agent", () => { + const result = classify("Mozilla/5.0 (compatible; Bytespider/1.0)"); expect(result.$is_ai_bot).toBe(true); - expect(result.$ai_bot_name).toBe('Bytespider'); - expect(result.$ai_bot_provider).toBe('ByteDance'); + expect(result.$ai_bot_name).toBe("Bytespider"); + expect(result.$ai_bot_provider).toBe("ByteDance"); }); // --- Common Crawl --- - it('should classify CCBot agent', () => { - const result = classify('CCBot/2.0 (https://commoncrawl.org/faq/)'); + it("should classify CCBot agent", () => { + const result = classify("CCBot/2.0 (https://commoncrawl.org/faq/)"); expect(result.$is_ai_bot).toBe(true); - expect(result.$ai_bot_name).toBe('CCBot'); - expect(result.$ai_bot_provider).toBe('Common Crawl'); + expect(result.$ai_bot_name).toBe("CCBot"); + expect(result.$ai_bot_provider).toBe("Common Crawl"); }); // --- Apple --- - it('should classify Applebot-Extended agent', () => { - const result = classify('Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Applebot-Extended/0.1'); + it("should classify Applebot-Extended agent", () => { + const result = classify( + "Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Applebot-Extended/0.1", + ); expect(result.$is_ai_bot).toBe(true); - expect(result.$ai_bot_name).toBe('Applebot-Extended'); - expect(result.$ai_bot_provider).toBe('Apple'); + expect(result.$ai_bot_name).toBe("Applebot-Extended"); + expect(result.$ai_bot_provider).toBe("Apple"); }); // --- Meta --- - it('should classify Meta-ExternalAgent agent', () => { - const result = classify('Mozilla/5.0 (compatible; Meta-ExternalAgent/1.0)'); + it("should classify Meta-ExternalAgent agent", () => { + const result = classify( + "Mozilla/5.0 (compatible; Meta-ExternalAgent/1.0)", + ); expect(result.$is_ai_bot).toBe(true); - expect(result.$ai_bot_name).toBe('Meta-ExternalAgent'); - expect(result.$ai_bot_provider).toBe('Meta'); + expect(result.$ai_bot_name).toBe("Meta-ExternalAgent"); + expect(result.$ai_bot_provider).toBe("Meta"); }); // --- Cohere --- - it('should classify cohere-ai agent', () => { - const result = classify('cohere-ai/1.0 (https://cohere.com)'); + it("should classify cohere-ai agent", () => { + const result = classify("cohere-ai/1.0 (https://cohere.com)"); expect(result.$is_ai_bot).toBe(true); - expect(result.$ai_bot_name).toBe('cohere-ai'); - expect(result.$ai_bot_provider).toBe('Cohere'); - expect(result.$ai_bot_category).toBe('indexing'); + expect(result.$ai_bot_name).toBe("cohere-ai"); + expect(result.$ai_bot_provider).toBe("Cohere"); + expect(result.$ai_bot_category).toBe("indexing"); }); // === NEGATIVE CASES === - it('should NOT classify regular Chrome browser as AI bot', () => { - const result = classify('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); + it("should NOT classify regular Chrome browser as AI bot", () => { + const result = classify( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + ); expect(result.$is_ai_bot).toBe(false); expect(result.$ai_bot_name).toBeUndefined(); }); - it('should NOT classify Googlebot (regular) as AI bot', () => { - const result = classify('Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'); + it("should NOT classify Googlebot (regular) as AI bot", () => { + const result = classify( + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + ); expect(result.$is_ai_bot).toBe(false); }); - it('should NOT classify Bingbot (regular) as AI bot', () => { - const result = classify('Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'); + it("should NOT classify Bingbot (regular) as AI bot", () => { + const result = classify( + "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", + ); expect(result.$is_ai_bot).toBe(false); }); - it('should NOT classify curl as AI bot', () => { - const result = classify('curl/7.64.1'); + it("should NOT classify curl as AI bot", () => { + const result = classify("curl/7.64.1"); expect(result.$is_ai_bot).toBe(false); }); - it('should handle empty user agent', () => { - const result = classify(''); + it("should handle empty user agent", () => { + const result = classify(""); expect(result.$is_ai_bot).toBe(false); }); - it('should handle undefined user agent', () => { + it("should handle undefined user agent", () => { const result = classify(undefined); expect(result.$is_ai_bot).toBe(false); }); - it('should handle null user agent', () => { + it("should handle null user agent", () => { const result = classify(null); expect(result.$is_ai_bot).toBe(false); }); // === CASE SENSITIVITY === - it('should match case-insensitively', () => { - const result = classify('Mozilla/5.0 (compatible; gptbot/1.2)'); + it("should match case-insensitively", () => { + const result = classify("Mozilla/5.0 (compatible; gptbot/1.2)"); expect(result.$is_ai_bot).toBe(true); - expect(result.$ai_bot_name).toBe('GPTBot'); + expect(result.$ai_bot_name).toBe("GPTBot"); }); // === RETURN SHAPE === - it('should return all expected fields for a match', () => { - const result = classify('GPTBot/1.2'); - expect(result).toHaveProperty('$is_ai_bot', true); - expect(result).toHaveProperty('$ai_bot_name'); - expect(result).toHaveProperty('$ai_bot_provider'); - expect(result).toHaveProperty('$ai_bot_category'); - expect(typeof result.$ai_bot_name).toBe('string'); - expect(typeof result.$ai_bot_provider).toBe('string'); - expect(['indexing', 'retrieval', 'agent']).toContain(result.$ai_bot_category); + it("should return all expected fields for a match", () => { + const result = classify("GPTBot/1.2"); + expect(result).toHaveProperty("$is_ai_bot", true); + expect(result).toHaveProperty("$ai_bot_name"); + expect(result).toHaveProperty("$ai_bot_provider"); + expect(result).toHaveProperty("$ai_bot_category"); + expect(typeof result.$ai_bot_name).toBe("string"); + expect(typeof result.$ai_bot_provider).toBe("string"); + expect(["indexing", "retrieval", "agent"]).toContain( + result.$ai_bot_category, + ); }); - it('should return only $is_ai_bot:false for non-matches', () => { - const result = classify('Mozilla/5.0 Chrome/120'); - expect(Object.keys(result)).toEqual(['$is_ai_bot']); + it("should return only $is_ai_bot:false for non-matches", () => { + const result = classify("Mozilla/5.0 Chrome/120"); + expect(Object.keys(result)).toEqual(["$is_ai_bot"]); expect(result.$is_ai_bot).toBe(false); }); }); // === BOT DATABASE === - describe('get_bot_database', () => { - it('should expose the bot database for inspection', () => { - const { get_bot_database } = require('../lib/ai_bot_classifier'); + describe("get_bot_database", () => { + it("should expose the bot database for inspection", () => { + const { get_bot_database } = require("../lib/ai_bot_classifier"); const db = get_bot_database(); expect(Array.isArray(db)).toBe(true); expect(db.length).toBeGreaterThan(0); - expect(db[0]).toHaveProperty('pattern'); - expect(db[0]).toHaveProperty('name'); - expect(db[0]).toHaveProperty('provider'); - expect(db[0]).toHaveProperty('category'); + expect(db[0]).toHaveProperty("pattern"); + expect(db[0]).toHaveProperty("name"); + expect(db[0]).toHaveProperty("provider"); + expect(db[0]).toHaveProperty("category"); }); }); // === CUSTOM BOTS === - describe('custom bot registration', () => { - it('should allow adding custom bot patterns', () => { - const { create_classifier } = require('../lib/ai_bot_classifier'); + describe("custom bot registration", () => { + it("should allow adding custom bot patterns", () => { + const { create_classifier } = require("../lib/ai_bot_classifier"); const classifier = create_classifier({ additional_bots: [ { pattern: /MyCustomBot\//i, - name: 'MyCustomBot', - provider: 'CustomCorp', - category: 'indexing' - } - ] + name: "MyCustomBot", + provider: "CustomCorp", + category: "indexing", + }, + ], }); - const result = classifier('Mozilla/5.0 (compatible; MyCustomBot/1.0)'); + const result = classifier("Mozilla/5.0 (compatible; MyCustomBot/1.0)"); expect(result.$is_ai_bot).toBe(true); - expect(result.$ai_bot_name).toBe('MyCustomBot'); + expect(result.$ai_bot_name).toBe("MyCustomBot"); }); - it('should check custom bots before built-in bots', () => { - const { create_classifier } = require('../lib/ai_bot_classifier'); + it("should check custom bots before built-in bots", () => { + const { create_classifier } = require("../lib/ai_bot_classifier"); const classifier = create_classifier({ additional_bots: [ { pattern: /GPTBot\//i, - name: 'GPTBot-Custom', - provider: 'CustomProvider', - category: 'retrieval' - } - ] + name: "GPTBot-Custom", + provider: "CustomProvider", + category: "retrieval", + }, + ], }); - const result = classifier('GPTBot/1.2'); - expect(result.$ai_bot_name).toBe('GPTBot-Custom'); + const result = classifier("GPTBot/1.2"); + expect(result.$ai_bot_name).toBe("GPTBot-Custom"); }); }); }); diff --git a/test/ai_bot_middleware.js b/test/ai_bot_middleware.js index 64d853d..a3a4173 100644 --- a/test/ai_bot_middleware.js +++ b/test/ai_bot_middleware.js @@ -1,25 +1,24 @@ // test/ai_bot_middleware.js -describe('AI Bot Middleware Integration', () => { - +describe("AI Bot Middleware Integration", () => { let Mixpanel, mixpanel; beforeEach(() => { - Mixpanel = require('../lib/mixpanel-node'); + Mixpanel = require("../lib/mixpanel-node"); }); - describe('enable_bot_classification', () => { - - it('should enrich track() calls with bot classification when $user_agent is present', () => { - const { enable_bot_classification } = require('../lib/ai_bot_middleware'); - mixpanel = Mixpanel.init('test-token'); + describe("enable_bot_classification", () => { + it("should enrich track() calls with bot classification when $user_agent is present", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); enable_bot_classification(mixpanel); - vi.spyOn(mixpanel, 'send_request'); + vi.spyOn(mixpanel, "send_request"); - mixpanel.track('page_view', { - distinct_id: 'user123', - $user_agent: 'Mozilla/5.0 (compatible; GPTBot/1.2; +https://openai.com/gptbot)' + mixpanel.track("page_view", { + distinct_id: "user123", + $user_agent: + "Mozilla/5.0 (compatible; GPTBot/1.2; +https://openai.com/gptbot)", }); expect(mixpanel.send_request).toHaveBeenCalledWith( @@ -27,39 +26,39 @@ describe('AI Bot Middleware Integration', () => { data: expect.objectContaining({ properties: expect.objectContaining({ $is_ai_bot: true, - $ai_bot_name: 'GPTBot', - $ai_bot_provider: 'OpenAI', - $ai_bot_category: 'indexing' - }) - }) + $ai_bot_name: "GPTBot", + $ai_bot_provider: "OpenAI", + $ai_bot_category: "indexing", + }), + }), }), - undefined + undefined, ); }); - it('should NOT add bot properties when $user_agent is not present', () => { - const { enable_bot_classification } = require('../lib/ai_bot_middleware'); - mixpanel = Mixpanel.init('test-token'); + it("should NOT add bot properties when $user_agent is not present", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); enable_bot_classification(mixpanel); - vi.spyOn(mixpanel, 'send_request'); + vi.spyOn(mixpanel, "send_request"); - mixpanel.track('page_view', { distinct_id: 'user123' }); + mixpanel.track("page_view", { distinct_id: "user123" }); const callData = mixpanel.send_request.mock.calls[0][0].data; expect(callData.properties.$is_ai_bot).toBeUndefined(); }); - it('should set $is_ai_bot:false when $user_agent is present but not an AI bot', () => { - const { enable_bot_classification } = require('../lib/ai_bot_middleware'); - mixpanel = Mixpanel.init('test-token'); + it("should set $is_ai_bot:false when $user_agent is present but not an AI bot", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); enable_bot_classification(mixpanel); - vi.spyOn(mixpanel, 'send_request'); + vi.spyOn(mixpanel, "send_request"); - mixpanel.track('page_view', { - distinct_id: 'user123', - $user_agent: 'Mozilla/5.0 Chrome/120.0.0.0' + mixpanel.track("page_view", { + distinct_id: "user123", + $user_agent: "Mozilla/5.0 Chrome/120.0.0.0", }); const callData = mixpanel.send_request.mock.calls[0][0].data; @@ -67,67 +66,73 @@ describe('AI Bot Middleware Integration', () => { expect(callData.properties.$ai_bot_name).toBeUndefined(); }); - it('should preserve existing properties alongside bot classification', () => { - const { enable_bot_classification } = require('../lib/ai_bot_middleware'); - mixpanel = Mixpanel.init('test-token'); + it("should preserve existing properties alongside bot classification", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); enable_bot_classification(mixpanel); - vi.spyOn(mixpanel, 'send_request'); + vi.spyOn(mixpanel, "send_request"); - mixpanel.track('page_view', { - distinct_id: 'user123', - $user_agent: 'GPTBot/1.2', - page_url: '/products', - custom_prop: 'value' + mixpanel.track("page_view", { + distinct_id: "user123", + $user_agent: "GPTBot/1.2", + page_url: "/products", + custom_prop: "value", }); const props = mixpanel.send_request.mock.calls[0][0].data.properties; - expect(props.page_url).toBe('/products'); - expect(props.custom_prop).toBe('value'); + expect(props.page_url).toBe("/products"); + expect(props.custom_prop).toBe("value"); expect(props.$is_ai_bot).toBe(true); }); - it('should preserve callback functionality', () => { - const { enable_bot_classification } = require('../lib/ai_bot_middleware'); - mixpanel = Mixpanel.init('test-token'); + it("should preserve callback functionality", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); enable_bot_classification(mixpanel); - vi.spyOn(mixpanel, 'send_request'); + vi.spyOn(mixpanel, "send_request"); const callback = vi.fn(); - mixpanel.track('page_view', { $user_agent: 'GPTBot/1.2' }, callback); + mixpanel.track("page_view", { $user_agent: "GPTBot/1.2" }, callback); expect(mixpanel.send_request).toHaveBeenCalledWith( expect.anything(), - callback + callback, ); }); - it('should support callback as second argument (no properties)', () => { - const { enable_bot_classification } = require('../lib/ai_bot_middleware'); - mixpanel = Mixpanel.init('test-token'); + it("should support callback as second argument (no properties)", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); enable_bot_classification(mixpanel); - vi.spyOn(mixpanel, 'send_request'); + vi.spyOn(mixpanel, "send_request"); const callback = vi.fn(); - mixpanel.track('page_view', callback); + mixpanel.track("page_view", callback); // When callback is passed as 2nd arg, properties should be empty // and no bot classification should be added expect(mixpanel.send_request).toHaveBeenCalled(); }); - it('should NOT enrich track_batch events (known limitation — track_batch bypasses send_event_request)', () => { - const { enable_bot_classification } = require('../lib/ai_bot_middleware'); - mixpanel = Mixpanel.init('test-token'); + it("should NOT enrich track_batch events (known limitation — track_batch bypasses send_event_request)", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); enable_bot_classification(mixpanel); - vi.spyOn(mixpanel, 'send_request'); + vi.spyOn(mixpanel, "send_request"); mixpanel.track_batch([ - { event: 'page_view', properties: { $user_agent: 'GPTBot/1.2', distinct_id: 'bot1' } }, - { event: 'page_view', properties: { $user_agent: 'Chrome/120', distinct_id: 'user1' } } + { + event: "page_view", + properties: { $user_agent: "GPTBot/1.2", distinct_id: "bot1" }, + }, + { + event: "page_view", + properties: { $user_agent: "Chrome/120", distinct_id: "user1" }, + }, ]); // track_batch goes through send_batch_requests -> send_request, NOT send_event_request @@ -137,121 +142,125 @@ describe('AI Bot Middleware Integration', () => { expect(call.data[1].properties.$is_ai_bot).toBeUndefined(); }); - it('should not modify the original properties object', () => { - const { enable_bot_classification } = require('../lib/ai_bot_middleware'); - mixpanel = Mixpanel.init('test-token'); + it("should not modify the original properties object", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); enable_bot_classification(mixpanel); - vi.spyOn(mixpanel, 'send_request'); + vi.spyOn(mixpanel, "send_request"); - const props = { distinct_id: 'user123', $user_agent: 'GPTBot/1.2' }; + const props = { distinct_id: "user123", $user_agent: "GPTBot/1.2" }; const originalKeys = Object.keys(props); - mixpanel.track('page_view', props); + mixpanel.track("page_view", props); // Original object should not have been mutated expect(Object.keys(props).sort()).toEqual(originalKeys.sort()); }); }); - describe('configuration options', () => { - - it('should accept custom user_agent_property name', () => { - const { enable_bot_classification } = require('../lib/ai_bot_middleware'); - mixpanel = Mixpanel.init('test-token'); + describe("configuration options", () => { + it("should accept custom user_agent_property name", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); enable_bot_classification(mixpanel, { - user_agent_property: 'ua_string' + user_agent_property: "ua_string", }); - vi.spyOn(mixpanel, 'send_request'); + vi.spyOn(mixpanel, "send_request"); - mixpanel.track('page_view', { - distinct_id: 'user123', - ua_string: 'GPTBot/1.2' + mixpanel.track("page_view", { + distinct_id: "user123", + ua_string: "GPTBot/1.2", }); const props = mixpanel.send_request.mock.calls[0][0].data.properties; expect(props.$is_ai_bot).toBe(true); }); - it('should accept custom property prefix', () => { - const { enable_bot_classification } = require('../lib/ai_bot_middleware'); - mixpanel = Mixpanel.init('test-token'); + it("should accept custom property prefix", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); enable_bot_classification(mixpanel, { - property_prefix: 'bot_' + property_prefix: "bot_", }); - vi.spyOn(mixpanel, 'send_request'); + vi.spyOn(mixpanel, "send_request"); - mixpanel.track('page_view', { - $user_agent: 'GPTBot/1.2' + mixpanel.track("page_view", { + $user_agent: "GPTBot/1.2", }); const props = mixpanel.send_request.mock.calls[0][0].data.properties; expect(props.bot_is_ai_bot).toBe(true); - expect(props.bot_name).toBe('GPTBot'); + expect(props.bot_name).toBe("GPTBot"); }); - it('should allow disabling classification without removing middleware', () => { - const { enable_bot_classification } = require('../lib/ai_bot_middleware'); - mixpanel = Mixpanel.init('test-token'); + it("should allow disabling classification without removing middleware", () => { + const { enable_bot_classification } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); const controller = enable_bot_classification(mixpanel); - vi.spyOn(mixpanel, 'send_request'); + vi.spyOn(mixpanel, "send_request"); controller.disable(); - mixpanel.track('page_view', { $user_agent: 'GPTBot/1.2' }); + mixpanel.track("page_view", { $user_agent: "GPTBot/1.2" }); const props = mixpanel.send_request.mock.calls[0][0].data.properties; expect(props.$is_ai_bot).toBeUndefined(); controller.enable(); - mixpanel.track('page_view', { $user_agent: 'GPTBot/1.2' }); + mixpanel.track("page_view", { $user_agent: "GPTBot/1.2" }); const props2 = mixpanel.send_request.mock.calls[1][0].data.properties; expect(props2.$is_ai_bot).toBe(true); }); }); - describe('helper: track_request', () => { - - it('should provide a helper that extracts user-agent from HTTP request', () => { - const { enable_bot_classification, track_request } = require('../lib/ai_bot_middleware'); - mixpanel = Mixpanel.init('test-token'); + describe("helper: track_request", () => { + it("should provide a helper that extracts user-agent from HTTP request", () => { + const { + enable_bot_classification, + track_request, + } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); enable_bot_classification(mixpanel); - vi.spyOn(mixpanel, 'send_request'); + vi.spyOn(mixpanel, "send_request"); // Simulate an Express/Node.js request object const mockReq = { headers: { - 'user-agent': 'GPTBot/1.2', - 'x-forwarded-for': '1.2.3.4' + "user-agent": "GPTBot/1.2", + "x-forwarded-for": "1.2.3.4", }, - ip: '1.2.3.4', - url: '/api/products' + ip: "1.2.3.4", + url: "/api/products", }; - track_request(mixpanel, mockReq, 'page_view', { - distinct_id: 'user123', - page_url: '/api/products' + track_request(mixpanel, mockReq, "page_view", { + distinct_id: "user123", + page_url: "/api/products", }); const props = mixpanel.send_request.mock.calls[0][0].data.properties; - expect(props.$user_agent).toBe('GPTBot/1.2'); + expect(props.$user_agent).toBe("GPTBot/1.2"); expect(props.$is_ai_bot).toBe(true); - expect(props.ip).toBe('1.2.3.4'); - expect(props.page_url).toBe('/api/products'); + expect(props.ip).toBe("1.2.3.4"); + expect(props.page_url).toBe("/api/products"); }); - it('should handle request with no user-agent header', () => { - const { enable_bot_classification, track_request } = require('../lib/ai_bot_middleware'); - mixpanel = Mixpanel.init('test-token'); + it("should handle request with no user-agent header", () => { + const { + enable_bot_classification, + track_request, + } = require("../lib/ai_bot_middleware"); + mixpanel = Mixpanel.init("test-token"); enable_bot_classification(mixpanel); - vi.spyOn(mixpanel, 'send_request'); + vi.spyOn(mixpanel, "send_request"); - const mockReq = { headers: {}, ip: '1.2.3.4' }; - track_request(mixpanel, mockReq, 'page_view', { distinct_id: 'user123' }); + const mockReq = { headers: {}, ip: "1.2.3.4" }; + track_request(mixpanel, mockReq, "page_view", { distinct_id: "user123" }); const props = mixpanel.send_request.mock.calls[0][0].data.properties; expect(props.$is_ai_bot).toBeUndefined(); From e8bd4fefd895db959da3d30447f3ac9cf84f9764 Mon Sep 17 00:00:00 2001 From: Jared McFarland Date: Wed, 18 Feb 2026 23:13:27 -0800 Subject: [PATCH 6/7] fix: replace var with let to satisfy oxlint no-var rule Fix CI lint check in AI bot middleware. --- lib/ai_bot_middleware.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ai_bot_middleware.js b/lib/ai_bot_middleware.js index 4dcd27d..52fe7f3 100644 --- a/lib/ai_bot_middleware.js +++ b/lib/ai_bot_middleware.js @@ -34,7 +34,7 @@ function enable_bot_classification(mixpanel, options) { properties, callback, ) { - var enrichedProperties = properties; + let enrichedProperties = properties; if (enabled && properties && properties[uaProp]) { const classification = classify(properties[uaProp]); // Map classification properties with the configured prefix From 8f7cc3dd28015ea6e7a329f3e265b4fc91454404 Mon Sep 17 00:00:00 2001 From: Jared McFarland Date: Thu, 19 Feb 2026 09:39:06 -0800 Subject: [PATCH 7/7] fix: address PR review comments - Add missing $ai_bot_category assertions for 4 bot tests - Prevent mutation of input properties in track_request - Add double-wrapping guard for enable_bot_classification - Fix JSDoc comment accuracy --- lib/ai_bot_middleware.js | 15 ++++++++++----- test/ai_bot_classifier.js | 4 ++++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/lib/ai_bot_middleware.js b/lib/ai_bot_middleware.js index 52fe7f3..4efb16a 100644 --- a/lib/ai_bot_middleware.js +++ b/lib/ai_bot_middleware.js @@ -7,7 +7,7 @@ const { /** * Enable AI bot classification on a Mixpanel client instance. - * Wraps track() to auto-classify when $user_agent property is present. + * Wraps send_event_request to auto-classify track() and import() calls when $user_agent property is present. * * @param {Object} mixpanel - Mixpanel client from Mixpanel.init() * @param {Object} [options] @@ -17,6 +17,11 @@ const { * @returns {Object} Controller with enable()/disable() methods */ function enable_bot_classification(mixpanel, options) { + if (mixpanel._ai_bot_classification_enabled) { + return; + } + mixpanel._ai_bot_classification_enabled = true; + const opts = options || {}; const uaProp = opts.user_agent_property || "$user_agent"; const prefix = opts.property_prefix || "$"; @@ -80,19 +85,19 @@ function enable_bot_classification(mixpanel, options) { * @param {Function} [callback] - Callback */ function track_request(mixpanel, req, eventName, properties, callback) { - properties = properties || {}; + const enrichedProperties = Object.assign({}, properties || {}); const ua = req.headers && req.headers["user-agent"]; if (ua) { - properties.$user_agent = ua; + enrichedProperties.$user_agent = ua; } const ip = req.ip || (req.headers && req.headers["x-forwarded-for"]) || (req.connection && req.connection.remoteAddress); if (ip) { - properties.ip = ip; + enrichedProperties.ip = ip; } - mixpanel.track(eventName, properties, callback); + mixpanel.track(eventName, enrichedProperties, callback); } module.exports = { enable_bot_classification, track_request }; diff --git a/test/ai_bot_classifier.js b/test/ai_bot_classifier.js index 2af7791..b2e9d7f 100644 --- a/test/ai_bot_classifier.js +++ b/test/ai_bot_classifier.js @@ -93,6 +93,7 @@ describe("AiBotClassifier", () => { expect(result.$is_ai_bot).toBe(true); expect(result.$ai_bot_name).toBe("Bytespider"); expect(result.$ai_bot_provider).toBe("ByteDance"); + expect(result.$ai_bot_category).toBe("indexing"); }); // --- Common Crawl --- @@ -102,6 +103,7 @@ describe("AiBotClassifier", () => { expect(result.$is_ai_bot).toBe(true); expect(result.$ai_bot_name).toBe("CCBot"); expect(result.$ai_bot_provider).toBe("Common Crawl"); + expect(result.$ai_bot_category).toBe("indexing"); }); // --- Apple --- @@ -113,6 +115,7 @@ describe("AiBotClassifier", () => { expect(result.$is_ai_bot).toBe(true); expect(result.$ai_bot_name).toBe("Applebot-Extended"); expect(result.$ai_bot_provider).toBe("Apple"); + expect(result.$ai_bot_category).toBe("indexing"); }); // --- Meta --- @@ -124,6 +127,7 @@ describe("AiBotClassifier", () => { expect(result.$is_ai_bot).toBe(true); expect(result.$ai_bot_name).toBe("Meta-ExternalAgent"); expect(result.$ai_bot_provider).toBe("Meta"); + expect(result.$ai_bot_category).toBe("indexing"); }); // --- Cohere ---