diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 360bbfe..1530975 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -2,8 +2,12 @@ name: Auto-release on version bump # Watches package.json on main. If its version field bumps to something that # doesn't have a matching `v` tag yet, create the tag + GitHub -# Release. That fires `publish.yml` via its `release: published` trigger, -# which runs the tests, builds, and publishes to npm via OIDC. +# Release, then explicitly kick off `publish.yml` against the new tag. +# +# Why the explicit dispatch: GitHub suppresses workflow triggers for events +# created by `GITHUB_TOKEN` (anti-recursion). So a `release: published` event +# from `gh release create` does NOT fire publish.yml. `workflow_dispatch` via +# `gh workflow run` is not subject to the same suppression and runs reliably. # # Net effect: bump `package.json`, merge, and a new npm version lands without # anyone touching the CLI. Version bumps still happen manually (either in a @@ -17,6 +21,7 @@ on: permissions: contents: write + actions: write # needed to dispatch publish.yml via `gh workflow run` concurrency: group: auto-release @@ -45,7 +50,7 @@ jobs: else echo "exists=false" >> "$GITHUB_OUTPUT" fi - - name: Create GitHub Release (triggers publish.yml) + - name: Create GitHub Release if: steps.tag.outputs.exists == 'false' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -53,3 +58,9 @@ jobs: gh release create "v${{ steps.version.outputs.value }}" \ --title "v${{ steps.version.outputs.value }}" \ --generate-notes + - name: Trigger npm publish for the new tag + if: steps.tag.outputs.exists == 'false' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh workflow run publish.yml --ref "v${{ steps.version.outputs.value }}" diff --git a/README.md b/README.md index bdbf7ba..d362db4 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,6 @@ One line of middleware. Fire-and-forget. Zero impact on your response latency. E "$current_url": "https://example.com/docs/intro", "path": "/docs/intro", "method": "GET", - "country_code": "NL", // x-vercel-ip-country / cf-ipcountry / x-country-code "user_agent": "ClaudeBot/1.0 (+https://claude.ai/bot)", "is_ai_bot": true, // strict: matches a branded AI crawler "bot_name": "Claude", // 'Claude' | 'ChatGPT' | ... | 'curl' | 'axios' | 'Electron' | 'Browser' | 'Other' @@ -307,6 +306,50 @@ Full middleware example: [`README.md → Markdown mirror helpers`](./README.md#m --- +## Advanced: Peec.ai crawl-insights export + +[Peec.ai](https://peec.ai)'s **Agent analytics** product ingests a CSV/CLF access log and produces dashboards on top of it. The Peec docs assume you have a Vercel Log Drain → Axiom (or similar) pipeline that emits these eight columns: `timestamp, request_method, request_url, response_status, client_ip, user_agent, country_code, referer`. + +If you're already running this library, **you can skip the log drain** — your PostHog `agent_visit` events are a near-superset of that schema. Opt into the two privacy-sensitive fields: + +```ts +void trackVisit(req, { + analytics, + captureCountry: true, // emits country_code from x-vercel-ip-country / cf-ipcountry / x-country-code + captureIp: true // emits raw client_ip (first hop of x-forwarded-for) +}) +``` + +Both default to **off** so the library stays PII-free out of the box. Enable them only on the deployments you intend to export. + +Then export from PostHog with a SQL insight: + +```sql +SELECT + timestamp AS timestamp, + coalesce(properties.method, 'GET') AS request_method, + properties.$current_url AS request_url, + '200' AS response_status, -- middleware runs pre-response + coalesce(properties.client_ip, properties.$ip) AS client_ip, + properties.user_agent AS user_agent, + coalesce(properties.country_code, + properties.$geoip_country_code) AS country_code, + properties.referer AS referer +FROM events +WHERE event = 'agent_visit' + AND properties.is_ai_bot = true + AND timestamp >= now() - INTERVAL 30 DAY +ORDER BY timestamp DESC +``` + +`coalesce` makes the query work on historical events that predate the new fields and on events where `captureCountry` / `captureIp` are off (PostHog's built-in `$ip` and `$geoip_country_code` enrichment fills the gap). Click **Export → CSV** and upload to Peec. + +**Caveats:** +- `response_status` is hardcoded `200` — middleware runs before the response. If Peec filters on status, use the Vercel Log Drain path instead. +- Drop `is_ai_bot = true` from the `WHERE` clause to also include coding-agent / scraper traffic (curl, axios, headless browsers). + +--- + ## Compared to… diff --git a/src/track.ts b/src/track.ts index a84e890..4fc6687 100644 --- a/src/track.ts +++ b/src/track.ts @@ -42,11 +42,12 @@ export async function trackVisit( const forwardedFor = req.headers.get('x-forwarded-for') || '' const ip = forwardedFor.split(',')[0]?.trim() ?? '' const referer = req.headers.get('referer') - const country = - req.headers.get('x-vercel-ip-country') || - req.headers.get('cf-ipcountry') || - req.headers.get('x-country-code') || - null + const country = opts.captureCountry + ? req.headers.get('x-vercel-ip-country') || + req.headers.get('cf-ipcountry') || + req.headers.get('x-country-code') || + null + : null const classification = classifyRequest(req) const event = { @@ -58,7 +59,7 @@ export async function trackVisit( $current_url: origin ? `${origin}${pathname}` : pathname, path: pathname, method: req.method, - country_code: country, + ...(opts.captureCountry ? { country_code: country } : {}), ...(opts.captureIp ? { client_ip: ip || null } : {}), user_agent: userAgent, is_ai_bot: classification.isAiBot, diff --git a/src/types.ts b/src/types.ts index 7899b0a..69f3799 100644 --- a/src/types.ts +++ b/src/types.ts @@ -50,4 +50,11 @@ export interface TrackVisitOptions { * crawl-insights CSV) and carries privacy implications. */ captureIp?: boolean + /** + * When `true`, emit `country_code` derived from `x-vercel-ip-country`, + * `cf-ipcountry`, or `x-country-code`. Off by default to keep the event + * payload PII-free — coarse country is low-risk but still user-derived. + * Enable for log-style exports (e.g. Peec.ai's crawl-insights CSV). + */ + captureCountry?: boolean } diff --git a/test/track.test.ts b/test/track.test.ts index fee836e..653651b 100644 --- a/test/track.test.ts +++ b/test/track.test.ts @@ -283,7 +283,7 @@ describe('trackVisit', () => { expect(a.distinctId).not.toBe(b.distinctId) }) - it('captures method, country_code, and omits client_ip by default', async () => { + it('captures method by default and omits country_code/client_ip', async () => { const spy = vi.fn() await trackVisit( new Request('https://example.com/page', { @@ -298,10 +298,23 @@ describe('trackVisit', () => { ) const event = spy.mock.calls[0]![0] as CaptureEvent expect(event.properties.method).toBe('POST') - expect(event.properties.country_code).toBe('NL') + expect(event.properties).not.toHaveProperty('country_code') expect(event.properties).not.toHaveProperty('client_ip') }) + it('emits country_code from x-vercel-ip-country when captureCountry is true', async () => { + const spy = vi.fn() + await trackVisit( + makeRequest('https://example.com/page', { + 'user-agent': 'ClaudeBot', + 'x-vercel-ip-country': 'NL' + }), + { analytics: customAnalytics(spy), captureCountry: true } + ) + const event = spy.mock.calls[0]![0] as CaptureEvent + expect(event.properties.country_code).toBe('NL') + }) + it('falls back to cf-ipcountry for country_code', async () => { const spy = vi.fn() await trackVisit( @@ -309,7 +322,7 @@ describe('trackVisit', () => { 'user-agent': 'ClaudeBot', 'cf-ipcountry': 'US' }), - { analytics: customAnalytics(spy) } + { analytics: customAnalytics(spy), captureCountry: true } ) const event = spy.mock.calls[0]![0] as CaptureEvent expect(event.properties.country_code).toBe('US')