From 63cfc8725215baea94fc2400f4d98ff96caf5ec2 Mon Sep 17 00:00:00 2001 From: Menci Date: Sat, 20 Jun 2026 03:57:00 +0800 Subject: [PATCH 1/6] feat(protocols,gateway): tier-aware pricing overlay (ModelPricing.tiers + TokenUsage.tier + resolveEffectivePricing) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Captures the per-service-tier dimension that bare ModelPricing misses: distinct service tiers for the same model (Anthropic standard / fast, OpenAI default / priority / flex / scale) are priced at different rates, and the gateway needs to surface that distinction in the cost aggregate. - `ModelPricing.tiers` carries per-service-tier overrides, keyed by the wire-value the upstream stamps on the usage object. `resolveEffectivePricing` folds a tier override into a flat snapshot before any unit-price lookup, so every downstream `unitPriceForDimension` call sees one self-contained map. - `UsageRecord` and `TokenUsage` grow a `tier` slot; the usage tables key buckets on (keyId, model, upstream, modelKey, hour, tier) so distinct tiers aggregate as separate buckets with distinct unit-price snapshots. Existing rows with `tier = NULL` keep computing identically to before (the resolver treats null as base pricing and returns the snapshot sans the `tiers` key). - `recordTokenUsage` threads the tier from the parsed `TokenUsage` onto the bucket so cost compute applies the right override; `tokenUsage`'s zero-dimension filter passes `tier` through verbatim. - Control-plane export / import surfaces the tier alongside the other bucket-identity fields; missing tier defaults to null on import. - Provider config parsers iterate `BILLING_DIMENSIONS` directly instead of a hand-rolled `keyof ModelPricing` list — the latter would now include `tiers` and admit a non-numeric value into `pricing[dimension]`. Schema: the SQL repo writes the tier column directly; depends on the sibling migration adding `tier` to `usage` + `usage_requests`. --- packages/gateway/src/app-control_test.ts | 5 ++ .../src/control-plane/data-transfer/routes.ts | 7 +++ .../data-transfer/routes_test.ts | 2 + .../control-plane/token-usage/aggregate.ts | 5 +- .../token-usage/aggregate_test.ts | 28 ++++++++++ .../control-plane/token-usage/routes_test.ts | 1 + .../src/data-plane/shared/telemetry/usage.ts | 16 +++++- packages/gateway/src/repo/memory.ts | 13 +++-- packages/gateway/src/repo/sql.ts | 48 ++++++++-------- packages/gateway/src/repo/types.ts | 33 ++++++++--- packages/protocols/src/common/models.ts | 24 +++++++- packages/protocols/src/common/models_test.ts | 56 ++++++++++++++++++- 12 files changed, 198 insertions(+), 40 deletions(-) diff --git a/packages/gateway/src/app-control_test.ts b/packages/gateway/src/app-control_test.ts index e97456c4c..dec01ede8 100644 --- a/packages/gateway/src/app-control_test.ts +++ b/packages/gateway/src/app-control_test.ts @@ -108,6 +108,7 @@ test('/api/token-usage scopes to the actor\'s keys when called with an API key', upstream: null, modelKey: 'claude-sonnet-4', hour: '2026-03-15T10', + tier: null, requests: 2, tokens: { input: 10, output: 5, input_cache_read: 4, input_cache_write: 1 }, cost: null, @@ -118,6 +119,7 @@ test('/api/token-usage scopes to the actor\'s keys when called with an API key', upstream: null, modelKey: 'gpt-5', hour: '2026-03-15T11', + tier: null, requests: 1, tokens: { input: 20, output: 8, input_cache_read: 6, input_cache_write: 2 }, cost: null, @@ -155,6 +157,7 @@ test('/api/token-usage in self-by-key mode includes per-key metadata for the act upstream: null, modelKey: 'gpt-5', hour: '2026-03-16T10', + tier: null, requests: 1, tokens: { input: 20, output: 8 }, cost: null, @@ -182,6 +185,7 @@ test('/api/token-usage all-by-user view aggregates across keys per user', async upstream: null, modelKey: 'gpt-5', hour: '2026-03-15T10', + tier: null, requests: 1, tokens: { input: 10, output: 5 }, cost: null, @@ -213,6 +217,7 @@ test('/api/token-usage merges Claude variants into backend base model records', keyId: apiKey.id, hour: '2026-03-17T10', upstream: 'copilot:1', + tier: null, requests: 1, tokens: { input: 10, output: 5, input_cache_read: 2, input_cache_write: 1 }, }; diff --git a/packages/gateway/src/control-plane/data-transfer/routes.ts b/packages/gateway/src/control-plane/data-transfer/routes.ts index 0a57f7781..780a0318d 100644 --- a/packages/gateway/src/control-plane/data-transfer/routes.ts +++ b/packages/gateway/src/control-plane/data-transfer/routes.ts @@ -402,6 +402,12 @@ const parseUsageRecords = (value: unknown): { type: 'ok'; records: UsageRecord[] if (typeof record.upstream === 'string' && isLegacyUpstreamIdentity(record.upstream)) { return { type: 'invalid', index: i, error: 'upstream must use a raw upstream id, not a legacy provider-prefixed identity' }; } + if (record.tier !== undefined && record.tier !== null && typeof record.tier !== 'string') { + return { type: 'invalid', index: i, error: 'record has invalid tier (must be a string or null)' }; + } + // `tier` is absent on exports taken before the column existed; collapse + // the absent and explicit-null cases into the same wire value. + const tier: string | null = typeof record.tier === 'string' ? record.tier : null; const tokensResult = parseImportedTokens(record.tokens); if (tokensResult.type === 'invalid') return { type: 'invalid', index: i, error: 'record has invalid token dimension counts' }; const costResult = parseImportedCost(record.cost); @@ -412,6 +418,7 @@ const parseUsageRecords = (value: unknown): { type: 'ok'; records: UsageRecord[] upstream: record.upstream as string | null, modelKey: record.modelKey, hour: record.hour, + tier, requests: record.requests, tokens: tokensResult.tokens, cost: costResult.cost, diff --git a/packages/gateway/src/control-plane/data-transfer/routes_test.ts b/packages/gateway/src/control-plane/data-transfer/routes_test.ts index 70f108c03..d43d7a975 100644 --- a/packages/gateway/src/control-plane/data-transfer/routes_test.ts +++ b/packages/gateway/src/control-plane/data-transfer/routes_test.ts @@ -177,6 +177,7 @@ const USAGE_1: UsageRecord = { upstream: 'up_copilot_a', modelKey: 'claude-opus-4.7', hour: '2026-01-01T10', + tier: null, requests: 5, tokens: { input: 1000, output: 500, input_cache_read: 120, input_cache_write: 80 }, cost: null, @@ -188,6 +189,7 @@ const USAGE_2: UsageRecord = { upstream: 'up_azure_a', modelKey: 'gpt-prod', hour: '2026-01-01T11', + tier: null, requests: 3, tokens: { input: 2000, output: 800, input_cache_read: 200, input_cache_write: 50 }, cost: null, diff --git a/packages/gateway/src/control-plane/token-usage/aggregate.ts b/packages/gateway/src/control-plane/token-usage/aggregate.ts index 2ba67ff74..3b0009805 100644 --- a/packages/gateway/src/control-plane/token-usage/aggregate.ts +++ b/packages/gateway/src/control-plane/token-usage/aggregate.ts @@ -22,7 +22,10 @@ export interface DisplayUsageByUserRecord { // Cost is pure addition over the dimension rows: Σ tokens × unit_price / 1e6. // No subtraction is needed because the counts are disjoint and each dimension -// already carries its own resolved unit price snapshot. +// already carries its own resolved unit price snapshot. `record.cost` here +// is the per-row reconstruction of the per-dimension `unit_price` columns +// the repo writer already folded the bucket's tier into — so the dimension +// lookup is a direct hit, no tier resolution needed at read time. const recordCostUsd = (record: UsageRecord): number => { let total = 0; for (const dimension of BILLING_DIMENSIONS) { diff --git a/packages/gateway/src/control-plane/token-usage/aggregate_test.ts b/packages/gateway/src/control-plane/token-usage/aggregate_test.ts index 04d62e46a..b16cbac66 100644 --- a/packages/gateway/src/control-plane/token-usage/aggregate_test.ts +++ b/packages/gateway/src/control-plane/token-usage/aggregate_test.ts @@ -14,6 +14,7 @@ const baseRecord = (overrides: Partial): UsageRecord => ({ model: 'claude-opus-4-7', upstream: 'up_copilot', modelKey: 'claude-opus-4-7', + tier: null, requests: 1, tokens: { input: 100, output: 50 }, cost: opus47Pricing, @@ -83,3 +84,30 @@ test('aggregateUsageForDisplay charges image dimensions separately', () => { // 10 + 5 + 40 + 30 = $85. assertAlmostEquals(out[0].cost, 85, 1e-9); }); + +test('aggregateUsageForDisplay reads unit prices from the already-folded cost the repo writer hands back', () => { + // The repo write path (`repo/sql.ts:dimensionRows`, `repo/memory.ts:dimensionEntries`) + // resolves the bucket's tier into per-dimension unit prices BEFORE storing, + // so by the time aggregate sees a UsageRecord the `cost` field is already + // the effective pricing for that bucket's tier and tier resolution is a + // no-op. Two same-tier records below model the post-write shape. + // Opus 4.8: standard $5 / $25, fast $10 / $50. + const fastRow = baseRecord({ + tier: 'fast', + cost: { input: 10, output: 50 }, + tokens: { input: 1_000_000, output: 1_000_000 }, + }); + const standardRow = baseRecord({ + tier: null, + cost: { input: 5, output: 25 }, + tokens: { input: 1_000_000, output: 1_000_000 }, + }); + + const fastOut = aggregateUsageForDisplay([fastRow]); + // 1M * $10 + 1M * $50 = $60. + assertAlmostEquals(fastOut[0].cost, 60, 1e-9); + + const standardOut = aggregateUsageForDisplay([standardRow]); + // 1M * $5 + 1M * $25 = $30. + assertAlmostEquals(standardOut[0].cost, 30, 1e-9); +}); diff --git a/packages/gateway/src/control-plane/token-usage/routes_test.ts b/packages/gateway/src/control-plane/token-usage/routes_test.ts index b911e2011..51c5ed85e 100644 --- a/packages/gateway/src/control-plane/token-usage/routes_test.ts +++ b/packages/gateway/src/control-plane/token-usage/routes_test.ts @@ -16,6 +16,7 @@ const seedUsage = async ( upstream: 'up_test', modelKey: model, hour, + tier: null, requests, tokens: { input: 100, output: 50 }, cost: null, diff --git a/packages/gateway/src/data-plane/shared/telemetry/usage.ts b/packages/gateway/src/data-plane/shared/telemetry/usage.ts index 99f232f47..7b32e8ad8 100644 --- a/packages/gateway/src/data-plane/shared/telemetry/usage.ts +++ b/packages/gateway/src/data-plane/shared/telemetry/usage.ts @@ -6,14 +6,27 @@ import type { TelemetryModelIdentity } from '@floway-dev/provider'; export const hasTokenUsage = (usage: TokenUsage): boolean => BILLING_DIMENSIONS.some(dimension => (usage[dimension] ?? 0) > 0); +// Map an upstream-reported service tier onto the tier marker the gateway +// stores on the usage row. `default` and `auto` (OpenAI's response-side base +// values) and `standard` (Anthropic's response-side base value) all denote +// base pricing and collapse to null so they aggregate with rows that carry +// no tier at all. +// https://developers.openai.com/api/docs/guides/priority-processing +// https://docs.claude.com/en/api/service-tiers +// https://docs.claude.com/en/build-with-claude/fast-mode +export const billableServiceTier = (tier: string | null | undefined): string | null => + tier != null && tier !== 'default' && tier !== 'auto' && tier !== 'standard' ? tier : null; + // Drop zero / undefined dimensions so a usage map only carries the dimensions -// actually billed. +// actually billed. `tier` (a non-numeric service-tier marker) survives the +// filter so per-tier pricing overrides resolve at recording time. export const tokenUsage = (counts: TokenUsage): TokenUsage => { const out: TokenUsage = {}; for (const dimension of BILLING_DIMENSIONS) { const value = counts[dimension] ?? 0; if (value > 0) out[dimension] = value; } + if (counts.tier != null) out.tier = counts.tier; return out; }; @@ -82,6 +95,7 @@ export const recordTokenUsage = async (keyId: string, modelIdentity: TelemetryMo upstream: modelIdentity.upstream, modelKey: modelIdentity.modelKey, hour: currentHour(), + tier: usage.tier ?? null, requests: 1, tokens: usage, cost: modelIdentity.cost, diff --git a/packages/gateway/src/repo/memory.ts b/packages/gateway/src/repo/memory.ts index 81bbd8e45..89088e84d 100644 --- a/packages/gateway/src/repo/memory.ts +++ b/packages/gateway/src/repo/memory.ts @@ -43,7 +43,7 @@ import { serializeStoredState } from './upstream-json.ts'; import { latencyBucketForMs } from '../shared/performance-histogram.ts'; import { generateSessionToken } from '../shared/session-tokens.ts'; import { assertWebSearchProviderName } from '../shared/web-search-providers.ts'; -import { BILLING_DIMENSIONS, type BillingDimension, type ModelPricing, unitPriceForDimension } from '@floway-dev/protocols/common'; +import { BILLING_DIMENSIONS, type BillingDimension, type ModelPricing, resolveEffectivePricing, unitPriceForDimension } from '@floway-dev/protocols/common'; import type { UpstreamModel, UpstreamRecord } from '@floway-dev/provider'; const SEED_ADMIN_USER: User = { @@ -230,6 +230,7 @@ interface UsageBucketIdentity { upstream: string | null; modelKey: string; hour: string; + tier: string | null; } interface UsageBucketState extends UsageBucketIdentity { @@ -242,13 +243,14 @@ class MemoryUsageRepo implements UsageRepo { private store = new Map(); private key(r: UsageBucketIdentity): string { - return [r.keyId, r.model, r.upstream ?? '', r.modelKey, r.hour].join('\0'); + return [r.keyId, r.model, r.upstream ?? '', r.modelKey, r.hour, r.tier ?? ''].join('\0'); } private dimensionEntries(record: UsageRecord): { dimension: BillingDimension; tokens: number; unitPrice: number | null }[] { + const effective = resolveEffectivePricing(record.cost, record.tier); return BILLING_DIMENSIONS.flatMap(dimension => { const tokens = record.tokens[dimension] ?? 0; - return tokens > 0 ? [{ dimension, tokens, unitPrice: unitPriceForDimension(record.cost, dimension) }] : []; + return tokens > 0 ? [{ dimension, tokens, unitPrice: unitPriceForDimension(effective, dimension) }] : []; }); } @@ -261,14 +263,14 @@ class MemoryUsageRepo implements UsageRepo { const unitPrice = state.unitPrices[dimension]; if (unitPrice !== undefined) (cost ??= {})[dimension] = unitPrice; } - return { keyId: state.keyId, model: state.model, upstream: state.upstream ?? null, modelKey: state.modelKey, hour: state.hour, requests: state.requests, tokens, cost }; + return { keyId: state.keyId, model: state.model, upstream: state.upstream ?? null, modelKey: state.modelKey, hour: state.hour, tier: state.tier, requests: state.requests, tokens, cost }; } private bucket(record: UsageRecord): UsageBucketState { const k = this.key(record); let state = this.store.get(k); if (!state) { - state = { keyId: record.keyId, model: record.model, upstream: record.upstream ?? null, modelKey: record.modelKey, hour: record.hour, tokens: {}, unitPrices: {}, requests: 0 }; + state = { keyId: record.keyId, model: record.model, upstream: record.upstream ?? null, modelKey: record.modelKey, hour: record.hour, tier: record.tier, tokens: {}, unitPrices: {}, requests: 0 }; this.store.set(k, state); } return state; @@ -308,6 +310,7 @@ class MemoryUsageRepo implements UsageRepo { upstream: record.upstream ?? null, modelKey: record.modelKey, hour: record.hour, + tier: record.tier, tokens: {}, unitPrices: {}, requests: record.requests, diff --git a/packages/gateway/src/repo/sql.ts b/packages/gateway/src/repo/sql.ts index f14d31337..933ca121c 100644 --- a/packages/gateway/src/repo/sql.ts +++ b/packages/gateway/src/repo/sql.ts @@ -39,7 +39,7 @@ import { latencyBucketForMs } from '../shared/performance-histogram.ts'; import { generateSessionToken } from '../shared/session-tokens.ts'; import { assertWebSearchProviderName } from '../shared/web-search-providers.ts'; import type { SqlDatabase, SqlPreparedStatement, SqlResult } from '@floway-dev/platform'; -import { BILLING_DIMENSIONS, type BillingDimension, type ModelPricing, unitPriceForDimension } from '@floway-dev/protocols/common'; +import { BILLING_DIMENSIONS, type BillingDimension, type ModelPricing, resolveEffectivePricing, unitPriceForDimension } from '@floway-dev/protocols/common'; import type { ProxyFallbackEntry, UpstreamModel, UpstreamProviderKind, UpstreamRecord } from '@floway-dev/provider'; const runStatements = async (db: SqlDatabase, statements: SqlPreparedStatement[]): Promise => { @@ -369,11 +369,13 @@ class SqlSessionsRepo implements SessionsRepo { } } -const dimensionRows = (record: UsageRecord): { dimension: BillingDimension; tokens: number; unitPrice: number | null }[] => - BILLING_DIMENSIONS.flatMap(dimension => { +const dimensionRows = (record: UsageRecord): { dimension: BillingDimension; tokens: number; unitPrice: number | null }[] => { + const effective = resolveEffectivePricing(record.cost, record.tier); + return BILLING_DIMENSIONS.flatMap(dimension => { const tokens = record.tokens[dimension] ?? 0; - return tokens > 0 ? [{ dimension, tokens, unitPrice: unitPriceForDimension(record.cost, dimension) }] : []; + return tokens > 0 ? [{ dimension, tokens, unitPrice: unitPriceForDimension(effective, dimension) }] : []; }); +}; class SqlUsageRepo implements UsageRepo { constructor(private db: SqlDatabase) {} @@ -383,19 +385,19 @@ class SqlUsageRepo implements UsageRepo { const statements: SqlPreparedStatement[] = dimensionRows(record).map(row => this.db .prepare( - `INSERT INTO usage (key_id, model, upstream, model_key, hour, dimension, tokens, unit_price) VALUES (?, ?, ?, ?, ?, ?, ?, ?) + `INSERT INTO usage (key_id, model, upstream, model_key, hour, tier, dimension, tokens, unit_price) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT DO UPDATE SET tokens = tokens + excluded.tokens, unit_price = COALESCE(unit_price, excluded.unit_price)`, ) - .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, row.dimension, row.tokens, row.unitPrice)); + .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, record.tier, row.dimension, row.tokens, row.unitPrice)); statements.push( this.db .prepare( - `INSERT INTO usage_requests (key_id, model, upstream, model_key, hour, requests) VALUES (?, ?, ?, ?, ?, ?) + `INSERT INTO usage_requests (key_id, model, upstream, model_key, hour, tier, requests) VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT DO UPDATE SET requests = requests + excluded.requests`, ) - .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, record.requests), + .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, record.tier, record.requests), ); await runStatements(this.db, statements); } @@ -405,11 +407,11 @@ class SqlUsageRepo implements UsageRepo { const binds = opts.keyId ? [opts.keyId, opts.start, opts.end] : [opts.start, opts.end]; const [{ results: dimensions }, { results: requests }] = await Promise.all([ this.db - .prepare(`SELECT key_id, model, upstream, model_key, hour, dimension, tokens, unit_price FROM usage WHERE ${dimensionWhere}`) + .prepare(`SELECT key_id, model, upstream, model_key, hour, tier, dimension, tokens, unit_price FROM usage WHERE ${dimensionWhere}`) .bind(...binds) .all(), this.db - .prepare(`SELECT key_id, model, upstream, model_key, hour, requests FROM usage_requests WHERE ${dimensionWhere}`) + .prepare(`SELECT key_id, model, upstream, model_key, hour, tier, requests FROM usage_requests WHERE ${dimensionWhere}`) .bind(...binds) .all(), ]); @@ -418,8 +420,8 @@ class SqlUsageRepo implements UsageRepo { async listAll(): Promise { const [{ results: dimensions }, { results: requests }] = await Promise.all([ - this.db.prepare('SELECT key_id, model, upstream, model_key, hour, dimension, tokens, unit_price FROM usage').all(), - this.db.prepare('SELECT key_id, model, upstream, model_key, hour, requests FROM usage_requests').all(), + this.db.prepare('SELECT key_id, model, upstream, model_key, hour, tier, dimension, tokens, unit_price FROM usage').all(), + this.db.prepare('SELECT key_id, model, upstream, model_key, hour, tier, requests FROM usage_requests').all(), ]); return assembleUsageRecords(dimensions, requests); } @@ -430,20 +432,20 @@ class SqlUsageRepo implements UsageRepo { // dimensions absent from the new record do not linger. const statements: SqlPreparedStatement[] = [ this.db - .prepare("DELETE FROM usage WHERE key_id = ? AND model = ? AND COALESCE(upstream, '') = COALESCE(?, '') AND model_key = ? AND hour = ?") - .bind(record.keyId, record.model, upstream, record.modelKey, record.hour), + .prepare("DELETE FROM usage WHERE key_id = ? AND model = ? AND COALESCE(upstream, '') = COALESCE(?, '') AND model_key = ? AND hour = ? AND COALESCE(tier, '') = COALESCE(?, '')") + .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, record.tier), ...dimensionRows(record).map(row => this.db - .prepare('INSERT INTO usage (key_id, model, upstream, model_key, hour, dimension, tokens, unit_price) VALUES (?, ?, ?, ?, ?, ?, ?, ?)') - .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, row.dimension, row.tokens, row.unitPrice)), + .prepare('INSERT INTO usage (key_id, model, upstream, model_key, hour, tier, dimension, tokens, unit_price) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)') + .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, record.tier, row.dimension, row.tokens, row.unitPrice)), ]; statements.push( this.db .prepare( - `INSERT INTO usage_requests (key_id, model, upstream, model_key, hour, requests) VALUES (?, ?, ?, ?, ?, ?) + `INSERT INTO usage_requests (key_id, model, upstream, model_key, hour, tier, requests) VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT DO UPDATE SET requests = excluded.requests`, ) - .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, record.requests), + .bind(record.keyId, record.model, upstream, record.modelKey, record.hour, record.tier, record.requests), ); await runStatements(this.db, statements); } @@ -459,6 +461,7 @@ interface UsageDimensionRow { upstream: string | null; model_key: string; hour: string; + tier: string | null; dimension: string; tokens: number; unit_price: number | null; @@ -470,11 +473,12 @@ interface UsageRequestRow { upstream: string | null; model_key: string; hour: string; + tier: string | null; requests: number; } -const usageBucketKey = (row: { key_id: string; model: string; upstream: string | null; model_key: string; hour: string }): string => - [row.key_id, row.model, row.upstream ?? '', row.model_key, row.hour].join('\0'); +const usageBucketKey = (row: { key_id: string; model: string; upstream: string | null; model_key: string; hour: string; tier: string | null }): string => + [row.key_id, row.model, row.upstream ?? '', row.model_key, row.hour, row.tier ?? ''].join('\0'); // Reassemble per-bucket UsageRecords from the two narrow tables. The dimension // rows carry the disjoint counts and the per-dimension unit_price snapshot, @@ -483,11 +487,11 @@ const usageBucketKey = (row: { key_id: string; model: string; upstream: string | const assembleUsageRecords = (dimensions: readonly UsageDimensionRow[], requests: readonly UsageRequestRow[]): UsageRecord[] => { const byBucket = new Map(); - const ensureRecord = (row: { key_id: string; model: string; upstream: string | null; model_key: string; hour: string }): UsageRecord => { + const ensureRecord = (row: { key_id: string; model: string; upstream: string | null; model_key: string; hour: string; tier: string | null }): UsageRecord => { const key = usageBucketKey(row); let record = byBucket.get(key); if (!record) { - record = { keyId: row.key_id, model: row.model, upstream: row.upstream, modelKey: row.model_key, hour: row.hour, requests: 0, tokens: {}, cost: null }; + record = { keyId: row.key_id, model: row.model, upstream: row.upstream, modelKey: row.model_key, hour: row.hour, tier: row.tier, requests: 0, tokens: {}, cost: null }; byBucket.set(key, record); } return record; diff --git a/packages/gateway/src/repo/types.ts b/packages/gateway/src/repo/types.ts index 6c002deba..5df3ef9dd 100644 --- a/packages/gateway/src/repo/types.ts +++ b/packages/gateway/src/repo/types.ts @@ -43,19 +43,34 @@ export interface UsageRecord { upstream: string | null; modelKey: string; hour: string; + // Service tier the upstream stamped on this bucket (Anthropic `speed`, + // OpenAI `service_tier`). null = the base / default tier. Distinct tiers + // for the same (keyId, model, upstream, modelKey, hour) are stored as + // separate buckets so per-tier pricing overrides apply correctly. + tier: string | null; requests: number; - // Disjoint per-dimension token counts for this bucket (see TokenUsage). - tokens: TokenUsage; + // Disjoint per-dimension token counts for this bucket. The tier the bucket + // was stamped under lives on the `tier` field above — do not encode it + // inside this map. + tokens: Partial>; // Pricing snapshot taken at write time. null means the provider did not // resolve pricing for this model (Custom upstreams, unknown Copilot // public id, etc.). The repo derives per-dimension unit prices from it via - // unitPriceForDimension; aggregation treats a null snapshot as cost 0. + // unitPriceForDimension after `resolveEffectivePricing(cost, tier)` folds + // in the bucket's tier override; aggregation treats a null snapshot as + // cost 0. cost: ModelPricing | null; } // Disjoint per-dimension token counts. Absent keys mean zero for that -// dimension. No key's count overlaps another's. -export type TokenUsage = Partial>; +// dimension. No key's count overlaps another's. `tier` is the upstream- +// reported service-tier marker (Anthropic `usage.speed`, OpenAI +// `usage.service_tier`) that selects an override against `cost.tiers` +// before any per-dimension unit-price lookup; absent / null = the model's +// base pricing applies. +export interface TokenUsage extends Partial> { + tier?: string | null; +} export type SearchUsageAction = 'search' | 'fetch_page'; @@ -137,10 +152,10 @@ export interface SessionsRepo { } export interface UsageRepo { - // Additive upsert: on (keyId, model, upstream, modelKey, hour) conflict, - // token counts are summed. cost is COALESCED — the first write within a - // bucket establishes the pricing snapshot for that row, later writes that - // share the bucket keep the original snapshot. + // Additive upsert: on (keyId, model, upstream, modelKey, hour, tier) + // conflict, token counts are summed. cost is COALESCED — the first write + // within a bucket establishes the pricing snapshot for that row, later + // writes that share the bucket keep the original snapshot. record(record: UsageRecord): Promise; query(opts: { keyId?: string; start: string; end: string }): Promise; listAll(): Promise; diff --git a/packages/protocols/src/common/models.ts b/packages/protocols/src/common/models.ts index a26fe8f7b..896733ffd 100644 --- a/packages/protocols/src/common/models.ts +++ b/packages/protocols/src/common/models.ts @@ -23,7 +23,14 @@ export const BILLING_DIMENSIONS: readonly BillingDimension[] = ['input', 'input_ // Keys are billing dimensions: bare `input`/`output` are the text/fallback rate // and `_image` keys are the image modality. Every key is optional; an absent key // falls back per `unitPriceForDimension` (modality → bare, cached → uncached). -export type ModelPricing = Partial>; +// +// `tiers` carries per-request service-tier overrides (Anthropic fast mode, +// OpenAI priority/flex). Each tier key is the wire-value the upstream stamps +// on the usage object (`fast`, `priority`, `flex`, ...). Resolve through +// `resolveEffectivePricing(pricing, usage.tier)` before any unit-price lookup. +export interface ModelPricing extends Partial> { + tiers?: Record>>; +} // Resolve the USD-per-million-tokens unit price for one dimension against a // pricing snapshot, applying the LiteLLM-style fallback chain: a modality with @@ -52,6 +59,21 @@ export const unitPriceForDimension = (pricing: ModelPricing | null, dimension: B } }; +// Fold the per-tier override (if any) into a flat ModelPricing snapshot, so +// every downstream `unitPriceForDimension` call sees one self-contained map. +// Per-dimension shallow merge: overlay keys win, omitted keys inherit the +// base rate (and then flow through `unitPriceForDimension`'s fallback chain). +// Returns a fresh object that never carries `tiers` — recursion would not +// match any real billing surface. An unknown or absent tier returns the base +// snapshot unchanged (sans `tiers`), so old usage rows with no tier carry on +// pricing identically to before. +export const resolveEffectivePricing = (pricing: ModelPricing | null, tier: string | null | undefined): ModelPricing | null => { + if (!pricing) return null; + const { tiers, ...base } = pricing; + const override = tier != null ? tiers?.[tier] : undefined; + return override ? { ...base, ...override } : base; +}; + // High-level endpoint-family discriminator. A model belongs to exactly one // kind; cross-cutting features (vision, function calling, structured // outputs) are orthogonal and modeled separately when needed. diff --git a/packages/protocols/src/common/models_test.ts b/packages/protocols/src/common/models_test.ts index ffe10d8c3..e706d4814 100644 --- a/packages/protocols/src/common/models_test.ts +++ b/packages/protocols/src/common/models_test.ts @@ -1,6 +1,6 @@ import { test } from 'vitest'; -import { unitPriceForDimension } from './models.ts'; +import { resolveEffectivePricing, unitPriceForDimension, type ModelPricing } from './models.ts'; import { assertEquals } from '../test-assert.ts'; test('unitPriceForDimension returns null when pricing snapshot is null', () => { @@ -33,3 +33,57 @@ test('unitPriceForDimension returns null when the fallback chain is empty', () = assertEquals(unitPriceForDimension({}, 'input_cache_write_1h'), null); assertEquals(unitPriceForDimension({ output: 5 }, 'input_cache_write_1h'), null); }); + +test('resolveEffectivePricing merges a tier override into the base snapshot and strips tiers', () => { + const base: ModelPricing = { + input: 5, + input_cache_read: 0.5, + input_cache_write: 6.25, + output: 25, + tiers: { fast: { input: 30, output: 150, input_cache_write: 60 } }, + }; + const effective = resolveEffectivePricing(base, 'fast'); + assertEquals(effective, { + input: 30, + input_cache_read: 0.5, + input_cache_write: 60, + output: 150, + }); +}); + +test('resolveEffectivePricing shallow-merges per dimension — omitted overlay keys inherit the base rate', () => { + // The codex flex/priority overlays exploit this: they declare only the + // input/output/cache-read dimensions that differ at the tier and leave + // cache-write (and any 1h/image dimension) to inherit base. + const base: ModelPricing = { + input: 5, + input_cache_read: 0.5, + input_cache_write: 6.25, + output: 25, + tiers: { flex: { input: 2.5 } }, + }; + assertEquals(resolveEffectivePricing(base, 'flex'), { + input: 2.5, + input_cache_read: 0.5, + input_cache_write: 6.25, + output: 25, + }); +}); + +test('resolveEffectivePricing returns the base snapshot (sans tiers) when tier is unknown or absent', () => { + const base: ModelPricing = { + input: 5, + output: 25, + tiers: { fast: { input: 30 } }, + }; + const expected: ModelPricing = { input: 5, output: 25 }; + + assertEquals(resolveEffectivePricing(base, null), expected); + assertEquals(resolveEffectivePricing(base, undefined), expected); + assertEquals(resolveEffectivePricing(base, 'priority'), expected); +}); + +test('resolveEffectivePricing returns null when the base snapshot is null', () => { + assertEquals(resolveEffectivePricing(null, 'fast'), null); + assertEquals(resolveEffectivePricing(null, null), null); +}); From 5e8aa8ce6177ea00c0409d62faef2d33eaf8f2cc Mon Sep 17 00:00:00 2001 From: Menci Date: Sat, 20 Jun 2026 04:02:17 +0800 Subject: [PATCH 2/6] feat(gateway): parse usage.speed / usage.service_tier into TokenUsage.tier across protocol shapes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reads each upstream's service-tier marker off the usage object and stamps it onto TokenUsage.tier so the recording layer routes the bucket through the right tier override: - Messages: Opus 4.6+ emits `usage.speed: 'standard' | 'fast'`; only `fast` surfaces as `tier: 'fast'`. Standard is left unset so base-tier rows aggregate with the historical no-tier rows. Streamed deltas propagate `speed` so a late delta carries the tier all the way to message_stop. - Responses: the top-level `response.service_tier` echoes the actual processing tier ('priority', 'flex', 'scale', 'default', 'auto'). We drop 'default' and 'auto' — both denote base pricing — and surface anything else verbatim. The WebSocket path reads service_tier the same way as HTTP. - Chat Completions: same as Responses but reading the top-level `chunk.service_tier` (chat.completion[.chunk]). Protocol types grow `MessagesUsage.speed`, `ResponsesResult.service_tier`, `ChatCompletionsResult.service_tier`, and `ChatCompletionsStreamEvent.service_tier`. --- .../llm/chat-completions/respond.ts | 23 +- .../data-plane/llm/chat-completions/usage.ts | 19 ++ .../llm/chat-completions/usage_test.ts | 76 +++++ .../llm/messages/events/reassemble.ts | 1 + .../src/data-plane/llm/messages/respond.ts | 33 +- .../data-plane/llm/messages/respond_test.ts | 283 ++++++++++++++++++ .../src/data-plane/llm/responses/respond.ts | 19 +- .../src/data-plane/llm/responses/usage.ts | 21 ++ .../data-plane/llm/responses/usage_test.ts | 81 +++++ .../src/data-plane/llm/responses/websocket.ts | 13 +- .../src/data-plane/shared/telemetry/usage.ts | 23 +- .../protocols/src/chat-completions/index.ts | 5 +- packages/protocols/src/messages/index.ts | 7 +- packages/protocols/src/responses/index.ts | 6 +- 14 files changed, 547 insertions(+), 63 deletions(-) create mode 100644 packages/gateway/src/data-plane/llm/chat-completions/usage.ts create mode 100644 packages/gateway/src/data-plane/llm/chat-completions/usage_test.ts create mode 100644 packages/gateway/src/data-plane/llm/responses/usage.ts create mode 100644 packages/gateway/src/data-plane/llm/responses/usage_test.ts diff --git a/packages/gateway/src/data-plane/llm/chat-completions/respond.ts b/packages/gateway/src/data-plane/llm/chat-completions/respond.ts index 1d3f5b50b..906e6f13b 100644 --- a/packages/gateway/src/data-plane/llm/chat-completions/respond.ts +++ b/packages/gateway/src/data-plane/llm/chat-completions/respond.ts @@ -3,11 +3,11 @@ import { streamSSE } from 'hono/streaming'; import { CHAT_COMPLETIONS_MISSING_TERMINAL_MESSAGE, collectChatCompletionsProtocolEventsToResult } from './events/to-result.ts'; import { chatCompletionsProtocolFrameToSSEFrame } from './events/to-sse.ts'; -import { tokenUsage } from '../../shared/telemetry/usage.ts'; +import { tokenUsageFromChatCompletionsUsage } from './usage.ts'; import type { GatewayCtx } from '../shared/gateway-ctx.ts'; import { SourceStreamState, eventResultMetadata, forwardUpstreamHeaders, mergeForwardedUpstreamHeaders, plainResultToResponse, recordPerformance, recordUsage } from '../shared/respond.ts'; import { type StreamCompletion, writeSSEFrames } from '../shared/stream/sse.ts'; -import type { ChatCompletionsStreamEvent, ChatCompletionsResult } from '@floway-dev/protocols/chat-completions'; +import type { ChatCompletionsStreamEvent } from '@floway-dev/protocols/chat-completions'; import { chatCompletionsErrorPayloadMessage } from '@floway-dev/protocols/chat-completions'; import { type ProtocolFrame, sseCommentFrame, sseFrame } from '@floway-dev/protocols/common'; import { type ExecuteResult, type PlainResult, type InternalDebugError, toInternalDebugError } from '@floway-dev/provider'; @@ -44,7 +44,7 @@ export const respondChatCompletions = async ( try { const response = await collectChatCompletionsProtocolEventsToResult(frames); const metadata = await eventResultMetadata(result); - const usage = response.usage ? tokenUsageFromChatCompletionsUsage(response.usage) : null; + const usage = response.usage ? tokenUsageFromChatCompletionsUsage(response.usage, response.service_tier) : null; await recordUsage(ctx, metadata.modelIdentity, usage); recordPerformance(ctx, metadata.performance, state.failed); return { success: true, response: Response.json(response, { headers: mergeForwardedUpstreamHeaders(undefined, result.headers) }) }; @@ -77,21 +77,6 @@ export const respondChatCompletions = async ( return { success: true, response }; }; -// --- token usage --- - -// OpenAI Chat usage reports prompt_tokens inclusive of cached and -// cache-creation tokens; subtract them to recover the disjoint bare input. -const tokenUsageFromChatCompletionsUsage = (u: NonNullable) => { - const cacheRead = u.prompt_tokens_details?.cached_tokens ?? 0; - const cacheWrite = u.prompt_tokens_details?.cache_creation_input_tokens ?? 0; - return tokenUsage({ - input: u.prompt_tokens - cacheRead - cacheWrite, - input_cache_read: cacheRead, - input_cache_write: cacheWrite, - output: u.completion_tokens, - }); -}; - // --- error rendering --- const internalChatCompletionsErrorPayload = (error: InternalDebugError) => ({ @@ -119,7 +104,7 @@ const observeChatCompletionsFrames = async function* (frames: AsyncIterable, serviceTier: string | null | undefined) => { + const cacheRead = u.prompt_tokens_details?.cached_tokens ?? 0; + const cacheWrite = u.prompt_tokens_details?.cache_creation_input_tokens ?? 0; + return tokenUsage({ + input: u.prompt_tokens - cacheRead - cacheWrite, + input_cache_read: cacheRead, + input_cache_write: cacheWrite, + output: u.completion_tokens, + tier: billableServiceTier(serviceTier), + }); +}; diff --git a/packages/gateway/src/data-plane/llm/chat-completions/usage_test.ts b/packages/gateway/src/data-plane/llm/chat-completions/usage_test.ts new file mode 100644 index 000000000..3f84e089e --- /dev/null +++ b/packages/gateway/src/data-plane/llm/chat-completions/usage_test.ts @@ -0,0 +1,76 @@ +import { test } from 'vitest'; + +import { tokenUsageFromChatCompletionsUsage } from './usage.ts'; +import { assertEquals } from '@floway-dev/test-utils'; + +test('Chat usage maps disjoint input/cache/output counts and omits tier when service_tier is absent', () => { + assertEquals( + tokenUsageFromChatCompletionsUsage( + { prompt_tokens: 100, completion_tokens: 20, total_tokens: 120, prompt_tokens_details: { cached_tokens: 30 } }, + null, + ), + { + input: 70, + input_cache_read: 30, + output: 20, + }, + ); +}); + +test('Chat usage drops service_tier=default to no-tier', () => { + assertEquals( + tokenUsageFromChatCompletionsUsage( + { prompt_tokens: 10, completion_tokens: 2, total_tokens: 12 }, + 'default', + ), + { + input: 10, + output: 2, + }, + ); +}); + +test('Chat usage forwards service_tier=priority verbatim', () => { + assertEquals( + tokenUsageFromChatCompletionsUsage( + { prompt_tokens: 10, completion_tokens: 2, total_tokens: 12 }, + 'priority', + ), + { + input: 10, + output: 2, + tier: 'priority', + }, + ); +}); + +test('Chat usage forwards service_tier=flex verbatim', () => { + assertEquals( + tokenUsageFromChatCompletionsUsage( + { prompt_tokens: 10, completion_tokens: 2, total_tokens: 12 }, + 'flex', + ), + { + input: 10, + output: 2, + tier: 'flex', + }, + ); +}); + +test('Chat usage forwards an unknown tier verbatim (forward-compat with a future wire value)', () => { + // A future OpenAI value the SDK has not minted yet must reach the billing + // record so the operator can backfill a per-tier pricing override for it + // rather than have it silently fold into the base bucket. + assertEquals( + tokenUsageFromChatCompletionsUsage( + { prompt_tokens: 10, completion_tokens: 2, total_tokens: 12 }, + 'super-priority', + ), + { + input: 10, + output: 2, + tier: 'super-priority', + }, + ); +}); diff --git a/packages/gateway/src/data-plane/llm/messages/events/reassemble.ts b/packages/gateway/src/data-plane/llm/messages/events/reassemble.ts index 71cea87e6..55ee37415 100644 --- a/packages/gateway/src/data-plane/llm/messages/events/reassemble.ts +++ b/packages/gateway/src/data-plane/llm/messages/events/reassemble.ts @@ -102,6 +102,7 @@ const applyMessagesUsage = (usage: MessagesUsage, update: Partial } if (update.cache_creation != null) usage.cache_creation = update.cache_creation; if (update.service_tier != null) usage.service_tier = update.service_tier; + if (update.speed != null) usage.speed = update.speed; if (update.server_tool_use != null) { usage.server_tool_use = update.server_tool_use; } diff --git a/packages/gateway/src/data-plane/llm/messages/respond.ts b/packages/gateway/src/data-plane/llm/messages/respond.ts index 3d65dbc3f..ef8f61a2e 100644 --- a/packages/gateway/src/data-plane/llm/messages/respond.ts +++ b/packages/gateway/src/data-plane/llm/messages/respond.ts @@ -3,7 +3,7 @@ import { streamSSE } from 'hono/streaming'; import { MESSAGES_MISSING_TERMINAL_MESSAGE, collectMessagesProtocolEventsToResult } from './events/to-result.ts'; import { messagesProtocolFrameToSSEFrame } from './events/to-sse.ts'; -import { tokenUsage } from '../../shared/telemetry/usage.ts'; +import { billableServiceTier, tokenUsage } from '../../shared/telemetry/usage.ts'; import type { GatewayCtx } from '../shared/gateway-ctx.ts'; import { SourceStreamState, eventResultMetadata, forwardUpstreamHeaders, mergeForwardedUpstreamHeaders, plainResultToResponse, recordPerformance, recordUsage } from '../shared/respond.ts'; import { type StreamCompletion, writeSSEFrames } from '../shared/stream/sse.ts'; @@ -83,16 +83,29 @@ export const respondMessages = async ( // (extended-cache-ttl-2025-04-11), split the per-TTL counts onto the 5m and // 1h dimensions; the flat `cache_creation_input_tokens` is the sum and is // only consulted when the sub-object is absent. +// +// Response usage carries two server-stamped tier fields: `speed` (fast mode) +// and `service_tier` (capacity assignment). Fast mode is documented as +// unavailable with Priority Tier and the Batch API, so at most one +// non-`standard` value lands on a single response — prefer `speed` first +// (the only multi-x override today) then fall through to `service_tier`. +// `standard` on either side collapses to null so per-tier rows aggregate +// with base; unknown values flow through verbatim so a future Anthropic +// release does not silently bill at base. +// * https://docs.claude.com/en/build-with-claude/fast-mode +// * https://docs.claude.com/en/api/service-tiers const tokenUsageFromMessagesUsage = (u: MessagesUsageLike) => { const cacheWrite5m = u.cache_creation?.ephemeral_5m_input_tokens; const cacheWrite1h = u.cache_creation?.ephemeral_1h_input_tokens; const cacheWriteRolledUp = u.cache_creation_input_tokens ?? 0; + const tier = billableServiceTier(u.speed) ?? billableServiceTier(u.service_tier); return tokenUsage({ input: u.input_tokens ?? 0, input_cache_read: u.cache_read_input_tokens ?? 0, input_cache_write: cacheWrite5m ?? cacheWriteRolledUp, input_cache_write_1h: cacheWrite1h ?? 0, output: u.output_tokens, + tier, }); }; @@ -122,9 +135,23 @@ export const tokenUsageFromMessagesFrame = (frame: ProtocolFrame { + const state = createMessagesStreamUsageState(); + + tokenUsageFromMessagesFrame( + eventFrame({ + type: 'message_start', + message: { + id: 'msg_1', + type: 'message', + role: 'assistant', + content: [], + model: 'claude-opus-4-8', + stop_reason: null, + stop_sequence: null, + usage: { input_tokens: 5, output_tokens: 0, speed: 'fast' }, + }, + } satisfies MessagesStreamEvent), + state, + ); + + assertEquals(tokenUsageFromMessagesFrame(stop(), state), { + input: 5, + tier: 'fast', + }); +}); + +test('Messages stream usage leaves tier unset when speed is standard', () => { + const state = createMessagesStreamUsageState(); + + tokenUsageFromMessagesFrame( + eventFrame({ + type: 'message_start', + message: { + id: 'msg_1', + type: 'message', + role: 'assistant', + content: [], + model: 'claude-opus-4-8', + stop_reason: null, + stop_sequence: null, + usage: { input_tokens: 5, output_tokens: 0, speed: 'standard' }, + }, + } satisfies MessagesStreamEvent), + state, + ); + + assertEquals(tokenUsageFromMessagesFrame(stop(), state), { + input: 5, + }); +}); + +test('Messages stream usage forwards service_tier=priority verbatim', () => { + const state = createMessagesStreamUsageState(); + + tokenUsageFromMessagesFrame( + eventFrame({ + type: 'message_start', + message: { + id: 'msg_1', + type: 'message', + role: 'assistant', + content: [], + model: 'claude-sonnet-4-6', + stop_reason: null, + stop_sequence: null, + usage: { input_tokens: 5, output_tokens: 0, service_tier: 'priority' }, + }, + } satisfies MessagesStreamEvent), + state, + ); + + assertEquals(tokenUsageFromMessagesFrame(stop(), state), { + input: 5, + tier: 'priority', + }); +}); + +test('Messages stream usage forwards service_tier=batch verbatim', () => { + const state = createMessagesStreamUsageState(); + + tokenUsageFromMessagesFrame( + eventFrame({ + type: 'message_start', + message: { + id: 'msg_1', + type: 'message', + role: 'assistant', + content: [], + model: 'claude-sonnet-4-6', + stop_reason: null, + stop_sequence: null, + usage: { input_tokens: 5, output_tokens: 0, service_tier: 'batch' }, + }, + } satisfies MessagesStreamEvent), + state, + ); + + assertEquals(tokenUsageFromMessagesFrame(stop(), state), { + input: 5, + tier: 'batch', + }); +}); + +test('Messages stream usage forwards an unknown non-standard tier verbatim (forward-compat)', () => { + // A future Anthropic value the SDK has not minted yet must reach the + // billing record so the operator can backfill a pricing override for it + // rather than have it silently fold into the base bucket. + const state = createMessagesStreamUsageState(); + + tokenUsageFromMessagesFrame( + eventFrame({ + type: 'message_start', + message: { + id: 'msg_1', + type: 'message', + role: 'assistant', + content: [], + model: 'claude-opus-4-8', + stop_reason: null, + stop_sequence: null, + usage: { input_tokens: 5, output_tokens: 0, speed: 'turbo' }, + }, + } satisfies MessagesStreamEvent), + state, + ); + + assertEquals(tokenUsageFromMessagesFrame(stop(), state), { + input: 5, + tier: 'turbo', + }); +}); + +test('Messages stream usage prefers speed=fast over service_tier=standard', () => { + // Anthropic stamps both fields on a Priority-Tier-aware account; fast mode + // is mutually exclusive with priority/batch per docs, so a `fast` row will + // always pair with `service_tier: 'standard'`. The non-standard signal + // wins; the redundant 'standard' must not clobber it. + const state = createMessagesStreamUsageState(); + + tokenUsageFromMessagesFrame( + eventFrame({ + type: 'message_start', + message: { + id: 'msg_1', + type: 'message', + role: 'assistant', + content: [], + model: 'claude-opus-4-8', + stop_reason: null, + stop_sequence: null, + usage: { input_tokens: 5, output_tokens: 0, speed: 'fast', service_tier: 'standard' }, + }, + } satisfies MessagesStreamEvent), + state, + ); + + assertEquals(tokenUsageFromMessagesFrame(stop(), state), { + input: 5, + tier: 'fast', + }); +}); + +test('Messages stream usage carries tier forward when a fully cache-hit start is followed by a delta that re-supplies input', () => { + // A fully cache-hit prompt: message_start reports bare input 0 and tier 'fast', + // and a later delta carries input_tokens without re-stamping the tier fields. + // The delta replaces state.current (gotInputFromStart was false), so without + // explicit carry-forward the fast tier would be dropped — and the row would + // bill at base. + const state = createMessagesStreamUsageState(); + + tokenUsageFromMessagesFrame( + eventFrame({ + type: 'message_start', + message: { + id: 'msg_1', + type: 'message', + role: 'assistant', + content: [], + model: 'claude-opus-4-8', + stop_reason: null, + stop_sequence: null, + usage: { input_tokens: 0, output_tokens: 0, speed: 'fast' }, + }, + } satisfies MessagesStreamEvent), + state, + ); + tokenUsageFromMessagesFrame( + eventFrame({ + type: 'message_delta', + delta: {}, + usage: { input_tokens: 11, output_tokens: 2, cache_read_input_tokens: 5 }, + } satisfies MessagesStreamEvent), + state, + ); + + assertEquals(tokenUsageFromMessagesFrame(stop(), state), { + input: 11, + input_cache_read: 5, + output: 2, + tier: 'fast', + }); +}); + +test('Messages stream usage lets a delta-stamped tier win over message_start on the cache-hit-prompt path', () => { + // The wire schema permits message_delta.usage to carry service_tier/speed + // (packages/protocols/src/messages/index.ts). If a future upstream reassigns + // the served tier between message_start and message_delta — or starts + // stamping the served tier only on the delta — the delta value describes + // the billing bucket and must replace the start-stamped one. + const state = createMessagesStreamUsageState(); + + tokenUsageFromMessagesFrame( + eventFrame({ + type: 'message_start', + message: { + id: 'msg_1', + type: 'message', + role: 'assistant', + content: [], + model: 'claude-opus-4-8', + stop_reason: null, + stop_sequence: null, + usage: { input_tokens: 0, output_tokens: 0, speed: 'fast' }, + }, + } satisfies MessagesStreamEvent), + state, + ); + tokenUsageFromMessagesFrame( + eventFrame({ + type: 'message_delta', + delta: {}, + usage: { input_tokens: 11, output_tokens: 2, service_tier: 'priority' }, + } satisfies MessagesStreamEvent), + state, + ); + + assertEquals(tokenUsageFromMessagesFrame(stop(), state), { + input: 11, + output: 2, + tier: 'priority', + }); +}); + +test('Messages stream usage lets a delta-stamped tier win on the normal output-only path', () => { + // Symmetric to the cache-hit branch: when message_start already carried the + // real input accounting (gotInputFromStart === true), the delta normally + // just updates the running output. The wire schema still permits the delta + // to (re)stamp service_tier/speed, and that signal describes this billing + // bucket — must replace what start stamped, not be silently dropped. + const state = createMessagesStreamUsageState(); + + tokenUsageFromMessagesFrame( + eventFrame({ + type: 'message_start', + message: { + id: 'msg_1', + type: 'message', + role: 'assistant', + content: [], + model: 'claude-opus-4-8', + stop_reason: null, + stop_sequence: null, + usage: { input_tokens: 50, output_tokens: 0, service_tier: 'standard' }, + }, + } satisfies MessagesStreamEvent), + state, + ); + tokenUsageFromMessagesFrame( + eventFrame({ + type: 'message_delta', + delta: {}, + usage: { output_tokens: 7, service_tier: 'priority' }, + } satisfies MessagesStreamEvent), + state, + ); + + assertEquals(tokenUsageFromMessagesFrame(stop(), state), { + input: 50, + output: 7, + tier: 'priority', + }); +}); + // --- header forwarding --- const forwardedHeadersFixture = (): Headers => new Headers({ diff --git a/packages/gateway/src/data-plane/llm/responses/respond.ts b/packages/gateway/src/data-plane/llm/responses/respond.ts index e50712060..f40db6ac6 100644 --- a/packages/gateway/src/data-plane/llm/responses/respond.ts +++ b/packages/gateway/src/data-plane/llm/responses/respond.ts @@ -3,12 +3,12 @@ import { streamSSE } from 'hono/streaming'; import { RESPONSES_MISSING_TERMINAL_MESSAGE, collectResponsesProtocolEventsToResult } from './events/to-result.ts'; import { responsesProtocolFrameToSSEFrame } from './events/to-sse.ts'; -import { tokenUsage } from '../../shared/telemetry/usage.ts'; +import { tokenUsageFromResponsesResult } from './usage.ts'; import type { GatewayCtx } from '../shared/gateway-ctx.ts'; import { SourceStreamState, eventResultMetadata, forwardUpstreamHeaders, mergeForwardedUpstreamHeaders, plainResultToResponse, recordPerformance, recordUsage } from '../shared/respond.ts'; import { type StreamCompletion, writeSSEFrames } from '../shared/stream/sse.ts'; import { type ProtocolFrame, sseCommentFrame, sseFrame } from '@floway-dev/protocols/common'; -import { isResponsesTerminalEvent, type ResponsesResult, type ResponsesStreamEvent, responsesResultFromStreamEvent } from '@floway-dev/protocols/responses'; +import { isResponsesTerminalEvent, type ResponsesStreamEvent, responsesResultFromStreamEvent } from '@floway-dev/protocols/responses'; import { type ExecuteResult, type PlainResult, type InternalDebugError, toInternalDebugError } from '@floway-dev/provider'; import { upstreamErrorToResponse } from '@floway-dev/provider'; @@ -74,21 +74,6 @@ export const respondResponses = async ( return { success: true, response }; }; -// --- token usage --- - -// OpenAI Responses reports input_tokens inclusive of cached tokens; subtract -// the cached split to recover the disjoint bare input. -const tokenUsageFromResponsesResult = (r: ResponsesResult) => { - const u = r.usage; - if (!u) return null; - const cacheRead = u.input_tokens_details?.cached_tokens ?? 0; - return tokenUsage({ - input: u.input_tokens - cacheRead, - input_cache_read: cacheRead, - output: u.output_tokens, - }); -}; - // --- error rendering --- const internalResponsesErrorResponse = (status: number, error: InternalDebugError): Response => diff --git a/packages/gateway/src/data-plane/llm/responses/usage.ts b/packages/gateway/src/data-plane/llm/responses/usage.ts new file mode 100644 index 000000000..91960da58 --- /dev/null +++ b/packages/gateway/src/data-plane/llm/responses/usage.ts @@ -0,0 +1,21 @@ +import { billableServiceTier, tokenUsage } from '../../shared/telemetry/usage.ts'; +import type { ResponsesResult } from '@floway-dev/protocols/responses'; + +// OpenAI Responses reports input_tokens inclusive of cached tokens; subtract +// the cached split to recover the disjoint bare input. The top-level +// `service_tier` echoes the actual processing tier the upstream served the +// request at (e.g. `default` when capacity downgraded a `priority` request). +// We surface it via `billableServiceTier` so per-tier pricing overrides +// resolve at recording time. +// https://developers.openai.com/api/docs/guides/priority-processing +export const tokenUsageFromResponsesResult = (response: ResponsesResult) => { + const usage = response.usage; + if (!usage) return null; + const cacheRead = usage.input_tokens_details?.cached_tokens ?? 0; + return tokenUsage({ + input: usage.input_tokens - cacheRead, + input_cache_read: cacheRead, + output: usage.output_tokens, + tier: billableServiceTier(response.service_tier), + }); +}; diff --git a/packages/gateway/src/data-plane/llm/responses/usage_test.ts b/packages/gateway/src/data-plane/llm/responses/usage_test.ts new file mode 100644 index 000000000..cdf846c9a --- /dev/null +++ b/packages/gateway/src/data-plane/llm/responses/usage_test.ts @@ -0,0 +1,81 @@ +import { test } from 'vitest'; + +import { tokenUsageFromResponsesResult } from './usage.ts'; +import type { ResponsesResult } from '@floway-dev/protocols/responses'; +import { assertEquals } from '@floway-dev/test-utils'; + +// Bare minimum ResponsesResult to exercise the usage extractor. The mapper +// only touches `usage` and `service_tier`; the rest of the response shape is +// irrelevant to billing. +const minimalResult = (overrides: Partial): ResponsesResult => ({ + id: 'resp_1', + object: 'response', + model: 'gpt-test', + output: [], + status: 'completed', + incomplete_details: null, + error: null, + ...overrides, +}); + +test('Responses usage maps disjoint input/cache/output counts and omits tier when service_tier is absent', () => { + const result = minimalResult({ + usage: { input_tokens: 100, output_tokens: 20, total_tokens: 120, input_tokens_details: { cached_tokens: 30 } }, + }); + assertEquals(tokenUsageFromResponsesResult(result), { + input: 70, + input_cache_read: 30, + output: 20, + }); +}); + +test('Responses usage drops service_tier=default (OpenAI base value) to no-tier', () => { + const result = minimalResult({ + usage: { input_tokens: 10, output_tokens: 2, total_tokens: 12 }, + service_tier: 'default', + }); + assertEquals(tokenUsageFromResponsesResult(result), { + input: 10, + output: 2, + }); +}); + +test('Responses usage forwards service_tier=priority verbatim', () => { + const result = minimalResult({ + usage: { input_tokens: 10, output_tokens: 2, total_tokens: 12 }, + service_tier: 'priority', + }); + assertEquals(tokenUsageFromResponsesResult(result), { + input: 10, + output: 2, + tier: 'priority', + }); +}); + +test('Responses usage forwards service_tier=flex verbatim', () => { + const result = minimalResult({ + usage: { input_tokens: 10, output_tokens: 2, total_tokens: 12 }, + service_tier: 'flex', + }); + assertEquals(tokenUsageFromResponsesResult(result), { + input: 10, + output: 2, + tier: 'flex', + }); +}); + +test('Responses usage forwards an unknown tier verbatim (forward-compat with a future wire value)', () => { + const result = minimalResult({ + usage: { input_tokens: 10, output_tokens: 2, total_tokens: 12 }, + service_tier: 'batch', + }); + assertEquals(tokenUsageFromResponsesResult(result), { + input: 10, + output: 2, + tier: 'batch', + }); +}); + +test('Responses usage returns null when the upstream omits the usage object', () => { + assertEquals(tokenUsageFromResponsesResult(minimalResult({})), null); +}); diff --git a/packages/gateway/src/data-plane/llm/responses/websocket.ts b/packages/gateway/src/data-plane/llm/responses/websocket.ts index dbbd85579..1aadc245a 100644 --- a/packages/gateway/src/data-plane/llm/responses/websocket.ts +++ b/packages/gateway/src/data-plane/llm/responses/websocket.ts @@ -4,8 +4,8 @@ import { RESPONSES_MISSING_TERMINAL_MESSAGE } from './events/to-result.ts'; import { createResponsesWsSession } from './items/store.ts'; import { PreviousResponseNotFoundError } from './serve-prep.ts'; import { responsesServe } from './serve.ts'; +import { tokenUsageFromResponsesResult } from './usage.ts'; import { inboundHeadersForUpstream } from '../../shared/inbound-headers.ts'; -import { tokenUsage } from '../../shared/telemetry/usage.ts'; import { createGatewayCtxForWs, type GatewayCtx } from '../shared/gateway-ctx.ts'; import { SourceStreamState, eventResultMetadata, recordPerformance, recordUsage } from '../shared/respond.ts'; import { DOWNSTREAM_KEEP_ALIVE_INTERVAL_MS, type StreamCompletion } from '../shared/stream/sse.ts'; @@ -399,17 +399,6 @@ const serverErrorEnvelope = (error: unknown): Record => ({ code: 'internal_error', }); -const tokenUsageFromResponsesResult = (response: ResponsesResult) => { - const usage = response.usage; - if (!usage) return null; - const cacheRead = usage.input_tokens_details?.cached_tokens ?? 0; - return tokenUsage({ - input: usage.input_tokens - cacheRead, - input_cache_read: cacheRead, - output: usage.output_tokens, - }); -}; - const responseDoneSummary = (event: unknown) => { if (!event || typeof event !== 'object') return null; const type = (event as { type?: unknown }).type; diff --git a/packages/gateway/src/data-plane/shared/telemetry/usage.ts b/packages/gateway/src/data-plane/shared/telemetry/usage.ts index 7b32e8ad8..8d1aa89ee 100644 --- a/packages/gateway/src/data-plane/shared/telemetry/usage.ts +++ b/packages/gateway/src/data-plane/shared/telemetry/usage.ts @@ -7,15 +7,21 @@ import type { TelemetryModelIdentity } from '@floway-dev/provider'; export const hasTokenUsage = (usage: TokenUsage): boolean => BILLING_DIMENSIONS.some(dimension => (usage[dimension] ?? 0) > 0); // Map an upstream-reported service tier onto the tier marker the gateway -// stores on the usage row. `default` and `auto` (OpenAI's response-side base -// values) and `standard` (Anthropic's response-side base value) all denote -// base pricing and collapse to null so they aggregate with rows that carry -// no tier at all. +// stores on the usage row. `default` (OpenAI's response-side base value) and +// `standard` (Anthropic's response-side base value) both denote base pricing +// and collapse to null so they aggregate with rows that carry no tier at all. +// Compared case-insensitively in case a future upstream stamps `'Default'` +// or `'STANDARD'` (defensive — both protocols' SDKs ship the values in +// lowercase today); non-base values pass through with their original +// casing so per-tier overrides match the wire-stamped string verbatim. // https://developers.openai.com/api/docs/guides/priority-processing // https://docs.claude.com/en/api/service-tiers // https://docs.claude.com/en/build-with-claude/fast-mode -export const billableServiceTier = (tier: string | null | undefined): string | null => - tier != null && tier !== 'default' && tier !== 'auto' && tier !== 'standard' ? tier : null; +export const billableServiceTier = (tier: string | null | undefined): string | null => { + if (tier == null) return null; + const normalized = tier.toLowerCase(); + return normalized === 'default' || normalized === 'standard' ? null : tier; +}; // Drop zero / undefined dimensions so a usage map only carries the dimensions // actually billed. `tier` (a non-numeric service-tier marker) survives the @@ -88,6 +94,7 @@ const splitModalityCounts = ( }; export const recordTokenUsage = async (keyId: string, modelIdentity: TelemetryModelIdentity, usage: TokenUsage): Promise => { + const { tier, ...tokens } = usage; await Promise.all([ getRepo().usage.record({ keyId, @@ -95,9 +102,9 @@ export const recordTokenUsage = async (keyId: string, modelIdentity: TelemetryMo upstream: modelIdentity.upstream, modelKey: modelIdentity.modelKey, hour: currentHour(), - tier: usage.tier ?? null, + tier: tier ?? null, requests: 1, - tokens: usage, + tokens, cost: modelIdentity.cost, }), (async () => { diff --git a/packages/protocols/src/chat-completions/index.ts b/packages/protocols/src/chat-completions/index.ts index fabe3f32e..394bd471c 100644 --- a/packages/protocols/src/chat-completions/index.ts +++ b/packages/protocols/src/chat-completions/index.ts @@ -20,7 +20,7 @@ export interface ChatCompletionsPayload { reasoning_effort?: string | null; prompt_cache_key?: string | null; safety_identifier?: string | null; - service_tier?: string | null; + service_tier?: 'default' | 'auto' | 'flex' | 'priority' | 'scale' | (string & {}) | null; tools?: ChatCompletionsTool[] | null; tool_choice?: 'none' | 'auto' | 'required' | { type: 'function'; function: { name: string } } | null; /** Request usage stats in streaming responses */ @@ -82,6 +82,8 @@ export interface ChatCompletionsResult { created: number; model: string; choices: ChatCompletionsChoiceNonStreaming[]; + // https://platform.openai.com/docs/api-reference/chat/object + service_tier?: 'default' | 'auto' | 'flex' | 'priority' | 'scale' | (string & {}) | null; usage?: ChatCompletionsUsage; } @@ -91,6 +93,7 @@ export interface ChatCompletionsStreamEvent { created: number; model: string; choices: ChatCompletionsChoiceStreaming[]; + service_tier?: 'default' | 'auto' | 'flex' | 'priority' | 'scale' | (string & {}) | null; usage?: ChatCompletionsUsage; } diff --git a/packages/protocols/src/messages/index.ts b/packages/protocols/src/messages/index.ts index 10076c1a8..089b34d29 100644 --- a/packages/protocols/src/messages/index.ts +++ b/packages/protocols/src/messages/index.ts @@ -233,7 +233,10 @@ export interface MessagesUsage { ephemeral_5m_input_tokens?: number; ephemeral_1h_input_tokens?: number; }; - service_tier?: 'standard' | 'priority' | 'batch'; + // https://docs.claude.com/en/api/service-tiers + service_tier?: 'standard' | 'priority' | 'batch' | (string & {}); + // https://docs.claude.com/en/build-with-claude/fast-mode + speed?: 'standard' | 'fast' | (string & {}); server_tool_use?: MessagesUsageServerToolUse; } @@ -312,6 +315,8 @@ export interface MessagesMessageDeltaEvent { ephemeral_5m_input_tokens?: number; ephemeral_1h_input_tokens?: number; }; + service_tier?: 'standard' | 'priority' | 'batch' | (string & {}); + speed?: 'standard' | 'fast' | (string & {}); server_tool_use?: MessagesUsageServerToolUse; }; } diff --git a/packages/protocols/src/responses/index.ts b/packages/protocols/src/responses/index.ts index 907bd0e42..a1aed6518 100644 --- a/packages/protocols/src/responses/index.ts +++ b/packages/protocols/src/responses/index.ts @@ -32,7 +32,7 @@ export interface ResponsesPayload { text?: { format?: Record | null } | null; prompt_cache_key?: string | null; safety_identifier?: string | null; - service_tier?: string | null; + service_tier?: 'default' | 'auto' | 'flex' | 'priority' | 'scale' | (string & {}) | null; } // Narrower payload for `/responses/compact`. The official endpoint accepts a @@ -50,7 +50,7 @@ export interface ResponsesCompactPayload { previous_response_id?: string | null; prompt_cache_key?: string | null; prompt_cache_retention?: 'in_memory' | '24h' | null; - service_tier?: string | null; + service_tier?: 'default' | 'auto' | 'flex' | 'priority' | 'scale' | (string & {}) | null; // Gateway-only: controls whether the compact response's output items + the // committed snapshot persist. Forwarded NEITHER to upstream nor to the // provider call body. @@ -395,6 +395,8 @@ export interface ResponsesResult { // never synthesizes it. incomplete_details: { reason: string } | null; error: { message: string; code: string; type?: string } | null; + // https://developers.openai.com/api/reference/resources/responses/methods/create + service_tier?: 'default' | 'auto' | 'flex' | 'priority' | 'scale' | (string & {}) | null; usage?: { input_tokens: number; output_tokens: number; From d23f6a5ef2dec43ade318d06c327d498cd0d1128 Mon Sep 17 00:00:00 2001 From: Menci Date: Sat, 20 Jun 2026 04:04:29 +0800 Subject: [PATCH 3/6] feat(codex): price flex/priority service tiers per OpenAI public rates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `tiers.flex` and `tiers.priority` overlays for every priced Codex slug so the dashboard's notional cost reflects which OpenAI service tier the request actually ran on. The gateway already captures `usage.service_tier` onto `TokenUsage.tier`; this commit completes the loop by giving the cost compute a per-tier rate row to look up. Tier overrides match OpenAI's public pricing (verified 2026-06-19 against https://platform.openai.com/docs/pricing): gpt-5.5 flex $2.5/$0.25/$15 priority $12.5/$1.25/$75 gpt-5.4 flex $1.25/$0.13/$7.5 priority $5/$0.5/$30 gpt-5.4-mini flex $0.375/$0.0375/$2.25 priority $1.5/$0.15/$9 `codex-auto-review` shares `gpt-5.4`'s pricing including the tier overrides. Codex CLI's `/fast` toggle writes `service_tier: "priority"` on the wire (per openai/codex's `ServiceTier::Fast.request_value()`), so operator-facing rows tagged "fast" cost out at the priority row. Cache-write rate stays unset on these entries — OpenAI charges cache creation at the same rate as input, which `unitPriceForDimension`'s fallback chain already covers. --- packages/provider-codex/src/models_test.ts | 37 ++++++++++++++- packages/provider-codex/src/pricing.ts | 54 ++++++++++++++++++---- 2 files changed, 82 insertions(+), 9 deletions(-) diff --git a/packages/provider-codex/src/models_test.ts b/packages/provider-codex/src/models_test.ts index 116daa8e6..edf04b75d 100644 --- a/packages/provider-codex/src/models_test.ts +++ b/packages/provider-codex/src/models_test.ts @@ -2,6 +2,7 @@ import { afterEach, describe, expect, test, vi } from 'vitest'; import { CODEX_CLI_VERSION } from './constants.ts'; import { codexRawToUpstreamModel, fetchCodexCatalog } from './models.ts'; +import { resolveEffectivePricing } from '@floway-dev/protocols/common'; import { directFetcher } from '@floway-dev/provider'; const okJson = (body: unknown): Response => new Response(JSON.stringify(body), { status: 200, headers: { 'content-type': 'application/json' } }); @@ -76,11 +77,45 @@ describe('codexRawToUpstreamModel', () => { test('attaches OpenAI-API-rate cost for known slugs and treats codex-auto-review as gpt-5.4', () => { const flagship = codexRawToUpstreamModel({ id: 'gpt-5.4', display_name: 'GPT-5.4', context_window: 272000 }, noFlags); - expect(flagship.cost).toEqual({ input: 2.5, input_cache_read: 0.25, output: 15 }); + expect(flagship.cost).toEqual({ + input: 2.5, + input_cache_read: 0.25, + output: 15, + tiers: { + flex: { input: 1.25, input_cache_read: 0.13, output: 7.5 }, + priority: { input: 5, input_cache_read: 0.5, output: 30 }, + }, + }); const review = codexRawToUpstreamModel({ id: 'codex-auto-review', display_name: 'Codex Auto Review', context_window: 272000 }, noFlags); expect(review.cost).toEqual(flagship.cost); }); + // End-to-end resolution check: tier keys must match the wire-value strings + // billableServiceTier persists, not the enum *names* in Codex's Rust source. + // A casing typo here (e.g. `Flex`) or a divergence from the wire value (e.g. + // `fast`) would compile cleanly against the structural test above but bill + // every tiered request at base. + test('cost.tiers keys resolve through resolveEffectivePricing for the wire-value strings', () => { + const flagship = codexRawToUpstreamModel({ id: 'gpt-5.4', display_name: 'GPT-5.4', context_window: 272000 }, noFlags); + if (!flagship.cost) throw new Error('expected cost to be defined'); + + expect(resolveEffectivePricing(flagship.cost, 'priority')).toEqual({ + input: 5, + input_cache_read: 0.5, + output: 30, + }); + expect(resolveEffectivePricing(flagship.cost, 'flex')).toEqual({ + input: 1.25, + input_cache_read: 0.13, + output: 7.5, + }); + expect(resolveEffectivePricing(flagship.cost, null)).toEqual({ + input: 2.5, + input_cache_read: 0.25, + output: 15, + }); + }); + test('omits cost for unknown slugs (forward-compat with new upstream models)', () => { const m = codexRawToUpstreamModel({ id: 'gpt-future-unreleased', display_name: 'X', context_window: 1 }, noFlags); expect(m.cost).toBeUndefined(); diff --git a/packages/provider-codex/src/pricing.ts b/packages/provider-codex/src/pricing.ts index 07df17d8d..4895e157f 100644 --- a/packages/provider-codex/src/pricing.ts +++ b/packages/provider-codex/src/pricing.ts @@ -8,25 +8,63 @@ // https://github.com/anomalyco/models.dev/blob/8e6d393c01cb42d41a92f18725eef545e7190efb/packages/core/src/schema.ts // // Source of truth for OpenAI public API prices the table is derived from: -// https://openai.com/api/pricing/ +// https://developers.openai.com/api/docs/pricing +// Refresh procedure: .agents/skills/fetching-models-pricing/. +// +// Per-tier overrides cover the two OpenAI service-tier wire values reachable +// through the Codex CLI's `ServiceTier` enum (`priority` / `flex`): +// - `flex` — discounted, latency-tolerant; the CLI sets `service_tier: "flex"`. +// https://developers.openai.com/api/docs/guides/flex-processing +// - `priority` — premium-priced, lower-latency lane; the CLI's `/fast` toggle +// stamps `service_tier: "priority"`. +// https://developers.openai.com/api/docs/guides/priority-processing +// https://github.com/openai/codex/blob/f774455c3a831dfab2c6f37a1f624b8097f6f2c2/codex-rs/protocol/src/config_types.rs#L445 +// Whether a request actually goes through at the requested tier depends on +// what each model's catalog entry (`service_tiers` block in upstream +// `models.json`) accepts and on remaining capacity; OpenAI reports the +// actually-served tier in `usage.service_tier` and the gateway captures it +// onto `TokenUsage.tier` so cost compute picks the right row. // // Coverage: every slug surfaced by /codex/models for ChatGPT Plus today // (gpt-5.5, gpt-5.4, gpt-5.4-mini, codex-auto-review). New slugs the upstream // rolls out at higher plans (Pro / Team / Enterprise) should be added here so // the dashboard reports their cost too. -// -// Refresh procedure: .agents/skills/fetching-models-pricing/. import type { ModelPricing } from '@floway-dev/protocols/common'; -const GPT_5_4_PRICING: ModelPricing = { input: 2.5, input_cache_read: 0.25, output: 15 }; +const GPT_5_4_PRICING: ModelPricing = { + input: 2.5, + input_cache_read: 0.25, + output: 15, + tiers: { + flex: { input: 1.25, input_cache_read: 0.13, output: 7.5 }, + priority: { input: 5, input_cache_read: 0.5, output: 30 }, + }, +}; const CODEX_MODEL_PRICING: readonly (readonly [key: string | RegExp, pricing: ModelPricing])[] = [ - ['gpt-5.5', { input: 5, input_cache_read: 0.5, output: 30 }], + ['gpt-5.5', { + input: 5, + input_cache_read: 0.5, + output: 30, + tiers: { + flex: { input: 2.5, input_cache_read: 0.25, output: 15 }, + priority: { input: 12.5, input_cache_read: 1.25, output: 75 }, + }, + }], ['gpt-5.4', GPT_5_4_PRICING], - ['gpt-5.4-mini', { input: 0.75, input_cache_read: 0.075, output: 4.5 }], - // Internal review model gated under codex_cli_rs's auto-review feature; runs - // on the same compute as gpt-5.4 and is billed identically. + ['gpt-5.4-mini', { + input: 0.75, + input_cache_read: 0.075, + output: 4.5, + tiers: { + flex: { input: 0.375, input_cache_read: 0.0375, output: 2.25 }, + priority: { input: 1.5, input_cache_read: 0.15, output: 9 }, + }, + }], + // Internal review model gated under codex_cli_rs's auto-review feature. No + // public price surface; billed as a notional clone of gpt-5.4 (closest + // analogue we have). ['codex-auto-review', GPT_5_4_PRICING], ]; From 1914062ab1338e8b81badd156754d93657767a6d Mon Sep 17 00:00:00 2001 From: Menci Date: Sat, 20 Jun 2026 11:52:35 +0800 Subject: [PATCH 4/6] feat(gateway): migration 0036 adds tier column to usage + usage_requests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR 69's runtime already keys usage buckets on TokenUsage.tier via `COALESCE(tier, '')` and the repo writes `tier` into `usage` and `usage_requests`, but the schema column didn't exist yet. Add it via a dedicated migration so PR 68 can stay focused on the per-TTL cache dimension and the tier column lives in this PR's blast radius. The CHECK list on `usage.dimension` is widened here to admit `input_cache_write_1h` as well so 0035 and 0036 converge on the same end state regardless of merge order — the codebase on this branch doesn't write that dimension yet (PR 68 ships the parser), but the schema is forward-compatible. Also extend `pricingField` (and the control-plane zod schema) to admit `cost.tiers`, so an operator-supplied per-tier overlay round-trips through the upstream save path instead of being silently dropped at parse time. Without this the per-tier editor in the next commit cannot persist anything. --- .../migrations/0036_usage_tier_column.sql | 56 +++++++++++++++++++ .../src/control-plane/data-transfer/routes.ts | 10 +++- .../data-transfer/routes_test.ts | 2 +- packages/gateway/src/control-plane/schemas.ts | 28 +++++++--- packages/provider/src/model-config.ts | 18 +++++- packages/provider/src/model-config_test.ts | 56 +++++++++++++++++++ 6 files changed, 158 insertions(+), 12 deletions(-) create mode 100644 packages/gateway/migrations/0036_usage_tier_column.sql create mode 100644 packages/provider/src/model-config_test.ts diff --git a/packages/gateway/migrations/0036_usage_tier_column.sql b/packages/gateway/migrations/0036_usage_tier_column.sql new file mode 100644 index 000000000..e75125079 --- /dev/null +++ b/packages/gateway/migrations/0036_usage_tier_column.sql @@ -0,0 +1,56 @@ +-- Add the per-request service tier column to `usage` + `usage_requests`. +-- +-- `tier` is the upstream-stamped service-tier marker (Anthropic `usage.speed`, +-- OpenAI `usage.service_tier`). It participates in bucket identity so a model +-- billed at multiple tiers in one hour aggregates as separate buckets with +-- distinct unit prices; recording writes NULL for base-tier requests and a +-- non-empty string otherwise. The unique index uses `COALESCE(tier, '')` +-- because SQLite treats NULLs as distinct under UNIQUE. +-- +-- SQLite cannot add a column to the middle of a UNIQUE INDEX in place, so +-- both tables are recreated. Existing rows backfill `tier = NULL`, which the +-- aggregator treats as base pricing — historical buckets compute identically. + +CREATE TABLE usage_new ( + key_id TEXT NOT NULL, + model TEXT NOT NULL, + upstream TEXT, + model_key TEXT NOT NULL, + hour TEXT NOT NULL, + tier TEXT, + dimension TEXT NOT NULL CHECK (dimension IN ( + 'input', 'input_cache_read', 'input_cache_write', 'input_cache_write_1h', 'input_image', 'output', 'output_image' + )), + tokens INTEGER NOT NULL DEFAULT 0, + unit_price REAL +); + +INSERT INTO usage_new (key_id, model, upstream, model_key, hour, tier, dimension, tokens, unit_price) + SELECT key_id, model, upstream, model_key, hour, NULL, dimension, tokens, unit_price FROM usage; + +DROP TABLE usage; +ALTER TABLE usage_new RENAME TO usage; + +CREATE UNIQUE INDEX idx_usage_dimension_identity + ON usage (key_id, model, COALESCE(upstream, ''), model_key, hour, COALESCE(tier, ''), dimension); +CREATE INDEX idx_usage_dimension_hour ON usage (hour); + +CREATE TABLE usage_requests_new ( + key_id TEXT NOT NULL, + model TEXT NOT NULL, + upstream TEXT, + model_key TEXT NOT NULL, + hour TEXT NOT NULL, + tier TEXT, + requests INTEGER NOT NULL DEFAULT 0 +); + +INSERT INTO usage_requests_new (key_id, model, upstream, model_key, hour, tier, requests) + SELECT key_id, model, upstream, model_key, hour, NULL, requests FROM usage_requests; + +DROP TABLE usage_requests; +ALTER TABLE usage_requests_new RENAME TO usage_requests; + +CREATE UNIQUE INDEX idx_usage_requests_identity + ON usage_requests (key_id, model, COALESCE(upstream, ''), model_key, hour, COALESCE(tier, '')); +CREATE INDEX idx_usage_requests_hour ON usage_requests (hour); diff --git a/packages/gateway/src/control-plane/data-transfer/routes.ts b/packages/gateway/src/control-plane/data-transfer/routes.ts index 780a0318d..52cf72d71 100644 --- a/packages/gateway/src/control-plane/data-transfer/routes.ts +++ b/packages/gateway/src/control-plane/data-transfer/routes.ts @@ -403,10 +403,14 @@ const parseUsageRecords = (value: unknown): { type: 'ok'; records: UsageRecord[] return { type: 'invalid', index: i, error: 'upstream must use a raw upstream id, not a legacy provider-prefixed identity' }; } if (record.tier !== undefined && record.tier !== null && typeof record.tier !== 'string') { - return { type: 'invalid', index: i, error: 'record has invalid tier (must be a string or null)' }; + return { type: 'invalid', index: i, error: 'tier, when present, must be a string or null' }; } - // `tier` is absent on exports taken before the column existed; collapse - // the absent and explicit-null cases into the same wire value. + if (record.tier === '') { + return { type: 'invalid', index: i, error: 'tier must be a non-empty string or null/absent' }; + } + // Empty-string is rejected rather than normalized to null: the unique + // index folds NULL/'' under COALESCE, so a '' import would silently + // merge with base-tier rows. const tier: string | null = typeof record.tier === 'string' ? record.tier : null; const tokensResult = parseImportedTokens(record.tokens); if (tokensResult.type === 'invalid') return { type: 'invalid', index: i, error: 'record has invalid token dimension counts' }; diff --git a/packages/gateway/src/control-plane/data-transfer/routes_test.ts b/packages/gateway/src/control-plane/data-transfer/routes_test.ts index d43d7a975..de3c08ac0 100644 --- a/packages/gateway/src/control-plane/data-transfer/routes_test.ts +++ b/packages/gateway/src/control-plane/data-transfer/routes_test.ts @@ -177,7 +177,7 @@ const USAGE_1: UsageRecord = { upstream: 'up_copilot_a', modelKey: 'claude-opus-4.7', hour: '2026-01-01T10', - tier: null, + tier: 'fast', requests: 5, tokens: { input: 1000, output: 500, input_cache_read: 120, input_cache_write: 80 }, cost: null, diff --git a/packages/gateway/src/control-plane/schemas.ts b/packages/gateway/src/control-plane/schemas.ts index 6857e3eab..6de95e9ec 100644 --- a/packages/gateway/src/control-plane/schemas.ts +++ b/packages/gateway/src/control-plane/schemas.ts @@ -60,6 +60,18 @@ const modelEndpointsSchema = z.object({ imagesEdits: z.object({}).optional(), }); +// Shared between base pricing and per-tier overlays so the two always carry +// the same dimension set. +const pricingDimensionShape = { + input: z.number().nonnegative().optional(), + output: z.number().nonnegative().optional(), + input_cache_read: z.number().nonnegative().optional(), + input_cache_write: z.number().nonnegative().optional(), + input_cache_write_1h: z.number().nonnegative().optional(), + input_image: z.number().nonnegative().optional(), + output_image: z.number().nonnegative().optional(), +}; + // Mirrors the runtime UpstreamModelConfig in @floway-dev/provider. // Azure and custom upstreams share this per-model entry; the canonical // per-model endpoint validation lives in the runtime validator. @@ -70,13 +82,15 @@ const upstreamModelSchema = z.object({ endpoints: modelEndpointsSchema, display_name: z.string().optional(), cost: z.object({ - input: z.number().optional(), - output: z.number().optional(), - input_cache_read: z.number().optional(), - input_cache_write: z.number().optional(), - input_cache_write_1h: z.number().optional(), - input_image: z.number().optional(), - output_image: z.number().optional(), + ...pricingDimensionShape, + // See ModelPricing.tiers in @floway-dev/protocols/common for semantics. + tiers: z.record( + z.string().min(1), + z.object(pricingDimensionShape).refine( + t => Object.values(t).some(v => v !== undefined), + { message: 'tier overlay must declare at least one rate' }, + ), + ).optional(), }).optional(), flagOverrides: z.object({ enabled: z.boolean(), diff --git a/packages/provider/src/model-config.ts b/packages/provider/src/model-config.ts index da1692d40..ecd8e1e0c 100644 --- a/packages/provider/src/model-config.ts +++ b/packages/provider/src/model-config.ts @@ -1,5 +1,5 @@ import { isKnownFlagId } from './flags.ts'; -import { BILLING_DIMENSIONS, type ModelEndpointKey, type ModelEndpoints, type ModelKind, type ModelPricing } from '@floway-dev/protocols/common'; +import { BILLING_DIMENSIONS, type BillingDimension, type ModelEndpointKey, type ModelEndpoints, type ModelKind, type ModelPricing } from '@floway-dev/protocols/common'; import { kindForEndpoints } from '@floway-dev/protocols/common'; export interface UpstreamModelLimits { @@ -127,6 +127,22 @@ export const pricingField = (value: unknown, label: string): ModelPricing | unde for (const dimension of BILLING_DIMENSIONS) { if (record[dimension] !== undefined) pricing[dimension] = nonNegativeNumberField(record[dimension], `${label}.${dimension}`); } + if (record.tiers !== undefined) { + if (!isRecord(record.tiers)) throw new Error(`Malformed ${label}.tiers: must be an object`); + const tiers: Record>> = {}; + for (const [tierName, overlay] of Object.entries(record.tiers)) { + if (tierName === '') throw new Error(`Malformed ${label}.tiers: tier name must be non-empty`); + if (!isRecord(overlay)) throw new Error(`Malformed ${label}.tiers.${tierName}: must be an object`); + const tierPricing: Partial> = {}; + for (const dimension of BILLING_DIMENSIONS) { + if (overlay[dimension] !== undefined) { + tierPricing[dimension] = nonNegativeNumberField(overlay[dimension], `${label}.tiers.${tierName}.${dimension}`); + } + } + if (Object.keys(tierPricing).length > 0) tiers[tierName] = tierPricing; + } + if (Object.keys(tiers).length > 0) pricing.tiers = tiers; + } return Object.keys(pricing).length > 0 ? pricing : undefined; }; diff --git a/packages/provider/src/model-config_test.ts b/packages/provider/src/model-config_test.ts new file mode 100644 index 000000000..8c25e695f --- /dev/null +++ b/packages/provider/src/model-config_test.ts @@ -0,0 +1,56 @@ +import { test } from 'vitest'; + +import { pricingField } from './model-config.ts'; +import { assertEquals, assertThrows } from '@floway-dev/test-utils'; + +test('pricingField parses bare dimensions and drops empty objects', () => { + assertEquals(pricingField(undefined, 'cost'), undefined); + assertEquals(pricingField({}, 'cost'), undefined); + assertEquals( + pricingField({ input: 5, output: 25, input_cache_read: 0.5 }, 'cost'), + { input: 5, output: 25, input_cache_read: 0.5 }, + ); +}); + +test('pricingField parses per-tier overlays alongside base rates', () => { + const result = pricingField( + { + input: 5, + output: 25, + tiers: { + fast: { input: 30, output: 150 }, + flex: { input: 2.5 }, + }, + }, + 'cost', + ); + assertEquals(result, { + input: 5, + output: 25, + tiers: { + fast: { input: 30, output: 150 }, + flex: { input: 2.5 }, + }, + }); +}); + +test('pricingField drops empty tier overlays and skips unknown keys inside them', () => { + const result = pricingField( + { + input: 5, + tiers: { + fast: { input: 30, bogus_key: 99 }, + priority: {}, + }, + }, + 'cost', + ); + assertEquals(result, { input: 5, tiers: { fast: { input: 30 } } }); +}); + +test('pricingField rejects non-object tiers, empty names, and negative rates', () => { + assertThrows(() => pricingField({ tiers: 'nope' }, 'cost'), Error, 'tiers'); + assertThrows(() => pricingField({ tiers: { '': { input: 5 } } }, 'cost'), Error, 'tier name'); + assertThrows(() => pricingField({ tiers: { fast: 1 } }, 'cost'), Error, 'tiers.fast'); + assertThrows(() => pricingField({ tiers: { fast: { input: -1 } } }, 'cost'), Error, 'non-negative'); +}); From 3c6066eec2052a116bc1aa11d3950a819240fb6f Mon Sep 17 00:00:00 2001 From: Menci Date: Sat, 20 Jun 2026 11:52:47 +0800 Subject: [PATCH 5/6] feat(web): per-tier pricing override editor on ModelEditor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Operators editing custom upstreams can now declare per-tier pricing overlays directly from the dashboard. Each tier row carries a free-form name (matching the wire value the upstream stamps onto `usage.service_tier` / `usage.speed`) and a sparse set of dimension rates that shadow the base pricing — absent rates fall through to the base row, mirroring `resolveEffectivePricing` semantics. Drafts are tracked in local component state rather than recomputed from stored cost on every keystroke, so a tier whose name is still empty stays on screen instead of being collapsed away by the save path's non-empty-name filter. Duplicate tier names within one model render an inline warning; the save path keeps the last entry per name. Also surface `BillingDimension` as a top-level type on the SPA API module and route the model editor's pricing dim list through it, so the `tiers` field on `ModelPricing` no longer leaks into the dimension labels record. --- apps/web/src/api/types.ts | 8 +- .../components/upstream-edit/ModelEditor.vue | 151 +++++++++++++++++- 2 files changed, 155 insertions(+), 4 deletions(-) diff --git a/apps/web/src/api/types.ts b/apps/web/src/api/types.ts index 069233d2f..2a5313067 100644 --- a/apps/web/src/api/types.ts +++ b/apps/web/src/api/types.ts @@ -23,7 +23,13 @@ export type ModelEndpointKey = keyof ModelEndpoints; // USD per million tokens, keyed by billing dimension. export type BillingDimension = 'input' | 'input_cache_read' | 'input_cache_write' | 'input_cache_write_1h' | 'input_image' | 'output' | 'output_image'; -export type ModelPricing = Partial>; + +// Base rates plus per-tier overlays for OpenAI service tiers (`flex`, +// `priority`, ...) and Anthropic fast mode. Each overlay is a partial that +// shadows individual dimensions; absent keys fall through to the base rate. +export interface ModelPricing extends Partial> { + tiers?: Record>>; +} export interface UpstreamModelConfig { upstreamModelId: string; diff --git a/apps/web/src/components/upstream-edit/ModelEditor.vue b/apps/web/src/components/upstream-edit/ModelEditor.vue index aa70330cd..fb44ae10f 100644 --- a/apps/web/src/components/upstream-edit/ModelEditor.vue +++ b/apps/web/src/components/upstream-edit/ModelEditor.vue @@ -1,5 +1,5 @@