diff --git a/apps/web/src/api/types.ts b/apps/web/src/api/types.ts index 6b91b7ea..069233d2 100644 --- a/apps/web/src/api/types.ts +++ b/apps/web/src/api/types.ts @@ -22,7 +22,8 @@ export interface ModelEndpoints { export type ModelEndpointKey = keyof ModelEndpoints; // USD per million tokens, keyed by billing dimension. -export type ModelPricing = Partial>; +export type BillingDimension = 'input' | 'input_cache_read' | 'input_cache_write' | 'input_cache_write_1h' | 'input_image' | 'output' | 'output_image'; +export type ModelPricing = Partial>; export interface UpstreamModelConfig { upstreamModelId: string; diff --git a/apps/web/src/components/upstream-edit/ModelEditor.vue b/apps/web/src/components/upstream-edit/ModelEditor.vue index 3d8e18ec..aa70330c 100644 --- a/apps/web/src/components/upstream-edit/ModelEditor.vue +++ b/apps/web/src/components/upstream-edit/ModelEditor.vue @@ -4,7 +4,7 @@ import { computed } from 'vue'; import EndpointsField from './EndpointsField.vue'; import FlagOverridesEditor from './FlagOverridesEditor.vue'; import { configOf, defaultEndpointsForKind, publicIdOf, titleFor, type Row } from './modelRows.ts'; -import type { FlagDef, ModelKind, ModelPricing, UpstreamModelConfig, UpstreamProviderKind } from '../../api/types.ts'; +import type { BillingDimension, FlagDef, ModelKind, ModelPricing, UpstreamModelConfig, UpstreamProviderKind } from '../../api/types.ts'; import { Button, Input, Select, Switch } from '@floway-dev/ui'; const props = defineProps<{ @@ -37,14 +37,15 @@ const kindOptions: { value: ModelKind; label: string }[] = [ const PRICING_LABELS: Record = { input: 'Input ($/MTok)', input_cache_read: 'Cache Read ($/MTok)', - input_cache_write: 'Cache Write ($/MTok)', + input_cache_write: 'Cache Write 5m ($/MTok)', + input_cache_write_1h: 'Cache Write 1h ($/MTok)', input_image: 'Image Input ($/MTok)', output: 'Output ($/MTok)', output_image: 'Image Output ($/MTok)', }; -const PRICING_BY_KIND: Record = { - chat: ['input', 'input_cache_read', 'input_cache_write', 'output'], +const PRICING_BY_KIND: Record = { + chat: ['input', 'input_cache_read', 'input_cache_write', 'input_cache_write_1h', 'output'], embedding: ['input'], image: ['input', 'input_image', 'output', 'output_image'], }; diff --git a/apps/web/src/pages/dashboard/usage.vue b/apps/web/src/pages/dashboard/usage.vue index ff490976..2e206639 100644 --- a/apps/web/src/pages/dashboard/usage.vue +++ b/apps/web/src/pages/dashboard/usage.vue @@ -6,6 +6,7 @@ import { defineBasicLoader } from 'unplugin-vue-router/data-loaders/basic'; import { computed, ref, watch } from 'vue'; import { callApi, useApi, type ApiClient } from '../../api/client.ts'; +import type { BillingDimension } from '../../api/types.ts'; import ChartCanvas from '../../components/charts/ChartCanvas.vue'; import { bucketKeyForUtcHour, chartColor, chartFont, chartXAxisTick, dashboardBuckets, dashboardRangeQuery, type DashboardRange } from '../../components/charts/dashboard-chart.ts'; import UsageSummaryMetric from '../../components/usage/UsageSummaryMetric.vue'; @@ -13,8 +14,6 @@ import { useModelsStore } from '../../composables/useModels.ts'; import { useAuthStore } from '../../stores/auth.ts'; import { OverlayScrollbars, Spinner } from '@floway-dev/ui'; -type BillingDimension = 'input' | 'input_cache_read' | 'input_cache_write' | 'input_image' | 'output' | 'output_image'; - interface DisplayUsageRecord { keyId: string; keyName?: string; @@ -190,7 +189,7 @@ const tokenSummary = computed(() => { input += dim(r, 'input'); output += dim(r, 'output'); cacheRead += dim(r, 'input_cache_read'); - cacheCreation += dim(r, 'input_cache_write'); + cacheCreation += dim(r, 'input_cache_write') + dim(r, 'input_cache_write_1h'); inputImage += dim(r, 'input_image'); outputImage += dim(r, 'output_image'); } @@ -240,12 +239,12 @@ const metricValue = (r: DisplayUsageRecord, metric: Metric): number => { switch (metric) { case 'requests': return r.requests; case 'cost': return r.cost; - case 'total': return dim(r, 'input') + dim(r, 'output') + dim(r, 'input_cache_read') + dim(r, 'input_cache_write') + dim(r, 'input_image') + dim(r, 'output_image'); - case 'input': return dim(r, 'input') + dim(r, 'input_cache_read') + dim(r, 'input_cache_write') + dim(r, 'input_image'); + case 'total': return dim(r, 'input') + dim(r, 'output') + dim(r, 'input_cache_read') + dim(r, 'input_cache_write') + dim(r, 'input_cache_write_1h') + dim(r, 'input_image') + dim(r, 'output_image'); + case 'input': return dim(r, 'input') + dim(r, 'input_cache_read') + dim(r, 'input_cache_write') + dim(r, 'input_cache_write_1h') + dim(r, 'input_image'); case 'output': return dim(r, 'output') + dim(r, 'output_image'); - case 'prefill': return dim(r, 'input') + dim(r, 'input_cache_write') + dim(r, 'input_image'); + case 'prefill': return dim(r, 'input') + dim(r, 'input_cache_write') + dim(r, 'input_cache_write_1h') + dim(r, 'input_image'); case 'cached': return dim(r, 'input_cache_read'); - case 'cacheCreation': return dim(r, 'input_cache_write'); + case 'cacheCreation': return dim(r, 'input_cache_write') + dim(r, 'input_cache_write_1h'); case 'cachedRate': case 'cacheHitRate': return 0; @@ -339,7 +338,7 @@ const aggregateTokenRecords = (records: readonly DisplayUsageRecord[], groupKey: detail.input += dim(r, 'input'); detail.output += dim(r, 'output'); detail.cacheRead += dim(r, 'input_cache_read'); - detail.cacheCreation += dim(r, 'input_cache_write'); + detail.cacheCreation += dim(r, 'input_cache_write') + dim(r, 'input_cache_write_1h'); detail.inputImage += dim(r, 'input_image'); detail.outputImage += dim(r, 'output_image'); detail.cost += r.cost; diff --git a/packages/gateway/migrations/0035_usage_input_cache_write_1h.sql b/packages/gateway/migrations/0035_usage_input_cache_write_1h.sql new file mode 100644 index 00000000..ebdc25e1 --- /dev/null +++ b/packages/gateway/migrations/0035_usage_input_cache_write_1h.sql @@ -0,0 +1,33 @@ +-- Widen the `usage.dimension` CHECK list to admit `input_cache_write_1h`. +-- +-- Anthropic's `extended-cache-ttl-2025-04-11` beta surfaces 1-hour cache +-- writes under `usage.cache_creation.ephemeral_1h_input_tokens`. Until now +-- we folded both 5m and 1h writes into the same `input_cache_write` bucket, +-- which under-bills 1h writes (priced at input × 2 vs. input × 1.25 for 5m). +-- Adding the dimension as a disjoint bucket requires recreating `usage` +-- because SQLite cannot alter a CHECK constraint in place. +-- +-- `usage_requests` is untouched: it does not carry a dimension column. + +CREATE TABLE usage_new ( + key_id TEXT NOT NULL, + model TEXT NOT NULL, + upstream TEXT, + model_key TEXT NOT NULL, + hour TEXT NOT NULL, + dimension TEXT NOT NULL CHECK (dimension IN ( + 'input', 'input_cache_read', 'input_cache_write', 'input_cache_write_1h', 'input_image', 'output', 'output_image' + )), + tokens INTEGER NOT NULL DEFAULT 0, + unit_price REAL +); + +INSERT INTO usage_new (key_id, model, upstream, model_key, hour, dimension, tokens, unit_price) + SELECT key_id, model, upstream, model_key, hour, dimension, tokens, unit_price FROM usage; + +DROP TABLE usage; +ALTER TABLE usage_new RENAME TO usage; + +CREATE UNIQUE INDEX idx_usage_dimension_identity + ON usage (key_id, model, COALESCE(upstream, ''), model_key, hour, dimension); +CREATE INDEX idx_usage_dimension_hour ON usage (hour); diff --git a/packages/gateway/src/control-plane/schemas.ts b/packages/gateway/src/control-plane/schemas.ts index 341ca347..cc428903 100644 --- a/packages/gateway/src/control-plane/schemas.ts +++ b/packages/gateway/src/control-plane/schemas.ts @@ -74,6 +74,7 @@ const upstreamModelSchema = z.object({ output: z.number().optional(), input_cache_read: z.number().optional(), input_cache_write: z.number().optional(), + input_cache_write_1h: z.number().optional(), input_image: z.number().optional(), output_image: z.number().optional(), }).optional(), diff --git a/packages/gateway/src/data-plane/llm/messages/events/reassemble.ts b/packages/gateway/src/data-plane/llm/messages/events/reassemble.ts index 70b6c026..71cea87e 100644 --- a/packages/gateway/src/data-plane/llm/messages/events/reassemble.ts +++ b/packages/gateway/src/data-plane/llm/messages/events/reassemble.ts @@ -100,6 +100,7 @@ const applyMessagesUsage = (usage: MessagesUsage, update: Partial if (update.cache_read_input_tokens != null) { usage.cache_read_input_tokens = update.cache_read_input_tokens; } + if (update.cache_creation != null) usage.cache_creation = update.cache_creation; if (update.service_tier != null) usage.service_tier = update.service_tier; if (update.server_tool_use != null) { usage.server_tool_use = update.server_tool_use; diff --git a/packages/gateway/src/data-plane/llm/messages/respond.ts b/packages/gateway/src/data-plane/llm/messages/respond.ts index d679428a..0008704c 100644 --- a/packages/gateway/src/data-plane/llm/messages/respond.ts +++ b/packages/gateway/src/data-plane/llm/messages/respond.ts @@ -77,14 +77,23 @@ export const respondMessages = async ( }; // Anthropic already reports disjoint token counts: input_tokens excludes the -// cache figures. Map them straight onto the billing dimensions without summing. -const tokenUsageFromMessagesUsage = (u: MessagesUsageLike) => - tokenUsage({ +// cache figures. Map them straight onto the billing dimensions without +// summing. When the upstream emits the `cache_creation` sub-object +// (extended-cache-ttl-2025-04-11), split the per-TTL counts onto the 5m and +// 1h dimensions; the flat `cache_creation_input_tokens` is the sum and is +// only consulted when the sub-object is absent. +const tokenUsageFromMessagesUsage = (u: MessagesUsageLike) => { + const cacheWrite5m = u.cache_creation?.ephemeral_5m_input_tokens; + const cacheWrite1h = u.cache_creation?.ephemeral_1h_input_tokens; + const cacheWriteRolledUp = u.cache_creation_input_tokens ?? 0; + return tokenUsage({ input: u.input_tokens ?? 0, input_cache_read: u.cache_read_input_tokens ?? 0, - input_cache_write: u.cache_creation_input_tokens ?? 0, + input_cache_write: cacheWrite5m ?? cacheWriteRolledUp, + input_cache_write_1h: cacheWrite1h ?? 0, output: u.output_tokens, }); +}; export const createMessagesStreamUsageState = () => ({ current: tokenUsage({}), @@ -102,7 +111,7 @@ export const tokenUsageFromMessagesFrame = (frame: ProtocolFrame 0; + state.gotInputFromStart ||= (state.current.input ?? 0) + (state.current.input_cache_read ?? 0) + (state.current.input_cache_write ?? 0) + (state.current.input_cache_write_1h ?? 0) > 0; } if (event.type === 'message_delta' && event.usage) { if (!state.gotInputFromStart && event.usage.input_tokens !== undefined) { diff --git a/packages/gateway/src/data-plane/llm/messages/respond_test.ts b/packages/gateway/src/data-plane/llm/messages/respond_test.ts index e59bd40b..91b18629 100644 --- a/packages/gateway/src/data-plane/llm/messages/respond_test.ts +++ b/packages/gateway/src/data-plane/llm/messages/respond_test.ts @@ -139,3 +139,69 @@ test('Messages stream usage keeps cache-only start when a later delta carries in output: 50, }); }); + +test('Messages stream usage splits cache_creation per-TTL when the sub-object is present', () => { + const state = createMessagesStreamUsageState(); + + tokenUsageFromMessagesFrame( + eventFrame({ + type: 'message_start', + message: { + id: 'msg_1', + type: 'message', + role: 'assistant', + content: [], + model: 'claude-opus-4-8', + stop_reason: null, + stop_sequence: null, + usage: { + input_tokens: 12, + output_tokens: 1, + // The flat field is the sum of both sub-buckets and is consulted + // only as a fallback. With the sub-object present the per-TTL split + // must take precedence — otherwise this row would double-count. + cache_creation_input_tokens: 9, + cache_creation: { ephemeral_5m_input_tokens: 4, ephemeral_1h_input_tokens: 5 }, + cache_read_input_tokens: 3, + }, + }, + } satisfies MessagesStreamEvent), + state, + ); + + assertEquals(tokenUsageFromMessagesFrame(stop(), state), { + input: 12, + input_cache_read: 3, + input_cache_write: 4, + input_cache_write_1h: 5, + output: 1, + }); +}); + +test('Messages stream usage falls back to the rolled-up cache_creation when the sub-object is absent', () => { + const state = createMessagesStreamUsageState(); + + tokenUsageFromMessagesFrame( + eventFrame({ + type: 'message_start', + message: { + id: 'msg_1', + type: 'message', + role: 'assistant', + content: [], + model: 'claude-sonnet-4-6', + stop_reason: null, + stop_sequence: null, + usage: { input_tokens: 12, output_tokens: 1, cache_creation_input_tokens: 9, cache_read_input_tokens: 3 }, + }, + } satisfies MessagesStreamEvent), + state, + ); + + assertEquals(tokenUsageFromMessagesFrame(stop(), state), { + input: 12, + input_cache_read: 3, + input_cache_write: 9, + output: 1, + }); +}); diff --git a/packages/protocols/src/common/models.ts b/packages/protocols/src/common/models.ts index 9634cc05..a26fe8f7 100644 --- a/packages/protocols/src/common/models.ts +++ b/packages/protocols/src/common/models.ts @@ -1,17 +1,22 @@ // Disjoint billing dimensions a single request can be charged on. Every count // keyed by these is non-overlapping: a prompt token is counted under exactly -// one of `input`, `input_cache_read`, `input_cache_write`, or `input_image`, -// never several at once. +// one of `input`, `input_cache_read`, `input_cache_write`, +// `input_cache_write_1h`, or `input_image`, never several at once. // // Convention borrowed from models.dev and LiteLLM: bare `input`/`output` mean // the text modality AND act as the fallback rate for any modality without a // dedicated rate; the `_image` variants are the image modality. There are no // image cache dimensions on purpose — a live probe of Azure gpt-image-2 // confirmed its usage object never emits cached fields. -export type BillingDimension = 'input' | 'input_cache_read' | 'input_cache_write' | 'input_image' | 'output' | 'output_image'; +// +// `input_cache_write` is the 5-minute (default) TTL bucket; `input_cache_write_1h` +// is the explicit 1-hour bucket Anthropic surfaces under +// `cache_creation.ephemeral_1h_input_tokens` (extended-cache-ttl-2025-04-11). +// They are disjoint subsets of `cache_creation_input_tokens`. +export type BillingDimension = 'input' | 'input_cache_read' | 'input_cache_write' | 'input_cache_write_1h' | 'input_image' | 'output' | 'output_image'; // Iteration form of BillingDimension; the type union is the source of truth. -export const BILLING_DIMENSIONS: readonly BillingDimension[] = ['input', 'input_cache_read', 'input_cache_write', 'input_image', 'output', 'output_image']; +export const BILLING_DIMENSIONS: readonly BillingDimension[] = ['input', 'input_cache_read', 'input_cache_write', 'input_cache_write_1h', 'input_image', 'output', 'output_image']; // Per-model pricing in USD per million tokens, aligned with the sst/models.dev // `Cost` schema (https://github.com/sst/models.dev/blob/main/packages/core/src/schema.ts). @@ -22,9 +27,11 @@ export type ModelPricing = Partial>; // Resolve the USD-per-million-tokens unit price for one dimension against a // pricing snapshot, applying the LiteLLM-style fallback chain: a modality with -// no dedicated rate falls back to the bare text rate, and cached input falls -// back to uncached input. Returns null when even the fallback base is absent -// (or the whole snapshot is null), which aggregation treats as cost 0. +// no dedicated rate falls back to the bare text rate, cached input falls back +// to uncached input, and the 1-hour cache write falls back to the 5-minute +// cache write before reaching uncached input. Returns null when even the +// fallback base is absent (or the whole snapshot is null), which aggregation +// treats as cost 0. export const unitPriceForDimension = (pricing: ModelPricing | null, dimension: BillingDimension): number | null => { if (!pricing) return null; switch (dimension) { @@ -34,6 +41,8 @@ export const unitPriceForDimension = (pricing: ModelPricing | null, dimension: B return pricing.input_cache_read ?? pricing.input ?? null; case 'input_cache_write': return pricing.input_cache_write ?? pricing.input ?? null; + case 'input_cache_write_1h': + return pricing.input_cache_write_1h ?? pricing.input_cache_write ?? pricing.input ?? null; case 'input_image': return pricing.input_image ?? pricing.input ?? null; case 'output': diff --git a/packages/protocols/src/common/models_test.ts b/packages/protocols/src/common/models_test.ts new file mode 100644 index 00000000..ffe10d8c --- /dev/null +++ b/packages/protocols/src/common/models_test.ts @@ -0,0 +1,35 @@ +import { test } from 'vitest'; + +import { unitPriceForDimension } from './models.ts'; +import { assertEquals } from '../test-assert.ts'; + +test('unitPriceForDimension returns null when pricing snapshot is null', () => { + assertEquals(unitPriceForDimension(null, 'input'), null); + assertEquals(unitPriceForDimension(null, 'input_cache_write_1h'), null); +}); + +test('unitPriceForDimension prefers the dimension-specific rate', () => { + const pricing = { input: 1, input_cache_read: 0.1, input_cache_write: 1.25, input_cache_write_1h: 2, output: 5 }; + assertEquals(unitPriceForDimension(pricing, 'input'), 1); + assertEquals(unitPriceForDimension(pricing, 'input_cache_read'), 0.1); + assertEquals(unitPriceForDimension(pricing, 'input_cache_write'), 1.25); + assertEquals(unitPriceForDimension(pricing, 'input_cache_write_1h'), 2); + assertEquals(unitPriceForDimension(pricing, 'output'), 5); +}); + +test('unitPriceForDimension falls input_cache_write_1h back to input_cache_write before reaching input', () => { + // 1h -> 5m -> input. When only 5m is defined, 1h reuses the 5m rate + // rather than skipping straight to the bare input rate. + const pricing = { input: 1, input_cache_write: 1.25 }; + assertEquals(unitPriceForDimension(pricing, 'input_cache_write_1h'), 1.25); +}); + +test('unitPriceForDimension falls input_cache_write_1h all the way back to input when neither cache write is set', () => { + const pricing = { input: 1 }; + assertEquals(unitPriceForDimension(pricing, 'input_cache_write_1h'), 1); +}); + +test('unitPriceForDimension returns null when the fallback chain is empty', () => { + assertEquals(unitPriceForDimension({}, 'input_cache_write_1h'), null); + assertEquals(unitPriceForDimension({ output: 5 }, 'input_cache_write_1h'), null); +}); diff --git a/packages/protocols/src/messages/index.ts b/packages/protocols/src/messages/index.ts index dc4c0243..094955a5 100644 --- a/packages/protocols/src/messages/index.ts +++ b/packages/protocols/src/messages/index.ts @@ -225,6 +225,14 @@ export interface MessagesUsage { output_tokens: number; cache_creation_input_tokens?: number; cache_read_input_tokens?: number; + // Per-TTL split for cache writes introduced by extended-cache-ttl-2025-04-11. + // Each `ephemeral_*` field is a disjoint subset of `cache_creation_input_tokens` + // (the legacy flat field is the sum of both); upstreams that have not opted + // into the beta omit `cache_creation` entirely and emit only the flat field. + cache_creation?: { + ephemeral_5m_input_tokens?: number; + ephemeral_1h_input_tokens?: number; + }; service_tier?: 'standard' | 'priority' | 'batch'; server_tool_use?: MessagesUsageServerToolUse; } @@ -300,6 +308,10 @@ export interface MessagesMessageDeltaEvent { output_tokens: number; cache_creation_input_tokens?: number; cache_read_input_tokens?: number; + cache_creation?: { + ephemeral_5m_input_tokens?: number; + ephemeral_1h_input_tokens?: number; + }; server_tool_use?: MessagesUsageServerToolUse; }; } diff --git a/packages/provider-custom/src/fetch-models.ts b/packages/provider-custom/src/fetch-models.ts index ab9ddefa..40d56774 100644 --- a/packages/provider-custom/src/fetch-models.ts +++ b/packages/provider-custom/src/fetch-models.ts @@ -11,7 +11,7 @@ import type { CustomUpstreamConfig } from './config.ts'; import { customFetchModels } from './fetch.ts'; -import type { ModelKind, ModelPricing } from '@floway-dev/protocols/common'; +import { BILLING_DIMENSIONS, type ModelKind, type ModelPricing } from '@floway-dev/protocols/common'; import { fetchUpstreamModels, type Fetcher } from '@floway-dev/provider'; export interface CustomRawModel { @@ -58,14 +58,12 @@ const parseLimits = (value: unknown): CustomRawModel['limits'] => { return Object.keys(limits).length > 0 ? limits : undefined; }; -const PRICING_DIMENSIONS: readonly (keyof ModelPricing)[] = ['input', 'input_cache_read', 'input_cache_write', 'input_image', 'output', 'output_image']; - const parseCost = (value: unknown): ModelPricing | undefined => { // Admit any subset of billing dimensions advertised on the upstream's // /v1/models cost block; drop the whole block when none are present. if (!isRecord(value)) return undefined; const cost: ModelPricing = {}; - for (const dimension of PRICING_DIMENSIONS) { + for (const dimension of BILLING_DIMENSIONS) { const rate = optionalNumberField(value[dimension]); if (rate !== undefined) cost[dimension] = rate; } diff --git a/packages/provider/src/model-config.ts b/packages/provider/src/model-config.ts index c3804052..da1692d4 100644 --- a/packages/provider/src/model-config.ts +++ b/packages/provider/src/model-config.ts @@ -1,5 +1,5 @@ import { isKnownFlagId } from './flags.ts'; -import type { ModelEndpointKey, ModelEndpoints, ModelKind, ModelPricing } from '@floway-dev/protocols/common'; +import { BILLING_DIMENSIONS, type ModelEndpointKey, type ModelEndpoints, type ModelKind, type ModelPricing } from '@floway-dev/protocols/common'; import { kindForEndpoints } from '@floway-dev/protocols/common'; export interface UpstreamModelLimits { @@ -120,13 +120,11 @@ const nonNegativeNumberField = (value: unknown, label: string): number => { return value; }; -const PRICING_DIMENSIONS: readonly (keyof ModelPricing)[] = ['input', 'input_cache_read', 'input_cache_write', 'input_image', 'output', 'output_image']; - export const pricingField = (value: unknown, label: string): ModelPricing | undefined => { const record = optionalMetadataRecord(value, label); if (!record) return undefined; const pricing: ModelPricing = {}; - for (const dimension of PRICING_DIMENSIONS) { + for (const dimension of BILLING_DIMENSIONS) { if (record[dimension] !== undefined) pricing[dimension] = nonNegativeNumberField(record[dimension], `${label}.${dimension}`); } return Object.keys(pricing).length > 0 ? pricing : undefined;