Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions apps/web/src/api/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@ export interface ModelEndpoints {

export type ModelEndpointKey = keyof ModelEndpoints;

// USD per million tokens, keyed by billing dimension.
export type BillingDimension = 'input' | 'input_cache_read' | 'input_cache_write' | 'input_cache_write_1h' | 'input_image' | 'output' | 'output_image';
export type ModelPricing = Partial<Record<BillingDimension, number>>;
// USD per million tokens, keyed by billing dimension. Imported from the
// gateway so the dashboard's pricing form stays locked to the same definition
// the backend writes against — same pattern as `ProxyRecord` below.
import type { BillingDimension, ModelPricing } from '@floway-dev/gateway/control-plane/pricing/types';
export type { BillingDimension, ModelPricing };

export interface UpstreamModelConfig {
upstreamModelId: string;
Expand Down
285 changes: 273 additions & 12 deletions apps/web/src/components/upstream-edit/ModelEditor.vue

Large diffs are not rendered by default.

56 changes: 56 additions & 0 deletions packages/gateway/migrations/0036_usage_tier_column.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
-- Add the per-request service tier column to `usage` + `usage_requests`.
--
-- `tier` is the upstream-stamped service-tier marker (Anthropic `usage.speed`,
-- OpenAI `usage.service_tier`). It participates in bucket identity so a model
-- billed at multiple tiers in one hour aggregates as separate buckets with
-- distinct unit prices; recording writes NULL for base-tier requests and a
-- non-empty string otherwise. The unique index uses `COALESCE(tier, '')`
-- because SQLite treats NULLs as distinct under UNIQUE.
--
-- SQLite cannot add a column to the middle of a UNIQUE INDEX in place, so
-- both tables are recreated. Existing rows backfill `tier = NULL`, which the
-- aggregator treats as base pricing — historical buckets compute identically.

CREATE TABLE usage_new (
key_id TEXT NOT NULL,
model TEXT NOT NULL,
upstream TEXT,
model_key TEXT NOT NULL,
hour TEXT NOT NULL,
tier TEXT,
dimension TEXT NOT NULL CHECK (dimension IN (
'input', 'input_cache_read', 'input_cache_write', 'input_cache_write_1h', 'input_image', 'output', 'output_image'
)),
tokens INTEGER NOT NULL DEFAULT 0,
unit_price REAL
);

INSERT INTO usage_new (key_id, model, upstream, model_key, hour, tier, dimension, tokens, unit_price)
SELECT key_id, model, upstream, model_key, hour, NULL, dimension, tokens, unit_price FROM usage;

DROP TABLE usage;
ALTER TABLE usage_new RENAME TO usage;

CREATE UNIQUE INDEX idx_usage_dimension_identity
ON usage (key_id, model, COALESCE(upstream, ''), model_key, hour, COALESCE(tier, ''), dimension);
CREATE INDEX idx_usage_dimension_hour ON usage (hour);

CREATE TABLE usage_requests_new (
key_id TEXT NOT NULL,
model TEXT NOT NULL,
upstream TEXT,
model_key TEXT NOT NULL,
hour TEXT NOT NULL,
tier TEXT,
requests INTEGER NOT NULL DEFAULT 0
);

INSERT INTO usage_requests_new (key_id, model, upstream, model_key, hour, tier, requests)
SELECT key_id, model, upstream, model_key, hour, NULL, requests FROM usage_requests;

DROP TABLE usage_requests;
ALTER TABLE usage_requests_new RENAME TO usage_requests;

CREATE UNIQUE INDEX idx_usage_requests_identity
ON usage_requests (key_id, model, COALESCE(upstream, ''), model_key, hour, COALESCE(tier, ''));
CREATE INDEX idx_usage_requests_hour ON usage_requests (hour);
1 change: 1 addition & 0 deletions packages/gateway/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
".": { "import": "./src/index.ts", "types": "./src/index.ts" },
"./app-type": { "types": "./src/app.ts" },
"./control-plane/proxies/serialize": { "types": "./src/control-plane/proxies/serialize.ts" },
"./control-plane/pricing/types": { "types": "./src/control-plane/pricing/types.ts" },
"./data-plane/tools/web-search/types": {
"import": "./src/data-plane/tools/web-search/types.ts",
"types": "./src/data-plane/tools/web-search/types.ts"
Expand Down
5 changes: 5 additions & 0 deletions packages/gateway/src/app-control_test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ test('/api/token-usage scopes to the actor\'s keys when called with an API key',
upstream: null,
modelKey: 'claude-sonnet-4',
hour: '2026-03-15T10',
tier: null,
requests: 2,
tokens: { input: 10, output: 5, input_cache_read: 4, input_cache_write: 1 },
cost: null,
Expand All @@ -118,6 +119,7 @@ test('/api/token-usage scopes to the actor\'s keys when called with an API key',
upstream: null,
modelKey: 'gpt-5',
hour: '2026-03-15T11',
tier: null,
requests: 1,
tokens: { input: 20, output: 8, input_cache_read: 6, input_cache_write: 2 },
cost: null,
Expand Down Expand Up @@ -155,6 +157,7 @@ test('/api/token-usage in self-by-key mode includes per-key metadata for the act
upstream: null,
modelKey: 'gpt-5',
hour: '2026-03-16T10',
tier: null,
requests: 1,
tokens: { input: 20, output: 8 },
cost: null,
Expand Down Expand Up @@ -182,6 +185,7 @@ test('/api/token-usage all-by-user view aggregates across keys per user', async
upstream: null,
modelKey: 'gpt-5',
hour: '2026-03-15T10',
tier: null,
requests: 1,
tokens: { input: 10, output: 5 },
cost: null,
Expand Down Expand Up @@ -213,6 +217,7 @@ test('/api/token-usage merges Claude variants into backend base model records',
keyId: apiKey.id,
hour: '2026-03-17T10',
upstream: 'copilot:1',
tier: null,
requests: 1,
tokens: { input: 10, output: 5, input_cache_read: 2, input_cache_write: 1 },
};
Expand Down
11 changes: 11 additions & 0 deletions packages/gateway/src/control-plane/data-transfer/routes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,16 @@ const parseUsageRecords = (value: unknown): { type: 'ok'; records: UsageRecord[]
if (typeof record.upstream === 'string' && isLegacyUpstreamIdentity(record.upstream)) {
return { type: 'invalid', index: i, error: 'upstream must use a raw upstream id, not a legacy provider-prefixed identity' };
}
if (record.tier !== undefined && record.tier !== null && typeof record.tier !== 'string') {
return { type: 'invalid', index: i, error: 'tier, when present, must be a string or null' };
}
if (record.tier === '') {
return { type: 'invalid', index: i, error: 'tier must be a non-empty string or null/absent' };
}
// Empty-string is rejected rather than normalized to null: the unique
// index folds NULL/'' under COALESCE, so a '' import would silently
// merge with base-tier rows.
const tier: string | null = typeof record.tier === 'string' ? record.tier : null;
const tokensResult = parseImportedTokens(record.tokens);
if (tokensResult.type === 'invalid') return { type: 'invalid', index: i, error: 'record has invalid token dimension counts' };
const costResult = parseImportedCost(record.cost);
Expand All @@ -412,6 +422,7 @@ const parseUsageRecords = (value: unknown): { type: 'ok'; records: UsageRecord[]
upstream: record.upstream as string | null,
modelKey: record.modelKey,
hour: record.hour,
tier,
requests: record.requests,
tokens: tokensResult.tokens,
cost: costResult.cost,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ const USAGE_1: UsageRecord = {
upstream: 'up_copilot_a',
modelKey: 'claude-opus-4.7',
hour: '2026-01-01T10',
tier: 'fast',
requests: 5,
tokens: { input: 1000, output: 500, input_cache_read: 120, input_cache_write: 80 },
cost: null,
Expand All @@ -188,6 +189,7 @@ const USAGE_2: UsageRecord = {
upstream: 'up_azure_a',
modelKey: 'gpt-prod',
hour: '2026-01-01T11',
tier: null,
requests: 3,
tokens: { input: 2000, output: 800, input_cache_read: 200, input_cache_write: 50 },
cost: null,
Expand Down
1 change: 1 addition & 0 deletions packages/gateway/src/control-plane/pricing/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export type { BillingDimension, ModelPricing } from '@floway-dev/protocols/common';
28 changes: 21 additions & 7 deletions packages/gateway/src/control-plane/schemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,18 @@ const modelEndpointsSchema = z.object({
imagesEdits: z.object({}).optional(),
});

// Shared between base pricing and per-tier overlays so the two always carry
// the same dimension set.
const pricingDimensionShape = {
input: z.number().nonnegative().optional(),
output: z.number().nonnegative().optional(),
input_cache_read: z.number().nonnegative().optional(),
input_cache_write: z.number().nonnegative().optional(),
input_cache_write_1h: z.number().nonnegative().optional(),
input_image: z.number().nonnegative().optional(),
output_image: z.number().nonnegative().optional(),
};

// Mirrors the runtime UpstreamModelConfig in @floway-dev/provider.
// Azure and custom upstreams share this per-model entry; the canonical
// per-model endpoint validation lives in the runtime validator.
Expand All @@ -70,13 +82,15 @@ const upstreamModelSchema = z.object({
endpoints: modelEndpointsSchema,
display_name: z.string().optional(),
cost: z.object({
input: z.number().optional(),
output: z.number().optional(),
input_cache_read: z.number().optional(),
input_cache_write: z.number().optional(),
input_cache_write_1h: z.number().optional(),
input_image: z.number().optional(),
output_image: z.number().optional(),
...pricingDimensionShape,
// See ModelPricing.tiers in @floway-dev/protocols/common for semantics.
tiers: z.record(
z.string().min(1),
z.object(pricingDimensionShape).refine(
t => Object.values(t).some(v => v !== undefined),
{ message: 'tier overlay must declare at least one rate' },
),
).optional(),
}).optional(),
flagOverrides: z.object({
enabled: z.boolean(),
Expand Down
5 changes: 4 additions & 1 deletion packages/gateway/src/control-plane/token-usage/aggregate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ export interface DisplayUsageByUserRecord {

// Cost is pure addition over the dimension rows: Σ tokens × unit_price / 1e6.
// No subtraction is needed because the counts are disjoint and each dimension
// already carries its own resolved unit price snapshot.
// already carries its own resolved unit price snapshot. `record.cost` here
// is the per-row reconstruction of the per-dimension `unit_price` columns
// the repo writer already folded the bucket's tier into — so the dimension
// lookup is a direct hit, no tier resolution needed at read time.
const recordCostUsd = (record: UsageRecord): number => {
let total = 0;
for (const dimension of BILLING_DIMENSIONS) {
Expand Down
28 changes: 28 additions & 0 deletions packages/gateway/src/control-plane/token-usage/aggregate_test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ const baseRecord = (overrides: Partial<UsageRecord>): UsageRecord => ({
model: 'claude-opus-4-7',
upstream: 'up_copilot',
modelKey: 'claude-opus-4-7',
tier: null,
requests: 1,
tokens: { input: 100, output: 50 },
cost: opus47Pricing,
Expand Down Expand Up @@ -83,3 +84,30 @@ test('aggregateUsageForDisplay charges image dimensions separately', () => {
// 10 + 5 + 40 + 30 = $85.
assertAlmostEquals(out[0].cost, 85, 1e-9);
});

test('aggregateUsageForDisplay reads unit prices from the already-folded cost the repo writer hands back', () => {
// The repo write path (`repo/sql.ts:dimensionRows`, `repo/memory.ts:dimensionEntries`)
// resolves the bucket's tier into per-dimension unit prices BEFORE storing,
// so by the time aggregate sees a UsageRecord the `cost` field is already
// the effective pricing for that bucket's tier and tier resolution is a
// no-op. Two same-tier records below model the post-write shape.
// Opus 4.8: standard $5 / $25, fast $10 / $50.
const fastRow = baseRecord({
tier: 'fast',
cost: { input: 10, output: 50 },
tokens: { input: 1_000_000, output: 1_000_000 },
});
const standardRow = baseRecord({
tier: null,
cost: { input: 5, output: 25 },
tokens: { input: 1_000_000, output: 1_000_000 },
});

const fastOut = aggregateUsageForDisplay([fastRow]);
// 1M * $10 + 1M * $50 = $60.
assertAlmostEquals(fastOut[0].cost, 60, 1e-9);

const standardOut = aggregateUsageForDisplay([standardRow]);
// 1M * $5 + 1M * $25 = $30.
assertAlmostEquals(standardOut[0].cost, 30, 1e-9);
});
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ const seedUsage = async (
upstream: 'up_test',
modelKey: model,
hour,
tier: null,
requests,
tokens: { input: 100, output: 50 },
cost: null,
Expand Down
23 changes: 4 additions & 19 deletions packages/gateway/src/data-plane/llm/chat-completions/respond.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ import { streamSSE } from 'hono/streaming';

import { CHAT_COMPLETIONS_MISSING_TERMINAL_MESSAGE, collectChatCompletionsProtocolEventsToResult } from './events/to-result.ts';
import { chatCompletionsProtocolFrameToSSEFrame } from './events/to-sse.ts';
import { tokenUsage } from '../../shared/telemetry/usage.ts';
import { tokenUsageFromChatCompletionsUsage } from './usage.ts';
import type { GatewayCtx } from '../shared/gateway-ctx.ts';
import { SourceStreamState, eventResultMetadata, forwardUpstreamHeaders, mergeForwardedUpstreamHeaders, plainResultToResponse, recordPerformance, recordUsage } from '../shared/respond.ts';
import { type StreamCompletion, writeSSEFrames } from '../shared/stream/sse.ts';
import type { ChatCompletionsStreamEvent, ChatCompletionsResult } from '@floway-dev/protocols/chat-completions';
import type { ChatCompletionsStreamEvent } from '@floway-dev/protocols/chat-completions';
import { chatCompletionsErrorPayloadMessage } from '@floway-dev/protocols/chat-completions';
import { type ProtocolFrame, sseCommentFrame, sseFrame } from '@floway-dev/protocols/common';
import { type ExecuteResult, type PlainResult, type InternalDebugError, toInternalDebugError } from '@floway-dev/provider';
Expand Down Expand Up @@ -44,7 +44,7 @@ export const respondChatCompletions = async (
try {
const response = await collectChatCompletionsProtocolEventsToResult(frames);
const metadata = await eventResultMetadata(result);
const usage = response.usage ? tokenUsageFromChatCompletionsUsage(response.usage) : null;
const usage = response.usage ? tokenUsageFromChatCompletionsUsage(response.usage, response.service_tier) : null;
await recordUsage(ctx, metadata.modelIdentity, usage);
recordPerformance(ctx, metadata.performance, state.failed);
return { success: true, response: Response.json(response, { headers: mergeForwardedUpstreamHeaders(undefined, result.headers) }) };
Expand Down Expand Up @@ -77,21 +77,6 @@ export const respondChatCompletions = async (
return { success: true, response };
};

// --- token usage ---

// OpenAI Chat usage reports prompt_tokens inclusive of cached and
// cache-creation tokens; subtract them to recover the disjoint bare input.
const tokenUsageFromChatCompletionsUsage = (u: NonNullable<ChatCompletionsResult['usage']>) => {
const cacheRead = u.prompt_tokens_details?.cached_tokens ?? 0;
const cacheWrite = u.prompt_tokens_details?.cache_creation_input_tokens ?? 0;
return tokenUsage({
input: u.prompt_tokens - cacheRead - cacheWrite,
input_cache_read: cacheRead,
input_cache_write: cacheWrite,
output: u.completion_tokens,
});
};

// --- error rendering ---

const internalChatCompletionsErrorPayload = (error: InternalDebugError) => ({
Expand Down Expand Up @@ -119,7 +104,7 @@ const observeChatCompletionsFrames = async function* (frames: AsyncIterable<Prot
const failed = isChatCompletionsFailureFrame(frame);
if (failed) state.failed = true;
if (observeUsage) {
state.rememberUsage(frame.type === 'event' && Array.isArray(frame.event.choices) && frame.event.choices.length === 0 && frame.event.usage ? tokenUsageFromChatCompletionsUsage(frame.event.usage) : null);
state.rememberUsage(frame.type === 'event' && Array.isArray(frame.event.choices) && frame.event.choices.length === 0 && frame.event.usage ? tokenUsageFromChatCompletionsUsage(frame.event.usage, frame.event.service_tier) : null);
}
if (isChatCompletionsTerminalFrame(frame) && !failed) state.completed = true;
yield frame;
Expand Down
19 changes: 19 additions & 0 deletions packages/gateway/src/data-plane/llm/chat-completions/usage.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import { billableServiceTier, tokenUsage } from '../../shared/telemetry/usage.ts';
import type { ChatCompletionsResult } from '@floway-dev/protocols/chat-completions';

// OpenAI Chat usage reports prompt_tokens inclusive of cached and
// cache-creation tokens; subtract them to recover the disjoint bare input.
// The top-level `service_tier` echoes the actual processing tier; surface it
// via `billableServiceTier` so per-tier pricing overrides resolve at
// recording time. https://developers.openai.com/api/docs/guides/priority-processing
export const tokenUsageFromChatCompletionsUsage = (u: NonNullable<ChatCompletionsResult['usage']>, serviceTier: string | null | undefined) => {
const cacheRead = u.prompt_tokens_details?.cached_tokens ?? 0;
const cacheWrite = u.prompt_tokens_details?.cache_creation_input_tokens ?? 0;
return tokenUsage({
input: u.prompt_tokens - cacheRead - cacheWrite,
input_cache_read: cacheRead,
input_cache_write: cacheWrite,
output: u.completion_tokens,
tier: billableServiceTier(serviceTier),
});
};
Loading